2 namespace wcf\system\html\input\node
;
3 use wcf\system\bbcode\BBCodeHandler
;
4 use wcf\system\event\EventHandler
;
5 use wcf\system\html\node\AbstractHtmlNodeProcessor
;
6 use wcf\system\html\node\IHtmlNode
;
8 use wcf\util\StringUtil
;
11 * Processes HTML nodes and handles bbcodes.
13 * @author Alexander Ebert
14 * @copyright 2001-2018 WoltLab GmbH
15 * @license GNU Lesser General Public License <http://opensource.org/licenses/lgpl-license.php>
16 * @package WoltLabSuite\Core\System\Html\Input\Node
19 class HtmlInputNodeProcessor
extends AbstractHtmlNodeProcessor
{
21 * list of allowed CSS class names per tag name
24 public static $allowedClassNames = [
25 'h2' => ['text-center', 'text-justify', 'text-right'],
26 'h3' => ['text-center', 'text-justify', 'text-right'],
27 'h4' => ['text-center', 'text-justify', 'text-right'],
30 'messageFloatObjectLeft', 'messageFloatObjectRight',
33 'smiley', 'woltlabAttachment', 'woltlabSuiteMedia',
35 'li' => ['text-center', 'text-justify', 'text-right'],
36 'p' => ['text-center', 'text-justify', 'text-right'],
37 'pre' => ['woltlabHtml'],
38 'td' => ['text-center', 'text-justify', 'text-right'],
42 * List of HTML elements that should allow for custom CSS using
43 * the `style`-attribute.
45 * Unfortunately, HTMLPurifier offers no *sane* way to limit this
46 * attribute to some elements only.
50 public static $allowedStyleElements = [
55 * list of HTML elements that are treated as empty, that means
56 * they don't generate any (indirect) output at all
60 public static $emptyTags = [
65 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
68 'table', 'tbody', 'thead', 'tr', 'th', 'td', 'colgroup', 'col',
74 'a', 'kbd', 'woltlab-quote', 'woltlab-spoiler', 'pre', 'sub', 'sup',
78 * list of tag names that represent inline content in the HTML 5 standard
81 public static $inlineElements = [
82 'a', 'abbr', 'acronym', 'audio', 'b', 'bdi', 'bdo', 'big', 'br', 'button',
83 'canvas', 'cite', 'code', 'data', 'datalist', 'del', 'dfn', 'em', 'embed',
84 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'map', 'mark', 'meter',
85 'noscript', 'object', 'output', 'picture', 'progress', 'q', 'ruby', 's',
86 'samp', 'script', 'select', 'slot', 'small', 'span', 'strong', 'sub', 'sup',
87 'svg', 'template', 'textarea', 'time', 'u', 'tt', 'var', 'video', 'wbr',
91 * list of embedded content grouped by type
94 protected $embeddedContent = [];
99 protected $nodeInterface = IHtmlInputNode
::class;
104 public function process() {
105 EventHandler
::getInstance()->fireAction($this, 'beforeProcess');
107 // fix invalid html such as metacode markers outside of block elements
110 // process metacode markers first
111 $this->invokeHtmlNode(new HtmlInputNodeWoltlabMetacodeMarker());
113 // handle static converters
114 $this->invokeHtmlNode(new HtmlInputNodeWoltlabMetacode());
116 if (MESSAGE_MAX_QUOTE_DEPTH
) {
117 $this->enforceQuoteDepth(MESSAGE_MAX_QUOTE_DEPTH
);
120 $imgNodeHandler = new HtmlInputNodeImg();
121 $this->invokeHtmlNode($imgNodeHandler);
122 $smileyCount = $imgNodeHandler->getSmileyCount();
124 // dynamic node handlers
125 $this->invokeNodeHandlers('wcf\system\html\input\node\HtmlInputNode', ['img', 'woltlab-metacode']);
127 // remove whitespace at the start/end of the message
130 // detect mentions, urls, emails and smileys
131 $textParser = new HtmlInputNodeTextParser($this, $smileyCount);
132 $textParser->parse();
134 // handle HTML bbcode
135 $allowHtml = BBCodeHandler
::getInstance()->isAvailableBBCode('html');
137 // strip invalid class names
138 /** @var \DOMElement $element */
139 foreach ($this->getXPath()->query('//*[@class]') as $element) {
140 $nodeName = $element->nodeName
;
141 if (isset(self
::$allowedClassNames[$nodeName])) {
142 if (self
::$allowedClassNames[$nodeName] === '*') {
146 $classNames = explode(' ', $element->getAttribute('class'));
147 $classNames = array_filter($classNames, function ($className) use ($allowHtml, $nodeName) {
148 if (!$allowHtml && $nodeName === 'pre' && $className === 'woltlabHtml') {
152 return ($className && in_array($className, self
::$allowedClassNames[$nodeName]));
155 if (!empty($classNames)) {
156 $element->setAttribute('class', implode(' ', $classNames));
161 $element->removeAttribute('class');
163 if ($nodeName === 'span' && $element->attributes
->length
=== 0) {
164 DOMUtil
::removeNode($element, true);
168 EventHandler
::getInstance()->fireAction($this, 'beforeEmbeddedProcess');
170 // extract embedded content
171 $this->processEmbeddedContent();
173 EventHandler
::getInstance()->fireAction($this, 'afterProcess');
177 * Enforces the maximum depth of nested quotes.
179 * @param integer $depth
181 public function enforceQuoteDepth($depth) {
183 /** @var \DOMElement $quote */
184 foreach ($this->getDocument()->getElementsByTagName('woltlab-quote') as $quote) {
188 foreach ($quotes as $quote) {
189 if (!$quote->parentNode
) {
194 DOMUtil
::removeNode($quote);
199 while ($parent = $parent->parentNode
) {
200 if ($parent->nodeName
=== 'woltlab-quote') {
205 if ($level < $depth) {
209 DOMUtil
::removeNode($quote);
215 * Fixes malformed HTML with metacode markers and text being placed
216 * outside of paragraphs.
218 protected function fixDom() {
219 // remove or convert any <div> found
220 $elements = $this->getDocument()->getElementsByTagName('div');
221 while ($elements->length
) {
222 $element = $elements->item(0);
224 if ($element->parentNode
->nodeName
=== 'P') {
225 DOMUtil
::removeNode($element, true);
228 DOMUtil
::replaceElement($element, $element->ownerDocument
->createElement('p'), true);
232 $appendToPreviousParagraph = function ($node) {
233 /** @var \DOMElement $paragraph */
234 $paragraph = $node->previousSibling
;
236 if (!$paragraph ||
$paragraph->nodeName
!== 'p') {
237 $paragraph = $node->ownerDocument
->createElement('p');
238 $node->parentNode
->insertBefore($paragraph, $node);
241 $paragraph->appendChild($node);
246 /** @var \DOMNode $node */
247 $node = $this->getDocument()->getElementsByTagName('body')->item(0)->firstChild
;
249 if ($node->nodeType
=== XML_ELEMENT_NODE
&& $node->nodeName
=== 'woltlab-metacode-marker') {
250 $node = $appendToPreviousParagraph($node);
252 else if ($node->nodeType
=== XML_ELEMENT_NODE
&& in_array($node->nodeName
, self
::$inlineElements)) {
253 $node = $appendToPreviousParagraph($node);
255 else if ($node->nodeType
=== XML_TEXT_NODE
) {
256 // text node contains only a line break
257 if ($node->textContent
=== "\n" ||
$node->textContent
=== "\r\n") {
258 // check if the previous node is a <p>, otherwise ignore this node entirely
259 if ($node->previousSibling
=== null ||
$node->previousSibling
->nodeName
!== 'p') {
260 $node = $node->nextSibling
;
265 $node = $appendToPreviousParagraph($node);
268 $node = $node->nextSibling
;
271 // remove style attributes from non-whitelisted elements
272 $elements = $this->getDocument()->getElementsByTagName('*');
273 for ($i = 0, $length = $elements->length
; $i < $length; $i++
) {
274 $element = $elements->item($i);
275 if ($element->hasAttribute('style') && !in_array($element->nodeName
, self
::$allowedStyleElements)) {
276 $element->removeAttribute('style');
282 * Trims leading and trailing whitespace. It will only remove text nodes containing
283 * just whitespaces and <p><br></p> (including any whitespace-only text nodes).
285 * It is still possible to work around this by inserting useless text formats such
286 * as bold to circumvent this check. The point of this method is to remove unintentional
287 * and/or potentially unwanted whitespace, not guarding against people being jerks.
289 protected function trim() {
290 $body = $this->getDocument()->getElementsByTagName('body')->item(0);
292 foreach (['firstChild', 'lastChild'] as $property) {
293 while ($node = $body->$property) {
294 if ($node->nodeType
=== XML_TEXT_NODE
) {
295 if (StringUtil
::trim($node->textContent
) === '') {
296 $body->removeChild($node);
303 /** @var \DOMElement $node */
304 if ($node->nodeName
=== 'p') {
305 for ($i = 0, $length = $node->childNodes
->length
; $i < $length; $i++
) {
306 $child = $node->childNodes
->item($i);
307 if ($child->nodeType
=== XML_TEXT_NODE
) {
308 if (StringUtil
::trim($child->textContent
) !== '') {
309 // terminate for() and while()
313 else if ($child->nodeName
!== 'br') {
314 // terminate for() and while()
319 $body->removeChild($node);
328 // strip empty <p></p> (zero content, not even whitespaces)
329 $paragraphs = DOMUtil
::getElements($this->getDocument(), 'p');
330 foreach ($paragraphs as $paragraph) {
331 if ($paragraph->childNodes
->length
=== 0) {
332 DOMUtil
::removeNode($paragraph);
337 /** @var \DOMElement $paragraph */
338 foreach ($this->getDocument()->getElementsByTagName('p') as $paragraph) {
339 DOMUtil
::normalize($paragraph);
341 if ($paragraph->firstChild
&& $paragraph->firstChild
->nodeType
=== XML_TEXT_NODE
) {
342 $oldNode = $paragraph->firstChild
;
343 $newNode = $paragraph->ownerDocument
->createTextNode(preg_replace('/^[\p{Zs}\s]+/u', '', $oldNode->textContent
));
344 $paragraph->insertBefore($newNode, $oldNode);
345 $paragraph->removeChild($oldNode);
349 if ($paragraph->lastChild
&& $paragraph->lastChild
->nodeType
=== XML_TEXT_NODE
) {
350 $oldNode = $paragraph->lastChild
;
351 $newNode = $paragraph->ownerDocument
->createTextNode(preg_replace('/[\p{Zs}\s]+$/u', '', $oldNode->textContent
));
352 $paragraph->insertBefore($newNode, $oldNode);
353 $paragraph->removeChild($oldNode);
359 /** @var \DOMElement $quote */
360 foreach ($this->getDocument()->getElementsByTagName('woltlab-quote') as $quote) {
361 $removeElements = [];
362 for ($i = 0, $length = $quote->childNodes
->length
; $i < $length; $i++
) {
363 $node = $quote->childNodes
->item($i);
364 if ($node->nodeType
=== XML_TEXT_NODE
) {
368 if ($node->nodeName
=== 'p' && $node->childNodes
->length
=== 1) {
369 $child = $node->childNodes
->item(0);
370 if ($child->nodeType
=== XML_ELEMENT_NODE
&& $child->nodeName
=== 'br') {
371 $removeElements[] = $node;
382 foreach ($removeElements as $removeElement) {
383 $quote->removeChild($removeElement);
386 $removeElements = [];
387 for ($i = $quote->childNodes
->length
- 1; $i >= 0; $i--) {
388 $node = $quote->childNodes
->item($i);
389 if ($node->nodeType
=== XML_TEXT_NODE
) {
393 if ($node->nodeName
=== 'p' && $node->childNodes
->length
=== 1) {
394 $child = $node->childNodes
->item(0);
395 if ($child->nodeType
=== XML_ELEMENT_NODE
&& $child->nodeName
=== 'br') {
396 $removeElements[] = $node;
407 foreach ($removeElements as $removeElement) {
408 $quote->removeChild($removeElement);
414 * Checks the input html for disallowed bbcodes and returns any matches.
416 * @return string[] list of matched disallowed bbcodes
418 public function validate() {
421 $this->invokeNodeHandlers('wcf\system\html\input\node\HtmlInputNode', [], function(IHtmlNode
$nodeHandler) use (&$result) {
422 $disallowed = $nodeHandler->isAllowed($this);
424 $result = array_merge($result, $disallowed);
428 // handle custom nodes that have no dedicated handler
430 'spoiler' => 'woltlab-spoiler',
434 foreach ($customTags as $bbcode => $tagName) {
435 if (BBCodeHandler
::getInstance()->isAvailableBBCode($bbcode)) {
439 if ($this->getDocument()->getElementsByTagName($tagName)->length
) {
444 $inlineStyles = array_filter([
446 'font' => 'font-family',
447 'size' => 'font-size',
448 ], function($bbcode) {
449 return !BBCodeHandler
::getInstance()->isAvailableBBCode($bbcode);
450 }, ARRAY_FILTER_USE_KEY
);
452 if (!empty($inlineStyles)) {
454 /** @var \DOMElement $element */
455 foreach ($this->getXPath()->query('//*[@style]') as $element) {
456 $tmp = array_filter(explode(';', $element->getAttribute('style')));
457 foreach ($tmp as $style) {
458 $property = explode(':', $style, 2)[0];
459 if (in_array($property, $inlineStyles) && !in_array($property, $result)) {
460 $result[] = $property;
470 * Returns the raw text content of current document.
472 * @return string raw text content
474 public function getTextContent() {
475 // cloning the body allows custom event handlers to alter the contents
476 // without making permanent changes to the document, avoids side-effects
477 $body = $this->getDocument()->getElementsByTagName('body')->item(0)->cloneNode(true);
479 $parameters = ['body' => $body];
480 EventHandler
::getInstance()->fireAction($this, 'getTextContent', $parameters);
482 return StringUtil
::trim($parameters['body']->textContent
);
486 * Returns true if the message appears to be empty.
488 * @return boolean true if message appears to be empty
490 public function appearsToBeEmpty() {
491 if ($this->getTextContent() !== '') {
495 /** @var \DOMElement $body */
496 $body = $this->getDocument()->getElementsByTagName('body')->item(0);
498 /** @var \DOMElement $element */
499 foreach ($body->getElementsByTagName('*') as $element) {
500 if (!in_array($element->nodeName
, self
::$emptyTags)) {
509 * Processes embedded content.
511 public function processEmbeddedContent() {
512 $this->embeddedContent
= [];
514 $this->parseEmbeddedContent();
518 * Returns the embedded content grouped by type.
522 public function getEmbeddedContent() {
523 return $this->embeddedContent
;
527 * Add embedded content for provided type.
529 * @param string $type type name
530 * @param array $data embedded content
532 public function addEmbeddedContent($type, array $data) {
533 if (isset($this->embeddedContent
[$type])) {
534 $this->embeddedContent
[$type] = array_merge($this->embeddedContent
[$type], $data);
537 $this->embeddedContent
[$type] = $data;
542 * Parses embedded content containedin metacode elements.
544 protected function parseEmbeddedContent() {
545 // handle `woltlab-metacode`
546 $elements = $this->getDocument()->getElementsByTagName('woltlab-metacode');
547 $metacodesByName = [];
548 for ($i = 0, $length = $elements->length
; $i < $length; $i++
) {
549 /** @var \DOMElement $element */
550 $element = $elements->item($i);
551 $name = $element->getAttribute('data-name');
552 $attributes = $this->parseAttributes($element->getAttribute('data-attributes'));
554 if (!isset($metacodesByName[$name])) $metacodesByName[$name] = [];
555 $metacodesByName[$name][] = $attributes;
558 $this->embeddedContent
= $metacodesByName;
560 EventHandler
::getInstance()->fireAction($this, 'parseEmbeddedContent');
564 * Creates a new `<woltlab-metacode>` element contained in the same document
565 * as the provided `$node`.
567 * @param \DOMNode $node reference node used to extract the owner document
568 * @param string $name metacode name
569 * @param mixed[] $attributes list of attributes
570 * @return \DOMElement new metacode element
572 public function createMetacodeElement(\DOMNode
$node, $name, array $attributes) {
573 $element = $node->ownerDocument
->createElement('woltlab-metacode');
574 $element->setAttribute('data-name', $name);
575 $element->setAttribute('data-attributes', base64_encode(json_encode($attributes)));