Merge branch '3.0'
[GitHub/WoltLab/WCF.git] / wcfsetup / install / files / lib / system / html / input / node / HtmlInputNodeProcessor.class.php
1 <?php
2 namespace wcf\system\html\input\node;
3 use wcf\system\bbcode\BBCodeHandler;
4 use wcf\system\event\EventHandler;
5 use wcf\system\html\node\AbstractHtmlNodeProcessor;
6 use wcf\system\html\node\IHtmlNode;
7 use wcf\util\DOMUtil;
8 use wcf\util\StringUtil;
9
10 /**
11 * Processes HTML nodes and handles bbcodes.
12 *
13 * @author Alexander Ebert
14 * @copyright 2001-2018 WoltLab GmbH
15 * @license GNU Lesser General Public License <http://opensource.org/licenses/lgpl-license.php>
16 * @package WoltLabSuite\Core\System\Html\Input\Node
17 * @since 3.0
18 */
19 class HtmlInputNodeProcessor extends AbstractHtmlNodeProcessor {
20 /**
21 * list of allowed CSS class names per tag name
22 * @var array<array>
23 */
24 public static $allowedClassNames = [
25 'h2' => ['text-center', 'text-justify', 'text-right'],
26 'h3' => ['text-center', 'text-justify', 'text-right'],
27 'h4' => ['text-center', 'text-justify', 'text-right'],
28 'img' => [
29 // float left/right
30 'messageFloatObjectLeft', 'messageFloatObjectRight',
31
32 // built-in
33 'smiley', 'woltlabAttachment', 'woltlabSuiteMedia',
34 ],
35 'li' => ['text-center', 'text-justify', 'text-right'],
36 'p' => ['text-center', 'text-justify', 'text-right'],
37 'pre' => ['woltlabHtml'],
38 'td' => ['text-center', 'text-justify', 'text-right'],
39 ];
40
41 /**
42 * List of HTML elements that should allow for custom CSS using
43 * the `style`-attribute.
44 *
45 * Unfortunately, HTMLPurifier offers no *sane* way to limit this
46 * attribute to some elements only.
47 *
48 * @var string[]
49 */
50 public static $allowedStyleElements = [
51 'span',
52 ];
53
54 /**
55 * list of HTML elements that are treated as empty, that means
56 * they don't generate any (indirect) output at all
57 *
58 * @var string[]
59 */
60 public static $emptyTags = [
61 // typical wrappers
62 'div', 'p', 'span',
63
64 // headlines
65 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
66
67 // tables
68 'table', 'tbody', 'thead', 'tr', 'th', 'td', 'colgroup', 'col',
69
70 // lists
71 'ul', 'ol', 'li',
72
73 // other
74 'a', 'kbd', 'woltlab-quote', 'woltlab-spoiler', 'pre', 'sub', 'sup',
75 ];
76
77 /**
78 * list of tag names that represent inline content in the HTML 5 standard
79 * @var string[]
80 */
81 public static $inlineElements = [
82 'a', 'abbr', 'acronym', 'audio', 'b', 'bdi', 'bdo', 'big', 'br', 'button',
83 'canvas', 'cite', 'code', 'data', 'datalist', 'del', 'dfn', 'em', 'embed',
84 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'map', 'mark', 'meter',
85 'noscript', 'object', 'output', 'picture', 'progress', 'q', 'ruby', 's',
86 'samp', 'script', 'select', 'slot', 'small', 'span', 'strong', 'sub', 'sup',
87 'svg', 'template', 'textarea', 'time', 'u', 'tt', 'var', 'video', 'wbr',
88 ];
89
90 /**
91 * list of embedded content grouped by type
92 * @var array
93 */
94 protected $embeddedContent = [];
95
96 /**
97 * @inheritDoc
98 */
99 protected $nodeInterface = IHtmlInputNode::class;
100
101 /**
102 * @inheritDoc
103 */
104 public function process() {
105 EventHandler::getInstance()->fireAction($this, 'beforeProcess');
106
107 // fix invalid html such as metacode markers outside of block elements
108 $this->fixDom();
109
110 // process metacode markers first
111 $this->invokeHtmlNode(new HtmlInputNodeWoltlabMetacodeMarker());
112
113 // handle static converters
114 $this->invokeHtmlNode(new HtmlInputNodeWoltlabMetacode());
115
116 if (MESSAGE_MAX_QUOTE_DEPTH) {
117 $this->enforceQuoteDepth(MESSAGE_MAX_QUOTE_DEPTH);
118 }
119
120 $imgNodeHandler = new HtmlInputNodeImg();
121 $this->invokeHtmlNode($imgNodeHandler);
122 $smileyCount = $imgNodeHandler->getSmileyCount();
123
124 // dynamic node handlers
125 $this->invokeNodeHandlers('wcf\system\html\input\node\HtmlInputNode', ['img', 'woltlab-metacode']);
126
127 // remove whitespace at the start/end of the message
128 $this->trim();
129
130 // detect mentions, urls, emails and smileys
131 $textParser = new HtmlInputNodeTextParser($this, $smileyCount);
132 $textParser->parse();
133
134 // handle HTML bbcode
135 $allowHtml = BBCodeHandler::getInstance()->isAvailableBBCode('html');
136
137 // strip invalid class names
138 /** @var \DOMElement $element */
139 foreach ($this->getXPath()->query('//*[@class]') as $element) {
140 $nodeName = $element->nodeName;
141 if (isset(self::$allowedClassNames[$nodeName])) {
142 if (self::$allowedClassNames[$nodeName] === '*') {
143 continue;
144 }
145
146 $classNames = explode(' ', $element->getAttribute('class'));
147 $classNames = array_filter($classNames, function ($className) use ($allowHtml, $nodeName) {
148 if (!$allowHtml && $nodeName === 'pre' && $className === 'woltlabHtml') {
149 return false;
150 }
151
152 return ($className && in_array($className, self::$allowedClassNames[$nodeName]));
153 });
154
155 if (!empty($classNames)) {
156 $element->setAttribute('class', implode(' ', $classNames));
157 continue;
158 }
159 }
160
161 $element->removeAttribute('class');
162
163 if ($nodeName === 'span' && $element->attributes->length === 0) {
164 DOMUtil::removeNode($element, true);
165 }
166 }
167
168 EventHandler::getInstance()->fireAction($this, 'beforeEmbeddedProcess');
169
170 // extract embedded content
171 $this->processEmbeddedContent();
172
173 EventHandler::getInstance()->fireAction($this, 'afterProcess');
174 }
175
176 /**
177 * Enforces the maximum depth of nested quotes.
178 *
179 * @param integer $depth
180 */
181 public function enforceQuoteDepth($depth) {
182 $quotes = [];
183 /** @var \DOMElement $quote */
184 foreach ($this->getDocument()->getElementsByTagName('woltlab-quote') as $quote) {
185 $quotes[] = $quote;
186 }
187
188 foreach ($quotes as $quote) {
189 if (!$quote->parentNode) {
190 continue;
191 }
192
193 if ($depth === 0) {
194 DOMUtil::removeNode($quote);
195 }
196 else {
197 $level = 0;
198 $parent = $quote;
199 while ($parent = $parent->parentNode) {
200 if ($parent->nodeName === 'woltlab-quote') {
201 $level++;
202 }
203 }
204
205 if ($level < $depth) {
206 continue;
207 }
208
209 DOMUtil::removeNode($quote);
210 }
211 }
212 }
213
214 /**
215 * Fixes malformed HTML with metacode markers and text being placed
216 * outside of paragraphs.
217 */
218 protected function fixDom() {
219 // remove or convert any <div> found
220 $elements = $this->getDocument()->getElementsByTagName('div');
221 while ($elements->length) {
222 $element = $elements->item(0);
223
224 if ($element->parentNode->nodeName === 'P') {
225 DOMUtil::removeNode($element, true);
226 }
227 else {
228 DOMUtil::replaceElement($element, $element->ownerDocument->createElement('p'), true);
229 }
230 }
231
232 $appendToPreviousParagraph = function ($node) {
233 /** @var \DOMElement $paragraph */
234 $paragraph = $node->previousSibling;
235
236 if (!$paragraph || $paragraph->nodeName !== 'p') {
237 $paragraph = $node->ownerDocument->createElement('p');
238 $node->parentNode->insertBefore($paragraph, $node);
239 }
240
241 $paragraph->appendChild($node);
242
243 return $paragraph;
244 };
245
246 /** @var \DOMNode $node */
247 $node = $this->getDocument()->getElementsByTagName('body')->item(0)->firstChild;
248 while ($node) {
249 if ($node->nodeType === XML_ELEMENT_NODE && $node->nodeName === 'woltlab-metacode-marker') {
250 $node = $appendToPreviousParagraph($node);
251 }
252 else if ($node->nodeType === XML_ELEMENT_NODE && in_array($node->nodeName, self::$inlineElements)) {
253 $node = $appendToPreviousParagraph($node);
254 }
255 else if ($node->nodeType === XML_TEXT_NODE) {
256 // text node contains only a line break
257 if ($node->textContent === "\n" || $node->textContent === "\r\n") {
258 // check if the previous node is a <p>, otherwise ignore this node entirely
259 if ($node->previousSibling === null || $node->previousSibling->nodeName !== 'p') {
260 $node = $node->nextSibling;
261 continue;
262 }
263 }
264
265 $node = $appendToPreviousParagraph($node);
266 }
267
268 $node = $node->nextSibling;
269 }
270
271 // remove style attributes from non-whitelisted elements
272 $elements = $this->getDocument()->getElementsByTagName('*');
273 for ($i = 0, $length = $elements->length; $i < $length; $i++) {
274 $element = $elements->item($i);
275 if ($element->hasAttribute('style') && !in_array($element->nodeName, self::$allowedStyleElements)) {
276 $element->removeAttribute('style');
277 }
278 }
279 }
280
281 /**
282 * Trims leading and trailing whitespace. It will only remove text nodes containing
283 * just whitespaces and <p><br></p> (including any whitespace-only text nodes).
284 *
285 * It is still possible to work around this by inserting useless text formats such
286 * as bold to circumvent this check. The point of this method is to remove unintentional
287 * and/or potentially unwanted whitespace, not guarding against people being jerks.
288 */
289 protected function trim() {
290 $body = $this->getDocument()->getElementsByTagName('body')->item(0);
291
292 foreach (['firstChild', 'lastChild'] as $property) {
293 while ($node = $body->$property) {
294 if ($node->nodeType === XML_TEXT_NODE) {
295 if (StringUtil::trim($node->textContent) === '') {
296 $body->removeChild($node);
297 }
298 else {
299 break;
300 }
301 }
302 else {
303 /** @var \DOMElement $node */
304 if ($node->nodeName === 'p') {
305 for ($i = 0, $length = $node->childNodes->length; $i < $length; $i++) {
306 $child = $node->childNodes->item($i);
307 if ($child->nodeType === XML_TEXT_NODE) {
308 if (StringUtil::trim($child->textContent) !== '') {
309 // terminate for() and while()
310 break 2;
311 }
312 }
313 else if ($child->nodeName !== 'br') {
314 // terminate for() and while()
315 break 2;
316 }
317 }
318
319 $body->removeChild($node);
320 }
321 else {
322 break;
323 }
324 }
325 }
326 }
327
328 // strip empty <p></p> (zero content, not even whitespaces)
329 $paragraphs = DOMUtil::getElements($this->getDocument(), 'p');
330 foreach ($paragraphs as $paragraph) {
331 if ($paragraph->childNodes->length === 0) {
332 DOMUtil::removeNode($paragraph);
333 }
334 }
335
336 // trim <p>...</p>
337 /** @var \DOMElement $paragraph */
338 foreach ($this->getDocument()->getElementsByTagName('p') as $paragraph) {
339 DOMUtil::normalize($paragraph);
340
341 if ($paragraph->firstChild && $paragraph->firstChild->nodeType === XML_TEXT_NODE) {
342 $oldNode = $paragraph->firstChild;
343 $newNode = $paragraph->ownerDocument->createTextNode(preg_replace('/^[\p{Zs}\s]+/u', '', $oldNode->textContent));
344 $paragraph->insertBefore($newNode, $oldNode);
345 $paragraph->removeChild($oldNode);
346
347 }
348
349 if ($paragraph->lastChild && $paragraph->lastChild->nodeType === XML_TEXT_NODE) {
350 $oldNode = $paragraph->lastChild;
351 $newNode = $paragraph->ownerDocument->createTextNode(preg_replace('/[\p{Zs}\s]+$/u', '', $oldNode->textContent));
352 $paragraph->insertBefore($newNode, $oldNode);
353 $paragraph->removeChild($oldNode);
354
355 }
356 }
357
358 // trim quotes
359 /** @var \DOMElement $quote */
360 foreach ($this->getDocument()->getElementsByTagName('woltlab-quote') as $quote) {
361 $removeElements = [];
362 for ($i = 0, $length = $quote->childNodes->length; $i < $length; $i++) {
363 $node = $quote->childNodes->item($i);
364 if ($node->nodeType === XML_TEXT_NODE) {
365 continue;
366 }
367
368 if ($node->nodeName === 'p' && $node->childNodes->length === 1) {
369 $child = $node->childNodes->item(0);
370 if ($child->nodeType === XML_ELEMENT_NODE && $child->nodeName === 'br') {
371 $removeElements[] = $node;
372 }
373 else {
374 break;
375 }
376 }
377 else {
378 break;
379 }
380 }
381
382 foreach ($removeElements as $removeElement) {
383 $quote->removeChild($removeElement);
384 }
385
386 $removeElements = [];
387 for ($i = $quote->childNodes->length - 1; $i >= 0; $i--) {
388 $node = $quote->childNodes->item($i);
389 if ($node->nodeType === XML_TEXT_NODE) {
390 continue;
391 }
392
393 if ($node->nodeName === 'p' && $node->childNodes->length === 1) {
394 $child = $node->childNodes->item(0);
395 if ($child->nodeType === XML_ELEMENT_NODE && $child->nodeName === 'br') {
396 $removeElements[] = $node;
397 }
398 else {
399 break;
400 }
401 }
402 else {
403 break;
404 }
405 }
406
407 foreach ($removeElements as $removeElement) {
408 $quote->removeChild($removeElement);
409 }
410 }
411 }
412
413 /**
414 * Checks the input html for disallowed bbcodes and returns any matches.
415 *
416 * @return string[] list of matched disallowed bbcodes
417 */
418 public function validate() {
419 $result = [];
420
421 $this->invokeNodeHandlers('wcf\system\html\input\node\HtmlInputNode', [], function(IHtmlNode $nodeHandler) use (&$result) {
422 $disallowed = $nodeHandler->isAllowed($this);
423 if ($disallowed) {
424 $result = array_merge($result, $disallowed);
425 }
426 });
427
428 // handle custom nodes that have no dedicated handler
429 $customTags = [
430 'spoiler' => 'woltlab-spoiler',
431 'url' => 'a',
432 ];
433
434 foreach ($customTags as $bbcode => $tagName) {
435 if (BBCodeHandler::getInstance()->isAvailableBBCode($bbcode)) {
436 continue;
437 }
438
439 if ($this->getDocument()->getElementsByTagName($tagName)->length) {
440 $result[] = $bbcode;
441 }
442 }
443
444 $inlineStyles = array_filter([
445 'color' => 'color',
446 'font' => 'font-family',
447 'size' => 'font-size',
448 ], function($bbcode) {
449 return !BBCodeHandler::getInstance()->isAvailableBBCode($bbcode);
450 }, ARRAY_FILTER_USE_KEY);
451
452 if (!empty($inlineStyles)) {
453 $styles = [];
454 /** @var \DOMElement $element */
455 foreach ($this->getXPath()->query('//*[@style]') as $element) {
456 $tmp = array_filter(explode(';', $element->getAttribute('style')));
457 foreach ($tmp as $style) {
458 $property = explode(':', $style, 2)[0];
459 if (in_array($property, $inlineStyles) && !in_array($property, $result)) {
460 $result[] = $property;
461 }
462 }
463 }
464 }
465
466 return $result;
467 }
468
469 /**
470 * Returns the raw text content of current document.
471 *
472 * @return string raw text content
473 */
474 public function getTextContent() {
475 // cloning the body allows custom event handlers to alter the contents
476 // without making permanent changes to the document, avoids side-effects
477 $body = $this->getDocument()->getElementsByTagName('body')->item(0)->cloneNode(true);
478
479 $parameters = ['body' => $body];
480 EventHandler::getInstance()->fireAction($this, 'getTextContent', $parameters);
481
482 return StringUtil::trim($parameters['body']->textContent);
483 }
484
485 /**
486 * Returns true if the message appears to be empty.
487 *
488 * @return boolean true if message appears to be empty
489 */
490 public function appearsToBeEmpty() {
491 if ($this->getTextContent() !== '') {
492 return false;
493 }
494
495 /** @var \DOMElement $body */
496 $body = $this->getDocument()->getElementsByTagName('body')->item(0);
497
498 /** @var \DOMElement $element */
499 foreach ($body->getElementsByTagName('*') as $element) {
500 if (!in_array($element->nodeName, self::$emptyTags)) {
501 return false;
502 }
503 }
504
505 return true;
506 }
507
508 /**
509 * Processes embedded content.
510 */
511 public function processEmbeddedContent() {
512 $this->embeddedContent = [];
513
514 $this->parseEmbeddedContent();
515 }
516
517 /**
518 * Returns the embedded content grouped by type.
519 *
520 * @return array
521 */
522 public function getEmbeddedContent() {
523 return $this->embeddedContent;
524 }
525
526 /**
527 * Add embedded content for provided type.
528 *
529 * @param string $type type name
530 * @param array $data embedded content
531 */
532 public function addEmbeddedContent($type, array $data) {
533 if (isset($this->embeddedContent[$type])) {
534 $this->embeddedContent[$type] = array_merge($this->embeddedContent[$type], $data);
535 }
536 else {
537 $this->embeddedContent[$type] = $data;
538 }
539 }
540
541 /**
542 * Parses embedded content containedin metacode elements.
543 */
544 protected function parseEmbeddedContent() {
545 // handle `woltlab-metacode`
546 $elements = $this->getDocument()->getElementsByTagName('woltlab-metacode');
547 $metacodesByName = [];
548 for ($i = 0, $length = $elements->length; $i < $length; $i++) {
549 /** @var \DOMElement $element */
550 $element = $elements->item($i);
551 $name = $element->getAttribute('data-name');
552 $attributes = $this->parseAttributes($element->getAttribute('data-attributes'));
553
554 if (!isset($metacodesByName[$name])) $metacodesByName[$name] = [];
555 $metacodesByName[$name][] = $attributes;
556 }
557
558 $this->embeddedContent = $metacodesByName;
559
560 EventHandler::getInstance()->fireAction($this, 'parseEmbeddedContent');
561 }
562
563 /**
564 * Creates a new `<woltlab-metacode>` element contained in the same document
565 * as the provided `$node`.
566 *
567 * @param \DOMNode $node reference node used to extract the owner document
568 * @param string $name metacode name
569 * @param mixed[] $attributes list of attributes
570 * @return \DOMElement new metacode element
571 */
572 public function createMetacodeElement(\DOMNode $node, $name, array $attributes) {
573 $element = $node->ownerDocument->createElement('woltlab-metacode');
574 $element->setAttribute('data-name', $name);
575 $element->setAttribute('data-attributes', base64_encode(json_encode($attributes)));
576
577 return $element;
578 }
579 }