Properly handling bbcodes with content being treated as source
[GitHub/WoltLab/WCF.git] / wcfsetup / install / files / lib / system / bbcode / HtmlBBCodeParser.class.php
1 <?php
2 namespace wcf\system\bbcode;
3 use wcf\system\exception\SystemException;
4 use wcf\util\JSON;
5 use wcf\util\StringUtil;
6
7 /**
8 * Parses bbcodes and transforms them into the custom HTML element <woltlab-bbcode>
9 * that can be safely passed through HTMLPurifier's validation mechanism.
10 *
11 * All though not exactly required for all bbcodes, the actual output of an bbcode
12 * cannot be foreseen and potentially conflict with HTMLPurifier's whitelist. Examples
13 * are <iframe> or other embedded media that is allowed as a result of a bbcode, but
14 * not allowed to be directly provided by a user.
15 *
16 * @author Alexander Ebert
17 * @copyright 2001-2016 WoltLab GmbH
18 * @license GNU Lesser General Public License <http://opensource.org/licenses/lgpl-license.php>
19 * @package WoltLabSuite\Core\System\Bbcode
20 * @since 3.0
21 */
22 class HtmlBBCodeParser extends BBCodeParser {
23 /**
24 * list of open tags with name and uuid
25 * @var array
26 */
27 protected $openTagIdentifiers = [];
28
29 /**
30 * regex for valid bbcode names
31 * @var string
32 */
33 protected $validBBCodePattern = '~^[a-z](?:[a-z0-9\-_]+)?$~';
34
35 /**
36 * @inheritDoc
37 */
38 public function parse($text) {
39 $this->setText($text);
40
41 // difference to the original implementation: sourcecode bbcodes are handled too
42 $this->buildTagArray(false);
43
44 // difference to the original implementation: we don't care for unclosed tags,
45 // they'll be marked as invalid and removed at the end, leaving lonely opening
46 // tags that will eventually be removed within the marker processor
47 $this->buildXMLStructure();
48
49 $this->handleSourceBBCodes();
50
51 $this->buildParsedString();
52
53 return $this->parsedText;
54 }
55
56 /**
57 * @inheritDoc
58 */
59 public function buildXMLStructure() {
60 // stack for open tags
61 $openTagStack = $openTagDataStack = [];
62 $newTagArray = [];
63 $newTextArray = [];
64
65 $i = -1;
66 foreach ($this->tagArray as $i => $tag) {
67 if ($tag['closing']) {
68 // closing tag
69 if (in_array($tag['name'], $openTagStack) && $this->isAllowed($openTagStack, $tag['name'], true)) {
70 // close unclosed tags
71 while (($previousTag = end($openTagStack)) != $tag['name']) {
72 $nextIndex = count($newTagArray);
73
74 // mark as invalid and do not flag as opened tag
75 $newTag = $this->buildTag('[/'.end($openTagStack).']');
76 $newTag['invalid'] = true;
77
78 $newTagArray[$nextIndex] = $newTag;
79 if (!isset($newTextArray[$nextIndex])) $newTextArray[$nextIndex] = '';
80 $newTextArray[$nextIndex] .= $this->textArray[$i];
81 $this->textArray[$i] = '';
82 array_pop($openTagStack);
83 array_pop($openTagDataStack);
84 }
85
86 $nextIndex = count($newTagArray);
87 $newTagArray[$nextIndex] = $tag;
88 array_pop($openTagStack);
89 array_pop($openTagDataStack);
90 if (!isset($newTextArray[$nextIndex])) $newTextArray[$nextIndex] = '';
91 $newTextArray[$nextIndex] .= $this->textArray[$i];
92 }
93 else {
94 // no such tag open
95 // handle as plain text
96 $this->textArray[$i] .= $tag['source'];
97 $last = count($newTagArray);
98 if (!isset($newTextArray[$last])) $newTextArray[$last] = '';
99 $newTextArray[$last] .= $this->textArray[$i];
100 }
101 }
102 else {
103 // opening tag
104 if ($this->isAllowed($openTagStack, $tag['name']) && $this->isValidTag($tag)) {
105 $openTagStack[] = $tag['name'];
106 $openTagDataStack[] = $tag;
107 $nextIndex = count($newTagArray);
108 $newTagArray[$nextIndex] = $tag;
109 if (!isset($newTextArray[$nextIndex])) $newTextArray[$nextIndex] = '';
110 $newTextArray[$nextIndex] .= $this->textArray[$i];
111 }
112 else {
113 // tag not allowed
114 $this->textArray[$i] .= $tag['source'];
115 $last = count($newTagArray);
116 if (!isset($newTextArray[$last])) $newTextArray[$last] = '';
117 $newTextArray[$last] .= $this->textArray[$i];
118 }
119 }
120 }
121
122 $last = count($newTagArray);
123 if (!isset($newTextArray[$last])) $newTextArray[$last] = '';
124 $newTextArray[$last] .= $this->textArray[$i + 1];
125
126 // close unclosed open tags
127 while (end($openTagStack)) {
128 $nextIndex = count($newTagArray);
129
130 // mark as invalid
131 $newTag = $this->buildTag('[/'.end($openTagStack).']');
132 $newTag['invalid'] = true;
133
134 $newTagArray[$nextIndex] = $newTag;
135 if (!isset($newTextArray[$nextIndex])) $newTextArray[$nextIndex] = '';
136 array_pop($openTagStack);
137 array_pop($openTagDataStack);
138 }
139
140 $this->tagArray = $newTagArray;
141 $this->textArray = $newTextArray;
142 }
143
144 /**
145 * Flags bbcodes inside code bbcodes for reversal, turning them back
146 * into their source state (= textual representation).
147 */
148 protected function handleSourceBBCodes() {
149 $sourceBBCodes = $this->getSourceBBCodes();
150
151 $inCode = '';
152 $openTagStack = [];
153
154 for ($i = 0, $length = count($this->tagArray); $i < $length; $i++) {
155 $tag = $this->tagArray[$i];
156
157 if (!empty($tag['invalid'])) {
158 continue;
159 }
160
161 $name = $tag['name'];
162
163 if ($tag['closing']) {
164 if ($inCode) {
165 // matches opening code tag
166 if ($inCode === $name) {
167 $inCode = '';
168 array_pop($openTagStack);
169 }
170 else {
171 // unrelated tag, flag as invalid
172 $this->tagArray[$i]['inCode'] = true;
173 }
174
175 continue;
176 }
177
178 array_pop($openTagStack);
179 }
180 else {
181 if ($inCode) {
182 // inside code block, flag as invalid
183 $this->tagArray[$i]['inCode'] = true;
184 continue;
185 }
186
187 // starts a new code block
188 if (in_array($name, $sourceBBCodes)) {
189 // look ahead to see if there is a valid closing tag
190 $hasClosingTag = false;
191 for ($j = $i + 1; $j < $length; $j++) {
192 if ($this->tagArray[$j]['name'] === $name && empty($this->tagArray[$j]['invalid'])) {
193 $hasClosingTag = true;
194 break;
195 }
196 }
197
198 if ($hasClosingTag) {
199 $inCode = $name;
200 }
201 else {
202 // no closing tag, flag as invalid to avoid the
203 // entire content afterwards being treated as code
204 $this->tagArray[$i]['inCode'] = true;
205 }
206 }
207
208 $openTagStack[] = $name;
209 }
210 }
211 }
212
213 /**
214 * @inheritDoc
215 */
216 public function buildParsedString() {
217 // reset parsed text
218 $this->parsedText = '';
219
220 // reset identifiers for open tags
221 $this->openTagIdentifiers = [];
222
223 // create text buffer
224 $buffer =& $this->parsedText;
225
226 // stack of buffered tags
227 $bufferedTagStack = [];
228
229 // loop through the tags
230 $i = -1;
231 foreach ($this->tagArray as $i => $tag) {
232 // append text to buffer
233 $buffer .= $this->textArray[$i];
234
235 if ($tag['closing']) {
236 if (!empty($tag['invalid'])) {
237 // drop invalid closing tag
238 continue;
239 }
240 else if (!empty($tag['inCode'])) {
241 // revert bbcodes inside code
242 $buffer .= $tag['source'];
243 continue;
244 }
245
246 // get buffered opening tag
247 $openingTag = end($bufferedTagStack);
248
249 // closing tag
250 if ($openingTag && $openingTag['name'] == $tag['name']) {
251 $hideBuffer = false;
252 // insert buffered content as attribute value
253 foreach ($this->bbcodes[$tag['name']]->getAttributes() as $attribute) {
254 if ($attribute->useText && !isset($openingTag['attributes'][$attribute->attributeNo])) {
255 $openingTag['attributes'][$attribute->attributeNo] = $buffer;
256 $hideBuffer = true;
257 break;
258 }
259 }
260
261 // validate tag attributes again
262 if ($this->isValidTag($openingTag)) {
263 // build tag
264 if ($this->bbcodes[$tag['name']]->className) {
265 // difference to the original implementation: use the custom HTML element than to process them directly
266 $parsedTag = $this->compileTag($openingTag, $buffer, $tag);
267 }
268 else {
269 // build tag
270 $parsedTag = $this->buildOpeningTag($openingTag);
271 $closingTag = $this->buildClosingTag($tag);
272 if (!empty($closingTag) && $hideBuffer) $parsedTag .= $buffer.$closingTag;
273 }
274 }
275 else {
276 $parsedTag = $openingTag['source'].$buffer.$tag['source'];
277 }
278
279 // close current buffer
280 array_pop($bufferedTagStack);
281
282 // open previous buffer
283 if (count($bufferedTagStack) > 0) {
284 $bufferedTag =& $bufferedTagStack[count($bufferedTagStack) - 1];
285 $buffer =& $bufferedTag['buffer'];
286 }
287 else {
288 $buffer =& $this->parsedText;
289 }
290
291 // append parsed tag
292 $buffer .= $parsedTag;
293 }
294 else {
295 $buffer .= $this->buildClosingTag($tag);
296 }
297 }
298 else {
299 if (!empty($tag['inCode'])) {
300 // revert bbcodes inside code
301 $buffer .= $tag['source'];
302 continue;
303 }
304
305 // opening tag
306 if ($this->needBuffering($tag)) {
307 // start buffering
308 $tag['buffer'] = '';
309 $bufferedTagStack[] = $tag;
310 $buffer =& $bufferedTagStack[count($bufferedTagStack) - 1]['buffer'];
311 }
312 else {
313 $buffer .= $this->buildOpeningTag($tag);
314 }
315 }
316 }
317
318 if (isset($this->textArray[$i + 1])) $this->parsedText .= $this->textArray[$i + 1];
319 }
320
321 /**
322 * Builds the bbcode output.
323 *
324 * @param string $name bbcode identifier
325 * @param array $attributes list of attributes
326 * @param \DOMElement $element element
327 * @return string parsed bbcode
328 */
329 public function getHtmlOutput($name, array $attributes, \DOMElement $element) {
330 if (isset($this->bbcodes[$name])) {
331 $bbcode = $this->bbcodes[$name];
332 if ($bbcode->isSourceCode) {
333 array_unshift($attributes, $element->textContent);
334 }
335
336 $openingTag = ['attributes' => $attributes, 'name' => $name];
337 $closingTag = ['name' => $name];
338
339 if ($bbcode->getProcessor()) {
340 /** @var IBBCode $processor */
341 $processor = $bbcode->getProcessor();
342 return $processor->getParsedTag($openingTag, '<!-- META_CODE_INNER_CONTENT -->', $closingTag, $this);
343 }
344 else {
345 return parent::buildOpeningTag($openingTag) . '<!-- META_CODE_INNER_CONTENT -->' . parent::buildClosingTag($closingTag);
346 }
347 }
348
349 // unknown bbcode, output plain tags
350 return $this->buildBBCodeTag($name, $attributes);
351 }
352
353 /**
354 * Builds a plain bbcode string, used for unknown bbcodes.
355 *
356 * @param string $name bbcode identifier
357 * @param array $attributes list of attributes
358 * @param boolean $openingTagOnly only render the opening tag
359 * @return string
360 */
361 public function buildBBCodeTag($name, $attributes, $openingTagOnly = false) {
362 if (!empty($attributes)) {
363 foreach ($attributes as &$attribute) {
364 $attribute = "'" . addcslashes($attribute, "'") . "'";
365 }
366 unset($attribute);
367
368 $attributes = '=' . implode(",", $attributes);
369 }
370 else {
371 $attributes = '';
372 }
373
374 if ($openingTagOnly) {
375 return '[' . $name . $attributes . ']';
376 }
377
378 return '[' . $name . $attributes . ']<!-- META_CODE_INNER_CONTENT -->[/' . $name . ']';
379 }
380
381 /**
382 * Returns the list of bbcodes that represent block elements.
383 *
384 * @return string[] list of bbcode block elements
385 */
386 public function getBlockBBCodes() {
387 $bbcodes = [];
388 foreach ($this->bbcodes as $name => $bbcode) {
389 if ($bbcode->isBlockElement) {
390 $bbcodes[] = $name;
391 }
392 }
393
394 return $bbcodes;
395 }
396
397 /**
398 * Returns the list of bbcodes that represent source code elements.
399 *
400 * @return string[] list of bbcode source code elements
401 */
402 public function getSourceBBCodes() {
403 $bbcodes = [];
404 foreach ($this->bbcodes as $name => $bbcode) {
405 if ($bbcode->isSourceCode) {
406 $bbcodes[] = $name;
407 }
408 }
409
410 return $bbcodes;
411 }
412
413 /**
414 * Compiles tag fragments into the custom HTML element.
415 *
416 * @param array $openingTag opening tag data
417 * @param string $content content between opening and closing tag
418 * @param array $closingTag closing tag data
419 * @return string custom HTML element
420 */
421 protected function compileTag(array $openingTag, $content, array $closingTag) {
422 return $this->buildOpeningTag($openingTag) . $content . $this->buildClosingTag($closingTag);
423 }
424
425 /**
426 * @inheritDoc
427 */
428 protected function buildOpeningTag(array $tag) {
429 $name = strtolower($tag['name']);
430 if (!$this->isValidBBCodeName($name)) {
431 return $tag['source'];
432 }
433
434 $uuid = StringUtil::getUUID();
435 $this->openTagIdentifiers[] = [
436 'name' => $name,
437 'uuid' => $uuid
438 ];
439
440 $attributes = '';
441 if (!empty($tag['attributes'])) {
442 // strip outer quote tags
443 $tag['attributes'] = array_map(function($attribute) {
444 if (preg_match('~^([\'"])(?P<content>.*)(\1)$~', $attribute, $matches)) {
445 return $matches['content'];
446 }
447
448 return $attribute;
449 }, $tag['attributes']);
450
451 // uses base64 encoding to avoid an "escape" nightmare
452 $attributes = ' data-attributes="' . base64_encode(JSON::encode($tag['attributes'])) . '"';
453 }
454
455 return '<woltlab-metacode-marker data-name="' . $name . '" data-uuid="' . $uuid . '" data-source="' . base64_encode($tag['source']) . '"' . $attributes . ' />';
456 }
457
458 /**
459 * @inheritDoc
460 */
461 protected function buildClosingTag(array $tag) {
462 $name = strtolower($tag['name']);
463 if (!$this->isValidBBCodeName($name)) {
464 return $tag['source'];
465 }
466
467 $data = array_pop($this->openTagIdentifiers);
468 if ($data['name'] !== $name) {
469 // check if this is a source code tag as some people
470 // love to nest the same source bbcode
471 if (in_array($name, $this->getSourceBBCodes())) {
472 return $tag['source'];
473 }
474
475 throw new SystemException("Tag mismatch, expected '".$name."', got '".$data['name']."'.");
476 }
477
478 return '<woltlab-metacode-marker data-uuid="' . $data['uuid'] . '" data-source="' . base64_encode($tag['source']) . '" />';
479 }
480
481 /**
482 * Returns true if provided name is a valid bbcode identifier.
483 *
484 * @param string $name bbcode identifier
485 * @return boolean true if provided name is a valid bbcode identifier
486 */
487 protected function isValidBBCodeName($name) {
488 return preg_match($this->validBBCodePattern, $name) === 1;
489 }
490 }