From 8996715791bc61bbcd864b9b8e695d0e4a1a3c08 Mon Sep 17 00:00:00 2001 From: Alexander Ebert Date: Sat, 6 May 2023 21:18:36 +0200 Subject: [PATCH] Improve the normalization of legacy messages --- .../output/node/HtmlOutputNodeBr.class.php | 92 ----------- .../node/HtmlOutputNodeNormalizer.class.php | 154 ++++++++++++++++++ .../output/node/HtmlOutputNodeP.class.php | 52 ------ .../node/HtmlOutputNodeProcessor.class.php | 2 + 4 files changed, 156 insertions(+), 144 deletions(-) delete mode 100644 wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeBr.class.php create mode 100644 wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeNormalizer.class.php delete mode 100644 wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeP.class.php diff --git a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeBr.class.php b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeBr.class.php deleted file mode 100644 index 2e8238b03a..0000000000 --- a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeBr.class.php +++ /dev/null @@ -1,92 +0,0 @@ - and strips trailing
. - * - * @author Alexander Ebert - * @copyright 2001-2023 WoltLab GmbH - * @license GNU Lesser General Public License - * @since 6.0 - */ -final class HtmlOutputNodeBr extends AbstractHtmlOutputNode -{ - /** - * @inheritDoc - */ - protected $tagName = 'br'; - - /** - * @inheritDoc - */ - public function process(array $elements, AbstractHtmlNodeProcessor $htmlNodeProcessor) - { - /** @var \DOMElement $element */ - foreach ($elements as $element) { - $this->unwrap($element); - $this->removeTrailingBr($element); - } - } - - private function unwrap(DOMElement $br): void - { - if ($br->previousSibling || $br->nextSibling) { - return; - } - - $parent = $br; - while (($parent = $parent->parentNode) !== null) { - switch ($parent->nodeName) { - case "b": - case "del": - case "em": - case "i": - case "strong": - case "sub": - case "sup": - case "span": - case "u": - if ($br->previousSibling || $br->nextSibling) { - return; - } - - $parent->parentNode->insertBefore($br, $parent); - $parent->parentNode->removeChild($parent); - $parent = $br; - - break; - - default: - return; - } - } - } - - private function removeTrailingBr(DOMElement $br): void - { - if ($br->getAttribute("data-cke-filler") === "true") { - return; - } - - $paragraph = DOMUtil::closest($br, "p"); - if ($paragraph === null) { - return; - } - - if (!DOMUtil::isLastNode($br, $paragraph)) { - return; - } - - if ($paragraph->childNodes->length === 1 && $paragraph->childNodes->item(0) === $br) { - $paragraph->parentNode->removeChild($paragraph); - } else { - $br->remove(); - } - } -} diff --git a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeNormalizer.class.php b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeNormalizer.class.php new file mode 100644 index 0000000000..2ba481cc3b --- /dev/null +++ b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeNormalizer.class.php @@ -0,0 +1,154 @@ + + * @since 6.0 + */ +final class HtmlOutputNodeNormalizer +{ + public function __construct(private readonly \DOMXPath $xpath) + { + } + + public function normalize(): void + { + $this->normalizeBr(); + + $candidates = $this->getPossibleSpacerParagraphs(); + $this->reduceSpacerParagraphs($candidates); + } + + /** + * @return list<\DOMElement> + */ + private function getPossibleSpacerParagraphs(): array + { + $paragraphs = []; + + foreach ($this->xpath->query('//p') as $p) { + \assert($p instanceof \DOMElement); + + if ($p->childNodes->length === 1) { + $child = $p->childNodes->item(0); + if ($child->nodeName === 'br') { + \assert($child instanceof \DOMElement); + + if ($child->getAttribute('data-cke-filler') !== 'true') { + $paragraphs[] = $p; + } + } + } + } + + return $paragraphs; + } + + /** + * @param list<\DOMElement> $paragraphs + * @return void + */ + private function reduceSpacerParagraphs(array $paragraphs): void + { + if ($paragraphs === []) { + return; + } + + for ($i = 0, $length = \count($paragraphs); $i < $length; $i++) { + $candidate = $paragraphs[$i]; + $offset = 0; + + // Searches for adjacent paragraphs. + while ($i + $offset + 1 < $length) { + $nextCandidate = $paragraphs[$i + $offset + 1]; + if ($candidate->nextElementSibling !== $nextCandidate) { + break; + } + + $offset++; + } + + if ($offset === 0) { + // An offset of 0 means that this is a single paragraph and we + // can safely remove it. + $candidate->remove(); + } else { + // We need to reduce the number of paragraphs by half, unless it + // is an uneven number in which case we need to remove one + // additional paragraph. + if ($offset % 2 === 1) { + // 2 -> 1, 4 -> 2 + $numberOfParagraphsToRemove = \ceil($offset / 2); + } else { + // 3 -> 1, 5 -> 2 + $numberOfParagraphsToRemove = \ceil($offset / 2) + 1; + } + + $removeParagraphs = \array_slice($paragraphs, $i, $numberOfParagraphsToRemove); + foreach ($removeParagraphs as $paragraph) { + $paragraph->remove(); + } + + $i += $offset; + } + } + } + + private function normalizeBr(): void + { + foreach ($this->xpath->query('//br') as $br) { + \assert($br instanceof \DOMElement); + + $this->unwrapBr($br); + $this->removeTrailingBr($br); + } + } + + private function unwrapBr(\DOMElement $br): void + { + if ($br->previousSibling || $br->nextSibling) { + return; + } + + $parent = $br->parentNode; + switch ($parent->nodeName) { + case "b": + case "del": + case "em": + case "i": + case "strong": + case "sub": + case "sup": + case "span": + case "u": + $parent->parentNode->insertBefore($br, $parent); + $parent->parentNode->removeChild($parent); + + $this->unwrapBr($br); + break; + } + } + + private function removeTrailingBr(\DOMElement $br): void + { + $paragraph = DOMUtil::closest($br, "p"); + if ($paragraph === null) { + return; + } + + if (!DOMUtil::isLastNode($br, $paragraph)) { + return; + } + + if ($paragraph->childNodes->length > 1) { + $br->remove(); + } + } +} diff --git a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeP.class.php b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeP.class.php deleted file mode 100644 index ddc772249a..0000000000 --- a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeP.class.php +++ /dev/null @@ -1,52 +0,0 @@ - - * @since 6.0 - */ -final class HtmlOutputNodeP extends AbstractHtmlOutputNode -{ - /** - * @inheritDoc - */ - protected $tagName = 'p'; - - /** - * @inheritDoc - */ - public function process(array $elements, AbstractHtmlNodeProcessor $htmlNodeProcessor) - { - /** @var \DOMElement $element */ - foreach ($elements as $element) { - if ($element->childElementCount === 1 && $element->firstElementChild) { - $child = $element->firstElementChild; - if ($child->tagName === 'br') { - if ($child->getAttribute('data-cke-filler') === 'true') { - // This is an internal marker used to identify paragraphs - // that are intentionally left blank. - $child->removeAttribute('data-cke-filler'); - - continue; - } - - // This is most likely a legacy paragraph that was inserted - // in earlier versions and is not longer required. We need - // to verify that there is no other text inside the node - // before removing it. - if (StringUtil::trim($element->textContent) === '') { - $element->remove(); - } - } - } - } - } -} diff --git a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeProcessor.class.php b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeProcessor.class.php index 41b712ba83..bf2b41a81e 100644 --- a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeProcessor.class.php +++ b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeProcessor.class.php @@ -97,6 +97,8 @@ class HtmlOutputNodeProcessor extends AbstractHtmlNodeProcessor $this->invokeHtmlNode(new HtmlOutputUnfurlUrlNode()); } + (new HtmlOutputNodeNormalizer($this->getXPath()))->normalize(); + // dynamic node handlers $this->invokeNodeHandlers('wcf\system\html\output\node\HtmlOutputNode', ['woltlab-metacode']); -- 2.20.1