Improve the normalization of legacy messages
authorAlexander Ebert <ebert@woltlab.com>
Sat, 6 May 2023 19:18:36 +0000 (21:18 +0200)
committerAlexander Ebert <ebert@woltlab.com>
Mon, 8 May 2023 16:34:04 +0000 (18:34 +0200)
wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeBr.class.php [deleted file]
wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeNormalizer.class.php [new file with mode: 0644]
wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeP.class.php [deleted file]
wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeProcessor.class.php

diff --git a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeBr.class.php b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeBr.class.php
deleted file mode 100644 (file)
index 2e8238b..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-<?php
-
-namespace wcf\system\html\output\node;
-
-use DOMElement;
-use wcf\system\html\node\AbstractHtmlNodeProcessor;
-use wcf\util\DOMUtil;
-use wcf\util\StringUtil;
-
-/**
- * Unwraps <br> and strips trailing <br>.
- *
- * @author Alexander Ebert
- * @copyright 2001-2023 WoltLab GmbH
- * @license GNU Lesser General Public License <http://opensource.org/licenses/lgpl-license.php>
- * @since 6.0
- */
-final class HtmlOutputNodeBr extends AbstractHtmlOutputNode
-{
-    /**
-     * @inheritDoc
-     */
-    protected $tagName = 'br';
-
-    /**
-     * @inheritDoc
-     */
-    public function process(array $elements, AbstractHtmlNodeProcessor $htmlNodeProcessor)
-    {
-        /** @var \DOMElement $element */
-        foreach ($elements as $element) {
-            $this->unwrap($element);
-            $this->removeTrailingBr($element);
-        }
-    }
-
-    private function unwrap(DOMElement $br): void
-    {
-        if ($br->previousSibling || $br->nextSibling) {
-            return;
-        }
-
-        $parent = $br;
-        while (($parent = $parent->parentNode) !== null) {
-            switch ($parent->nodeName) {
-                case "b":
-                case "del":
-                case "em":
-                case "i":
-                case "strong":
-                case "sub":
-                case "sup":
-                case "span":
-                case "u":
-                    if ($br->previousSibling || $br->nextSibling) {
-                        return;
-                    }
-
-                    $parent->parentNode->insertBefore($br, $parent);
-                    $parent->parentNode->removeChild($parent);
-                    $parent = $br;
-
-                    break;
-
-                default:
-                    return;
-            }
-        }
-    }
-
-    private function removeTrailingBr(DOMElement $br): void
-    {
-        if ($br->getAttribute("data-cke-filler") === "true") {
-            return;
-        }
-
-        $paragraph = DOMUtil::closest($br, "p");
-        if ($paragraph === null) {
-            return;
-        }
-
-        if (!DOMUtil::isLastNode($br, $paragraph)) {
-            return;
-        }
-
-        if ($paragraph->childNodes->length === 1 && $paragraph->childNodes->item(0) === $br) {
-            $paragraph->parentNode->removeChild($paragraph);
-        } else {
-            $br->remove();
-        }
-    }
-}
diff --git a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeNormalizer.class.php b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeNormalizer.class.php
new file mode 100644 (file)
index 0000000..2ba481c
--- /dev/null
@@ -0,0 +1,154 @@
+<?php
+
+namespace wcf\system\html\output\node;
+
+use wcf\util\DOMUtil;
+
+/**
+ * Normalizes HTML generated by earlier version of WoltLab Suite.
+ *
+ * @author Alexander Ebert
+ * @copyright 2001-2023 WoltLab GmbH
+ * @license GNU Lesser General Public License <http://opensource.org/licenses/lgpl-license.php>
+ * @since 6.0
+ */
+final class HtmlOutputNodeNormalizer
+{
+    public function __construct(private readonly \DOMXPath $xpath)
+    {
+    }
+
+    public function normalize(): void
+    {
+        $this->normalizeBr();
+
+        $candidates = $this->getPossibleSpacerParagraphs();
+        $this->reduceSpacerParagraphs($candidates);
+    }
+
+    /**
+     * @return list<\DOMElement>
+     */
+    private function getPossibleSpacerParagraphs(): array
+    {
+        $paragraphs = [];
+
+        foreach ($this->xpath->query('//p') as $p) {
+            \assert($p instanceof \DOMElement);
+
+            if ($p->childNodes->length === 1) {
+                $child = $p->childNodes->item(0);
+                if ($child->nodeName === 'br') {
+                    \assert($child instanceof \DOMElement);
+
+                    if ($child->getAttribute('data-cke-filler') !== 'true') {
+                        $paragraphs[] = $p;
+                    }
+                }
+            }
+        }
+
+        return $paragraphs;
+    }
+
+    /**
+     * @param list<\DOMElement> $paragraphs
+     * @return void
+     */
+    private function reduceSpacerParagraphs(array $paragraphs): void
+    {
+        if ($paragraphs === []) {
+            return;
+        }
+
+        for ($i = 0, $length = \count($paragraphs); $i < $length; $i++) {
+            $candidate = $paragraphs[$i];
+            $offset = 0;
+
+            // Searches for adjacent paragraphs.
+            while ($i + $offset + 1 < $length) {
+                $nextCandidate = $paragraphs[$i + $offset + 1];
+                if ($candidate->nextElementSibling !== $nextCandidate) {
+                    break;
+                }
+
+                $offset++;
+            }
+
+            if ($offset === 0) {
+                // An offset of 0 means that this is a single paragraph and we
+                // can safely remove it.
+                $candidate->remove();
+            } else {
+                // We need to reduce the number of paragraphs by half, unless it
+                // is an uneven number in which case we need to remove one
+                // additional paragraph.
+                if ($offset % 2 === 1) {
+                    // 2 -> 1, 4 -> 2
+                    $numberOfParagraphsToRemove = \ceil($offset / 2);
+                } else {
+                    // 3 -> 1, 5 -> 2
+                    $numberOfParagraphsToRemove = \ceil($offset / 2) + 1;
+                }
+
+                $removeParagraphs = \array_slice($paragraphs, $i, $numberOfParagraphsToRemove);
+                foreach ($removeParagraphs as $paragraph) {
+                    $paragraph->remove();
+                }
+
+                $i += $offset;
+            }
+        }
+    }
+
+    private function normalizeBr(): void
+    {
+        foreach ($this->xpath->query('//br') as $br) {
+            \assert($br instanceof \DOMElement);
+
+            $this->unwrapBr($br);
+            $this->removeTrailingBr($br);
+        }
+    }
+
+    private function unwrapBr(\DOMElement $br): void
+    {
+        if ($br->previousSibling || $br->nextSibling) {
+            return;
+        }
+
+        $parent = $br->parentNode;
+        switch ($parent->nodeName) {
+            case "b":
+            case "del":
+            case "em":
+            case "i":
+            case "strong":
+            case "sub":
+            case "sup":
+            case "span":
+            case "u":
+                $parent->parentNode->insertBefore($br, $parent);
+                $parent->parentNode->removeChild($parent);
+
+                $this->unwrapBr($br);
+                break;
+        }
+    }
+
+    private function removeTrailingBr(\DOMElement $br): void
+    {
+        $paragraph = DOMUtil::closest($br, "p");
+        if ($paragraph === null) {
+            return;
+        }
+
+        if (!DOMUtil::isLastNode($br, $paragraph)) {
+            return;
+        }
+
+        if ($paragraph->childNodes->length > 1) {
+            $br->remove();
+        }
+    }
+}
diff --git a/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeP.class.php b/wcfsetup/install/files/lib/system/html/output/node/HtmlOutputNodeP.class.php
deleted file mode 100644 (file)
index ddc7722..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-<?php
-
-namespace wcf\system\html\output\node;
-
-use wcf\system\html\node\AbstractHtmlNodeProcessor;
-use wcf\util\StringUtil;
-
-/**
- * Removes empty paragraphs that were used to emulate paragraphs in earlier versions.
- *
- * @author Alexander Ebert
- * @copyright 2001-2023 WoltLab GmbH
- * @license GNU Lesser General Public License <http://opensource.org/licenses/lgpl-license.php>
- * @since 6.0
- */
-final class HtmlOutputNodeP extends AbstractHtmlOutputNode
-{
-    /**
-     * @inheritDoc
-     */
-    protected $tagName = 'p';
-
-    /**
-     * @inheritDoc
-     */
-    public function process(array $elements, AbstractHtmlNodeProcessor $htmlNodeProcessor)
-    {
-        /** @var \DOMElement $element */
-        foreach ($elements as $element) {
-            if ($element->childElementCount === 1 && $element->firstElementChild) {
-                $child = $element->firstElementChild;
-                if ($child->tagName === 'br') {
-                    if ($child->getAttribute('data-cke-filler') === 'true') {
-                        // This is an internal marker used to identify paragraphs
-                        // that are intentionally left blank.
-                        $child->removeAttribute('data-cke-filler');
-
-                        continue;
-                    }
-
-                    // This is most likely a legacy paragraph that was inserted
-                    // in earlier versions and is not longer required. We need
-                    // to verify that there is no other text inside the node
-                    // before removing it.
-                    if (StringUtil::trim($element->textContent) === '') {
-                        $element->remove();
-                    }
-                }
-            }
-        }
-    }
-}
index 41b712ba835c21966f46d3f1d394bdb8391d1b97..bf2b41a81e2f373011ac7cdee3548eeca3cf311e 100644 (file)
@@ -97,6 +97,8 @@ class HtmlOutputNodeProcessor extends AbstractHtmlNodeProcessor
             $this->invokeHtmlNode(new HtmlOutputUnfurlUrlNode());
         }
 
+        (new HtmlOutputNodeNormalizer($this->getXPath()))->normalize();
+
         // dynamic node handlers
         $this->invokeNodeHandlers('wcf\system\html\output\node\HtmlOutputNode', ['woltlab-metacode']);