From 0bc79023944f35034f153b10e51a69640cb1fa27 Mon Sep 17 00:00:00 2001 From: Alexander Ebert Date: Wed, 29 Mar 2023 18:24:45 +0200 Subject: [PATCH] Force replace some HTML tags before sending messages to the search index Stripping the HTML can cause certain words to be accidentally joined when there is no symbol between them that is recognized by the tokenizer. Inserting a whitespace at tag positions that are known to be prone is a stop-gap solution until we find a more stable replacement strategy. See #4652 and WoltLab/com.woltlab.wcf.elasticSearch#14 --- .../system/search/SearchIndexManager.class.php | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/wcfsetup/install/files/lib/system/search/SearchIndexManager.class.php b/wcfsetup/install/files/lib/system/search/SearchIndexManager.class.php index 4c51d7afa1..a9f887422a 100644 --- a/wcfsetup/install/files/lib/system/search/SearchIndexManager.class.php +++ b/wcfsetup/install/files/lib/system/search/SearchIndexManager.class.php @@ -122,6 +122,24 @@ class SearchIndexManager extends SingletonFactory implements ISearchIndexManager $languageID = null, $metaData = '' ) { + // Force replace certain tags with a whitespace to prevent words from adjacent + // lines to be glued together. + $message = \str_replace([ + '
', + '', + '', + '', + '', + '', + '', + '', + '', + '

', + '', + '', + '', + ], ' ', $message); + // strip html; remove whitespace from beginning and end of the message $message = StringUtil::trim(StringUtil::stripHTML($message)); -- 2.20.1