From: Alexander Ebert Date: Wed, 29 Mar 2023 16:24:45 +0000 (+0200) Subject: Force replace some HTML tags before sending messages to the search index X-Git-Tag: 5.5.11_dev_1~10^2~1 X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=0bc79023944f35034f153b10e51a69640cb1fa27;p=GitHub%2FWoltLab%2FWCF.git Force replace some HTML tags before sending messages to the search index Stripping the HTML can cause certain words to be accidentally joined when there is no symbol between them that is recognized by the tokenizer. Inserting a whitespace at tag positions that are known to be prone is a stop-gap solution until we find a more stable replacement strategy. See #4652 and WoltLab/com.woltlab.wcf.elasticSearch#14 --- diff --git a/wcfsetup/install/files/lib/system/search/SearchIndexManager.class.php b/wcfsetup/install/files/lib/system/search/SearchIndexManager.class.php index 4c51d7afa1..a9f887422a 100644 --- a/wcfsetup/install/files/lib/system/search/SearchIndexManager.class.php +++ b/wcfsetup/install/files/lib/system/search/SearchIndexManager.class.php @@ -122,6 +122,24 @@ class SearchIndexManager extends SingletonFactory implements ISearchIndexManager $languageID = null, $metaData = '' ) { + // Force replace certain tags with a whitespace to prevent words from adjacent + // lines to be glued together. + $message = \str_replace([ + '
', + '', + '', + '', + '', + '', + '', + '', + '', + '

', + '', + '', + '', + ], ' ', $message); + // strip html; remove whitespace from beginning and end of the message $message = StringUtil::trim(StringUtil::stripHTML($message));