Ignore prefixes that are not separated with a space in MysqlSearchEngine::splitIntoTe...
authorTim Düsterhus <duesterhus@woltlab.com>
Wed, 6 Apr 2022 11:02:02 +0000 (13:02 +0200)
committerTim Düsterhus <duesterhus@woltlab.com>
Wed, 6 Apr 2022 11:02:37 +0000 (13:02 +0200)
wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php

index 44f4989626859fe67e427da2b59732d5b9a3e70a..5d9ab9ad28640541edecb0cd012a345162429176 100644 (file)
@@ -230,6 +230,18 @@ class MysqlSearchEngine extends AbstractSearchEngine
      * Word: |Apfel|
      * Word: -|Banane|
      *
+     * Query: `Apfel-Banane`
+     * Word: |Apfel|
+     * Word: |Banane|
+     *
+     * Query: `Apfel-+-Banane`
+     * Word: |Apfel|
+     * Word: |Banane|
+     *
+     * Query: `B*-tree`
+     * Word: |B|*
+     * Word: |tree|
+     *
      * Query: ` Apfel `
      * Word: |Apfel|
      *
@@ -437,15 +449,38 @@ class MysqlSearchEngine extends AbstractSearchEngine
                     $i++;
                     continue;
                 } else {
-                    $state = 'finish';
-                    // No increment, we must yield the word and then continue parsing at
-                    // the current position to prevent skipping characters.
+                    $state = 'prefixWithoutSpace';
+                    // No increment, we must check whether the current character is a prefix
+                    // that needs to be dropped.
                     continue;
                 }
             } elseif ($state === 'atSign') {
                 if (\preg_match('/[0-9]/', $char)) {
                     $i++;
                     continue;
+                } else {
+                    $state = 'prefixWithoutSpace';
+                    // No increment, we must check whether the current character is a prefix
+                    // that needs to be dropped.
+                    continue;
+                }
+            } elseif ($state === 'prefixWithoutSpace') {
+                if (
+                    \in_array($char, [
+                        '-',
+                        '+',
+                        '~',
+                        '<',
+                        '>',
+                    ])
+                ) {
+                    // Ignore valid prefixes after a word is fully parsed: The word
+                    // parsing was aborted, because the prefix character was encountered.
+                    // Thus an input such as `compound-word` would see the hyphen as a
+                    // exclusion prefix of `word` instead of a hyphen. By ignoring such
+                    // prefixes (unless they are clearly delimited with a space) the parsed
+                    // result will better match user expectations.
+                    $i++;
                 } else {
                     $state = 'finish';
                     // No increment, we must yield the word and then continue parsing at