Refactor search query parse of MysqlSearchEngine

author Tim Düsterhus <duesterhus@woltlab.com>

Tue, 22 Dec 2020 10:59:00 +0000 (11:59 +0100)

committer Tim Düsterhus <duesterhus@woltlab.com>

Wed, 23 Dec 2020 09:20:50 +0000 (10:20 +0100)
author Tim Düsterhus <duesterhus@woltlab.com>
Tue, 22 Dec 2020 10:59:00 +0000 (11:59 +0100)
committer Tim Düsterhus <duesterhus@woltlab.com>
Wed, 23 Dec 2020 09:20:50 +0000 (10:20 +0100)
diff --git a/wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php b/wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php

index c0899a49112511f5cc941fd07855906cf9d15344..8aba719429d6d6b0569d1fa6fae243795a075a24 100644 (file)
--- a/wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php
+++ b/wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php
@@ -126,93 +126,304 @@ class MysqlSearchEngine extends AbstractSearchEngine {
         }
         
         /**
-        * Manipulates the search term (< and > used as quotation marks):
+        * Manipulates the search term by adding prefixes and suffixes.
          * 
-        * - <test foo> becomes <+test* +foo*>
-        * - <test -foo bar> becomes <+test* -foo* +bar*>
-        * - <test "foo bar"> becomes <+test* +"foo bar">
-        * 
-        * @see http://dev.mysql.com/doc/refman/5.5/en/fulltext-boolean.html
-        * 
-        * @param       string          $query
-        * @return      string
+        * - `test foo` becomes `+test* +foo*`
+        * - `test -foo bar` becomes `+test* -foo +bar*`
+        * - `test <foo bar` becomes `+test* <foo* +bar*`
+        * - `test "foo bar"` becomes `+test* +"foo bar"`
          */
         protected function parseSearchQuery($query) {
                 $query = StringUtil::trim($query);
                 
-               // expand search terms with a * unless they're encapsulated with quotes
-               $inQuotes = false;
-               $previousChar = $tmp = '';
-               $controlCharacterOrSpace = false;
-               $chars = ['+', '-', '*'];
-               $ftMinWordLen = $this->getFulltextMinimumWordLength();
-               for ($i = 0, $length = mb_strlen($query); $i < $length; $i++) {
-                       $char = mb_substr($query, $i, 1);
+               $result = [];
+               foreach ($this->splitIntoTerms($query) as $term) {
+                       [$prefix, $word, $suffix] = $term;
                         
-                       if ($inQuotes) {
-                               if ($char == '"') {
-                                       $inQuotes = false;
+                       // Ignore parentheses.
+                       if ($word === '(' || $word === ')') {
+                               continue;
+                       }
+                       
+                       // Add a '+' prefix if no prefix is given.
+                       if (!$prefix) {
+                               $prefix = '+';
+                       }
+                       if (!$suffix) {
+                               // Add a '*' suffix if no suffix is given,
+                               // - the word is not quoted, and
+                               // - the prefix is not '-'.
+                               if ($word[0] !== '"' && $prefix !== '-') {
+                                       $suffix = '*';
                                 }
                         }
-                       else {
-                               if ($char == '"') {
-                                       $inQuotes = true;
+                       
+                       $result[] = $prefix.$word.$suffix;
+               }
+               
+               return implode(' ', $result);
+       }
+       
+       /**
+        * Parses the query into separate search terms.
+        * 
+        * The parser is based off the original InnoDB search query parser with
+        * a small difference: Prefixes are only understood if they stand right
+        * beside the search term. InnoDB allows an arbitrary number of whitespace
+        * after the prefix, leading to unexpected results if the search query
+        * was copied from a sentence that uses the dash as word separator.
+        * 
+        * The resulting terms should not be split by MySQL when concatenated
+        * with spaces and neither should they cause syntax errors.
+        * 
+        * Examples:
+        * 
+        * Query: `Apfel - Banane`
+        * Word: |Apfel|
+        * Word: |Banane|
+        * 
+        * Query: `Apfel -Banane`
+        * Word: |Apfel|
+        * Word: -|Banane|
+        * 
+        * Query: ` Apfel `
+        * Word: |Apfel|
+        * 
+        * Query: ` Apfel Banane `
+        * Word: |Apfel|
+        * Word: |Banane|
+        * 
+        * Query: `Apfel*`
+        * Word: |Apfel|*
+        * 
+        * Query: `Apfel *`
+        * Word: |Apfel|
+        * 
+        * Query: `Apfel * Banane`
+        * Word: |Apfel|
+        * Word: |Banane|
+        * 
+        * Query: `+-"Apfel Banane"*`
+        * Word: -|"Apfel Banane"|
+        * 
+        * Query: `Äpfel Bananen`
+        * Word: |Äpfel|
+        * Word: |Bananen|
+        * 
+        * Query: `+-*`
+        * 
+        * Query: `"Apfel`
+        * Word: |"Apfel"|
+        * 
+        * Query: `"Apfel Banane" @8`
+        * Word: |"Apfel Banane"|
+        * 
+        * Query: `Apfel Banane @8`
+        * Word: |Apfel|
+        * Word: |Banane|
+        * 
+        * Query: `+((+Apfel -Banane) (-Apfel +Banane)) >Clementine`
+        * Word: +|(|
+        * Word: |(|
+        * Word: +|Apfel|
+        * Word: -|Banane|
+        * Word: |)|
+        * Word: |(|
+        * Word: -|Apfel|
+        * Word: +|Banane|
+        * Word: |)|
+        * Word: |)|
+        * Word: >|Clementine|
+        * 
+        * @see https://dev.mysql.com/doc/refman/8.0/en/fulltext-boolean.html
+        * @see https://github.com/mysql/mysql-server/blob/ee4455a33b10f1b1886044322e4893f587b319ed/storage/innobase/fts/fts0pars.y
+        * @see https://github.com/mysql/mysql-server/blob/ee4455a33b10f1b1886044322e4893f587b319ed/storage/innobase/fts/fts0blex.l
+        */
+       protected function splitIntoTerms($query) {
+               $state = 'beforePrefix';
+               
+               $parentheses = 0;
+               $word = "";
+               $isQuoted = null;
+               $prefix = null;
+               $suffix = null;
+               
+               for ($i = 0, $max = strlen($query); $i < $max;) {
+                       $char = $query[$i];
+                       
+                       // Treat ASCII control characters as spaces.
+                       if (ord($query[$i]) < 0x20 || ord($query[$i]) == 0x7f) {
+                               $char = " ";
+                       }
+                       
+                       if ($state === 'beforePrefix') {
+                               // Skip Whitespace.
+                               if (in_array($char, [
+                                       ' ',
+                                       "\t"
+                               ])) {
+                                       $i++;
+                                       continue;
                                 }
-                               else {
-                                       if ($char == ' ' && !$controlCharacterOrSpace) {
-                                               $controlCharacterOrSpace = true;
-                                               $tmp .= '*';
+                               
+                               // After a word is before a word. Handle the closing parenthesis
+                               // early on to avoid needing through all the states.
+                               if ($char === ')') {
+                                       if ($parentheses > 0) {
+                                               $word = ')';
                                         }
-                                       else if (in_array($char, $chars)) {
-                                               $controlCharacterOrSpace = true;
+                                       $parentheses--;
+                                       $i++;
+                                       $state = 'finish';
+                                       continue;
+                               }
+                               
+                               $state = 'prefix';
+                               
+                               // No increment, we must interpret the current character as a prefix.
+                               continue;
+                       }
+                       else if ($state === 'prefix') {
+                               if (in_array($char, [
+                                       '-',
+                                       '+',
+                                       '~',
+                                       '<',
+                                       '>',
+                               ])) {
+                                       // The last prefix character wins.
+                                       $prefix = $char;
+                                       $i++;
+                                       continue;
+                               }
+                               else {
+                                       $state = 'word';
+                                       // No increment, we must interpret the current character as a word.
+                                       continue;
+                               }
+                       }
+                       else if ($state === 'word') {
+                               // Parentheses might have a prefix, so we handle them
+                               // inside of the 'word' state.
+                               if ($char === '(') {
+                                       $word = '(';
+                                       $parentheses++;
+                                       $i++;
+                                       
+                                       // Immediately go to the finish to allow for parsing the prefix
+                                       // of the first word within the parenthesis.
+                                       $state = 'finish';
+                                       continue;
+                               }
+                               
+                               // Check whether this word is quoted.
+                               if ($isQuoted === null) {
+                                       if ($char === '"') {
+                                               $isQuoted = true;
+                                               $word .= $char;
+                                               $i++;
+                                               continue;
                                         }
                                         else {
-                                               $controlCharacterOrSpace = false;
+                                               $isQuoted = false;
                                         }
                                 }
-                       }
-                       
-                       /*
-                        * prepend a plus sign (logical AND) if ALL these conditions are given:
-                        * 
-                        * 1) previous character:
-                        *   - is empty (start of string)
-                        *   - is a space (MySQL uses spaces to separate words)
-                        * 
-                        * 2) not within quotation marks
-                        * 
-                        * 3) current char:
-                        *   - is NOT +, - or *
-                        */
-                       if (($previousChar == '' || $previousChar == ' ') && !$inQuotes && !in_array($char, $chars)) {
-                               // check if the term is shorter than the minimum fulltext word length
-                               if ($i + $ftMinWordLen <= $length) {
-                                       $term = '';// $char;
-                                       for ($j = $i, $innerLength = $ftMinWordLen + $i; $j < $innerLength; $j++) {
-                                               $currentChar = mb_substr($query, $j, 1);
-                                               if ($currentChar == '"' || $currentChar == ' ' || in_array($currentChar, $chars)) {
-                                                       break;
-                                               }
-                                               
-                                               $term .= $currentChar;
+                               
+                               if ($isQuoted) {
+                                       $word .= $char;
+                                       if ($char === '"') {
+                                               $state = 'suffix';
                                         }
-                                       
-                                       if (mb_strlen($term) == $ftMinWordLen) {
-                                               $tmp .= '+';
+                                       $i++;
+                                       continue;
+                               }
+                               else {
+                                       if (preg_match('/[^" \n*()+\-<>~@%]/', $char)) {
+                                               $word .= $char;
+                                               $i++;
+                                               continue;
+                                       }
+                                       else {
+                                               $state = 'suffix';
+                                               // No increment, we must interpret the current character as a suffix.
+                                               continue;
                                         }
                                 }
                         }
-                       
-                       $tmp .= $char;
-                       $previousChar = $char;
+                       else if ($state === 'suffix') {
+                               if (!$isQuoted && in_array($char, [
+                                       '*'
+                               ])) {
+                                       $suffix = $char;
+                                       $i++;
+                                       continue;
+                               }
+                               else if ($char == '@') {
+                                       $state = 'atSign';
+                                       $i++;
+                                       continue;
+                               }
+                               else {
+                                       $state = 'finish';
+                                       // No increment, we must yield the word and then continue parsing at
+                                       // the current position to prevent skipping characters.
+                                       continue;
+                               }
+                       }
+                       else if ($state === 'atSign') {
+                               if (preg_match('/[0-9]/', $char)) {
+                                       $i++;
+                                       continue;
+                               }
+                               else {
+                                       $state = 'finish';
+                                       // No increment, we must yield the word and then continue parsing at
+                                       // the current position to prevent skipping characters.
+                                       continue;
+                               }
+                       }
+                       else if ($state === 'finish') {
+                               // Yield only if the word is non-empty.
+                               if ($word) {
+                                       yield [$prefix, $word, $suffix];
+                               }
+                               
+                               $state = 'beforePrefix';
+                               $word = "";
+                               $isQuoted = null;
+                               $prefix = null;
+                               $suffix = null;
+                               
+                               // It's a bit unclear what we need to do for the percent sign.
+                               // It may not appear within a word, but it is no legal operator either.
+                               // Just skip it here to prevent infinite loops, due to no state making
+                               // progress at the percent sign.
+                               if ($char === '%') {
+                                       $i++;
+                               }
+                               
+                               // No increment, we must interpret the current character as a prefix.
+                               continue;
+                       }
+                       else {
+                               throw new \Exception('Unreachable');
+                       }
                 }
                 
-               // handle last char
-               if (!$inQuotes && !$controlCharacterOrSpace) {
-                       $tmp .= '*';
+               // Yield only if the word is non-empty.
+               if ($word) {
+                       // Add missing quote.
+                       if ($isQuoted && substr($word, -1) !== '"') {
+                               $word .= '"';
+                       }
+                       
+                       yield [$prefix, $word, $suffix];
                 }
                 
-               return $tmp;
+               // Yield the remaining closing parentheses.
+               while ($parentheses-- > 0) {
+                       yield ['', ')', ''];
+               }
         }
         
         /**
author	Tim Düsterhus <duesterhus@woltlab.com>
	Tue, 22 Dec 2020 10:59:00 +0000 (11:59 +0100)
committer	Tim Düsterhus <duesterhus@woltlab.com>
	Wed, 23 Dec 2020 09:20:50 +0000 (10:20 +0100)