Refactor search query parse of MysqlSearchEngine
authorTim Düsterhus <duesterhus@woltlab.com>
Tue, 22 Dec 2020 10:59:00 +0000 (11:59 +0100)
committerTim Düsterhus <duesterhus@woltlab.com>
Wed, 23 Dec 2020 09:20:50 +0000 (10:20 +0100)
This new parser was written against MySQL's Yacc / Bison grammar and should be
much more robust, because it also understand MySQL's semantics properly.

This new parser consists of two parts:
1. Split the query into separate terms like MySQL would do.
2. Modify these terms to improve the user experience
   (e.g. by adding the asterisk wildcard).

The result of this change should be that the search engine always generates
queries that are compatible with InnoDB based fulltext indices.

This is related to #3404.

wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php

index c0899a49112511f5cc941fd07855906cf9d15344..8aba719429d6d6b0569d1fa6fae243795a075a24 100644 (file)
@@ -126,93 +126,304 @@ class MysqlSearchEngine extends AbstractSearchEngine {
        }
        
        /**
-        * Manipulates the search term (< and > used as quotation marks):
+        * Manipulates the search term by adding prefixes and suffixes.
         * 
-        * - <test foo> becomes <+test* +foo*>
-        * - <test -foo bar> becomes <+test* -foo* +bar*>
-        * - <test "foo bar"> becomes <+test* +"foo bar">
-        * 
-        * @see http://dev.mysql.com/doc/refman/5.5/en/fulltext-boolean.html
-        * 
-        * @param       string          $query
-        * @return      string
+        * - `test foo` becomes `+test* +foo*`
+        * - `test -foo bar` becomes `+test* -foo +bar*`
+        * - `test <foo bar` becomes `+test* <foo* +bar*`
+        * - `test "foo bar"` becomes `+test* +"foo bar"`
         */
        protected function parseSearchQuery($query) {
                $query = StringUtil::trim($query);
                
-               // expand search terms with a * unless they're encapsulated with quotes
-               $inQuotes = false;
-               $previousChar = $tmp = '';
-               $controlCharacterOrSpace = false;
-               $chars = ['+', '-', '*'];
-               $ftMinWordLen = $this->getFulltextMinimumWordLength();
-               for ($i = 0, $length = mb_strlen($query); $i < $length; $i++) {
-                       $char = mb_substr($query, $i, 1);
+               $result = [];
+               foreach ($this->splitIntoTerms($query) as $term) {
+                       [$prefix, $word, $suffix] = $term;
                        
-                       if ($inQuotes) {
-                               if ($char == '"') {
-                                       $inQuotes = false;
+                       // Ignore parentheses.
+                       if ($word === '(' || $word === ')') {
+                               continue;
+                       }
+                       
+                       // Add a '+' prefix if no prefix is given.
+                       if (!$prefix) {
+                               $prefix = '+';
+                       }
+                       if (!$suffix) {
+                               // Add a '*' suffix if no suffix is given,
+                               // - the word is not quoted, and
+                               // - the prefix is not '-'.
+                               if ($word[0] !== '"' && $prefix !== '-') {
+                                       $suffix = '*';
                                }
                        }
-                       else {
-                               if ($char == '"') {
-                                       $inQuotes = true;
+                       
+                       $result[] = $prefix.$word.$suffix;
+               }
+               
+               return implode(' ', $result);
+       }
+       
+       /**
+        * Parses the query into separate search terms.
+        * 
+        * The parser is based off the original InnoDB search query parser with
+        * a small difference: Prefixes are only understood if they stand right
+        * beside the search term. InnoDB allows an arbitrary number of whitespace
+        * after the prefix, leading to unexpected results if the search query
+        * was copied from a sentence that uses the dash as word separator.
+        * 
+        * The resulting terms should not be split by MySQL when concatenated
+        * with spaces and neither should they cause syntax errors.
+        * 
+        * Examples:
+        * 
+        * Query: `Apfel - Banane`
+        * Word: |Apfel|
+        * Word: |Banane|
+        * 
+        * Query: `Apfel -Banane`
+        * Word: |Apfel|
+        * Word: -|Banane|
+        * 
+        * Query: ` Apfel `
+        * Word: |Apfel|
+        * 
+        * Query: ` Apfel Banane `
+        * Word: |Apfel|
+        * Word: |Banane|
+        * 
+        * Query: `Apfel*`
+        * Word: |Apfel|*
+        * 
+        * Query: `Apfel *`
+        * Word: |Apfel|
+        * 
+        * Query: `Apfel * Banane`
+        * Word: |Apfel|
+        * Word: |Banane|
+        * 
+        * Query: `+-"Apfel Banane"*`
+        * Word: -|"Apfel Banane"|
+        * 
+        * Query: `Äpfel Bananen`
+        * Word: |Äpfel|
+        * Word: |Bananen|
+        * 
+        * Query: `+-*`
+        * 
+        * Query: `"Apfel`
+        * Word: |"Apfel"|
+        * 
+        * Query: `"Apfel Banane" @8`
+        * Word: |"Apfel Banane"|
+        * 
+        * Query: `Apfel Banane @8`
+        * Word: |Apfel|
+        * Word: |Banane|
+        * 
+        * Query: `+((+Apfel -Banane) (-Apfel +Banane)) >Clementine`
+        * Word: +|(|
+        * Word: |(|
+        * Word: +|Apfel|
+        * Word: -|Banane|
+        * Word: |)|
+        * Word: |(|
+        * Word: -|Apfel|
+        * Word: +|Banane|
+        * Word: |)|
+        * Word: |)|
+        * Word: >|Clementine|
+        * 
+        * @see https://dev.mysql.com/doc/refman/8.0/en/fulltext-boolean.html
+        * @see https://github.com/mysql/mysql-server/blob/ee4455a33b10f1b1886044322e4893f587b319ed/storage/innobase/fts/fts0pars.y
+        * @see https://github.com/mysql/mysql-server/blob/ee4455a33b10f1b1886044322e4893f587b319ed/storage/innobase/fts/fts0blex.l
+        */
+       protected function splitIntoTerms($query) {
+               $state = 'beforePrefix';
+               
+               $parentheses = 0;
+               $word = "";
+               $isQuoted = null;
+               $prefix = null;
+               $suffix = null;
+               
+               for ($i = 0, $max = strlen($query); $i < $max;) {
+                       $char = $query[$i];
+                       
+                       // Treat ASCII control characters as spaces.
+                       if (ord($query[$i]) < 0x20 || ord($query[$i]) == 0x7f) {
+                               $char = " ";
+                       }
+                       
+                       if ($state === 'beforePrefix') {
+                               // Skip Whitespace.
+                               if (in_array($char, [
+                                       ' ',
+                                       "\t"
+                               ])) {
+                                       $i++;
+                                       continue;
                                }
-                               else {
-                                       if ($char == ' ' && !$controlCharacterOrSpace) {
-                                               $controlCharacterOrSpace = true;
-                                               $tmp .= '*';
+                               
+                               // After a word is before a word. Handle the closing parenthesis
+                               // early on to avoid needing through all the states.
+                               if ($char === ')') {
+                                       if ($parentheses > 0) {
+                                               $word = ')';
                                        }
-                                       else if (in_array($char, $chars)) {
-                                               $controlCharacterOrSpace = true;
+                                       $parentheses--;
+                                       $i++;
+                                       $state = 'finish';
+                                       continue;
+                               }
+                               
+                               $state = 'prefix';
+                               
+                               // No increment, we must interpret the current character as a prefix.
+                               continue;
+                       }
+                       else if ($state === 'prefix') {
+                               if (in_array($char, [
+                                       '-',
+                                       '+',
+                                       '~',
+                                       '<',
+                                       '>',
+                               ])) {
+                                       // The last prefix character wins.
+                                       $prefix = $char;
+                                       $i++;
+                                       continue;
+                               }
+                               else {
+                                       $state = 'word';
+                                       // No increment, we must interpret the current character as a word.
+                                       continue;
+                               }
+                       }
+                       else if ($state === 'word') {
+                               // Parentheses might have a prefix, so we handle them
+                               // inside of the 'word' state.
+                               if ($char === '(') {
+                                       $word = '(';
+                                       $parentheses++;
+                                       $i++;
+                                       
+                                       // Immediately go to the finish to allow for parsing the prefix
+                                       // of the first word within the parenthesis.
+                                       $state = 'finish';
+                                       continue;
+                               }
+                               
+                               // Check whether this word is quoted.
+                               if ($isQuoted === null) {
+                                       if ($char === '"') {
+                                               $isQuoted = true;
+                                               $word .= $char;
+                                               $i++;
+                                               continue;
                                        }
                                        else {
-                                               $controlCharacterOrSpace = false;
+                                               $isQuoted = false;
                                        }
                                }
-                       }
-                       
-                       /*
-                        * prepend a plus sign (logical AND) if ALL these conditions are given:
-                        * 
-                        * 1) previous character:
-                        *   - is empty (start of string)
-                        *   - is a space (MySQL uses spaces to separate words)
-                        * 
-                        * 2) not within quotation marks
-                        * 
-                        * 3) current char:
-                        *   - is NOT +, - or *
-                        */
-                       if (($previousChar == '' || $previousChar == ' ') && !$inQuotes && !in_array($char, $chars)) {
-                               // check if the term is shorter than the minimum fulltext word length
-                               if ($i + $ftMinWordLen <= $length) {
-                                       $term = '';// $char;
-                                       for ($j = $i, $innerLength = $ftMinWordLen + $i; $j < $innerLength; $j++) {
-                                               $currentChar = mb_substr($query, $j, 1);
-                                               if ($currentChar == '"' || $currentChar == ' ' || in_array($currentChar, $chars)) {
-                                                       break;
-                                               }
-                                               
-                                               $term .= $currentChar;
+                               
+                               if ($isQuoted) {
+                                       $word .= $char;
+                                       if ($char === '"') {
+                                               $state = 'suffix';
                                        }
-                                       
-                                       if (mb_strlen($term) == $ftMinWordLen) {
-                                               $tmp .= '+';
+                                       $i++;
+                                       continue;
+                               }
+                               else {
+                                       if (preg_match('/[^" \n*()+\-<>~@%]/', $char)) {
+                                               $word .= $char;
+                                               $i++;
+                                               continue;
+                                       }
+                                       else {
+                                               $state = 'suffix';
+                                               // No increment, we must interpret the current character as a suffix.
+                                               continue;
                                        }
                                }
                        }
-                       
-                       $tmp .= $char;
-                       $previousChar = $char;
+                       else if ($state === 'suffix') {
+                               if (!$isQuoted && in_array($char, [
+                                       '*'
+                               ])) {
+                                       $suffix = $char;
+                                       $i++;
+                                       continue;
+                               }
+                               else if ($char == '@') {
+                                       $state = 'atSign';
+                                       $i++;
+                                       continue;
+                               }
+                               else {
+                                       $state = 'finish';
+                                       // No increment, we must yield the word and then continue parsing at
+                                       // the current position to prevent skipping characters.
+                                       continue;
+                               }
+                       }
+                       else if ($state === 'atSign') {
+                               if (preg_match('/[0-9]/', $char)) {
+                                       $i++;
+                                       continue;
+                               }
+                               else {
+                                       $state = 'finish';
+                                       // No increment, we must yield the word and then continue parsing at
+                                       // the current position to prevent skipping characters.
+                                       continue;
+                               }
+                       }
+                       else if ($state === 'finish') {
+                               // Yield only if the word is non-empty.
+                               if ($word) {
+                                       yield [$prefix, $word, $suffix];
+                               }
+                               
+                               $state = 'beforePrefix';
+                               $word = "";
+                               $isQuoted = null;
+                               $prefix = null;
+                               $suffix = null;
+                               
+                               // It's a bit unclear what we need to do for the percent sign.
+                               // It may not appear within a word, but it is no legal operator either.
+                               // Just skip it here to prevent infinite loops, due to no state making
+                               // progress at the percent sign.
+                               if ($char === '%') {
+                                       $i++;
+                               }
+                               
+                               // No increment, we must interpret the current character as a prefix.
+                               continue;
+                       }
+                       else {
+                               throw new \Exception('Unreachable');
+                       }
                }
                
-               // handle last char
-               if (!$inQuotes && !$controlCharacterOrSpace) {
-                       $tmp .= '*';
+               // Yield only if the word is non-empty.
+               if ($word) {
+                       // Add missing quote.
+                       if ($isQuoted && substr($word, -1) !== '"') {
+                               $word .= '"';
+                       }
+                       
+                       yield [$prefix, $word, $suffix];
                }
                
-               return $tmp;
+               // Yield the remaining closing parentheses.
+               while ($parentheses-- > 0) {
+                       yield ['', ')', ''];
+               }
        }
        
        /**