From c36b70769fd366597d463064b29afc87fb2bcefe Mon Sep 17 00:00:00 2001
From: =?utf8?q?Tim=20D=C3=BCsterhus?= <duesterhus@woltlab.com>
Date: Tue, 22 Dec 2020 11:59:00 +0100
Subject: [PATCH] Refactor search query parse of MysqlSearchEngine

This new parser was written against MySQL's Yacc / Bison grammar and should be
much more robust, because it also understand MySQL's semantics properly.

This new parser consists of two parts:
1. Split the query into separate terms like MySQL would do.
2. Modify these terms to improve the user experience
   (e.g. by adding the asterisk wildcard).

The result of this change should be that the search engine always generates
queries that are compatible with InnoDB based fulltext indices.

This is related to #3404.
---
 .../search/mysql/MysqlSearchEngine.class.php  | 341 ++++++++++++++----
 1 file changed, 276 insertions(+), 65 deletions(-)
diff --git a/wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php b/wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php
index c0899a4911..8aba719429 100644
--- a/wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php
+++ b/wcfsetup/install/files/lib/system/search/mysql/MysqlSearchEngine.class.php
@@ -126,93 +126,304 @@ class MysqlSearchEngine extends AbstractSearchEngine {
 	}
 	
 	/**
-	 * Manipulates the search term (< and > used as quotation marks):
+	 * Manipulates the search term by adding prefixes and suffixes.
 	 * 
-	 * - <test foo> becomes <+test* +foo*>
-	 * - <test -foo bar> becomes <+test* -foo* +bar*>
-	 * - <test "foo bar"> becomes <+test* +"foo bar">
-	 * 
-	 * @see	http://dev.mysql.com/doc/refman/5.5/en/fulltext-boolean.html
-	 * 
-	 * @param	string		$query
-	 * @return	string
+	 * - `test foo` becomes `+test* +foo*`
+	 * - `test -foo bar` becomes `+test* -foo +bar*`
+	 * - `test <foo bar` becomes `+test* <foo* +bar*`
+	 * - `test "foo bar"` becomes `+test* +"foo bar"`
 	 */
 	protected function parseSearchQuery($query) {
 		$query = StringUtil::trim($query);
 		
-		// expand search terms with a * unless they're encapsulated with quotes
-		$inQuotes = false;
-		$previousChar = $tmp = '';
-		$controlCharacterOrSpace = false;
-		$chars = ['+', '-', '*'];
-		$ftMinWordLen = $this->getFulltextMinimumWordLength();
-		for ($i = 0, $length = mb_strlen($query); $i < $length; $i++) {
-			$char = mb_substr($query, $i, 1);
+		$result = [];
+		foreach ($this->splitIntoTerms($query) as $term) {
+			[$prefix, $word, $suffix] = $term;
 			
-			if ($inQuotes) {
-				if ($char == '"') {
-					$inQuotes = false;
+			// Ignore parentheses.
+			if ($word === '(' || $word === ')') {
+				continue;
+			}
+			
+			// Add a '+' prefix if no prefix is given.
+			if (!$prefix) {
+				$prefix = '+';
+			}
+			if (!$suffix) {
+				// Add a '*' suffix if no suffix is given,
+				// - the word is not quoted, and
+				// - the prefix is not '-'.
+				if ($word[0] !== '"' && $prefix !== '-') {
+					$suffix = '*';
 				}
 			}
-			else {
-				if ($char == '"') {
-					$inQuotes = true;
+			
+			$result[] = $prefix.$word.$suffix;
+		}
+		
+		return implode(' ', $result);
+	}
+	
+	/**
+	 * Parses the query into separate search terms.
+	 * 
+	 * The parser is based off the original InnoDB search query parser with
+	 * a small difference: Prefixes are only understood if they stand right
+	 * beside the search term. InnoDB allows an arbitrary number of whitespace
+	 * after the prefix, leading to unexpected results if the search query
+	 * was copied from a sentence that uses the dash as word separator.
+	 * 
+	 * The resulting terms should not be split by MySQL when concatenated
+	 * with spaces and neither should they cause syntax errors.
+	 * 
+	 * Examples:
+	 * 
+	 * Query: `Apfel - Banane`
+	 * Word: |Apfel|
+	 * Word: |Banane|
+	 * 
+	 * Query: `Apfel -Banane`
+	 * Word: |Apfel|
+	 * Word: -|Banane|
+	 * 
+	 * Query: ` Apfel `
+	 * Word: |Apfel|
+	 * 
+	 * Query: ` Apfel Banane `
+	 * Word: |Apfel|
+	 * Word: |Banane|
+	 * 
+	 * Query: `Apfel*`
+	 * Word: |Apfel|*
+	 * 
+	 * Query: `Apfel *`
+	 * Word: |Apfel|
+	 * 
+	 * Query: `Apfel * Banane`
+	 * Word: |Apfel|
+	 * Word: |Banane|
+	 * 
+	 * Query: `+-"Apfel Banane"*`
+	 * Word: -|"Apfel Banane"|
+	 * 
+	 * Query: `Ãpfel Bananen`
+	 * Word: |Ãpfel|
+	 * Word: |Bananen|
+	 * 
+	 * Query: `+-*`
+	 * 
+	 * Query: `"Apfel`
+	 * Word: |"Apfel"|
+	 * 
+	 * Query: `"Apfel Banane" @8`
+	 * Word: |"Apfel Banane"|
+	 * 
+	 * Query: `Apfel Banane @8`
+	 * Word: |Apfel|
+	 * Word: |Banane|
+	 * 
+	 * Query: `+((+Apfel -Banane) (-Apfel +Banane)) >Clementine`
+	 * Word: +|(|
+	 * Word: |(|
+	 * Word: +|Apfel|
+	 * Word: -|Banane|
+	 * Word: |)|
+	 * Word: |(|
+	 * Word: -|Apfel|
+	 * Word: +|Banane|
+	 * Word: |)|
+	 * Word: |)|
+	 * Word: >|Clementine|
+	 * 
+	 * @see	https://dev.mysql.com/doc/refman/8.0/en/fulltext-boolean.html
+	 * @see	https://github.com/mysql/mysql-server/blob/ee4455a33b10f1b1886044322e4893f587b319ed/storage/innobase/fts/fts0pars.y
+	 * @see	https://github.com/mysql/mysql-server/blob/ee4455a33b10f1b1886044322e4893f587b319ed/storage/innobase/fts/fts0blex.l
+	 */
+	protected function splitIntoTerms($query) {
+		$state = 'beforePrefix';
+		
+		$parentheses = 0;
+		$word = "";
+		$isQuoted = null;
+		$prefix = null;
+		$suffix = null;
+		
+		for ($i = 0, $max = strlen($query); $i < $max;) {
+			$char = $query[$i];
+			
+			// Treat ASCII control characters as spaces.
+			if (ord($query[$i]) < 0x20 || ord($query[$i]) == 0x7f) {
+				$char = " ";
+			}
+			
+			if ($state === 'beforePrefix') {
+				// Skip Whitespace.
+				if (in_array($char, [
+					' ',
+					"\t"
+				])) {
+					$i++;
+					continue;
 				}
-				else {
-					if ($char == ' ' && !$controlCharacterOrSpace) {
-						$controlCharacterOrSpace = true;
-						$tmp .= '*';
+				
+				// After a word is before a word. Handle the closing parenthesis
+				// early on to avoid needing through all the states.
+				if ($char === ')') {
+					if ($parentheses > 0) {
+						$word = ')';
 					}
-					else if (in_array($char, $chars)) {
-						$controlCharacterOrSpace = true;
+					$parentheses--;
+					$i++;
+					$state = 'finish';
+					continue;
+				}
+				
+				$state = 'prefix';
+				
+				// No increment, we must interpret the current character as a prefix.
+				continue;
+			}
+			else if ($state === 'prefix') {
+				if (in_array($char, [
+					'-',
+					'+',
+					'~',
+					'<',
+					'>',
+				])) {
+					// The last prefix character wins.
+					$prefix = $char;
+					$i++;
+					continue;
+				}
+				else {
+					$state = 'word';
+					// No increment, we must interpret the current character as a word.
+					continue;
+				}
+			}
+			else if ($state === 'word') {
+				// Parentheses might have a prefix, so we handle them
+				// inside of the 'word' state.
+				if ($char === '(') {
+					$word = '(';
+					$parentheses++;
+					$i++;
+					
+					// Immediately go to the finish to allow for parsing the prefix
+					// of the first word within the parenthesis.
+					$state = 'finish';
+					continue;
+				}
+				
+				// Check whether this word is quoted.
+				if ($isQuoted === null) {
+					if ($char === '"') {
+						$isQuoted = true;
+						$word .= $char;
+						$i++;
+						continue;
 					}
 					else {
-						$controlCharacterOrSpace = false;
+						$isQuoted = false;
 					}
 				}
-			}
-			
-			/*
-			 * prepend a plus sign (logical AND) if ALL these conditions are given:
-			 * 
-			 * 1) previous character:
-			 *   - is empty (start of string)
-			 *   - is a space (MySQL uses spaces to separate words)
-			 * 
-			 * 2) not within quotation marks
-			 * 
-			 * 3) current char:
-			 *   - is NOT +, - or *
-			 */
-			if (($previousChar == '' || $previousChar == ' ') && !$inQuotes && !in_array($char, $chars)) {
-				// check if the term is shorter than the minimum fulltext word length
-				if ($i + $ftMinWordLen <= $length) {
-					$term = '';// $char;
-					for ($j = $i, $innerLength = $ftMinWordLen + $i; $j < $innerLength; $j++) {
-						$currentChar = mb_substr($query, $j, 1);
-						if ($currentChar == '"' || $currentChar == ' ' || in_array($currentChar, $chars)) {
-							break;
-						}
-						
-						$term .= $currentChar;
+				
+				if ($isQuoted) {
+					$word .= $char;
+					if ($char === '"') {
+						$state = 'suffix';
 					}
-					
-					if (mb_strlen($term) == $ftMinWordLen) {
-						$tmp .= '+';
+					$i++;
+					continue;
+				}
+				else {
+					if (preg_match('/[^" \n*()+\-<>~@%]/', $char)) {
+						$word .= $char;
+						$i++;
+						continue;
+					}
+					else {
+						$state = 'suffix';
+						// No increment, we must interpret the current character as a suffix.
+						continue;
 					}
 				}
 			}
-			
-			$tmp .= $char;
-			$previousChar = $char;
+			else if ($state === 'suffix') {
+				if (!$isQuoted && in_array($char, [
+					'*'
+				])) {
+					$suffix = $char;
+					$i++;
+					continue;
+				}
+				else if ($char == '@') {
+					$state = 'atSign';
+					$i++;
+					continue;
+				}
+				else {
+					$state = 'finish';
+					// No increment, we must yield the word and then continue parsing at
+					// the current position to prevent skipping characters.
+					continue;
+				}
+			}
+			else if ($state === 'atSign') {
+				if (preg_match('/[0-9]/', $char)) {
+					$i++;
+					continue;
+				}
+				else {
+					$state = 'finish';
+					// No increment, we must yield the word and then continue parsing at
+					// the current position to prevent skipping characters.
+					continue;
+				}
+			}
+			else if ($state === 'finish') {
+				// Yield only if the word is non-empty.
+				if ($word) {
+					yield [$prefix, $word, $suffix];
+				}
+				
+				$state = 'beforePrefix';
+				$word = "";
+				$isQuoted = null;
+				$prefix = null;
+				$suffix = null;
+				
+				// It's a bit unclear what we need to do for the percent sign.
+				// It may not appear within a word, but it is no legal operator either.
+				// Just skip it here to prevent infinite loops, due to no state making
+				// progress at the percent sign.
+				if ($char === '%') {
+					$i++;
+				}
+				
+				// No increment, we must interpret the current character as a prefix.
+				continue;
+			}
+			else {
+				throw new \Exception('Unreachable');
+			}
 		}
 		
-		// handle last char
-		if (!$inQuotes && !$controlCharacterOrSpace) {
-			$tmp .= '*';
+		// Yield only if the word is non-empty.
+		if ($word) {
+			// Add missing quote.
+			if ($isQuoted && substr($word, -1) !== '"') {
+				$word .= '"';
+			}
+			
+			yield [$prefix, $word, $suffix];
 		}
 		
-		return $tmp;
+		// Yield the remaining closing parentheses.
+		while ($parentheses-- > 0) {
+			yield ['', ')', ''];
+		}
 	}
 	
 	/**
-- 
2.20.1