From: Tim Düsterhus Date: Fri, 29 Jul 2016 15:12:59 +0000 (+0200) Subject: Merge branch 'master' into next X-Git-Tag: 3.0.0_Beta_1~911 X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=0777d51f46485a21ca41a447fe0fe56f31244ce9;p=GitHub%2FWoltLab%2FWCF.git Merge branch 'master' into next --- 0777d51f46485a21ca41a447fe0fe56f31244ce9 diff --cc wcfsetup/install/files/lib/system/bbcode/PreParser.class.php index a28801bdf3,de6194ebcf..bd9c31fcb9 --- a/wcfsetup/install/files/lib/system/bbcode/PreParser.class.php +++ b/wcfsetup/install/files/lib/system/bbcode/PreParser.class.php @@@ -166,123 -161,6 +166,127 @@@ class PreParser extends SingletonFactor $this->text = $urlPattern->replace($this->text, $callback); } + /** + * Parses user mentions. + * + * @since 3.0 + */ + protected function parseUserMentions() { + static $userRegex = null; + if ($userRegex === null) { + $userRegex = new Regex(" - (?:^|(?<=\s|\])) # either at start of string, or after whitespace ++ (?:^|(?<=\s|\])) # either at start of string, or after whitespace + @ + ( - ([^',\s][^,\s]{2,})(?:\s[^,\s]+)? # either at most two strings, not containing - # whitespace or the comma, not starting with a single quote - # separated by a single whitespace character ++ ([^',\s][^,\s]{2,})(?:\s[^@,\s][^,\s]*)? # either at most two strings, ++ # not containing the whitespace or the comma, ++ # not starting with a single quote ++ # the second string not starting with the at sign ++ # separated by a single whitespace character + | - '(?:''|[^']){3,}' # or a string delimited by single quotes ++ '(?:''|[^']){3,}' # or a string delimited by single quotes + ) + ", Regex::IGNORE_WHITESPACE); + } + + // cache quotes + // @see \wcf\system\bbcode\BBCodeParser::buildTagArray() + $pattern = '~\[(?:/(?:quote)|(?:quote) + (?:= + (?:\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'|[^,\]]*) + (?:,(?:\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'|[^,\]]*))* + )?)\]~ix'; + preg_match_all($pattern, $this->text, $quoteMatches); + $textArray = preg_split($pattern, $this->text); + $text = $textArray[0]; + + $openQuotes = 0; + $quote = ''; + foreach ($quoteMatches[0] as $i => $quoteTag) { + if (mb_substr($quoteTag, 1, 1) == '/') { + $openQuotes--; + + $quote .= $quoteTag; + if ($openQuotes) { + $quote .= $textArray[$i + 1]; + } + else { + $text .= StringStack::pushToStringStack($quote, 'preParserUserMentions', '@@@').$textArray[$i + 1]; + $quote = ''; + } + } + else { + $openQuotes++; + $quote .= $quoteTag.$textArray[$i + 1]; + } + } + + if ($quote) { + $text .= $quote; + } + + $userRegex->match($text, true, Regex::ORDER_MATCH_BY_SET); + $matches = $userRegex->getMatches(); + + if (!empty($matches)) { + $usernames = []; + foreach ($matches as $match) { + // we don't care about the full match + array_shift($match); + + foreach ($match as $username) { + $username = self::getUsername($username); + if (!in_array($username, $usernames)) $usernames[] = $username; + } + } + + if (!empty($usernames)) { + // fetch users + $userList = new UserList(); + $userList->getConditionBuilder()->add('user_table.username IN (?)', [$usernames]); + $userList->readObjects(); + $users = []; + foreach ($userList as $user) { + $users[mb_strtolower($user->username)] = $user; + } + + $text = $userRegex->replace($text, new Callback(function ($matches) use ($users) { + // containing the full match - $usernames = [$matches[1]]; ++ $usernames = [ ++ 'full' => $matches[1] ++ ]; + + // containing only the part before the first space - if (isset($matches[2])) $usernames[] = $matches[2]; ++ if (isset($matches[2])) $usernames['part'] = $matches[2]; + + $usernames = array_map([PreParser::class, 'getUsername'], $usernames); + - foreach ($usernames as $username) { ++ foreach ($usernames as $type => $username) { + if (!isset($users[$username])) continue; + $link = LinkHandler::getInstance()->getLink('User', [ + 'appendSession' => false, + 'object' => $users[$username] + ]); + + $mention = "[url='".$link."']@".$users[$username]->username.'[/url]'; + + // check if only the part before the first space matched, in that case append the second word - if (isset($matches[2]) && strcasecmp($matches[2], $username) === 0) { - $mention .= mb_substr($matches[1], strlen($matches[2])); ++ if ($type === 'part') { ++ $mention .= mb_substr($matches[1], mb_strlen($matches[2])); + } + + return $mention; + } + + return $matches[0]; + })); + } + } + + // reinsert cached quotes + $this->text = StringStack::reinsertStrings($text, 'preParserUserMentions'); + } + /** * Caches code bbcodes to avoid parsing inside them. */ diff --cc wcfsetup/install/files/lib/system/html/input/node/HtmlInputNodeTextParser.class.php index 80a3e80ce2,0000000000..f7ee2238fc mode 100644,000000..100644 --- a/wcfsetup/install/files/lib/system/html/input/node/HtmlInputNodeTextParser.class.php +++ b/wcfsetup/install/files/lib/system/html/input/node/HtmlInputNodeTextParser.class.php @@@ -1,579 -1,0 +1,581 @@@ + + * @package WoltLabSuite\Core\System\Html\Input\Node + * @since 3.0 + */ +class HtmlInputNodeTextParser { + /** + * list of markers per element that will face a replacement + * @var \DOMElement[][] + */ + protected $elementStack = []; + + /** + * @var HtmlInputNodeProcessor + */ + protected $htmlInputNodeProcessor; + + /** + * list of text nodes that will face a replacement + * @var \DOMText[] + */ + protected $nodeStack = []; + + /** + * list of smilies by smiley code + * @var string[] + */ + protected $smilies = []; + + /** + * @var string[] + */ + protected $sourceBBCodes = []; + + /** + * forbidden characters + * @var string + */ + protected static $illegalChars = '[^\x0-\x2C\x2E\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]+'; + + /** + * regex for user mentions + * @var string + */ + protected static $userRegex = "~ + \\B # any non-word character, whitespace, string start is fine + @ + ( - ([^',\\s][^,\\s]{2,})(?:\\s[^,\\s]+)? # either at most two strings, not containing - # whitespace or the comma, not starting with a single quote - # separated by a single whitespace character ++ ([^',\s][^,\s]{2,})(?:\s[^@,\s][^,\s]*)? # either at most two strings, ++ # not containing the whitespace or the comma, ++ # not starting with a single quote ++ # the second string not starting with the at sign ++ # separated by a single whitespace character + | - '(?:''|[^']){3,}' # or a string delimited by single quotes ++ '(?:''|[^']){3,}' # or a string delimited by single quotes + ) + ~x"; + + /** + * HtmlInputNodeTextParser constructor. + * + * @param HtmlInputNodeProcessor $htmlInputNodeProcessor + */ + public function __construct(HtmlInputNodeProcessor $htmlInputNodeProcessor) { + $this->htmlInputNodeProcessor = $htmlInputNodeProcessor; + $this->sourceBBCodes = HtmlBBCodeParser::getInstance()->getSourceBBCodes(); + + if (MODULE_SMILEY) { + // get smilies + $smilies = SmileyCache::getInstance()->getSmilies(); + $categories = SmileyCache::getInstance()->getCategories(); + + foreach ($smilies as $categoryID => $categorySmilies) { + if ($categories[$categoryID ?: null]->isDisabled) continue; + + /** @var Smiley $smiley */ + foreach ($categorySmilies as $smiley) { + foreach ($smiley->smileyCodes as $smileyCode) { + $this->smilies[$smileyCode] = $smiley->getURL(); + } + } + } + + uksort($this->smilies, function($a, $b) { + $lengthA = mb_strlen($a); + $lengthB = mb_strlen($b); + + if ($lengthA < $lengthB) { + return 1; + } + else if ($lengthA === $lengthB) { + return 0; + } + + return -1; + }); + } + } + + /** + * Parses all text nodes searching for possible replacements. + */ + public function parse() { + // get all text nodes + $nodes = []; + /** @var \DOMText $node */ + foreach ($this->htmlInputNodeProcessor->getXPath()->query('//text()') as $node) { + $value = StringUtil::trim($node->textContent); + if (empty($value)) { + // skip empty nodes + continue; + } + + // check if node is within a code element or link + if ($this->hasCodeParent($node) || $this->hasLinkParent($node)) { + continue; + } + + $nodes[] = $node; + } + + // search for mentions, this step is separated to reduce the + // impact of querying the database for many matches + $usernames = []; + for ($i = 0, $length = count($nodes); $i < $length; $i++) { + /** @var \DOMText $node */ + $node = $nodes[$i]; + + $this->detectMention($node, $node->textContent, $usernames); + } + + $users = []; + if (!empty($usernames)) { + $users = $this->lookupUsernames($usernames); + } + + $allowEmail = BBCodeHandler::getInstance()->isAvailableBBCode('email'); + $allowMedia = BBCodeHandler::getInstance()->isAvailableBBCode('media'); + $allowURL = BBCodeHandler::getInstance()->isAvailableBBCode('url'); + + for ($i = 0, $length = count($nodes); $i < $length; $i++) { + /** @var \DOMText $node */ + $node = $nodes[$i]; + $oldValue = $value = $node->textContent; + + if (!empty($users)) { + $value = $this->parseMention($node, $value, $users); + } + + if ($allowURL || $allowMedia) { + $value = $this->parseURL($node, $value, $allowURL, $allowMedia); + } + + if ($allowEmail) { + $value = $this->parseEmail($node, $value); + } + + $value = $this->parseSmiley($node, $value); + + if ($value !== $oldValue) { + $node->textContent = $value; + } + } + + // replace matches + for ($i = 0, $length = count($this->nodeStack); $i < $length; $i++) { + $this->replaceMatches($this->nodeStack[$i], $this->elementStack[$i]); + } + } + + /** + * Detects mentions in text nodes. + * + * @param \DOMText $text text node + * @param string $value node value + * @param string[] $usernames list of already found usernames + */ + protected function detectMention(\DOMText $text, $value, array &$usernames) { + if (mb_strpos($value, '@') === false) { + return; + } + + if (preg_match_all(self::$userRegex, $value, $matches, PREG_PATTERN_ORDER)) { + // $i = 1 to skip the full match + for ($i = 1, $length = count($matches); $i < $length; $i++) { + for ($j = 0, $innerLength = count($matches[$i]); $j < $innerLength; $j++) { + if ($matches[$i][$j] === '') { + continue; + } + + $username = $this->getUsername($matches[$i][$j]); + if (!isset($usernames[$username])) { + $usernames[$username] = $username; + } + } + } + } + } + + /** + * Matches the found usernames agains the user table. + * + * @param string[] $usernames list of found usernames + * @return string[] list of valid usernames + */ + protected function lookupUsernames(array $usernames) { + $exactValues = []; + $likeValues = []; + foreach ($usernames as $username) { + if (mb_strpos($username, ' ') !== false) { + // string contains a whitespace, account for names that + // are built up with more than two words + $likeValues[] = $username; + } + else { + $exactValues[] = $username; + } + } + + $conditions = new PreparedStatementConditionBuilder(true, 'OR'); + + if (!empty($exactValues)) { + $conditions->add('username IN (?)', [$exactValues]); + } + + if (!empty($likeValues)) { + for ($i = 0, $length = count($likeValues); $i < $length; $i++) { + $conditions->add('username LIKE ?', [str_replace('%', '', $likeValues[$i]) . '%']); + } + } + + $sql = "SELECT userID, username + FROM wcf".WCF_N."_user + ".$conditions; + $statement = WCF::getDB()->prepareStatement($sql); + $statement->execute($conditions->getParameters()); + $users = $statement->fetchMap('userID', 'username'); + + // sort usernames with the longest one being first + uasort($users, function($a, $b) { + $lengthA = mb_strlen($a); + $lengthB = mb_strlen($b); + + if ($lengthA < $lengthB) { + return 1; + } + else if ($lengthA === $lengthB) { + return 0; + } + + return -1; + }); + + return $users; + } + + /** + * Parses text nodes and searches for mentions. + * + * @param \DOMText $text text node + * @param string $value node value + * @param string[] $users list of usernames by user id + * @return string modified node value with replacement placeholders + */ + protected function parseMention(\DOMText $text, $value, array $users) { + if (mb_strpos($value, '@') === false) { + return $value; + } + + foreach ($users as $userID => $username) { + do { + $needle = '@' . $username; + $pos = mb_stripos($value, $needle); + + // username not found, maybe it is quoted + if ($pos === false) { + $needle = "@'" . str_ireplace("'", "''", $username) . "'"; + $pos = mb_stripos($value, $needle); + } + + if ($pos !== false) { + $element = $text->ownerDocument->createElement('woltlab-mention'); + $element->setAttribute('data-user-id', $userID); + $element->setAttribute('data-username', $username); + + $marker = $this->addReplacement($text, $element); + + // we use preg_replace() because the username could appear multiple times + // and we need to replace them one by one, also avoiding only replacing + // the non-quoted username even though both variants are present + $value = preg_replace('~' . preg_quote($needle, '~') . '~i', $marker, $value, 1); + } + } + while ($pos); + } + + return $value; + } + + /** + * Parses regular links and media links contained in text nodes. + * + * @param \DOMText $text text node + * @param string $value node value + * @param boolean $allowURL url bbcode is allowed + * @param boolean $allowMedia media bbcode is allowed + * @return string modified node value with replacement placeholders + */ + protected function parseURL(\DOMText $text, $value, $allowURL, $allowMedia) { + static $urlPattern = ''; + if ($urlPattern === '') { + $urlPattern = '~ + (?()\[\]{}\s]* + (?: + [!.,?;(){}]+ [^!.,?;"\'<>()\[\]{}\s]+ + )* + )?~ix'; + } + + return preg_replace_callback($urlPattern, function($matches) use ($text, $allowURL, $allowMedia) { + $link = $matches[0]; + + if (BBCodeMediaProvider::isMediaURL($link)) { + if ($allowMedia) { + $element = $this->htmlInputNodeProcessor->createMetacodeElement($text, 'media', [$link]); + } + else { + return $matches[0]; + } + } + else { + if ($allowURL) { + $element = $text->ownerDocument->createElement('a'); + $element->setAttribute('href', $link); + $element->textContent = $link; + } + else { + return $matches[0]; + } + } + + return $this->addReplacement($text, $element); + }, $value); + } + + /** + * Parses text nodes and replaces email addresses. + * + * @param \DOMText $text text node + * @param string $value node value + * @return string modified node value with replacement placeholders + */ + protected function parseEmail(\DOMText $text, $value) { + if (mb_strpos($value, '@') === false) { + return $value; + } + + static $emailPattern = null; + if ($emailPattern === null) { + $emailPattern = '~ + (?htmlInputNodeProcessor->createMetacodeElement($text, 'email', [$email]); + + return $this->addReplacement($text, $element); + }, $value); + } + + /** + * Parses text nodes and replaces smilies. + * + * @param \DOMText $text text node + * @param string $value node value + * @return string modified node value with replacement placeholders + */ + protected function parseSmiley(\DOMText $text, $value) { + static $smileyPattern = null; + if ($smileyPattern === null) { + foreach ($this->smilies as $smileyCode => $url) { + $smileyCode = preg_quote($smileyCode, '~'); + + if (!preg_match('~^\\\:.+\\\:$~', $smileyCode)) { + $smileyCode = '\B' . $smileyCode . '\B'; + } + + if (!empty($smileyPattern)) $smileyPattern .= '|'; + $smileyPattern .= $smileyCode; + } + + $smileyPattern = '~(' . $smileyPattern . ')~'; + } + + return preg_replace_callback($smileyPattern, function($matches) use ($text) { + $smileyCode = $matches[0]; + + $element = $text->ownerDocument->createElement('img'); + $element->setAttribute('src', $this->smilies[$smileyCode]); + $element->setAttribute('class', 'smiley'); + $element->setAttribute('alt', $smileyCode); + + return $this->addReplacement($text, $element); + }, $value); + } + + /** + * Replaces all found occurences of special text with their new value. + * + * @param \DOMText $text text node + * @param \DOMElement[] $elements elements to be inserted + */ + protected function replaceMatches(\DOMText $text, array $elements) { + $nodes = [$text]; + + foreach ($elements as $marker => $element) { + for ($i = 0, $length = count($nodes); $i < $length; $i++) { + /** @var \DOMText $node */ + $node = $nodes[$i]; + $value = $node->textContent; + + if (($pos = mb_strpos($value, $marker)) !== false) { + // move text in front of the marker into a new text node, + // unless the position is 0 which means there is nothing + if ($pos !== 0) { + $newNode = $node->ownerDocument->createTextNode(mb_substr($value, 0, $pos)); + $node->parentNode->insertBefore($newNode, $node); + + // add new text node to the stack as it may contain other markers + $nodes[] = $newNode; + $length++; + } + + $node->parentNode->insertBefore($element, $node); + + // modify text content of existing text node + $node->textContent = mb_substr($value, $pos + strlen($marker)); + } + } + } + } + + /** + * Returns true if text node is inside a code element, suppresing any + * auto-detection of content. + * + * @param \DOMText $text text node + * @return boolean true if text node is inside a code element + */ + protected function hasCodeParent(\DOMText $text) { + $parent = $text; + /** @var \DOMElement $parent */ + while ($parent = $parent->parentNode) { + $nodeName = $parent->nodeName; + if ($nodeName === 'code' || $nodeName === 'kbd' || $nodeName === 'pre') { + return true; + } + else if ($nodeName === 'woltlab-metacode' && in_array($parent->getAttribute('data-name'), $this->sourceBBCodes)) { + return true; + } + } + + return false; + } + + /** + * Returns true if text node is inside a link, preventing the link content + * being recognized as a link again. + * + * @param \DOMText $text text node + * @return boolean true if text node is inside a link + */ + protected function hasLinkParent(\DOMText $text) { + $parent = $text; + /** @var \DOMElement $parent */ + while ($parent = $parent->parentNode) { + $nodeName = $parent->nodeName; + if ($nodeName === 'a') { + return true; + } + } + + return false; + } + + /** + * Uses string markers to replace the matched text. This process prevents multiple + * detections being applied to the same target and enables us to delay replacement. + * + * Immediately replacing matches would potentially cause a lot of DOM modifications + * and moving of nodes especially if there are multiple matches per text node. + * + * @param \DOMText $text text node + * @param \DOMElement $element element queued for insertion + * @return string replacement marker + */ + public function addReplacement(\DOMText $text, \DOMElement $element) { + $index = array_search($text, $this->nodeStack, true); + if ($index === false) { + $index = count($this->nodeStack); + + $this->nodeStack[$index] = $text; + $this->elementStack[$index] = []; + } + + $marker = $this->getNewMarker(); + $this->elementStack[$index][$marker] = $element; + + return $marker; + } + + /** + * Returns a random string marker for replacement. + * + * @return string random string marker + */ + public function getNewMarker() { + return '@@@' . StringUtil::getUUID() . '@@@'; + } + + /** + * Returns the username for the given regular expression match and takes care + * of any quotes outside the username and certain special characters, such as + * colons, that have been incorrectly matched. + * + * @param string $match matched username + * @return string sanitized username + */ + public function getUsername($match) { + // remove escaped single quotation mark + $match = str_replace("''", "'", $match); + + // remove single quotation marks + if ($match{0} == "'") { + $match = mb_substr($match, 1, -1); + } + else { + // remove characters that might be at the end of our match + // but are not part of the username itself such as a colon + // rtrim() is not binary safe + $match = preg_replace('~[:;,.)]$~', '', $match); + } + + return mb_strtolower($match); + } +}