From 27b071fb01866d543cabd3a80369a3384736101f Mon Sep 17 00:00:00 2001 From: joshuaruesweg Date: Thu, 4 Mar 2021 14:26:57 +0100 Subject: [PATCH] Replace UnfurlUrlUtil with UnfurlResponse --- .../job/UnfurlUrlBackgroundJob.class.php | 250 ++++++++++----- .../message/unfurl/DownloadFailed.class.php | 18 ++ .../message/unfurl/ParsingFailed.class.php | 18 ++ .../message/unfurl/UnfurlResponse.class.php | 294 ++++++++++++++++++ .../message/unfurl/UrlInaccessible.class.php | 18 ++ .../files/lib/util/UnfurlUrlUtil.class.php | 254 --------------- 6 files changed, 512 insertions(+), 340 deletions(-) create mode 100644 wcfsetup/install/files/lib/system/message/unfurl/DownloadFailed.class.php create mode 100644 wcfsetup/install/files/lib/system/message/unfurl/ParsingFailed.class.php create mode 100644 wcfsetup/install/files/lib/system/message/unfurl/UnfurlResponse.class.php create mode 100644 wcfsetup/install/files/lib/system/message/unfurl/UrlInaccessible.class.php delete mode 100644 wcfsetup/install/files/lib/util/UnfurlUrlUtil.class.php diff --git a/wcfsetup/install/files/lib/system/background/job/UnfurlUrlBackgroundJob.class.php b/wcfsetup/install/files/lib/system/background/job/UnfurlUrlBackgroundJob.class.php index 954f6d4f05..c747e160d1 100644 --- a/wcfsetup/install/files/lib/system/background/job/UnfurlUrlBackgroundJob.class.php +++ b/wcfsetup/install/files/lib/system/background/job/UnfurlUrlBackgroundJob.class.php @@ -2,11 +2,16 @@ namespace wcf\system\background\job; +use BadMethodCallException; +use GuzzleHttp\Psr7\Response; use wcf\data\unfurl\url\UnfurlUrl; use wcf\data\unfurl\url\UnfurlUrlAction; +use wcf\system\message\unfurl\DownloadFailed; +use wcf\system\message\unfurl\ParsingFailed; +use wcf\system\message\unfurl\UnfurlResponse; +use wcf\system\message\unfurl\UrlInaccessible; use wcf\util\FileUtil; use wcf\util\StringUtil; -use wcf\util\UnfurlUrlUtil; /** * Represents a background job to get information for an url. @@ -17,12 +22,12 @@ use wcf\util\UnfurlUrlUtil; * @package WoltLabSuite\Core\System\Background\Job * @since 5.4 */ -class UnfurlUrlBackgroundJob extends AbstractBackgroundJob +final class UnfurlUrlBackgroundJob extends AbstractBackgroundJob { /** - * @var UnfurlUrl + * @var int */ - private $url; + private $urlID; /** * UnfurlURLJob constructor. @@ -31,7 +36,7 @@ class UnfurlUrlBackgroundJob extends AbstractBackgroundJob */ public function __construct(UnfurlUrl $url) { - $this->url = $url; + $this->urlID = $url->urlID; } /** @@ -57,104 +62,177 @@ class UnfurlUrlBackgroundJob extends AbstractBackgroundJob */ public function perform() { + $unfurlUrl = new UnfurlUrl($this->urlID); + try { - $url = new UnfurlUrlUtil($this->url->url); - - if (empty(StringUtil::trim($url->getTitle()))) { - $urlAction = new UnfurlUrlAction([$this->url], 'update', [ - 'data' => [ - 'title' => '', - 'description' => '', - 'status' => UnfurlUrl::STATUS_REJECTED, - ], - ]); - $urlAction->executeAction(); + $unfurlResponse = UnfurlResponse::fetch($unfurlUrl->url); + + if (empty(StringUtil::trim($unfurlResponse->getTitle()))) { + $this->save(UnfurlUrl::STATUS_REJECTED); } else { - $title = StringUtil::truncate($url->getTitle(), 255); - $description = $url->getDescription(); - $data = [ - 'title' => $title, - 'description' => $description !== null ? StringUtil::truncate($description, 500) : '', - 'status' => UnfurlUrl::STATUS_SUCCESSFUL, - ]; - - if ($url->getImageUrl()) { - $image = UnfurlUrlUtil::downloadImageFromUrl($url->getImageUrl()); - - if ($image !== null) { - $imageData = @\getimagesizefromstring($image); - - // filter images which are too large or too small - $isSquared = $imageData[0] === $imageData[1]; - if ( - (!$isSquared && ($imageData[0] < 300 && $imageData[1] < 150)) - || \min($imageData[0], $imageData[1]) < 50 - ) { - $data['imageType'] = UnfurlUrl::IMAGE_NO_IMAGE; - } else { - if ($imageData[0] === $imageData[1]) { - $data['imageUrl'] = $url->getImageUrl(); - $data['imageType'] = UnfurlUrl::IMAGE_SQUARED; - } else { - $data['imageUrl'] = $url->getImageUrl(); - $data['imageType'] = UnfurlUrl::IMAGE_COVER; - } + $title = StringUtil::truncate($unfurlResponse->getTitle(), 255); + if ($unfurlResponse->getDescription() !== null) { + $description = StringUtil::truncate($unfurlResponse->getDescription(), 500); + } else { + $description = ""; + } - // Download image, if there is no image proxy or external source images allowed. + if ($unfurlResponse->getImageUrl()) { + try { + $image = $this->downloadImage($unfurlResponse->getImage()); + $imageData = \getimagesizefromstring($image); + if ($imageData !== false) { + $imageType = $this->validateImage($imageData); if (!(MODULE_IMAGE_PROXY || IMAGE_ALLOW_EXTERNAL_SOURCE)) { - if (isset($data['imageType'])) { - switch ($imageData[2]) { - case \IMAGETYPE_PNG: - $extension = 'png'; - break; - case \IMAGETYPE_GIF: - $extension = 'gif'; - break; - case \IMAGETYPE_JPEG: - $extension = 'jpg'; - break; - default: - throw new \RuntimeException(); - } - - $data['imageHash'] = \sha1($image) . '.' . $extension; - - $path = WCF_DIR . 'images/unfurlUrl/' . \substr($data['imageHash'], 0, 2); - FileUtil::makePath($path); - - $fileLocation = $path . '/' . $data['imageHash']; - - \file_put_contents($fileLocation, $image); - - @\touch($fileLocation); - } + $imageHash = $this->saveImage($imageData, $image); + } else { + $imageHash = ""; } + } else { + $imageType = UnfurlUrl::IMAGE_NO_IMAGE; + } + + if ($imageType === UnfurlUrl::IMAGE_NO_IMAGE) { + $imageUrl = $imageHash = ""; + } else { + $imageUrl = $unfurlResponse->getImageUrl(); } + } catch (UrlInaccessible | DownloadFailed $e) { + $imageType = UnfurlUrl::IMAGE_NO_IMAGE; + $imageUrl = $imageHash = ""; } + } else { + $imageType = UnfurlUrl::IMAGE_NO_IMAGE; + $imageUrl = $imageHash = ""; } - $urlAction = new UnfurlUrlAction([$this->url], 'update', [ - 'data' => $data, - ]); - $urlAction->executeAction(); + $this->save( + UnfurlUrl::STATUS_SUCCESSFUL, + $title, + $description, + $imageType, + $imageUrl, + $imageHash + ); + } + } catch (UrlInaccessible | ParsingFailed $e) { + if (\ENABLE_DEBUG_MODE) { + \wcf\functions\exception\logThrowable($e); + } + + $this->save(UnfurlUrl::STATUS_REJECTED); + } + } + + private function downloadImage(Response $imageResponse): string + { + $image = ""; + while (!$imageResponse->getBody()->eof()) { + $image .= $imageResponse->getBody()->read(8192); + + if ($imageResponse->getBody()->tell() >= UnfurlResponse::MAX_IMAGE_SIZE) { + break; } - } catch (\InvalidArgumentException $e) { - \wcf\functions\exception\logThrowable($e); } + $imageResponse->getBody()->close(); + + return $image; } - /** - * @inheritDoc - */ - public function onFinalFailure() + private function validateImage(array $imageData): string { - $urlAction = new UnfurlUrlAction([$this->url], 'update', [ + $isSquared = $imageData[0] === $imageData[1]; + if ( + (!$isSquared && ($imageData[0] < 300 && $imageData[1] < 150)) + || \min($imageData[0], $imageData[1]) < 50 + ) { + return UnfurlUrl::IMAGE_NO_IMAGE; + } else { + if ($isSquared) { + return UnfurlUrl::IMAGE_SQUARED; + } else { + return UnfurlUrl::IMAGE_COVER; + } + } + } + + private function saveImage(array $imageData, string $image): string + { + switch ($imageData[2]) { + case \IMAGETYPE_PNG: + $extension = 'png'; + break; + case \IMAGETYPE_GIF: + $extension = 'gif'; + break; + case \IMAGETYPE_JPEG: + $extension = 'jpg'; + break; + + default: + throw new DownloadFailed(); + } + + $imageHash = sha1($image); + + $path = WCF_DIR . 'images/unfurlUrl/' . \substr($imageHash, 0, 2); + FileUtil::makePath($path); + + $fileLocation = $path . '/' . $imageHash . '.' . $extension; + + \file_put_contents($fileLocation, $image); + + @\touch($fileLocation); + + return $imageHash . '.' . $extension; + } + + private function save( + string $status, + string $title = "", + string $description = "", + string $imageType = UnfurlUrl::IMAGE_NO_IMAGE, + string $imageUrl = "", + string $imageHash = "" + ): void { + switch ($status) { + case UnfurlUrl::STATUS_PENDING: + case UnfurlUrl::STATUS_REJECTED: + case UnfurlUrl::STATUS_SUCCESSFUL: + break; + + default: + throw new BadMethodCallException("Invalid status '{$status}' given."); + } + + switch ($imageType) { + case UnfurlUrl::IMAGE_COVER: + case UnfurlUrl::IMAGE_NO_IMAGE: + case UnfurlUrl::IMAGE_SQUARED: + break; + + default: + throw new BadMethodCallException("Invalid imageType '{$imageType}' given."); + } + + $urlAction = new UnfurlUrlAction([$this->urlID], 'update', [ 'data' => [ - 'title' => '', - 'description' => '', - 'status' => 'REJECTED', + 'status' => $status, + 'title' => $title, + 'description' => $description, + 'imageType' => $imageType, + 'imageUrl' => $imageUrl, + 'imageHash' => $imageHash, ], ]); $urlAction->executeAction(); } + + /** + * @inheritDoc + */ + public function onFinalFailure() + { + $this->save(UnfurlUrl::STATUS_REJECTED); + } } diff --git a/wcfsetup/install/files/lib/system/message/unfurl/DownloadFailed.class.php b/wcfsetup/install/files/lib/system/message/unfurl/DownloadFailed.class.php new file mode 100644 index 0000000000..c9e776b297 --- /dev/null +++ b/wcfsetup/install/files/lib/system/message/unfurl/DownloadFailed.class.php @@ -0,0 +1,18 @@ + + * @package WoltLabSuite\Core\System\Message\Unfurl + * @since 5.4 + */ +class DownloadFailed extends Exception +{ +} diff --git a/wcfsetup/install/files/lib/system/message/unfurl/ParsingFailed.class.php b/wcfsetup/install/files/lib/system/message/unfurl/ParsingFailed.class.php new file mode 100644 index 0000000000..595288c551 --- /dev/null +++ b/wcfsetup/install/files/lib/system/message/unfurl/ParsingFailed.class.php @@ -0,0 +1,18 @@ + + * @package WoltLabSuite\Core\System\Message\Unfurl + * @since 5.4 + */ +class ParsingFailed extends Exception +{ +} diff --git a/wcfsetup/install/files/lib/system/message/unfurl/UnfurlResponse.class.php b/wcfsetup/install/files/lib/system/message/unfurl/UnfurlResponse.class.php new file mode 100644 index 0000000000..bd7983f660 --- /dev/null +++ b/wcfsetup/install/files/lib/system/message/unfurl/UnfurlResponse.class.php @@ -0,0 +1,294 @@ + + * @package WoltLabSuite\Core\System\Message\Unfurl + * @since 5.4 + */ +final class UnfurlResponse +{ + /** + * 10 Mebibyte + */ + private const MAX_SIZE = (10 * (1 << 20)); + + /** + * 3 Mebibyte + */ + public const MAX_IMAGE_SIZE = (3 * (1 << 20)); + + /** + * @var ClientInterface + */ + private static $httpClient; + + /** + * @var string + */ + private $url; + + /** + * @var Response + */ + private $response; + + /** + * @var \DOMDocument + */ + private $domDocument; + + /** + * Fetches a given Url and returns an UnfurlResponse instance. + * + * @throws ParsingFailed If the body cannot be parsed (e.g. the url is an image). + * @throws DownloadFailed If the url can not be downloaded. This can be a temporary error. + * @throws UrlInaccessible If the url is inaccessible (e.g. sends status code 403). + */ + public static function fetch(string $url): self + { + if (!Url::is($url)) { + throw new \InvalidArgumentException('Given URL "' . $url . '" is not a valid URL.'); + } + + try { + $request = new Request('GET', $url, [ + 'range' => \sprintf('bytes=%d-%d', 0, self::MAX_SIZE - 1), + ]); + $response = self::getHttpClient()->send($request); + + return new self($url, $response); + } catch (BadResponseException $e) { + $response = $e->getResponse(); + + switch ($response->getStatusCode()) { + case 400: // Bad Request + case 401: // Unauthorized + case 402: // Payment Required + case 403: // Forbidden + case 404: // Not Found + $message = "Request failed with status code {$response->getStatusCode()}."; + + throw new UrlInaccessible($message, $response->getStatusCode(), $e); + break; + + default: + throw new DownloadFailed("Could not download content.", $response->getStatusCode(), $e); + } + } catch (ClientExceptionInterface $e) { + throw new DownloadFailed("Could not download content.", 0, $e); + } + } + + /** + * @throws ParsingFailed If the body cannot be parsed (e.g. the url is an image). + * @throws DownloadFailed If the url can not be downloaded. This can be a temporary error. + */ + private function __construct(string $url, Response $response) + { + $this->url = $url; + $this->response = $response; + + $this->readBody(); + $this->readDomDocument(); + } + + /** + * Reads the body of the given url and converts the body to utf-8. + */ + private function readBody(): void + { + $this->body = ""; + while (!$this->response->getBody()->eof()) { + $this->body .= $this->response->getBody()->read(8192); + + if ($this->response->getBody()->tell() >= self::MAX_SIZE) { + break; + } + } + $this->response->getBody()->close(); + + if (\mb_detect_encoding($this->body) !== 'UTF-8') { + try { + $this->body = StringUtil::convertEncoding(\mb_detect_encoding($this->body), 'UTF-8', $this->body); + } catch (Exception $e) { + throw new ParsingFailed( + "Could not parse body, due an invalid charset. The Url could be an image.", + 0, + $e + ); + } + } + } + + /** + * Creates the DomDocument. + * + * @throws ParsingFailed If the body cannot be parsed (e.g. the url is an JSON file). + */ + private function readDomDocument(): void + { + \libxml_use_internal_errors(true); + $this->domDocument = new \DOMDocument(); + if (!$this->domDocument->loadHTML('' . $this->body)) { + throw new ParsingFailed("Could not parse body."); + } + } + + /** + * Determines the title of the website. + */ + public function getTitle(): ?string + { + if (!empty($this->body)) { + $metaTags = $this->domDocument->getElementsByTagName('meta'); + + // og + foreach ($metaTags as $metaTag) { + foreach ($metaTag->attributes as $attr) { + if ($attr->nodeName == 'property' && $attr->value == 'og:title') { + foreach ($attr->parentNode->attributes as $attr) { + if ($attr->nodeName == 'content') { + return $attr->value; + } + } + } + } + } + + // title tag + $title = $this->domDocument->getElementsByTagName('title'); + if ($title->length) { + return $title->item(0)->nodeValue; + } + } + + return null; + } + + /** + * Determines the description of the website. + */ + public function getDescription(): ?string + { + if (!empty($this->body)) { + $metaTags = $this->domDocument->getElementsByTagName('meta'); + + // og:description + foreach ($metaTags as $metaTag) { + foreach ($metaTag->attributes as $attr) { + if ($attr->nodeName == 'property' && $attr->value == 'og:description') { + foreach ($attr->parentNode->attributes as $attr) { + if ($attr->nodeName == 'content') { + return $attr->value; + } + } + } + } + } + } + + return null; + } + + /** + * Returns the image url for the current url. + */ + public function getImageUrl(): ?string + { + if (!empty($this->body)) { + $metaTags = $this->domDocument->getElementsByTagName('meta'); + + // og:image + foreach ($metaTags as $metaTag) { + foreach ($metaTag->attributes as $attr) { + if ($attr->nodeName == 'property' && $attr->value == 'og:image') { + foreach ($attr->parentNode->attributes as $attr) { + if ($attr->nodeName == 'content') { + return $attr->value; + } + } + } + } + } + } + + return null; + } + + /** + * Returns the Response for the used image. + * + * @throws BadMethodCallException If the url does not have an image. + * @throws DownloadFailed If the url can not be downloaded. This can be a temporary error. + * @throws UrlInaccessible If the url is inaccessible (e.g. sends status code 403). + */ + public function getImage(): Response + { + if (!$this->getImageUrl()) { + throw new BadMethodCallException("This url does not have an image."); + } + + try { + $request = new Request('GET', $this->getImageUrl(), [ + 'accept' => 'image/*', + 'range' => 'bytes=0-' . (self::MAX_IMAGE_SIZE - 1), + ]); + + return self::getHttpClient()->send($request); + } catch (BadResponseException $e) { + $response = $e->getResponse(); + + switch ($response->getStatusCode()) { + case 400: // Bad Request + case 401: // Unauthorized + case 402: // Payment Required + case 403: // Forbidden + case 404: // Not Found + $message = "Request failed with status code {$response->getStatusCode()}."; + + throw new UrlInaccessible($message, $response->getStatusCode(), $e); + break; + + default: + throw new DownloadFailed("Could not download content.", $response->getStatusCode(), $e); + } + } catch (ClientExceptionInterface $e) { + throw new DownloadFailed("Could not download content.", 0, $e); + } + } + + /** + * Returns a "static" instance of the HTTP client to use to allow + * for TCP connection reuse. + */ + private static function getHttpClient(): ClientInterface + { + if (!self::$httpClient) { + self::$httpClient = HttpFactory::makeClient([ + RequestOptions::TIMEOUT => 10, + RequestOptions::STREAM => true, + ]); + } + + return self::$httpClient; + } +} diff --git a/wcfsetup/install/files/lib/system/message/unfurl/UrlInaccessible.class.php b/wcfsetup/install/files/lib/system/message/unfurl/UrlInaccessible.class.php new file mode 100644 index 0000000000..2db18a419e --- /dev/null +++ b/wcfsetup/install/files/lib/system/message/unfurl/UrlInaccessible.class.php @@ -0,0 +1,18 @@ + + * @package WoltLabSuite\Core\System\Message\Unfurl + * @since 5.4 + */ +class UrlInaccessible extends Exception +{ +} diff --git a/wcfsetup/install/files/lib/util/UnfurlUrlUtil.class.php b/wcfsetup/install/files/lib/util/UnfurlUrlUtil.class.php deleted file mode 100644 index faf446d38e..0000000000 --- a/wcfsetup/install/files/lib/util/UnfurlUrlUtil.class.php +++ /dev/null @@ -1,254 +0,0 @@ - - * @package WoltLabSuite\Core\Util - * @since 5.4 - */ -final class UnfurlUrlUtil -{ - /** - * 10 Mebibyte - */ - private const MAX_SIZE = (10 * (1 << 20)); - - /** - * 3 Mebibyte - */ - private const MAX_IMAGE_SIZE = (3 * (1 << 20)); - - /** - * @var string - */ - private $url; - - /** - * @var string - */ - private $body; - - /** - * @var \DOMDocument - */ - private $domDocument; - - public function __construct(string $url) - { - if (!Url::is($url)) { - throw new \InvalidArgumentException('Given URL "' . $url . '" is not a valid URL.'); - } - - $this->url = $url; - - $this->fetchUrl(); - } - - /** - * Fetches the body of the given url and converts the body to utf-8. - */ - private function fetchUrl(): void - { - try { - $client = HttpFactory::makeClient([ - RequestOptions::TIMEOUT => 10, - RequestOptions::STREAM => true, - ]); - $request = new Request('GET', $this->url, [ - 'range' => \sprintf('bytes=%d-%d', 0, self::MAX_SIZE - 1), - ]); - $response = $client->send($request); - - $this->body = ""; - while (!$response->getBody()->eof()) { - $this->body .= $response->getBody()->read(8192); - - if ($response->getBody()->tell() >= self::MAX_SIZE) { - break; - } - } - $response->getBody()->close(); - - if (\mb_detect_encoding($this->body) !== 'UTF-8') { - $this->body = StringUtil::convertEncoding(\mb_detect_encoding($this->body), 'UTF-8', $this->body); - } - } catch (TooManyRedirectsException | BadResponseException | TransferException $e) { - // Ignore these exceptions. - } - } - - /** - * Returns the dom document of the website. - */ - private function getDomDocument(): \DOMDocument - { - if ($this->domDocument === null) { - \libxml_use_internal_errors(true); - $this->domDocument = new \DOMDocument(); - $this->domDocument->loadHTML('' . $this->body); - } - - return $this->domDocument; - } - - /** - * Determines the title of the website. - */ - public function getTitle(): ?string - { - if (!empty($this->body)) { - $metaTags = $this->getDomDocument()->getElementsByTagName('meta'); - - // og - foreach ($metaTags as $metaTag) { - foreach ($metaTag->attributes as $attr) { - if ($attr->nodeName == 'property' && $attr->value == 'og:title') { - foreach ($attr->parentNode->attributes as $attr) { - if ($attr->nodeName == 'content') { - return $attr->value; - } - } - } - } - } - - // title tag - $title = $this->getDomDocument()->getElementsByTagName('title'); - if ($title->length) { - return $title->item(0)->nodeValue; - } - } - - return null; - } - - /** - * Determines the description of the website. - */ - public function getDescription(): ?string - { - if (!empty($this->body)) { - $metaTags = $this->getDomDocument()->getElementsByTagName('meta'); - - // og:description - foreach ($metaTags as $metaTag) { - foreach ($metaTag->attributes as $attr) { - if ($attr->nodeName == 'property' && $attr->value == 'og:description') { - foreach ($attr->parentNode->attributes as $attr) { - if ($attr->nodeName == 'content') { - return $attr->value; - } - } - } - } - } - } - - return null; - } - - /** - * Returns the image url for the current url. - */ - public function getImageUrl(): ?string - { - if (!empty($this->body)) { - $metaTags = $this->getDomDocument()->getElementsByTagName('meta'); - - // og:image - foreach ($metaTags as $metaTag) { - foreach ($metaTag->attributes as $attr) { - if ($attr->nodeName == 'property' && $attr->value == 'og:image') { - foreach ($attr->parentNode->attributes as $attr) { - if ($attr->nodeName == 'content') { - return $attr->value; - } - } - } - } - } - } - - return null; - } - - /** - * Downloads the image from a url and returns the image body. - */ - public static function downloadImageFromUrl(string $url): ?string - { - try { - // Rewrite schemaless URLs to https. - $scheme = \parse_url($url, \PHP_URL_SCHEME); - if (!$scheme) { - if (StringUtil::startsWith($url, '//')) { - $url = 'https:' . $url; - } else { - throw new \DomainException(); - } - } - - // download image - try { - $client = HttpFactory::makeClient([ - RequestOptions::TIMEOUT => 10, - RequestOptions::STREAM => true, - ]); - $request = new Request('GET', $url, [ - 'via' => '1.1 wsc', - 'accept' => 'image/*', - 'range' => 'bytes=0-' . (self::MAX_IMAGE_SIZE - 1), - ]); - $response = $client->send($request); - - $image = ""; - while (!$response->getBody()->eof()) { - $image .= $response->getBody()->read(8192); - - if ($response->getBody()->tell() >= self::MAX_IMAGE_SIZE) { - break; - } - } - $response->getBody()->close(); - } catch (TransferException $e) { - throw new \DomainException('Failed to request', 0, $e); - } - - // check file type - $imageData = @\getimagesizefromstring($image); - if (!$imageData) { - throw new \DomainException(); - } - - switch ($imageData[2]) { - case \IMAGETYPE_PNG: - $extension = 'png'; - break; - case \IMAGETYPE_GIF: - $extension = 'gif'; - break; - case \IMAGETYPE_JPEG: - $extension = 'jpg'; - break; - default: - throw new \DomainException(); - } - - return $image; - } catch (\DomainException $e) { - return null; - } - } -} -- 2.20.1