From f9ce3aa3f8802976be7eac51d95aad9cc304f39a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Katarina=20Mio=C4=8Di=C4=87?= Date: Thu, 25 Apr 2024 16:50:32 +0200 Subject: [PATCH] NGSTACK-843 introducing abstract PageTextIndexer --- lib/Core/Search/Common/PageTextExtractor.php | 277 +---------------- .../PageTextExtractor/PageTextExtractor.php | 282 ++++++++++++++++++ .../common/layouts_page_text_indexing.yaml | 4 +- 3 files changed, 286 insertions(+), 277 deletions(-) create mode 100644 lib/Core/Search/Common/PageTextExtractor/PageTextExtractor.php diff --git a/lib/Core/Search/Common/PageTextExtractor.php b/lib/Core/Search/Common/PageTextExtractor.php index e3d179d5..45bb51d4 100644 --- a/lib/Core/Search/Common/PageTextExtractor.php +++ b/lib/Core/Search/Common/PageTextExtractor.php @@ -1,282 +1,9 @@ |string>>> */ - private array $cache = []; - - private LoggerInterface $logger; - - /** - * @param array $siteRoots - * @param array> $languageAccessibility - * @param array> $pageTextConfig - */ - public function __construct( - private readonly ContentHandler $contentHandler, - private readonly LocationHandler $locationHandler, - private readonly RouterInterface $router, - private readonly array $siteRoots, - private readonly array $languageAccessibility, - private readonly string $pageIndexingHost, - private readonly array $pageTextConfig, - ) { - $this->logger = new NullLogger(); - } - - public function setLogger(LoggerInterface $logger): void - { - $this->logger = $logger; - } - - /** - * @param int $contentId - * @param string $languageCode - * - * @return array|string> - */ - public function extractPageText(int $contentId, string $languageCode): array - { - if (isset($this->cache[$contentId][$languageCode])) { - return $this->cache[$contentId][$languageCode]; - } - - if (count($this->cache) > 10) { - $this->cache = []; - } - - try { - $html = $this->fetchPageSource($contentId, $languageCode); - } catch (IndexPageUnavailableException|RuntimeException $e) { - $this->logger->error($e->getMessage()); - - return []; - } - - $textArray = $this->extractTextArray($html); - - $this->cache[$contentId][$languageCode] = $textArray; - - return $textArray; - } - - /** - * @param string $languageCode - * @param int $contentId - * - * @throws \Ibexa\Contracts\Core\Repository\Exceptions\NotFoundException - * - * @return string - */ - private function generateUrl(string $languageCode, int $contentId): string - { - $contentInfo = $this->contentHandler->loadContentInfo($contentId); - $siteAccess = $this->resolveSiteAccess($contentInfo, $languageCode); - - $relativePath = $this->router->generate( - 'ibexa.url.alias', - [ - 'locationId' => (int) $contentInfo->mainLocationId, - 'siteaccess' => $siteAccess, - ], - UrlGeneratorInterface::RELATIVE_PATH, - ); - - return $this->pageIndexingHost . $relativePath; - } - - private function resolveSiteAccess(ContentInfo $contentInfo, string $languageCode): string - { - try { - $location = $this->locationHandler->load($contentInfo->mainLocationId); - } catch (NotFoundException) { - throw new RuntimeException( - sprintf( - 'Content #%d does not have a location', - $contentInfo->id, - ), - ); - } - - $pathArray = explode('/', $location->pathString); - - foreach ($this->siteRoots as $site => $siteRoot) { - if (!in_array((string) $siteRoot, $pathArray, true)) { - continue; - } - - if (!isset($this->languageAccessibility[$site][$languageCode])) { - throw new RuntimeException( - sprintf( - "Language not supported for matched siteaccess group %s", - $site - ) - ); - } - - return $this->languageAccessibility[$site][$languageCode]; - } - - throw new RuntimeException( - sprintf( - "Failed to match content ID %d to a siteaccess", - $contentInfo->id - ) - ); - } - - /** - * @param \DOMNode $node - * @param array> $textArray - * - * @return array> - */ - private function recursiveExtractTextArray(DOMNode $node, array &$textArray): array - { - if ($node->nodeType === XML_ELEMENT_NODE || $node->nodeType === XML_HTML_DOCUMENT_NODE) { - $fieldLevel = $this->getFieldName($node); - - if ($fieldLevel !== null) { - $textArray[$fieldLevel][] = $node->textContent; - - return $textArray; - } - - foreach ($node->childNodes as $childNode) { - $this->recursiveExtractTextArray($childNode, $textArray); - } - - } - if ($node->nodeType === XML_TEXT_NODE) { - $textContent = trim($node->textContent); - if ($textContent !== '') { - $textArray['other'][] = $textContent; - } - } - - return $textArray; - } - - private function getFieldName(DOMNode $node): null|string - { - foreach ($this->pageTextConfig as $level => $tags) { - foreach ($tags as $tag) { - $tagParts = explode('.', $tag); // Split tag and class if present - $tagName = $tagParts[0]; // Get the tag name - $class = $tagParts[1] ?? null; // Get the class if exists - - if ($node->nodeName !== $tagName) { - continue; - } - - if ($class !== null && !$this->hasClass($node, $class)) { - continue; - } - - return $level; - } - } - - return null; - } - - private function hasClass(DOMNode $node, string $className): bool - { - /** @var \DOMElement $node */ - $classes = explode(' ', $node->getAttribute('class')); - - return in_array($className, $classes, true); - } - - /** - * @throws NotFoundException - * @throws UnauthorizedException - * @throws \RuntimeException - */ - private function fetchPageSource(int $contentId, string $languageCode): string - { - $url = $this->generateUrl($languageCode, $contentId); - - $httpClient = HttpClient::create( - ); - - $response = $httpClient->request( - 'GET', - $url - ); - - $html = $response->getContent(); - - if ($response->getStatusCode() !== 200) { - throw new IndexPageUnavailableException( - sprintf( - 'Could not fetch URL "%s": %s', - $url, - $response->getInfo()['error'], - ), - ); - } - - return $html; - } - - /** - * @param string $html - * - * @return array> - */ - private function extractTextArray(string $html): array - { - $startTag = ''; - $endTag = ''; - - $startPos = mb_strpos($html, $startTag); - $endPos = mb_strpos($html, $endTag); - - $textArray = []; - - if ($startPos !== false && $endPos !== false) { - $startPos += mb_strlen($startTag); - $extractedContent = mb_substr($html, $startPos, $endPos - $startPos); - - libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $doc->loadHTML($extractedContent); - libxml_use_internal_errors(false); - $textArray = $this->recursiveExtractTextArray($doc, $textArray); - } + abstract public function extractPageText(int $contentId, string $languageCode); - return $textArray; - } } diff --git a/lib/Core/Search/Common/PageTextExtractor/PageTextExtractor.php b/lib/Core/Search/Common/PageTextExtractor/PageTextExtractor.php new file mode 100644 index 00000000..97de1c19 --- /dev/null +++ b/lib/Core/Search/Common/PageTextExtractor/PageTextExtractor.php @@ -0,0 +1,282 @@ +|string>>> */ + private array $cache = []; + + private LoggerInterface $logger; + + /** + * @param array $siteRoots + * @param array> $languageAccessibility + * @param array> $pageTextConfig + */ + public function __construct( + private readonly ContentHandler $contentHandler, + private readonly LocationHandler $locationHandler, + private readonly RouterInterface $router, + private readonly array $siteRoots, + private readonly array $languageAccessibility, + private readonly string $pageIndexingHost, + private readonly array $pageTextConfig, + ) { + $this->logger = new NullLogger(); + } + + public function setLogger(LoggerInterface $logger): void + { + $this->logger = $logger; + } + + /** + * @param int $contentId + * @param string $languageCode + * + * @return array|string> + */ + public function extractPageText(int $contentId, string $languageCode): array + { + if (isset($this->cache[$contentId][$languageCode])) { + return $this->cache[$contentId][$languageCode]; + } + + if (count($this->cache) > 10) { + $this->cache = []; + } + + try { + $html = $this->fetchPageSource($contentId, $languageCode); + } catch (IndexPageUnavailableException|RuntimeException $e) { + $this->logger->error($e->getMessage()); + + return []; + } + + $textArray = $this->extractTextArray($html); + + $this->cache[$contentId][$languageCode] = $textArray; + + return $textArray; + } + + /** + * @param string $languageCode + * @param int $contentId + * + * @throws \Ibexa\Contracts\Core\Repository\Exceptions\NotFoundException + * + * @return string + */ + private function generateUrl(string $languageCode, int $contentId): string + { + $contentInfo = $this->contentHandler->loadContentInfo($contentId); + $siteAccess = $this->resolveSiteAccess($contentInfo, $languageCode); + + $relativePath = $this->router->generate( + 'ibexa.url.alias', + [ + 'locationId' => (int) $contentInfo->mainLocationId, + 'siteaccess' => $siteAccess, + ], + UrlGeneratorInterface::RELATIVE_PATH, + ); + + return $this->pageIndexingHost . $relativePath; + } + + private function resolveSiteAccess(ContentInfo $contentInfo, string $languageCode): string + { + try { + $location = $this->locationHandler->load($contentInfo->mainLocationId); + } catch (NotFoundException) { + throw new RuntimeException( + sprintf( + 'Content #%d does not have a location', + $contentInfo->id, + ), + ); + } + + $pathArray = explode('/', $location->pathString); + + foreach ($this->siteRoots as $site => $siteRoot) { + if (!in_array((string) $siteRoot, $pathArray, true)) { + continue; + } + + if (!isset($this->languageAccessibility[$site][$languageCode])) { + throw new RuntimeException( + sprintf( + "Language not supported for matched siteaccess group %s", + $site + ) + ); + } + + return $this->languageAccessibility[$site][$languageCode]; + } + + throw new RuntimeException( + sprintf( + "Failed to match content ID %d to a siteaccess", + $contentInfo->id + ) + ); + } + + /** + * @param \DOMNode $node + * @param array> $textArray + * + * @return array> + */ + private function recursiveExtractTextArray(DOMNode $node, array &$textArray): array + { + if ($node->nodeType === XML_ELEMENT_NODE || $node->nodeType === XML_HTML_DOCUMENT_NODE) { + $fieldLevel = $this->getFieldName($node); + + if ($fieldLevel !== null) { + $textArray[$fieldLevel][] = $node->textContent; + + return $textArray; + } + + foreach ($node->childNodes as $childNode) { + $this->recursiveExtractTextArray($childNode, $textArray); + } + + } + if ($node->nodeType === XML_TEXT_NODE) { + $textContent = trim($node->textContent); + if ($textContent !== '') { + $textArray['other'][] = $textContent; + } + } + + return $textArray; + } + + private function getFieldName(DOMNode $node): null|string + { + foreach ($this->pageTextConfig as $level => $tags) { + foreach ($tags as $tag) { + $tagParts = explode('.', $tag); // Split tag and class if present + $tagName = $tagParts[0]; // Get the tag name + $class = $tagParts[1] ?? null; // Get the class if exists + + if ($node->nodeName !== $tagName) { + continue; + } + + if ($class !== null && !$this->hasClass($node, $class)) { + continue; + } + + return $level; + } + } + + return null; + } + + private function hasClass(DOMNode $node, string $className): bool + { + /** @var \DOMElement $node */ + $classes = explode(' ', $node->getAttribute('class')); + + return in_array($className, $classes, true); + } + + /** + * @throws NotFoundException + * @throws UnauthorizedException + * @throws \RuntimeException + */ + private function fetchPageSource(int $contentId, string $languageCode): string + { + $url = $this->generateUrl($languageCode, $contentId); + + $httpClient = HttpClient::create( + ); + + $response = $httpClient->request( + 'GET', + $url + ); + + $html = $response->getContent(); + + if ($response->getStatusCode() !== 200) { + throw new IndexPageUnavailableException( + sprintf( + 'Could not fetch URL "%s": %s', + $url, + $response->getInfo()['error'], + ), + ); + } + + return $html; + } + + /** + * @param string $html + * + * @return array> + */ + private function extractTextArray(string $html): array + { + $startTag = ''; + $endTag = ''; + + $startPos = mb_strpos($html, $startTag); + $endPos = mb_strpos($html, $endTag); + + $textArray = []; + + if ($startPos !== false && $endPos !== false) { + $startPos += mb_strlen($startTag); + $extractedContent = mb_substr($html, $startPos, $endPos - $startPos); + + libxml_use_internal_errors(true); + $doc = new DOMDocument(); + $doc->loadHTML($extractedContent); + libxml_use_internal_errors(false); + $textArray = $this->recursiveExtractTextArray($doc, $textArray); + } + + return $textArray; + } +} diff --git a/lib/Resources/config/search/common/layouts_page_text_indexing.yaml b/lib/Resources/config/search/common/layouts_page_text_indexing.yaml index 07360d64..b99b26b7 100644 --- a/lib/Resources/config/search/common/layouts_page_text_indexing.yaml +++ b/lib/Resources/config/search/common/layouts_page_text_indexing.yaml @@ -1,6 +1,6 @@ services: netgen.ibexa_search_extra.page_indexing.page_text_extractor: - class: Netgen\IbexaSearchExtra\Core\Search\Common\PageTextExtractor + class: Netgen\IbexaSearchExtra\Core\Search\Common\PageTextExtractor\PageTextExtractor arguments: - '@Ibexa\Contracts\Core\Persistence\Content\Handler' - '@Ibexa\Contracts\Core\Persistence\Content\Location\Handler' @@ -20,4 +20,4 @@ services: - '@Ibexa\Contracts\Core\Repository\ContentService' - '@ibexa.spi.search' - '@ibexa.api.persistence_handler' - - '%netgen_ibexa_search_extra.page_indexing.allowed_content_types%' \ No newline at end of file + - '%netgen_ibexa_search_extra.page_indexing.allowed_content_types%'