From 0f127ddb91b7756ebbf369acc31b99f940eedafb Mon Sep 17 00:00:00 2001 From: ghost Date: Thu, 17 Aug 2023 13:28:50 +0300 Subject: [PATCH] upgrade hostPageDom crawler to Symfony\Component\DomCrawler --- src/crontab/crawler.php | 96 ++++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 31 deletions(-) diff --git a/src/crontab/crawler.php b/src/crontab/crawler.php index b422e60..c1b164a 100644 --- a/src/crontab/crawler.php +++ b/src/crontab/crawler.php @@ -31,10 +31,6 @@ require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/helper.php'); require_once(__DIR__ . '/../library/yggstate.php'); - -// @TODO deprecated, use Symfony\Component\DomCrawler\Crawler instead -// require_once(__DIR__ . '/../library/vendor/simple_html_dom.php'); - require_once __DIR__ . '/../../vendor/autoload.php'; // Check disk quota @@ -666,6 +662,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ $metaYggoManifestURL = null; // Parse page content + // @TODO refactor to Symfony\Component\DomCrawler\Crawler $dom = new DomDocument(); if ($encoding = mb_detect_encoding($content)) { @@ -738,33 +735,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ $metaKeywords ? Filter::pageKeywords($metaKeywords) : null, time()); - // Collect page DOM elements data on enabled - /* @TODO deprecated, use Symfony\Component\DomCrawler\Crawler instead - if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) { - - // Begin selectors extraction - $html = str_get_html($content); - - foreach ((array) explode(';', $hostPageDomSelectors) as $selector) { - - foreach($html->find($selector) as $element) { - - if (!empty($element->innertext)) { - - $db->addHostPageDom($queueHostPage->hostPageId, - time(), - $selector, - trim(Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/', - ' ', - str_replace(['
', '
', '
', 'innertext))) : $element->innertext)); - } - } - } - } - */ - // Skip page links following with meta robots:nofollow attribute foreach (@$dom->getElementsByTagName('meta') as $meta) { @@ -1020,6 +990,70 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ ]; } + // Init DOM crawler + $crawler = new Symfony\Component\DomCrawler\Crawler(); + $crawler->addHtmlContent($content); + + // Process selectors configuration + if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', json_decode(DEFAULT_HOST_PAGES_DOM_SELECTORS))) { + + foreach ($hostPageDomSelectors as $selector => $settings) { + + // Extract target selector data + foreach ($crawler->filter($selector) as $data) { + + foreach ($data->childNodes as $node) { + + $value = trim($node->ownerDocument->saveHtml()); + + // Apply selector settings + foreach ($settings as $key => $value) { + + switch ($key) { + + case 'strip_tags': + + if (!isset($value->enabled)) { + + continue; + } + + if (false === $value->enabled) { + + continue; + } + + if (!isset($value->allowed_tags)) { + + continue; + } + + $value = strip_tags($value, $value->allowed_tags); + + break; + } + } + + // Skip empty selector values save + if (empty($value)) { + + continue; + } + + // Save selector value + $db->addHostPageDom( + $queueHostPage->hostPageId, + $selector, + $value, + time() + ); + + $hostPageDomAddedTotal++; + } + } + } + } + // Process links collected foreach ($links as $link) {