From dfbc6132c9cc1d6f4ef9718ba633774b13876fd0 Mon Sep 17 00:00:00 2001 From: ghost Date: Sun, 9 Apr 2023 15:25:15 +0300 Subject: [PATCH] fix robots:noindex condition, add robots:nofollow attribute support --- crontab/crawler.php | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/crontab/crawler.php b/crontab/crawler.php index 9f2c89c..b37bfa0 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -83,8 +83,15 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET } } + // Update queued page data + $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, + Filter::pageTitle($title->item(0)->nodeValue), + Filter::pageDescription($metaDescription), + Filter::pageKeywords($metaKeywords), + CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); + // Append page with meta robots:noindex value to the robotsPostfix disallow list - if ($metaRobots == 'noindex') { + if (false !== stripos($metaRobots, 'noindex')) { $robots = new Robots($queueHostPage->robots); $robotsPostfix = new Robots($queueHostPage->robotsPostfix); @@ -99,12 +106,11 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET } } - // Update queued page data - $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, - Filter::pageTitle($title->item(0)->nodeValue), - Filter::pageDescription($metaDescription), - Filter::pageKeywords($metaKeywords), - CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); + // Skip page links following by robots:nofollow attribute detected + if (false !== stripos($metaRobots, 'nofollow')) { + + continue; + } // Collect internal links from page content foreach(@$dom->getElementsByTagName('a') as $a) {