From a5f55413957f0125ed44b6d612d0171161ab4abe Mon Sep 17 00:00:00 2001 From: ghost Date: Sat, 29 Apr 2023 08:58:48 +0300 Subject: [PATCH] skip robots:noindex page without extra actions --- crontab/crawler.php | 12 +----------- library/mysql.php | 9 --------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/crontab/crawler.php b/crontab/crawler.php index dbb3263..7085002 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -106,17 +106,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET // Append page with meta robots:noindex value to the robotsPostfix disallow list if (false !== stripos($metaRobots, 'noindex')) { - $robots = new Robots($queueHostPage->robots); - $robotsPostfix = new Robots($queueHostPage->robotsPostfix); - - // Ignore URI if does not match existing rules yet - if ($robotsPostfix->uriAllowed($queueHostPage->uri) && - $robots->uriAllowed($queueHostPage->uri)) { - - $robotsPostfix->append('Disallow:', $queueHostPage->uri); - - $db->updateHostRobotsPostfix($queueHostPage->hostId, $robotsPostfix->getData(), time()); - } + continue; } // Skip page links following by robots:nofollow attribute detected diff --git a/library/mysql.php b/library/mysql.php index 08b5daf..582997d 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -74,15 +74,6 @@ class MySQL { return $query->rowCount(); } - public function updateHostRobotsPostfix(int $hostId, mixed $robotsPostfix, int $timeUpdated) { - - $query = $this->_db->prepare('UPDATE `host` SET `robotsPostfix` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1'); - - $query->execute([$robotsPostfix, $timeUpdated, $hostId]); - - return $query->rowCount(); - } - // Pages public function getTotalHostPages(int $hostId) {