Browse Source

skip robots:noindex page without extra actions

main
ghost 2 years ago
parent
commit
a5f5541395
  1. 12
      crontab/crawler.php
  2. 9
      library/mysql.php

12
crontab/crawler.php

@ -106,17 +106,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET @@ -106,17 +106,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {
$robots = new Robots($queueHostPage->robots);
$robotsPostfix = new Robots($queueHostPage->robotsPostfix);
// Ignore URI if does not match existing rules yet
if ($robotsPostfix->uriAllowed($queueHostPage->uri) &&
$robots->uriAllowed($queueHostPage->uri)) {
$robotsPostfix->append('Disallow:', $queueHostPage->uri);
$db->updateHostRobotsPostfix($queueHostPage->hostId, $robotsPostfix->getData(), time());
}
continue;
}
// Skip page links following by robots:nofollow attribute detected

9
library/mysql.php

@ -74,15 +74,6 @@ class MySQL { @@ -74,15 +74,6 @@ class MySQL {
return $query->rowCount();
}
public function updateHostRobotsPostfix(int $hostId, mixed $robotsPostfix, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `host` SET `robotsPostfix` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
$query->execute([$robotsPostfix, $timeUpdated, $hostId]);
return $query->rowCount();
}
// Pages
public function getTotalHostPages(int $hostId) {

Loading…
Cancel
Save