Browse Source

fix robots:noindex condition, add robots:nofollow attribute support

main
ghost 2 years ago
parent
commit
dfbc6132c9
  1. 20
      crontab/crawler.php

20
crontab/crawler.php

@ -83,8 +83,15 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
} }
} }
// Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Append page with meta robots:noindex value to the robotsPostfix disallow list // Append page with meta robots:noindex value to the robotsPostfix disallow list
if ($metaRobots == 'noindex') { if (false !== stripos($metaRobots, 'noindex')) {
$robots = new Robots($queueHostPage->robots); $robots = new Robots($queueHostPage->robots);
$robotsPostfix = new Robots($queueHostPage->robotsPostfix); $robotsPostfix = new Robots($queueHostPage->robotsPostfix);
@ -99,12 +106,11 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
} }
} }
// Update queued page data // Skip page links following by robots:nofollow attribute detected
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, if (false !== stripos($metaRobots, 'nofollow')) {
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription), continue;
Filter::pageKeywords($metaKeywords), }
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Collect internal links from page content // Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) { foreach(@$dom->getElementsByTagName('a') as $a) {

Loading…
Cancel
Save