Browse Source

fix robots:noindex condition, add robots:nofollow attribute support

main
ghost 2 years ago
parent
commit
dfbc6132c9
  1. 20
      crontab/crawler.php

20
crontab/crawler.php

@ -83,8 +83,15 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET @@ -83,8 +83,15 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
}
}
// Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if ($metaRobots == 'noindex') {
if (false !== stripos($metaRobots, 'noindex')) {
$robots = new Robots($queueHostPage->robots);
$robotsPostfix = new Robots($queueHostPage->robotsPostfix);
@ -99,12 +106,11 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET @@ -99,12 +106,11 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
}
}
// Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) {
continue;
}
// Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) {

Loading…
Cancel
Save