mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-10 14:57:56 +00:00
fix robots:noindex condition, add robots:nofollow attribute support
This commit is contained in:
parent
5c8d299a4a
commit
dfbc6132c9
@ -83,8 +83,15 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update queued page data
|
||||||
|
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
||||||
|
Filter::pageTitle($title->item(0)->nodeValue),
|
||||||
|
Filter::pageDescription($metaDescription),
|
||||||
|
Filter::pageKeywords($metaKeywords),
|
||||||
|
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
||||||
|
|
||||||
// Append page with meta robots:noindex value to the robotsPostfix disallow list
|
// Append page with meta robots:noindex value to the robotsPostfix disallow list
|
||||||
if ($metaRobots == 'noindex') {
|
if (false !== stripos($metaRobots, 'noindex')) {
|
||||||
|
|
||||||
$robots = new Robots($queueHostPage->robots);
|
$robots = new Robots($queueHostPage->robots);
|
||||||
$robotsPostfix = new Robots($queueHostPage->robotsPostfix);
|
$robotsPostfix = new Robots($queueHostPage->robotsPostfix);
|
||||||
@ -99,12 +106,11 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update queued page data
|
// Skip page links following by robots:nofollow attribute detected
|
||||||
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
if (false !== stripos($metaRobots, 'nofollow')) {
|
||||||
Filter::pageTitle($title->item(0)->nodeValue),
|
|
||||||
Filter::pageDescription($metaDescription),
|
continue;
|
||||||
Filter::pageKeywords($metaKeywords),
|
}
|
||||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
|
||||||
|
|
||||||
// Collect internal links from page content
|
// Collect internal links from page content
|
||||||
foreach(@$dom->getElementsByTagName('a') as $a) {
|
foreach(@$dom->getElementsByTagName('a') as $a) {
|
||||||
|
Loading…
Reference in New Issue
Block a user