mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
fix robots:noindex condition, add robots:nofollow attribute support
This commit is contained in:
parent
5c8d299a4a
commit
dfbc6132c9
@ -83,8 +83,15 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
||||
}
|
||||
}
|
||||
|
||||
// Update queued page data
|
||||
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
||||
Filter::pageTitle($title->item(0)->nodeValue),
|
||||
Filter::pageDescription($metaDescription),
|
||||
Filter::pageKeywords($metaKeywords),
|
||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
||||
|
||||
// Append page with meta robots:noindex value to the robotsPostfix disallow list
|
||||
if ($metaRobots == 'noindex') {
|
||||
if (false !== stripos($metaRobots, 'noindex')) {
|
||||
|
||||
$robots = new Robots($queueHostPage->robots);
|
||||
$robotsPostfix = new Robots($queueHostPage->robotsPostfix);
|
||||
@ -99,12 +106,11 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
||||
}
|
||||
}
|
||||
|
||||
// Update queued page data
|
||||
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
||||
Filter::pageTitle($title->item(0)->nodeValue),
|
||||
Filter::pageDescription($metaDescription),
|
||||
Filter::pageKeywords($metaKeywords),
|
||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
||||
// Skip page links following by robots:nofollow attribute detected
|
||||
if (false !== stripos($metaRobots, 'nofollow')) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Collect internal links from page content
|
||||
foreach(@$dom->getElementsByTagName('a') as $a) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user