|
|
@ -586,110 +586,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
continue; |
|
|
|
continue; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Is HTML document |
|
|
|
|
|
|
|
if (false !== stripos(Filter::mime($contentType), 'text/html')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Define variables |
|
|
|
|
|
|
|
$metaDescription = null; |
|
|
|
|
|
|
|
$metaKeywords = null; |
|
|
|
|
|
|
|
$metaYggoManifest = null; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Parse page content |
|
|
|
|
|
|
|
$dom = new DomDocument(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($encoding = mb_detect_encoding($content)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@$dom->loadHTML(sprintf('<?xml encoding="%s" ?>', $encoding) . $content); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip index page links without title tag |
|
|
|
|
|
|
|
$title = @$dom->getElementsByTagName('title'); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($title->length == 0) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$metaTitle = $title->item(0)->nodeValue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Get optional page meta data |
|
|
|
|
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'description') { |
|
|
|
|
|
|
|
$metaDescription = @$meta->getAttribute('content'); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'keywords') { |
|
|
|
|
|
|
|
$metaKeywords = @$meta->getAttribute('content'); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'robots') { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$metaRobots = @$meta->getAttribute('content'); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Ban page with meta robots:noindex attribute |
|
|
|
|
|
|
|
if (false !== stripos($metaRobots, 'noindex')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Grab meta yggo:manifest link when available |
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'yggo:manifest') { |
|
|
|
|
|
|
|
$metaYggoManifest = Filter::url(@$meta->getAttribute('content')); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Add queued page description if not exists |
|
|
|
|
|
|
|
$db->addHostPageDescription($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
$metaTitle, |
|
|
|
|
|
|
|
$metaDescription ? Filter::pageDescription($metaDescription) : null, |
|
|
|
|
|
|
|
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null, |
|
|
|
|
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, |
|
|
|
|
|
|
|
time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Collect page DOM elements data on enabled |
|
|
|
|
|
|
|
if (CRAWL_HOST_PAGE_DOM_SELECTORS) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin selectors extraction |
|
|
|
|
|
|
|
$html = str_get_html($content); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach($html->find($selector) as $element) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!empty($element->innertext)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->addHostPageDom($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
time(), |
|
|
|
|
|
|
|
$selector, |
|
|
|
|
|
|
|
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( |
|
|
|
|
|
|
|
preg_replace('/[\s]+/', |
|
|
|
|
|
|
|
' ', |
|
|
|
|
|
|
|
str_replace(['<br />', '<br/>', '<br>', '</'], |
|
|
|
|
|
|
|
[' ', ' ', ' ', ' </'], |
|
|
|
|
|
|
|
$element->innertext))) : $element->innertext)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin snaps |
|
|
|
// Begin snaps |
|
|
|
if (SNAP_STORAGE) { |
|
|
|
if (SNAP_STORAGE) { |
|
|
|
|
|
|
|
|
|
|
@ -820,6 +716,110 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Is HTML document |
|
|
|
|
|
|
|
if (false !== stripos(Filter::mime($contentType), 'text/html')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Define variables |
|
|
|
|
|
|
|
$metaDescription = null; |
|
|
|
|
|
|
|
$metaKeywords = null; |
|
|
|
|
|
|
|
$metaYggoManifest = null; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Parse page content |
|
|
|
|
|
|
|
$dom = new DomDocument(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($encoding = mb_detect_encoding($content)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@$dom->loadHTML(sprintf('<?xml encoding="%s" ?>', $encoding) . $content); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip index page links without title tag |
|
|
|
|
|
|
|
$title = @$dom->getElementsByTagName('title'); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($title->length == 0) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$metaTitle = $title->item(0)->nodeValue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Get optional page meta data |
|
|
|
|
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'description') { |
|
|
|
|
|
|
|
$metaDescription = @$meta->getAttribute('content'); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'keywords') { |
|
|
|
|
|
|
|
$metaKeywords = @$meta->getAttribute('content'); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'robots') { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$metaRobots = @$meta->getAttribute('content'); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Ban page with meta robots:noindex attribute |
|
|
|
|
|
|
|
if (false !== stripos($metaRobots, 'noindex')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Grab meta yggo:manifest link when available |
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'yggo:manifest') { |
|
|
|
|
|
|
|
$metaYggoManifest = Filter::url(@$meta->getAttribute('content')); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Add queued page description if not exists |
|
|
|
|
|
|
|
$db->addHostPageDescription($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
$metaTitle, |
|
|
|
|
|
|
|
$metaDescription ? Filter::pageDescription($metaDescription) : null, |
|
|
|
|
|
|
|
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null, |
|
|
|
|
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, |
|
|
|
|
|
|
|
time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Collect page DOM elements data on enabled |
|
|
|
|
|
|
|
if (CRAWL_HOST_PAGE_DOM_SELECTORS) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin selectors extraction |
|
|
|
|
|
|
|
$html = str_get_html($content); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach($html->find($selector) as $element) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!empty($element->innertext)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->addHostPageDom($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
time(), |
|
|
|
|
|
|
|
$selector, |
|
|
|
|
|
|
|
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( |
|
|
|
|
|
|
|
preg_replace('/[\s]+/', |
|
|
|
|
|
|
|
' ', |
|
|
|
|
|
|
|
str_replace(['<br />', '<br/>', '<br>', '</'], |
|
|
|
|
|
|
|
[' ', ' ', ' ', ' </'], |
|
|
|
|
|
|
|
$element->innertext))) : $element->innertext)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Skip page links following with meta robots:nofollow attribute |
|
|
|
// Skip page links following with meta robots:nofollow attribute |
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) { |
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) { |
|
|
|
|
|
|
|
|
|
|
|