|
|
@ -243,6 +243,7 @@ try { |
|
|
|
|
|
|
|
|
|
|
|
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time()); |
|
|
|
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Ban page if not available |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
@ -250,30 +251,32 @@ try { |
|
|
|
continue; |
|
|
|
continue; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Validate MIME |
|
|
|
// Parse MIME |
|
|
|
$hostPageIsDom = false; |
|
|
|
$hostPageIsDom = false; |
|
|
|
$hostPageBanned = true; |
|
|
|
$hostPageInMime = false; |
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) { |
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) { |
|
|
|
|
|
|
|
|
|
|
|
$mime = trim(strtolower($mime)); |
|
|
|
$mime = Filter::mime($mime); |
|
|
|
|
|
|
|
|
|
|
|
// Check for DOM |
|
|
|
// Check for DOM |
|
|
|
if (false !== strpos('text/html', $mime)) { |
|
|
|
if (false !== strpos('text/html', $mime)) { |
|
|
|
|
|
|
|
|
|
|
|
$hostPageIsDom = true; |
|
|
|
$hostPageIsDom = true; |
|
|
|
$hostPageBanned = false; |
|
|
|
$hostPageInMime = true; |
|
|
|
break; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Ban page on MIME type not allowed in settings |
|
|
|
// Ban page on MIME type not allowed in settings |
|
|
|
if (false !== strpos(strtolower($contentType), $mime)) { |
|
|
|
if (false !== strpos(Filter::mime($contentType), $mime)) { |
|
|
|
|
|
|
|
|
|
|
|
$hostPageBanned = false; |
|
|
|
$hostPageInMime = true; |
|
|
|
break; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if ($hostPageBanned) { |
|
|
|
// Ban page not in MIME list |
|
|
|
|
|
|
|
if (!$hostPageInMime) { |
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
|
@ -295,7 +298,7 @@ try { |
|
|
|
$robots = null; |
|
|
|
$robots = null; |
|
|
|
$yggoManifest = null; |
|
|
|
$yggoManifest = null; |
|
|
|
|
|
|
|
|
|
|
|
// Is DOM |
|
|
|
// Is DOM content |
|
|
|
if ($hostPageIsDom) { |
|
|
|
if ($hostPageIsDom) { |
|
|
|
|
|
|
|
|
|
|
|
// Parse content |
|
|
|
// Parse content |
|
|
@ -332,7 +335,7 @@ try { |
|
|
|
|
|
|
|
|
|
|
|
$robots = @$meta->getAttribute('content'); |
|
|
|
$robots = @$meta->getAttribute('content'); |
|
|
|
|
|
|
|
|
|
|
|
// Append page with meta robots:noindex value to the robotsPostfix disallow list |
|
|
|
// Ban page with meta robots:noindex value |
|
|
|
if (false !== stripos($robots, 'noindex')) { |
|
|
|
if (false !== stripos($robots, 'noindex')) { |
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
@ -340,31 +343,30 @@ try { |
|
|
|
continue; |
|
|
|
continue; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Skip page links following by robots:nofollow attribute detected |
|
|
|
// Skip page with meta robots:nofollow attribute |
|
|
|
if (false !== stripos($robots, 'nofollow')) { |
|
|
|
if (false !== stripos($robots, 'nofollow')) { |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Grab meta yggo:manifest link when available |
|
|
|
if (@$meta->getAttribute('name') == 'yggo:manifest') { |
|
|
|
if (@$meta->getAttribute('name') == 'yggo:manifest') { |
|
|
|
$yggoManifest = Filter::url(@$meta->getAttribute('content')); |
|
|
|
$yggoManifest = Filter::url(@$meta->getAttribute('content')); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Update queued page |
|
|
|
|
|
|
|
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
Filter::mime($contentType), |
|
|
|
|
|
|
|
time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Add queued page description if not exists |
|
|
|
// Add queued page description if not exists |
|
|
|
$db->addHostPageDescription($queueHostPage->hostPageId, |
|
|
|
if ($title || $description || $keywords) { |
|
|
|
$title ? Filter::pageTitle($title) : null, |
|
|
|
|
|
|
|
$description ? Filter::pageDescription($description) : null, |
|
|
|
$db->addHostPageDescription($queueHostPage->hostPageId, |
|
|
|
$keywords ? Filter::pageKeywords($keywords) : null, |
|
|
|
$title ? Filter::pageTitle($title) : null, |
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, |
|
|
|
$description ? Filter::pageDescription($description) : null, |
|
|
|
time()); |
|
|
|
$keywords ? Filter::pageKeywords($keywords) : null, |
|
|
|
|
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, |
|
|
|
|
|
|
|
time()); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Update manifest registry |
|
|
|
// Update manifest registry |
|
|
|
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) { |
|
|
|
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) { |
|
|
@ -381,7 +383,7 @@ try { |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Init links registry |
|
|
|
// Begin page links collection |
|
|
|
$links = []; |
|
|
|
$links = []; |
|
|
|
|
|
|
|
|
|
|
|
// Collect image links |
|
|
|
// Collect image links |
|
|
|