Browse Source

fix media crawling

main
ghost 2 years ago
parent
commit
7c5ba050b2
  1. 77
      crontab/crawler.php
  2. 2
      library/filter.php

77
crontab/crawler.php

@ -250,11 +250,23 @@ try {
continue; continue;
} }
// Skip page processing on MIME type not allowed in settings // Validate MIME
$hostPageIsDom = false;
$hostPageBanned = true; $hostPageBanned = true;
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) { foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {
if (false !== strpos($contentType, trim($mime))) { $mime = trim(strtolower($mime));
// Check for DOM
if (false !== strpos('text/html', $mime)) {
$hostPageIsDom = true;
$hostPageBanned = false;
break;
}
// Ban page on MIME type not allowed in settings
if (false !== strpos(strtolower($contentType), $mime)) {
$hostPageBanned = false; $hostPageBanned = false;
break; break;
@ -276,7 +288,17 @@ try {
continue; continue;
} }
// Grab page content // Define variables
$title = null;
$description = null;
$keywords = null;
$robots = null;
$yggoManifest = null;
// Is DOM
if ($hostPageIsDom) {
// Parse content
$dom = new DomDocument(); $dom = new DomDocument();
@$dom->loadHTML($content); @$dom->loadHTML($content);
@ -289,35 +311,29 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
continue; continue;
} else {
$title = $title->item(0)->nodeValue;
} }
// Get optional page meta data // Get optional page meta data
$metaDescription = '';
$metaKeywords = '';
$metaRobots = '';
$metaYggoManifest = '';
foreach (@$dom->getElementsByTagName('meta') as $meta) { foreach (@$dom->getElementsByTagName('meta') as $meta) {
if (@$meta->getAttribute('name') == 'description') { if (@$meta->getAttribute('name') == 'description') {
$metaDescription = @$meta->getAttribute('content'); $description = @$meta->getAttribute('content');
} }
if (@$meta->getAttribute('name') == 'keywords') { if (@$meta->getAttribute('name') == 'keywords') {
$metaKeywords = @$meta->getAttribute('content'); $keywords = @$meta->getAttribute('content');
} }
if (@$meta->getAttribute('name') == 'robots') { if (@$meta->getAttribute('name') == 'robots') {
$metaRobots = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'yggo:manifest') { $robots = @$meta->getAttribute('content');
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
}
}
// Append page with meta robots:noindex value to the robotsPostfix disallow list // Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) { if (false !== stripos($robots, 'noindex')) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
@ -325,10 +341,17 @@ try {
} }
// Skip page links following by robots:nofollow attribute detected // Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) { if (false !== stripos($robots, 'nofollow')) {
continue; continue;
} }
}
if (@$meta->getAttribute('name') == 'yggo:manifest') {
$yggoManifest = Filter::url(@$meta->getAttribute('content'));
}
}
}
// Update queued page // Update queued page
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
@ -337,20 +360,20 @@ try {
// Add queued page description if not exists // Add queued page description if not exists
$db->addHostPageDescription($queueHostPage->hostPageId, $db->addHostPageDescription($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue), $title ? Filter::pageTitle($title) : null,
Filter::pageDescription($metaDescription), $description ? Filter::pageDescription($description) : null,
Filter::pageKeywords($metaKeywords), $keywords ? Filter::pageKeywords($keywords) : null,
$queueHostPage->crawlMetaOnly ? null : base64_encode($content), $content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
time()); time());
// Update manifest registry // Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) {
$metaYggoManifestCRC32 = crc32($metaYggoManifest); $yggoManifestCRC32 = crc32($yggoManifest);
if (!$db->getManifest($metaYggoManifestCRC32)) { if (!$db->getManifest($yggoManifestCRC32)) {
$db->addManifest($metaYggoManifestCRC32, $db->addManifest($yggoManifestCRC32,
$metaYggoManifest, $yggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS, (string) CRAWL_MANIFEST_DEFAULT_STATUS,
time()); time());

2
library/filter.php

@ -18,7 +18,7 @@ class Filter {
$mime = (string) $mime; $mime = (string) $mime;
return trim($mime); return trim(strtolower($mime));
} }
static public function pageTitle(mixed $title) { static public function pageTitle(mixed $title) {

Loading…
Cancel
Save