|
|
@ -71,10 +71,10 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Get optional page meta data |
|
|
|
// Get optional page meta data |
|
|
|
$metaDescription = ''; |
|
|
|
$metaDescription = ''; |
|
|
|
$metaKeywords = ''; |
|
|
|
$metaKeywords = ''; |
|
|
|
$metaRobots = ''; |
|
|
|
$metaRobots = ''; |
|
|
|
$metaYggo = ''; |
|
|
|
$metaYggoManifest = ''; |
|
|
|
|
|
|
|
|
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) { |
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) { |
|
|
|
|
|
|
|
|
|
|
@ -90,8 +90,8 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET |
|
|
|
$metaRobots = @$meta->getAttribute('content'); |
|
|
|
$metaRobots = @$meta->getAttribute('content'); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'yggo') { |
|
|
|
if (@$meta->getAttribute('name') == 'yggo:manifest') { |
|
|
|
$metaYggo = Filter::url(@$meta->getAttribute('content')); |
|
|
|
$metaYggoManifest = Filter::url(@$meta->getAttribute('content')); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -103,13 +103,13 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET |
|
|
|
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); |
|
|
|
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); |
|
|
|
|
|
|
|
|
|
|
|
// Update manifest registry |
|
|
|
// Update manifest registry |
|
|
|
if (CRAWL_MANIFEST && !empty($metaYggo) && filter_var($metaYggo, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggo)) { |
|
|
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { |
|
|
|
|
|
|
|
|
|
|
|
$metaYggoCRC32url = crc32($metaYggo); |
|
|
|
$metaYggoManifestCRC32 = crc32($metaYggoManifest); |
|
|
|
|
|
|
|
|
|
|
|
if (!$db->getManifest($metaYggoCRC32url)) { |
|
|
|
if (!$db->getManifest($metaYggoManifestCRC32)) { |
|
|
|
$db->addManifest($metaYggoCRC32url, |
|
|
|
$db->addManifest($metaYggoManifestCRC32, |
|
|
|
$metaYggo, |
|
|
|
$metaYggoManifest, |
|
|
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS, |
|
|
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS, |
|
|
|
time()); |
|
|
|
time()); |
|
|
|
} |
|
|
|
} |
|
|
|