mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 21:44:59 +00:00
fix meta variables overwrite
This commit is contained in:
parent
0949d7f871
commit
d2469e9adc
@ -483,16 +483,14 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Define variables
|
|
||||||
$title = null;
|
|
||||||
$description = null;
|
|
||||||
$keywords = null;
|
|
||||||
$robots = null;
|
|
||||||
$yggoManifest = null;
|
|
||||||
|
|
||||||
// Is HTML document
|
// Is HTML document
|
||||||
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
|
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
|
||||||
|
|
||||||
|
// Define variables
|
||||||
|
$metaDescription = null;
|
||||||
|
$metaKeywords = null;
|
||||||
|
$metaYggoManifest = null;
|
||||||
|
|
||||||
// Parse content
|
// Parse content
|
||||||
$dom = new DomDocument();
|
$dom = new DomDocument();
|
||||||
|
|
||||||
@ -511,26 +509,26 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
$title = $title->item(0)->nodeValue;
|
$metaTitle = $title->item(0)->nodeValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get optional page meta data
|
// Get optional page meta data
|
||||||
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
||||||
|
|
||||||
if (@$meta->getAttribute('name') == 'description') {
|
if (@$meta->getAttribute('name') == 'description') {
|
||||||
$description = @$meta->getAttribute('content');
|
$metaDescription = @$meta->getAttribute('content');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (@$meta->getAttribute('name') == 'keywords') {
|
if (@$meta->getAttribute('name') == 'keywords') {
|
||||||
$keywords = @$meta->getAttribute('content');
|
$metaKeywords = @$meta->getAttribute('content');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (@$meta->getAttribute('name') == 'robots') {
|
if (@$meta->getAttribute('name') == 'robots') {
|
||||||
|
|
||||||
$robots = @$meta->getAttribute('content');
|
$metaRobots = @$meta->getAttribute('content');
|
||||||
|
|
||||||
// Ban page with meta robots:noindex value
|
// Ban page with meta robots:noindex value
|
||||||
if (false !== stripos($robots, 'noindex')) {
|
if (false !== stripos($metaRobots, 'noindex')) {
|
||||||
|
|
||||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||||
|
|
||||||
@ -546,7 +544,30 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
|
|
||||||
// Grab meta yggo:manifest link when available
|
// Grab meta yggo:manifest link when available
|
||||||
if (@$meta->getAttribute('name') == 'yggo:manifest') {
|
if (@$meta->getAttribute('name') == 'yggo:manifest') {
|
||||||
$yggoManifest = Filter::url(@$meta->getAttribute('content'));
|
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add queued page description if not exists
|
||||||
|
$db->addHostPageDescription($queueHostPage->hostPageId,
|
||||||
|
$metaTitle,
|
||||||
|
$metaDescription ? Filter::pageDescription($metaDescription) : null,
|
||||||
|
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
|
||||||
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
|
||||||
|
time());
|
||||||
|
|
||||||
|
// Update manifest registry
|
||||||
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||||
|
|
||||||
|
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
|
||||||
|
|
||||||
|
if (!$db->getManifest($metaYggoManifestCRC32)) {
|
||||||
|
$db->addManifest($metaYggoManifestCRC32,
|
||||||
|
$metaYggoManifest,
|
||||||
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
||||||
|
time());
|
||||||
|
|
||||||
|
$manifestsAdded++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -569,7 +590,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!$title = @$img->getAttribute('title')) {
|
if (!$title = @$img->getAttribute('title')) {
|
||||||
$title = null;
|
$title = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip encoded content
|
// Skip encoded content
|
||||||
@ -692,7 +713,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
|
|
||||||
// Get title attribute if available
|
// Get title attribute if available
|
||||||
if (!$title = @$a->getAttribute('title')) {
|
if (!$title = @$a->getAttribute('title')) {
|
||||||
$title = null;
|
$title = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip anchor links
|
// Skip anchor links
|
||||||
@ -851,32 +872,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add queued page description if not exists
|
|
||||||
if ($title || $description || $keywords) {
|
|
||||||
|
|
||||||
$db->addHostPageDescription($queueHostPage->hostPageId,
|
|
||||||
$title ? Filter::pageTitle($title) : null,
|
|
||||||
$description ? Filter::pageDescription($description) : null,
|
|
||||||
$keywords ? Filter::pageKeywords($keywords) : null,
|
|
||||||
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
|
|
||||||
time());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update manifest registry
|
|
||||||
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) {
|
|
||||||
|
|
||||||
$yggoManifestCRC32 = crc32($yggoManifest);
|
|
||||||
|
|
||||||
if (!$db->getManifest($yggoManifestCRC32)) {
|
|
||||||
$db->addManifest($yggoManifestCRC32,
|
|
||||||
$yggoManifest,
|
|
||||||
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
|
||||||
time());
|
|
||||||
|
|
||||||
$manifestsAdded++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Begin snaps
|
// Begin snaps
|
||||||
$snapLocal = false;
|
$snapLocal = false;
|
||||||
$snapMega = false;
|
$snapMega = false;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user