add page description on title | description | keywords not empty, remove deprecated constructions

This commit is contained in:
ghost 2023-05-10 19:35:01 +03:00
parent 7c5ba050b2
commit 307ebcf0b1
2 changed files with 24 additions and 31 deletions

View File

@ -243,6 +243,7 @@ try {
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time()); $db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
// Ban page if not available
} else { } else {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
@ -250,30 +251,32 @@ try {
continue; continue;
} }
// Validate MIME // Parse MIME
$hostPageIsDom = false; $hostPageIsDom = false;
$hostPageBanned = true; $hostPageInMime = false;
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) { foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {
$mime = trim(strtolower($mime)); $mime = Filter::mime($mime);
// Check for DOM // Check for DOM
if (false !== strpos('text/html', $mime)) { if (false !== strpos('text/html', $mime)) {
$hostPageIsDom = true; $hostPageIsDom = true;
$hostPageBanned = false; $hostPageInMime = true;
break; break;
} }
// Ban page on MIME type not allowed in settings // Ban page on MIME type not allowed in settings
if (false !== strpos(strtolower($contentType), $mime)) { if (false !== strpos(Filter::mime($contentType), $mime)) {
$hostPageBanned = false; $hostPageInMime = true;
break; break;
} }
} }
if ($hostPageBanned) { // Ban page not in MIME list
if (!$hostPageInMime) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
@ -295,7 +298,7 @@ try {
$robots = null; $robots = null;
$yggoManifest = null; $yggoManifest = null;
// Is DOM // Is DOM content
if ($hostPageIsDom) { if ($hostPageIsDom) {
// Parse content // Parse content
@ -332,7 +335,7 @@ try {
$robots = @$meta->getAttribute('content'); $robots = @$meta->getAttribute('content');
// Append page with meta robots:noindex value to the robotsPostfix disallow list // Ban page with meta robots:noindex value
if (false !== stripos($robots, 'noindex')) { if (false !== stripos($robots, 'noindex')) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
@ -340,31 +343,30 @@ try {
continue; continue;
} }
// Skip page links following by robots:nofollow attribute detected // Skip page with meta robots:nofollow attribute
if (false !== stripos($robots, 'nofollow')) { if (false !== stripos($robots, 'nofollow')) {
continue; continue;
} }
} }
// Grab meta yggo:manifest link when available
if (@$meta->getAttribute('name') == 'yggo:manifest') { if (@$meta->getAttribute('name') == 'yggo:manifest') {
$yggoManifest = Filter::url(@$meta->getAttribute('content')); $yggoManifest = Filter::url(@$meta->getAttribute('content'));
} }
} }
} }
// Update queued page
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::mime($contentType),
time());
// Add queued page description if not exists // Add queued page description if not exists
if ($title || $description || $keywords) {
$db->addHostPageDescription($queueHostPage->hostPageId, $db->addHostPageDescription($queueHostPage->hostPageId,
$title ? Filter::pageTitle($title) : null, $title ? Filter::pageTitle($title) : null,
$description ? Filter::pageDescription($description) : null, $description ? Filter::pageDescription($description) : null,
$keywords ? Filter::pageKeywords($keywords) : null, $keywords ? Filter::pageKeywords($keywords) : null,
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, $content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
time()); time());
}
// Update manifest registry // Update manifest registry
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) { if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) {
@ -381,7 +383,7 @@ try {
} }
} }
// Init links registry // Begin page links collection
$links = []; $links = [];
// Collect image links // Collect image links

View File

@ -258,15 +258,6 @@ class MySQL {
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function updateHostPage(int $hostPageId, string $mime, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `mime` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $mime, $hostPageId]);
return $query->rowCount();
}
public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) { public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1'); $query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');