mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-08 22:07:56 +00:00
add page description on title | description | keywords not empty, remove deprecated constructions
This commit is contained in:
parent
7c5ba050b2
commit
307ebcf0b1
@ -243,6 +243,7 @@ try {
|
|||||||
|
|
||||||
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
|
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
|
||||||
|
|
||||||
|
// Ban page if not available
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||||
@ -250,30 +251,32 @@ try {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate MIME
|
// Parse MIME
|
||||||
$hostPageIsDom = false;
|
$hostPageIsDom = false;
|
||||||
$hostPageBanned = true;
|
$hostPageInMime = false;
|
||||||
|
|
||||||
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {
|
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {
|
||||||
|
|
||||||
$mime = trim(strtolower($mime));
|
$mime = Filter::mime($mime);
|
||||||
|
|
||||||
// Check for DOM
|
// Check for DOM
|
||||||
if (false !== strpos('text/html', $mime)) {
|
if (false !== strpos('text/html', $mime)) {
|
||||||
|
|
||||||
$hostPageIsDom = true;
|
$hostPageIsDom = true;
|
||||||
$hostPageBanned = false;
|
$hostPageInMime = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ban page on MIME type not allowed in settings
|
// Ban page on MIME type not allowed in settings
|
||||||
if (false !== strpos(strtolower($contentType), $mime)) {
|
if (false !== strpos(Filter::mime($contentType), $mime)) {
|
||||||
|
|
||||||
$hostPageBanned = false;
|
$hostPageInMime = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($hostPageBanned) {
|
// Ban page not in MIME list
|
||||||
|
if (!$hostPageInMime) {
|
||||||
|
|
||||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||||
|
|
||||||
@ -295,7 +298,7 @@ try {
|
|||||||
$robots = null;
|
$robots = null;
|
||||||
$yggoManifest = null;
|
$yggoManifest = null;
|
||||||
|
|
||||||
// Is DOM
|
// Is DOM content
|
||||||
if ($hostPageIsDom) {
|
if ($hostPageIsDom) {
|
||||||
|
|
||||||
// Parse content
|
// Parse content
|
||||||
@ -332,7 +335,7 @@ try {
|
|||||||
|
|
||||||
$robots = @$meta->getAttribute('content');
|
$robots = @$meta->getAttribute('content');
|
||||||
|
|
||||||
// Append page with meta robots:noindex value to the robotsPostfix disallow list
|
// Ban page with meta robots:noindex value
|
||||||
if (false !== stripos($robots, 'noindex')) {
|
if (false !== stripos($robots, 'noindex')) {
|
||||||
|
|
||||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||||
@ -340,31 +343,30 @@ try {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip page links following by robots:nofollow attribute detected
|
// Skip page with meta robots:nofollow attribute
|
||||||
if (false !== stripos($robots, 'nofollow')) {
|
if (false !== stripos($robots, 'nofollow')) {
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Grab meta yggo:manifest link when available
|
||||||
if (@$meta->getAttribute('name') == 'yggo:manifest') {
|
if (@$meta->getAttribute('name') == 'yggo:manifest') {
|
||||||
$yggoManifest = Filter::url(@$meta->getAttribute('content'));
|
$yggoManifest = Filter::url(@$meta->getAttribute('content'));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update queued page
|
|
||||||
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
|
||||||
Filter::mime($contentType),
|
|
||||||
time());
|
|
||||||
|
|
||||||
// Add queued page description if not exists
|
// Add queued page description if not exists
|
||||||
|
if ($title || $description || $keywords) {
|
||||||
|
|
||||||
$db->addHostPageDescription($queueHostPage->hostPageId,
|
$db->addHostPageDescription($queueHostPage->hostPageId,
|
||||||
$title ? Filter::pageTitle($title) : null,
|
$title ? Filter::pageTitle($title) : null,
|
||||||
$description ? Filter::pageDescription($description) : null,
|
$description ? Filter::pageDescription($description) : null,
|
||||||
$keywords ? Filter::pageKeywords($keywords) : null,
|
$keywords ? Filter::pageKeywords($keywords) : null,
|
||||||
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
|
||||||
time());
|
time());
|
||||||
|
}
|
||||||
|
|
||||||
// Update manifest registry
|
// Update manifest registry
|
||||||
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) {
|
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) {
|
||||||
@ -381,7 +383,7 @@ try {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init links registry
|
// Begin page links collection
|
||||||
$links = [];
|
$links = [];
|
||||||
|
|
||||||
// Collect image links
|
// Collect image links
|
||||||
|
@ -258,15 +258,6 @@ class MySQL {
|
|||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function updateHostPage(int $hostPageId, string $mime, int $timeUpdated) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `mime` = ? WHERE `hostPageId` = ? LIMIT 1');
|
|
||||||
|
|
||||||
$query->execute([$timeUpdated, $mime, $hostPageId]);
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) {
|
public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||||
|
Loading…
Reference in New Issue
Block a user