update page / image description models, implement history snap crawling

This commit is contained in:
ghost 2023-05-09 08:19:49 +03:00
parent 77bd25f587
commit 23ead4e12c
8 changed files with 147 additions and 74 deletions

View File

@ -255,7 +255,7 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
* Index only meta tags to prevent disk overuse
* or false to save meta tags + overall plain text page content
*
* Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
*
* This option able to change search results relevance
* This option enables image data caching in base64
@ -367,6 +367,12 @@ define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30);
*/
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove page description history after following time
*
*/
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
/*
* Remove image ban after following time
*
@ -376,6 +382,12 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
*/
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove image description history after following time
*
*/
define('CLEAN_IMAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
// API settings
/*

View File

@ -21,7 +21,7 @@ source hostPage : common
hostPageDescription.metaKeywords) \
FROM hostPageDescription \
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
ORDER BY hostPageDescription.timeAdded DESC \
ORDER BY hostPageDescription.timeUpdated DESC, hostPageDescription.timeAdded DESC \
LIMIT 1) AS pageDescription \
FROM hostPage \
JOIN host ON (host.hostId = hostPage.hostId) \
@ -37,8 +37,8 @@ source hostImage : common
(SELECT CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title) \
FROM hostImageDescription \
WHERE hostImageDescription.hostImageId = hostImage.hostImageId \
ORDER BY hostImageDescription.timeAdded \
DESC LIMIT 1) AS imageDescription \
ORDER BY hostImageDescription.timeUpdated DESC, hostImageDescription.timeAdded DESC \
LIMIT 1) AS imageDescription \
FROM hostImage \
JOIN host ON (host.hostId = hostImage.hostId) \
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \

View File

@ -21,22 +21,24 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Debug
$timeStart = microtime(true);
$httpRequestsTotal = 0;
$httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;
$httpRequestsTotal = 0;
$httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;
$hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostPagesDeleted = 0;
$hostImagesDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
$hostImagesBansRemoved = 0;
$hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostPagesDeleted = 0;
$hostPageDescriptionsDeleted = 0;
$hostImagesDeleted = 0;
$hostImageDescriptionsDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
$hostImagesBansRemoved = 0;
$logsCleanerDeleted = 0;
$logsCrawlerDeleted = 0;
$logsCleanerDeleted = 0;
$logsCrawlerDeleted = 0;
// Begin update
$db->beginTransaction();
@ -202,9 +204,15 @@ try {
// Reset banned pages
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
// Delete page description history
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
// Reset banned images
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
// Delete image description history
$hostImageDescriptionsDeleted += $db->deleteHostImageDescriptionsByTimeAdded(time() - CLEAN_IMAGE_DESCRIPTION_OFFSET);
// Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
@ -228,8 +236,10 @@ if (CLEAN_LOG_ENABLED) {
$hostsTotal,
$hostsUpdated,
$hostPagesDeleted,
$hostPageDescriptionsDeleted,
$hostPagesBansRemoved,
$hostImagesDeleted,
$hostImageDescriptionsDeleted,
$hostImagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
@ -252,7 +262,9 @@ echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL;
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
echo 'Host image descriptions deleted: ' . $hostImageDescriptionsDeleted . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;

View File

@ -301,7 +301,7 @@ try {
}
// Convert remote image data to base64 string
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
if (!$queueHostImage->crawlMetaOnly) {
// Skip image processing without returned content
if (!$hostImageContent = $curl->getContent()) {
@ -327,14 +327,22 @@ try {
$hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
} else {
// Set host image description
// On link collection we knew meta but data,
// this step use latest description slice and insert the data received by curl request
if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) {
$hostImageData = null;
$db->setHostImageDescription($queueHostImage->hostImageId,
crc32($hostImageData),
$lastHostImageDescription->alt,
$lastHostImageDescription->title,
$hostImageData,
time());
}
}
$hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
Filter::mime($hostImageContentType),
$hostImageData,
time());
}
@ -465,17 +473,13 @@ try {
$content = Filter::pageData($content);
// Add queued page description if not exists
$crc32data = crc32($content);
if (!$db->getHostPageDescription($queueHostPage->hostPageId, $crc32data)) {
$db->addHostPageDescription($queueHostPage->hostPageId,
$crc32data,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : $content,
time());
}
$db->setHostPageDescription($queueHostPage->hostPageId,
crc32($content),
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
$queueHostPage->crawlMetaOnly ? null : $content,
time());
// Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
@ -547,7 +551,7 @@ try {
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
@ -610,20 +614,16 @@ try {
}
}
// Host image exists or created new one
if ($hostImageId) {
// Add/update host image description
$db->setHostImageDescription($hostImageId,
null, // no data, download it in the crawler queue
Filter::imageAlt($imageAlt),
Filter::imageTitle($imageTitle),
null,
time());
// Add/update host image description
$db->setHostImageDescription($hostImageId,
crc32(md5((string) $imageAlt . (string) $imageTitle)),
Filter::imageAlt($imageAlt),
Filter::imageTitle($imageTitle),
time(),
time());
// Relate host image with host page was found
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), time(), 1);
}
// Relate host image with host page was found
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
// Increase image rank when link does not match the current host
if ($hostImageURL->scheme . '://' .

Binary file not shown.

View File

@ -102,11 +102,11 @@ class MySQL {
return $query->fetch()->total;
}
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]);
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $robots, $robotsPostfix]);
return $this->_db->lastInsertId();
}
@ -241,13 +241,12 @@ class MySQL {
public function updateHostImage(int $hostImageId,
string $mime,
mixed $data,
int $timeUpdated,
mixed $timeBanned = null) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$mime, $data, $timeUpdated, $timeBanned, $hostImageId]);
$query->execute([$mime, $timeUpdated, $timeBanned, $hostImageId]);
return $query->rowCount();
}
@ -261,10 +260,15 @@ class MySQL {
return $query->rowCount();
}
public function setHostImageDescription(int $hostImageId, int $crc32id, string $alt, string $title, int $timeAdded, int $timeUpdated) {
public function setHostImageDescription(int $hostImageId,
mixed $crc32data,
string $alt,
string $title,
mixed $data,
int $time) {
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
`crc32id`,
`crc32data`,
`alt`,
`title`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)
@ -273,7 +277,7 @@ class MySQL {
`title` = ?,
`timeUpdated` = ?');
$query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded, $alt, $title, $timeUpdated]);
$query->execute([$hostImageId, $crc32data, $alt, $title, $time, $alt, $title, $time]);
return $this->_db->lastInsertId();
}
@ -287,6 +291,15 @@ class MySQL {
return $query->rowCount();
}
public function getLastHostImageDescription(int $hostImageId) {
$query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
$query->execute([$hostImageId]);
return $query->fetch();
}
public function getHostImageHostPages(int $hostImageId, int $limit = 5) {
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage`
@ -312,7 +325,7 @@ class MySQL {
return $query->fetch()->total;
}
public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) {
public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $time, int $quantity) {
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
`hostPageId`,
@ -323,7 +336,7 @@ class MySQL {
ON DUPLICATE KEY UPDATE `timeUpdated` = ?,
`quantity` = `quantity` + ' . (int) $quantity);
$query->execute([$hostImageId, $hostPageId, $timeAdded, null, $quantity, $timeUpdated]);
$query->execute([$hostImageId, $hostPageId, $time, null, $quantity, $time]);
return $query->rowCount(); // no primary key
}
@ -402,7 +415,7 @@ class MySQL {
public function getLastPageDescription(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
$query->execute([$hostPageId]);
@ -438,11 +451,7 @@ class MySQL {
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
(SELECT GROUP_CONCAT(CONCAT_WS(" ", `hostImageDescription`.`alt`, `hostImageDescription`.`title`))
FROM `hostImageDescription`
WHERE `hostImageDescription`.`hostImageId` = `hostImage`.`hostImageId`) AS `description`
`host`.`crawlMetaOnly`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
@ -560,13 +569,13 @@ class MySQL {
return $query->rowCount();
}
public function addHostPageDescription(int $hostPageId,
public function setHostPageDescription(int $hostPageId,
int $crc32data,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
mixed $data,
int $timeAdded) {
int $time) {
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
`crc32data`,
@ -575,7 +584,9 @@ class MySQL {
`metaKeywords`,
`data`,
`timeAdded`
) VALUES (?, ?, ?, ?, ?, ?, ?)');
) VALUES (?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
$query->execute([
$hostPageId,
@ -584,7 +595,8 @@ class MySQL {
$metaDescription,
$metaKeywords,
$data,
$timeAdded
$time,
$time
]);
return $query->rowCount();
@ -615,6 +627,15 @@ class MySQL {
return $query->rowCount();
}
public function deleteHostPageDescriptionsByTimeAdded(int $timeOffset) {
$query = $this->_db->prepare('DELETE FROM `hostPageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function resetBannedHostImages(int $timeOffset) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
@ -624,12 +645,23 @@ class MySQL {
return $query->rowCount();
}
public function deleteHostImageDescriptionsByTimeAdded(int $timeOffset) {
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function addCleanerLog(int $timeAdded,
int $hostsTotal,
int $hostsUpdated,
int $hostPagesDeleted,
int $hostPageDescriptionsDeleted,
int $hostPagesBansRemoved,
int $hostImagesDeleted,
int $hostImageDescriptionsDeleted,
int $hostImagesBansRemoved,
int $manifestsTotal,
int $manifestsDeleted,
@ -645,8 +677,10 @@ class MySQL {
`hostsTotal`,
`hostsUpdated`,
`hostPagesDeleted`,
`hostPageDescriptionsDeleted`,
`hostPagesBansRemoved`,
`hostImagesDeleted`,
`hostImageDescriptionsDeleted`,
`hostImagesBansRemoved`,
`manifestsTotal`,
`manifestsDeleted`,
@ -656,15 +690,17 @@ class MySQL {
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
$hostsTotal,
$hostsUpdated,
$hostPagesDeleted,
$hostPageDescriptionsDeleted,
$hostPagesBansRemoved,
$hostImagesDeleted,
$hostImageDescriptionsDeleted,
$hostImagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
@ -700,7 +736,7 @@ class MySQL {
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlImageLimit`,
`host`.`crawlPageMetaOnly`,
`host`.`crawlMetaOnly`,
`host`.`robots`,
`host`.`robotsPostfix`
@ -735,7 +771,8 @@ class MySQL {
`hostImage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
`host`.`port`,
`host`.`crawlMetaOnly`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 126 KiB

After

Width:  |  Height:  |  Size: 135 KiB

View File

@ -418,9 +418,21 @@ if (!empty($q)) {
// Save image content on data settings enabled
$db->updateHostImage($hostImage->hostImageId,
Filter::mime($hostImageContentType),
CRAWL_HOST_DEFAULT_META_ONLY ? null : $hostImageURLencoded,
time());
// Set host image description
// On link collection we knew meta but data,
// this step use latest description slice and insert the data received by curl request
if ($lastHostImageDescription = $db->getLastHostImageDescription($hostImage->hostImageId)) {
$db->setHostImageDescription($hostImage->hostImageId,
crc32($hostImageData),
$lastHostImageDescription->alt,
$lastHostImageDescription->title,
$hostImage->crawlMetaOnly ? null : $hostImageData,
time());
}
// Local image data exists
} else {
@ -439,8 +451,8 @@ if (!empty($q)) {
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
<?php } ?>
<?php if (!empty($hostImage->description)) { ?>
<span><?php echo $hostImage->description ?></span>
<?php if ($lastHostImageDescription = $db->getLastHostImageDescription($result->id)) { ?>
<span><?php echo $lastHostImageDescription->title ?> <?php echo $lastHostImageDescription->alt ?></span>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />