mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
update page / image description models, implement history snap crawling
This commit is contained in:
parent
77bd25f587
commit
23ead4e12c
@ -255,7 +255,7 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
|
|||||||
* Index only meta tags to prevent disk overuse
|
* Index only meta tags to prevent disk overuse
|
||||||
* or false to save meta tags + overall plain text page content
|
* or false to save meta tags + overall plain text page content
|
||||||
*
|
*
|
||||||
* Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field
|
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
|
||||||
*
|
*
|
||||||
* This option able to change search results relevance
|
* This option able to change search results relevance
|
||||||
* This option enables image data caching in base64
|
* This option enables image data caching in base64
|
||||||
@ -367,6 +367,12 @@ define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30);
|
|||||||
*/
|
*/
|
||||||
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove page description history after following time
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Remove image ban after following time
|
* Remove image ban after following time
|
||||||
*
|
*
|
||||||
@ -376,6 +382,12 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
|||||||
*/
|
*/
|
||||||
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove image description history after following time
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CLEAN_IMAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
|
||||||
|
|
||||||
// API settings
|
// API settings
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -21,7 +21,7 @@ source hostPage : common
|
|||||||
hostPageDescription.metaKeywords) \
|
hostPageDescription.metaKeywords) \
|
||||||
FROM hostPageDescription \
|
FROM hostPageDescription \
|
||||||
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
|
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
|
||||||
ORDER BY hostPageDescription.timeAdded DESC \
|
ORDER BY hostPageDescription.timeUpdated DESC, hostPageDescription.timeAdded DESC \
|
||||||
LIMIT 1) AS pageDescription \
|
LIMIT 1) AS pageDescription \
|
||||||
FROM hostPage \
|
FROM hostPage \
|
||||||
JOIN host ON (host.hostId = hostPage.hostId) \
|
JOIN host ON (host.hostId = hostPage.hostId) \
|
||||||
@ -37,8 +37,8 @@ source hostImage : common
|
|||||||
(SELECT CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title) \
|
(SELECT CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title) \
|
||||||
FROM hostImageDescription \
|
FROM hostImageDescription \
|
||||||
WHERE hostImageDescription.hostImageId = hostImage.hostImageId \
|
WHERE hostImageDescription.hostImageId = hostImage.hostImageId \
|
||||||
ORDER BY hostImageDescription.timeAdded \
|
ORDER BY hostImageDescription.timeUpdated DESC, hostImageDescription.timeAdded DESC \
|
||||||
DESC LIMIT 1) AS imageDescription \
|
LIMIT 1) AS imageDescription \
|
||||||
FROM hostImage \
|
FROM hostImage \
|
||||||
JOIN host ON (host.hostId = hostImage.hostId) \
|
JOIN host ON (host.hostId = hostImage.hostId) \
|
||||||
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
|
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
|
||||||
|
@ -21,22 +21,24 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
|||||||
// Debug
|
// Debug
|
||||||
$timeStart = microtime(true);
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
$httpRequestsTotal = 0;
|
$httpRequestsTotal = 0;
|
||||||
$httpRequestsSizeTotal = 0;
|
$httpRequestsSizeTotal = 0;
|
||||||
$httpDownloadSizeTotal = 0;
|
$httpDownloadSizeTotal = 0;
|
||||||
$httpRequestsTimeTotal = 0;
|
$httpRequestsTimeTotal = 0;
|
||||||
|
|
||||||
$hostsTotal = $db->getTotalHosts();
|
$hostsTotal = $db->getTotalHosts();
|
||||||
$manifestsTotal = $db->getTotalManifests();
|
$manifestsTotal = $db->getTotalManifests();
|
||||||
$hostsUpdated = 0;
|
$hostsUpdated = 0;
|
||||||
$hostPagesDeleted = 0;
|
$hostPagesDeleted = 0;
|
||||||
$hostImagesDeleted = 0;
|
$hostPageDescriptionsDeleted = 0;
|
||||||
$manifestsDeleted = 0;
|
$hostImagesDeleted = 0;
|
||||||
$hostPagesBansRemoved = 0;
|
$hostImageDescriptionsDeleted = 0;
|
||||||
$hostImagesBansRemoved = 0;
|
$manifestsDeleted = 0;
|
||||||
|
$hostPagesBansRemoved = 0;
|
||||||
|
$hostImagesBansRemoved = 0;
|
||||||
|
|
||||||
$logsCleanerDeleted = 0;
|
$logsCleanerDeleted = 0;
|
||||||
$logsCrawlerDeleted = 0;
|
$logsCrawlerDeleted = 0;
|
||||||
|
|
||||||
// Begin update
|
// Begin update
|
||||||
$db->beginTransaction();
|
$db->beginTransaction();
|
||||||
@ -202,9 +204,15 @@ try {
|
|||||||
// Reset banned pages
|
// Reset banned pages
|
||||||
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
|
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
|
||||||
|
|
||||||
|
// Delete page description history
|
||||||
|
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
|
||||||
|
|
||||||
// Reset banned images
|
// Reset banned images
|
||||||
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
|
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
|
||||||
|
|
||||||
|
// Delete image description history
|
||||||
|
$hostImageDescriptionsDeleted += $db->deleteHostImageDescriptionsByTimeAdded(time() - CLEAN_IMAGE_DESCRIPTION_OFFSET);
|
||||||
|
|
||||||
// Delete deprecated logs
|
// Delete deprecated logs
|
||||||
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
||||||
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
|
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
|
||||||
@ -228,8 +236,10 @@ if (CLEAN_LOG_ENABLED) {
|
|||||||
$hostsTotal,
|
$hostsTotal,
|
||||||
$hostsUpdated,
|
$hostsUpdated,
|
||||||
$hostPagesDeleted,
|
$hostPagesDeleted,
|
||||||
|
$hostPageDescriptionsDeleted,
|
||||||
$hostPagesBansRemoved,
|
$hostPagesBansRemoved,
|
||||||
$hostImagesDeleted,
|
$hostImagesDeleted,
|
||||||
|
$hostImageDescriptionsDeleted,
|
||||||
$hostImagesBansRemoved,
|
$hostImagesBansRemoved,
|
||||||
$manifestsTotal,
|
$manifestsTotal,
|
||||||
$manifestsDeleted,
|
$manifestsDeleted,
|
||||||
@ -252,7 +262,9 @@ echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
|||||||
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
||||||
|
|
||||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||||
|
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL;
|
||||||
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
|
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
|
||||||
|
echo 'Host image descriptions deleted: ' . $hostImageDescriptionsDeleted . PHP_EOL;
|
||||||
|
|
||||||
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
|
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
|
||||||
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
|
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
|
||||||
|
@ -301,7 +301,7 @@ try {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Convert remote image data to base64 string
|
// Convert remote image data to base64 string
|
||||||
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
if (!$queueHostImage->crawlMetaOnly) {
|
||||||
|
|
||||||
// Skip image processing without returned content
|
// Skip image processing without returned content
|
||||||
if (!$hostImageContent = $curl->getContent()) {
|
if (!$hostImageContent = $curl->getContent()) {
|
||||||
@ -327,14 +327,22 @@ try {
|
|||||||
|
|
||||||
$hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
|
$hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
|
||||||
|
|
||||||
} else {
|
// Set host image description
|
||||||
|
// On link collection we knew meta but data,
|
||||||
|
// this step use latest description slice and insert the data received by curl request
|
||||||
|
if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) {
|
||||||
|
|
||||||
$hostImageData = null;
|
$db->setHostImageDescription($queueHostImage->hostImageId,
|
||||||
|
crc32($hostImageData),
|
||||||
|
$lastHostImageDescription->alt,
|
||||||
|
$lastHostImageDescription->title,
|
||||||
|
$hostImageData,
|
||||||
|
time());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
|
$hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
|
||||||
Filter::mime($hostImageContentType),
|
Filter::mime($hostImageContentType),
|
||||||
$hostImageData,
|
|
||||||
time());
|
time());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -465,17 +473,13 @@ try {
|
|||||||
$content = Filter::pageData($content);
|
$content = Filter::pageData($content);
|
||||||
|
|
||||||
// Add queued page description if not exists
|
// Add queued page description if not exists
|
||||||
$crc32data = crc32($content);
|
$db->setHostPageDescription($queueHostPage->hostPageId,
|
||||||
|
crc32($content),
|
||||||
if (!$db->getHostPageDescription($queueHostPage->hostPageId, $crc32data)) {
|
Filter::pageTitle($title->item(0)->nodeValue),
|
||||||
$db->addHostPageDescription($queueHostPage->hostPageId,
|
Filter::pageDescription($metaDescription),
|
||||||
$crc32data,
|
Filter::pageKeywords($metaKeywords),
|
||||||
Filter::pageTitle($title->item(0)->nodeValue),
|
$queueHostPage->crawlMetaOnly ? null : $content,
|
||||||
Filter::pageDescription($metaDescription),
|
time());
|
||||||
Filter::pageKeywords($metaKeywords),
|
|
||||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : $content,
|
|
||||||
time());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update manifest registry
|
// Update manifest registry
|
||||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||||
@ -547,7 +551,7 @@ try {
|
|||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$httpRequestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
@ -610,20 +614,16 @@ try {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Host image exists or created new one
|
// Add/update host image description
|
||||||
if ($hostImageId) {
|
$db->setHostImageDescription($hostImageId,
|
||||||
|
null, // no data, download it in the crawler queue
|
||||||
|
Filter::imageAlt($imageAlt),
|
||||||
|
Filter::imageTitle($imageTitle),
|
||||||
|
null,
|
||||||
|
time());
|
||||||
|
|
||||||
// Add/update host image description
|
// Relate host image with host page was found
|
||||||
$db->setHostImageDescription($hostImageId,
|
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
|
||||||
crc32(md5((string) $imageAlt . (string) $imageTitle)),
|
|
||||||
Filter::imageAlt($imageAlt),
|
|
||||||
Filter::imageTitle($imageTitle),
|
|
||||||
time(),
|
|
||||||
time());
|
|
||||||
|
|
||||||
// Relate host image with host page was found
|
|
||||||
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), time(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Increase image rank when link does not match the current host
|
// Increase image rank when link does not match the current host
|
||||||
if ($hostImageURL->scheme . '://' .
|
if ($hostImageURL->scheme . '://' .
|
||||||
|
Binary file not shown.
@ -102,11 +102,11 @@ class MySQL {
|
|||||||
return $query->fetch()->total;
|
return $query->fetch()->total;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
|
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]);
|
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $robots, $robotsPostfix]);
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
@ -241,13 +241,12 @@ class MySQL {
|
|||||||
|
|
||||||
public function updateHostImage(int $hostImageId,
|
public function updateHostImage(int $hostImageId,
|
||||||
string $mime,
|
string $mime,
|
||||||
mixed $data,
|
|
||||||
int $timeUpdated,
|
int $timeUpdated,
|
||||||
mixed $timeBanned = null) {
|
mixed $timeBanned = null) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
|
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||||
|
|
||||||
$query->execute([$mime, $data, $timeUpdated, $timeBanned, $hostImageId]);
|
$query->execute([$mime, $timeUpdated, $timeBanned, $hostImageId]);
|
||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
@ -261,10 +260,15 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function setHostImageDescription(int $hostImageId, int $crc32id, string $alt, string $title, int $timeAdded, int $timeUpdated) {
|
public function setHostImageDescription(int $hostImageId,
|
||||||
|
mixed $crc32data,
|
||||||
|
string $alt,
|
||||||
|
string $title,
|
||||||
|
mixed $data,
|
||||||
|
int $time) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
|
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
|
||||||
`crc32id`,
|
`crc32data`,
|
||||||
`alt`,
|
`alt`,
|
||||||
`title`,
|
`title`,
|
||||||
`timeAdded`) VALUES (?, ?, ?, ?, ?)
|
`timeAdded`) VALUES (?, ?, ?, ?, ?)
|
||||||
@ -273,7 +277,7 @@ class MySQL {
|
|||||||
`title` = ?,
|
`title` = ?,
|
||||||
`timeUpdated` = ?');
|
`timeUpdated` = ?');
|
||||||
|
|
||||||
$query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded, $alt, $title, $timeUpdated]);
|
$query->execute([$hostImageId, $crc32data, $alt, $title, $time, $alt, $title, $time]);
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
@ -287,6 +291,15 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getLastHostImageDescription(int $hostImageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostImageId]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
public function getHostImageHostPages(int $hostImageId, int $limit = 5) {
|
public function getHostImageHostPages(int $hostImageId, int $limit = 5) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage`
|
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage`
|
||||||
@ -312,7 +325,7 @@ class MySQL {
|
|||||||
return $query->fetch()->total;
|
return $query->fetch()->total;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) {
|
public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $time, int $quantity) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
|
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
|
||||||
`hostPageId`,
|
`hostPageId`,
|
||||||
@ -323,7 +336,7 @@ class MySQL {
|
|||||||
ON DUPLICATE KEY UPDATE `timeUpdated` = ?,
|
ON DUPLICATE KEY UPDATE `timeUpdated` = ?,
|
||||||
`quantity` = `quantity` + ' . (int) $quantity);
|
`quantity` = `quantity` + ' . (int) $quantity);
|
||||||
|
|
||||||
$query->execute([$hostImageId, $hostPageId, $timeAdded, null, $quantity, $timeUpdated]);
|
$query->execute([$hostImageId, $hostPageId, $time, null, $quantity, $time]);
|
||||||
|
|
||||||
return $query->rowCount(); // no primary key
|
return $query->rowCount(); // no primary key
|
||||||
}
|
}
|
||||||
@ -402,7 +415,7 @@ class MySQL {
|
|||||||
|
|
||||||
public function getLastPageDescription(int $hostPageId) {
|
public function getLastPageDescription(int $hostPageId) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
|
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
|
||||||
|
|
||||||
$query->execute([$hostPageId]);
|
$query->execute([$hostPageId]);
|
||||||
|
|
||||||
@ -438,11 +451,7 @@ class MySQL {
|
|||||||
`host`.`scheme`,
|
`host`.`scheme`,
|
||||||
`host`.`name`,
|
`host`.`name`,
|
||||||
`host`.`port`,
|
`host`.`port`,
|
||||||
|
`host`.`crawlMetaOnly`
|
||||||
(SELECT GROUP_CONCAT(CONCAT_WS(" ", `hostImageDescription`.`alt`, `hostImageDescription`.`title`))
|
|
||||||
|
|
||||||
FROM `hostImageDescription`
|
|
||||||
WHERE `hostImageDescription`.`hostImageId` = `hostImage`.`hostImageId`) AS `description`
|
|
||||||
|
|
||||||
FROM `hostImage`
|
FROM `hostImage`
|
||||||
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
||||||
@ -560,13 +569,13 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function addHostPageDescription(int $hostPageId,
|
public function setHostPageDescription(int $hostPageId,
|
||||||
int $crc32data,
|
int $crc32data,
|
||||||
mixed $metaTitle,
|
mixed $metaTitle,
|
||||||
mixed $metaDescription,
|
mixed $metaDescription,
|
||||||
mixed $metaKeywords,
|
mixed $metaKeywords,
|
||||||
mixed $data,
|
mixed $data,
|
||||||
int $timeAdded) {
|
int $time) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
|
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
|
||||||
`crc32data`,
|
`crc32data`,
|
||||||
@ -575,7 +584,9 @@ class MySQL {
|
|||||||
`metaKeywords`,
|
`metaKeywords`,
|
||||||
`data`,
|
`data`,
|
||||||
`timeAdded`
|
`timeAdded`
|
||||||
) VALUES (?, ?, ?, ?, ?, ?, ?)');
|
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
|
||||||
|
ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
|
||||||
|
|
||||||
$query->execute([
|
$query->execute([
|
||||||
$hostPageId,
|
$hostPageId,
|
||||||
@ -584,7 +595,8 @@ class MySQL {
|
|||||||
$metaDescription,
|
$metaDescription,
|
||||||
$metaKeywords,
|
$metaKeywords,
|
||||||
$data,
|
$data,
|
||||||
$timeAdded
|
$time,
|
||||||
|
$time
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
@ -615,6 +627,15 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function deleteHostPageDescriptionsByTimeAdded(int $timeOffset) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostPageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
public function resetBannedHostImages(int $timeOffset) {
|
public function resetBannedHostImages(int $timeOffset) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
||||||
@ -624,12 +645,23 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function deleteHostImageDescriptionsByTimeAdded(int $timeOffset) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
public function addCleanerLog(int $timeAdded,
|
public function addCleanerLog(int $timeAdded,
|
||||||
int $hostsTotal,
|
int $hostsTotal,
|
||||||
int $hostsUpdated,
|
int $hostsUpdated,
|
||||||
int $hostPagesDeleted,
|
int $hostPagesDeleted,
|
||||||
|
int $hostPageDescriptionsDeleted,
|
||||||
int $hostPagesBansRemoved,
|
int $hostPagesBansRemoved,
|
||||||
int $hostImagesDeleted,
|
int $hostImagesDeleted,
|
||||||
|
int $hostImageDescriptionsDeleted,
|
||||||
int $hostImagesBansRemoved,
|
int $hostImagesBansRemoved,
|
||||||
int $manifestsTotal,
|
int $manifestsTotal,
|
||||||
int $manifestsDeleted,
|
int $manifestsDeleted,
|
||||||
@ -645,8 +677,10 @@ class MySQL {
|
|||||||
`hostsTotal`,
|
`hostsTotal`,
|
||||||
`hostsUpdated`,
|
`hostsUpdated`,
|
||||||
`hostPagesDeleted`,
|
`hostPagesDeleted`,
|
||||||
|
`hostPageDescriptionsDeleted`,
|
||||||
`hostPagesBansRemoved`,
|
`hostPagesBansRemoved`,
|
||||||
`hostImagesDeleted`,
|
`hostImagesDeleted`,
|
||||||
|
`hostImageDescriptionsDeleted`,
|
||||||
`hostImagesBansRemoved`,
|
`hostImagesBansRemoved`,
|
||||||
`manifestsTotal`,
|
`manifestsTotal`,
|
||||||
`manifestsDeleted`,
|
`manifestsDeleted`,
|
||||||
@ -656,15 +690,17 @@ class MySQL {
|
|||||||
`httpRequestsSizeTotal`,
|
`httpRequestsSizeTotal`,
|
||||||
`httpDownloadSizeTotal`,
|
`httpDownloadSizeTotal`,
|
||||||
`httpRequestsTimeTotal`,
|
`httpRequestsTimeTotal`,
|
||||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
$query->execute([
|
$query->execute([
|
||||||
$timeAdded,
|
$timeAdded,
|
||||||
$hostsTotal,
|
$hostsTotal,
|
||||||
$hostsUpdated,
|
$hostsUpdated,
|
||||||
$hostPagesDeleted,
|
$hostPagesDeleted,
|
||||||
|
$hostPageDescriptionsDeleted,
|
||||||
$hostPagesBansRemoved,
|
$hostPagesBansRemoved,
|
||||||
$hostImagesDeleted,
|
$hostImagesDeleted,
|
||||||
|
$hostImageDescriptionsDeleted,
|
||||||
$hostImagesBansRemoved,
|
$hostImagesBansRemoved,
|
||||||
$manifestsTotal,
|
$manifestsTotal,
|
||||||
$manifestsDeleted,
|
$manifestsDeleted,
|
||||||
@ -700,7 +736,7 @@ class MySQL {
|
|||||||
`host`.`port`,
|
`host`.`port`,
|
||||||
`host`.`crawlPageLimit`,
|
`host`.`crawlPageLimit`,
|
||||||
`host`.`crawlImageLimit`,
|
`host`.`crawlImageLimit`,
|
||||||
`host`.`crawlPageMetaOnly`,
|
`host`.`crawlMetaOnly`,
|
||||||
`host`.`robots`,
|
`host`.`robots`,
|
||||||
`host`.`robotsPostfix`
|
`host`.`robotsPostfix`
|
||||||
|
|
||||||
@ -735,7 +771,8 @@ class MySQL {
|
|||||||
`hostImage`.`uri`,
|
`hostImage`.`uri`,
|
||||||
`host`.`scheme`,
|
`host`.`scheme`,
|
||||||
`host`.`name`,
|
`host`.`name`,
|
||||||
`host`.`port`
|
`host`.`port`,
|
||||||
|
`host`.`crawlMetaOnly`
|
||||||
|
|
||||||
FROM `hostImage`
|
FROM `hostImage`
|
||||||
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 126 KiB After Width: | Height: | Size: 135 KiB |
@ -418,9 +418,21 @@ if (!empty($q)) {
|
|||||||
// Save image content on data settings enabled
|
// Save image content on data settings enabled
|
||||||
$db->updateHostImage($hostImage->hostImageId,
|
$db->updateHostImage($hostImage->hostImageId,
|
||||||
Filter::mime($hostImageContentType),
|
Filter::mime($hostImageContentType),
|
||||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : $hostImageURLencoded,
|
|
||||||
time());
|
time());
|
||||||
|
|
||||||
|
// Set host image description
|
||||||
|
// On link collection we knew meta but data,
|
||||||
|
// this step use latest description slice and insert the data received by curl request
|
||||||
|
if ($lastHostImageDescription = $db->getLastHostImageDescription($hostImage->hostImageId)) {
|
||||||
|
|
||||||
|
$db->setHostImageDescription($hostImage->hostImageId,
|
||||||
|
crc32($hostImageData),
|
||||||
|
$lastHostImageDescription->alt,
|
||||||
|
$lastHostImageDescription->title,
|
||||||
|
$hostImage->crawlMetaOnly ? null : $hostImageData,
|
||||||
|
time());
|
||||||
|
}
|
||||||
|
|
||||||
// Local image data exists
|
// Local image data exists
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
@ -439,8 +451,8 @@ if (!empty($q)) {
|
|||||||
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
|
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
|
||||||
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
|
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
<?php if (!empty($hostImage->description)) { ?>
|
<?php if ($lastHostImageDescription = $db->getLastHostImageDescription($result->id)) { ?>
|
||||||
<span><?php echo $hostImage->description ?></span>
|
<span><?php echo $lastHostImageDescription->title ?> <?php echo $lastHostImageDescription->alt ?></span>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
<a href="<?php echo $hostPageURL ?>">
|
<a href="<?php echo $hostPageURL ?>">
|
||||||
<img src="<?php echo WEBSITE_DOMAIN ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
|
<img src="<?php echo WEBSITE_DOMAIN ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
|
||||||
|
Loading…
x
Reference in New Issue
Block a user