add crawler/cleaner logs

This commit is contained in:
ghost 2023-05-08 11:04:59 +03:00
parent dcdc2c50ad
commit 25b6bce2ec
7 changed files with 316 additions and 82 deletions

View File

@ -169,19 +169,35 @@ GET m=SphinxQL
##### Crawler
* [x] Auto crawl links by regular expression rules
+ [x] Pages
+ [x] Images
+ [x] Manifests
* [x] Robots.txt / robots meta tags support (#2)
* [x] Specific rules configuration for every host
* [x] Deprecated index auto cleaner
* [x] Auto stop crawling on disk quota reached
* [x] Transactions support to prevent data loss on queue failures
* [x] Distributed index crawling between YGGo nodes trough manifest API
* [x] MIME Content-type crawler settings
* [x] Ban non-condition links to prevent extra requests
* [x] Debug log
* [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing
* [ ] Palette image index / filter
* [ ] Crawl queue balancer, that depends of CPU available
##### Cleaner
* [x] Deprecated DB items auto deletion / host settings update
+ [x] Pages
+ [x] Images
+ [x] Manifests
+ [x] Logs
+ [x] Crawler
+ [x] Cleaner
* [x] Banned resources reset by timeout
+ [x] Pages
+ [x] Images
* [x] Debug log
##### Other
* [ ] Administrative panel for useful index moderation

View File

@ -98,6 +98,18 @@ define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGver
// Crawl settings
/*
* Save crawler debug to `logCrawler` table
*
*/
define('CRAWL_LOG_ENABLED', true);
/*
* Auto clean `logCrawler` items older seconds offset
*
*/
define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30);
/*
* Crawler / Bot User Agent name
*
@ -311,6 +323,18 @@ define('CRAWL_MANIFEST_DEFAULT_STATUS', true);
// Cleaner settings
/*
* Save cleaner debug to `logCleaner` table
*
*/
define('CLEAN_LOG_ENABLED', true);
/*
* Auto clean `logCleaner` items older seconds offset
*
*/
define('CLEAN_LOG_SECONDS_OFFSET', 60*60*24*30);
/*
* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
*

View File

@ -21,20 +21,23 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Debug
$timeStart = microtime(true);
$requestsTotal = 0;
$requestSizeTotal = 0;
$downloadSizeTotal = 0;
$requestsTotalTime = 0;
$httpRequestsTotal = 0;
$httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;
$hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostsPagesDeleted = 0;
$hostsImagesDeleted = 0;
$hostPagesDeleted = 0;
$hostImagesDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
$hostImagesBansRemoved = 0;
$logsCleanerDeleted = 0;
$logsCrawlerDeleted = 0;
// Begin update
$db->beginTransaction();
@ -50,10 +53,10 @@ try {
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
@ -76,7 +79,7 @@ try {
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
@ -91,7 +94,7 @@ try {
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}
@ -107,7 +110,7 @@ try {
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
@ -119,7 +122,7 @@ try {
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}
@ -131,7 +134,7 @@ try {
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
@ -143,10 +146,10 @@ try {
$curl = new Curl($manifest->url);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Skip processing non 200 code
if (200 != $curl->getCode()) {
@ -198,6 +201,10 @@ try {
// Reset banned images
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
// Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
$db->commit();
} catch(Exception $e){
@ -208,10 +215,34 @@ try {
}
// Debug
$executionTimeTotal = microtime(true) - $timeStart;
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
if (CLEAN_LOG_ENABLED) {
$db->addCleanerLog( time(),
$hostsTotal,
$hostsUpdated,
$hostPagesDeleted,
$hostPagesBansRemoved,
$hostImagesDeleted,
$hostImagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
$logsCleanerDeleted,
$logsCrawlerDeleted,
$httpRequestsTotal,
$httpRequestsSizeTotal,
$httpDownloadSizeTotal,
$httpRequestsTimeTotal,
$executionTimeTotal);
}
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
echo 'Hosts images deleted: ' . $hostImagesDeleted . PHP_EOL;
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
@ -219,9 +250,12 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
echo 'HTTP Requests total: ' . $requestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $requestSizeTotal . PHP_EOL;
echo 'HTTP Download total size: ' . $downloadSizeTotal . PHP_EOL;
echo 'HTTP Requests total time: ' . $requestsTotalTime / 1000000 . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;

View File

@ -27,22 +27,22 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
// Debug
$timeStart = microtime(true);
$requestsTotal = 0;
$requestSizeTotal = 0;
$downloadSizeTotal = 0;
$requestsTotalTime = 0;
$httpRequestsTotal = 0;
$httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;
$hostPagesProcessed = 0;
$hostImagesProcessed = 0;
$manifestsProcessed = 0;
$hostPagesIndexed = 0;
$hostImagesIndexed = 0;
$manifestsIndexed = 0;
$hostPagesAdded = 0;
$hostImagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
$hostImagesBanned = 0;
$hostPagesProcessed = 0;
$hostImagesProcessed = 0;
$manifestsProcessed = 0;
$hostPagesIndexed = 0;
$hostImagesIndexed = 0;
$manifestsAdded = 0;
$hostPagesAdded = 0;
$hostImagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
$hostImagesBanned = 0;
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@ -57,10 +57,10 @@ try {
$curl = new Curl($queueManifest->url);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update manifest index anyway, with the current time and http code
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
@ -120,10 +120,10 @@ try {
$curl = new Curl($remoteManifest->result->api->hosts);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Skip processing non 200 code
if (200 != $curl->getCode()) {
@ -184,10 +184,10 @@ try {
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
@ -254,10 +254,10 @@ try {
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
@ -334,10 +334,10 @@ try {
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
@ -447,6 +447,8 @@ try {
$metaYggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
$manifestsAdded++;
}
}
@ -504,10 +506,10 @@ try {
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
@ -666,10 +668,10 @@ try {
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
@ -745,24 +747,49 @@ try {
}
// Debug
$executionTimeTotal = microtime(true) - $timeStart;
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
if (CRAWL_LOG_ENABLED) {
$db->addCrawlerLog(time(),
$hostsAdded,
$hostPagesProcessed,
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesBanned,
$hostImagesIndexed,
$hostImagesProcessed,
$hostImagesAdded,
$hostImagesBanned,
$manifestsProcessed,
$manifestsAdded,
$httpRequestsTotal,
$httpRequestsSizeTotal,
$httpDownloadSizeTotal,
$httpRequestsTimeTotal,
$executionTimeTotal);
}
// Debug output
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Images banned: ' . $hostImagesBanned . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL;
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Hosts pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Hosts images banned: ' . $hostImagesBanned . PHP_EOL;
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
echo 'HTTP Requests total: ' . $requestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $requestSizeTotal . PHP_EOL;
echo 'HTTP Download total size: ' . $downloadSizeTotal . PHP_EOL;
echo 'HTTP Requests total time: ' . $requestsTotalTime / 1000000 . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;

Binary file not shown.

View File

@ -564,7 +564,7 @@ class MySQL {
public function resetBannedHostPages(int $timeOffset) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` > ' . (int) $timeOffset);
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
$query->execute();
@ -573,7 +573,72 @@ class MySQL {
public function resetBannedHostImages(int $timeOffset) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` > ' . (int) $timeOffset);
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function addCleanerLog(int $timeAdded,
int $hostsTotal,
int $hostsUpdated,
int $hostPagesDeleted,
int $hostPagesBansRemoved,
int $hostImagesDeleted,
int $hostImagesBansRemoved,
int $manifestsTotal,
int $manifestsDeleted,
int $logsCleanerDeleted,
int $logsCrawlerDeleted,
int $httpRequestsTotal,
int $httpRequestsSizeTotal,
int $httpDownloadSizeTotal,
float $httpRequestsTimeTotal,
float $executionTimeTotal) {
$query = $this->_db->prepare('INSERT INTO `logCleaner` (`timeAdded`,
`hostsTotal`,
`hostsUpdated`,
`hostPagesDeleted`,
`hostPagesBansRemoved`,
`hostImagesDeleted`,
`hostImagesBansRemoved`,
`manifestsTotal`,
`manifestsDeleted`,
`logsCleanerDeleted`,
`logsCrawlerDeleted`,
`httpRequestsTotal`,
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
$hostsTotal,
$hostsUpdated,
$hostPagesDeleted,
$hostPagesBansRemoved,
$hostImagesDeleted,
$hostImagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
$logsCleanerDeleted,
$logsCrawlerDeleted,
$httpRequestsTotal,
$httpRequestsSizeTotal,
$httpDownloadSizeTotal,
$httpRequestsTimeTotal,
$executionTimeTotal
]);
return $this->_db->lastInsertId();
}
public function deleteLogCleaner(int $timeOffset) {
$query = $this->_db->prepare('DELETE FROM `logCleaner` WHERE `timeAdded` < ' . (int) $timeOffset);
$query->execute();
@ -675,4 +740,72 @@ class MySQL {
return $query->rowCount();
}
public function addCrawlerLog(int $timeAdded,
int $hostsAdded,
int $hostPagesProcessed,
int $hostPagesIndexed,
int $hostPagesAdded,
int $hostPagesBanned,
int $hostImagesIndexed,
int $hostImagesProcessed,
int $hostImagesAdded,
int $hostImagesBanned,
int $manifestsProcessed,
int $manifestsAdded,
int $httpRequestsTotal,
int $httpRequestsSizeTotal,
int $httpDownloadSizeTotal,
float $httpRequestsTimeTotal,
float $executionTimeTotal) {
$query = $this->_db->prepare('INSERT INTO `logCrawler` (`timeAdded`,
`hostsAdded`,
`hostPagesProcessed`,
`hostPagesIndexed`,
`hostPagesAdded`,
`hostPagesBanned`,
`hostImagesIndexed`,
`hostImagesProcessed`,
`hostImagesAdded`,
`hostImagesBanned`,
`manifestsProcessed`,
`manifestsAdded`,
`httpRequestsTotal`,
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
$hostsAdded,
$hostPagesProcessed,
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesBanned,
$hostImagesIndexed,
$hostImagesProcessed,
$hostImagesAdded,
$hostImagesBanned,
$manifestsProcessed,
$manifestsAdded,
$httpRequestsTotal,
$httpRequestsSizeTotal,
$httpDownloadSizeTotal,
$httpRequestsTimeTotal,
$executionTimeTotal
]);
return $this->_db->lastInsertId();
}
public function deleteLogCrawler(int $timeOffset) {
$query = $this->_db->prepare('DELETE FROM `logCrawler` WHERE `timeAdded` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 110 KiB

After

Width:  |  Height:  |  Size: 126 KiB