mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-08 22:07:56 +00:00
add crawler/cleaner logs
This commit is contained in:
parent
dcdc2c50ad
commit
25b6bce2ec
18
README.md
18
README.md
@ -169,19 +169,35 @@ GET m=SphinxQL
|
|||||||
##### Crawler
|
##### Crawler
|
||||||
|
|
||||||
* [x] Auto crawl links by regular expression rules
|
* [x] Auto crawl links by regular expression rules
|
||||||
|
+ [x] Pages
|
||||||
|
+ [x] Images
|
||||||
|
+ [x] Manifests
|
||||||
* [x] Robots.txt / robots meta tags support (#2)
|
* [x] Robots.txt / robots meta tags support (#2)
|
||||||
* [x] Specific rules configuration for every host
|
* [x] Specific rules configuration for every host
|
||||||
* [x] Deprecated index auto cleaner
|
|
||||||
* [x] Auto stop crawling on disk quota reached
|
* [x] Auto stop crawling on disk quota reached
|
||||||
* [x] Transactions support to prevent data loss on queue failures
|
* [x] Transactions support to prevent data loss on queue failures
|
||||||
* [x] Distributed index crawling between YGGo nodes trough manifest API
|
* [x] Distributed index crawling between YGGo nodes trough manifest API
|
||||||
* [x] MIME Content-type crawler settings
|
* [x] MIME Content-type crawler settings
|
||||||
* [x] Ban non-condition links to prevent extra requests
|
* [x] Ban non-condition links to prevent extra requests
|
||||||
|
* [x] Debug log
|
||||||
* [ ] Indexing new sites homepage in higher priority
|
* [ ] Indexing new sites homepage in higher priority
|
||||||
* [ ] Redirect codes extended processing
|
* [ ] Redirect codes extended processing
|
||||||
* [ ] Palette image index / filter
|
* [ ] Palette image index / filter
|
||||||
* [ ] Crawl queue balancer, that depends of CPU available
|
* [ ] Crawl queue balancer, that depends of CPU available
|
||||||
|
|
||||||
|
##### Cleaner
|
||||||
|
* [x] Deprecated DB items auto deletion / host settings update
|
||||||
|
+ [x] Pages
|
||||||
|
+ [x] Images
|
||||||
|
+ [x] Manifests
|
||||||
|
+ [x] Logs
|
||||||
|
+ [x] Crawler
|
||||||
|
+ [x] Cleaner
|
||||||
|
* [x] Banned resources reset by timeout
|
||||||
|
+ [x] Pages
|
||||||
|
+ [x] Images
|
||||||
|
* [x] Debug log
|
||||||
|
|
||||||
##### Other
|
##### Other
|
||||||
|
|
||||||
* [ ] Administrative panel for useful index moderation
|
* [ ] Administrative panel for useful index moderation
|
||||||
|
@ -98,6 +98,18 @@ define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGver
|
|||||||
|
|
||||||
// Crawl settings
|
// Crawl settings
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Save crawler debug to `logCrawler` table
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_LOG_ENABLED', true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Auto clean `logCrawler` items older seconds offset
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Crawler / Bot User Agent name
|
* Crawler / Bot User Agent name
|
||||||
*
|
*
|
||||||
@ -311,6 +323,18 @@ define('CRAWL_MANIFEST_DEFAULT_STATUS', true);
|
|||||||
|
|
||||||
// Cleaner settings
|
// Cleaner settings
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Save cleaner debug to `logCleaner` table
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CLEAN_LOG_ENABLED', true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Auto clean `logCleaner` items older seconds offset
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CLEAN_LOG_SECONDS_OFFSET', 60*60*24*30);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
|
* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
|
||||||
*
|
*
|
||||||
|
@ -21,20 +21,23 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
|||||||
// Debug
|
// Debug
|
||||||
$timeStart = microtime(true);
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
$requestsTotal = 0;
|
$httpRequestsTotal = 0;
|
||||||
$requestSizeTotal = 0;
|
$httpRequestsSizeTotal = 0;
|
||||||
$downloadSizeTotal = 0;
|
$httpDownloadSizeTotal = 0;
|
||||||
$requestsTotalTime = 0;
|
$httpRequestsTimeTotal = 0;
|
||||||
|
|
||||||
$hostsTotal = $db->getTotalHosts();
|
$hostsTotal = $db->getTotalHosts();
|
||||||
$manifestsTotal = $db->getTotalManifests();
|
$manifestsTotal = $db->getTotalManifests();
|
||||||
$hostsUpdated = 0;
|
$hostsUpdated = 0;
|
||||||
$hostsPagesDeleted = 0;
|
$hostPagesDeleted = 0;
|
||||||
$hostsImagesDeleted = 0;
|
$hostImagesDeleted = 0;
|
||||||
$manifestsDeleted = 0;
|
$manifestsDeleted = 0;
|
||||||
$hostPagesBansRemoved = 0;
|
$hostPagesBansRemoved = 0;
|
||||||
$hostImagesBansRemoved = 0;
|
$hostImagesBansRemoved = 0;
|
||||||
|
|
||||||
|
$logsCleanerDeleted = 0;
|
||||||
|
$logsCrawlerDeleted = 0;
|
||||||
|
|
||||||
// Begin update
|
// Begin update
|
||||||
$db->beginTransaction();
|
$db->beginTransaction();
|
||||||
|
|
||||||
@ -50,10 +53,10 @@ try {
|
|||||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
$hostRobots = $curl->getContent();
|
$hostRobots = $curl->getContent();
|
||||||
@ -76,7 +79,7 @@ try {
|
|||||||
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||||
|
|
||||||
// Delete host image
|
// Delete host image
|
||||||
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -91,7 +94,7 @@ try {
|
|||||||
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||||
|
|
||||||
// Delete host page
|
// Delete host page
|
||||||
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -107,7 +110,7 @@ try {
|
|||||||
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||||
|
|
||||||
// Delete host image
|
// Delete host image
|
||||||
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -119,7 +122,7 @@ try {
|
|||||||
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||||
|
|
||||||
// Delete host page
|
// Delete host page
|
||||||
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,7 +134,7 @@ try {
|
|||||||
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||||
|
|
||||||
// Delete host image
|
// Delete host image
|
||||||
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,10 +146,10 @@ try {
|
|||||||
$curl = new Curl($manifest->url);
|
$curl = new Curl($manifest->url);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Skip processing non 200 code
|
// Skip processing non 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
@ -198,6 +201,10 @@ try {
|
|||||||
// Reset banned images
|
// Reset banned images
|
||||||
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
|
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
|
||||||
|
|
||||||
|
// Delete deprecated logs
|
||||||
|
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
||||||
|
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
} catch(Exception $e){
|
} catch(Exception $e){
|
||||||
@ -208,10 +215,34 @@ try {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Debug
|
// Debug
|
||||||
|
$executionTimeTotal = microtime(true) - $timeStart;
|
||||||
|
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
||||||
|
|
||||||
|
if (CLEAN_LOG_ENABLED) {
|
||||||
|
|
||||||
|
$db->addCleanerLog( time(),
|
||||||
|
$hostsTotal,
|
||||||
|
$hostsUpdated,
|
||||||
|
$hostPagesDeleted,
|
||||||
|
$hostPagesBansRemoved,
|
||||||
|
$hostImagesDeleted,
|
||||||
|
$hostImagesBansRemoved,
|
||||||
|
$manifestsTotal,
|
||||||
|
$manifestsDeleted,
|
||||||
|
$logsCleanerDeleted,
|
||||||
|
$logsCrawlerDeleted,
|
||||||
|
$httpRequestsTotal,
|
||||||
|
$httpRequestsSizeTotal,
|
||||||
|
$httpDownloadSizeTotal,
|
||||||
|
$httpRequestsTimeTotal,
|
||||||
|
$executionTimeTotal);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
||||||
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
||||||
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
|
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
|
||||||
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
|
echo 'Hosts images deleted: ' . $hostImagesDeleted . PHP_EOL;
|
||||||
|
|
||||||
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
||||||
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
||||||
@ -219,9 +250,12 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
|||||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||||
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
|
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
|
||||||
|
|
||||||
echo 'HTTP Requests total: ' . $requestsTotal . PHP_EOL;
|
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
|
||||||
echo 'HTTP Requests total size: ' . $requestSizeTotal . PHP_EOL;
|
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
|
||||||
echo 'HTTP Download total size: ' . $downloadSizeTotal . PHP_EOL;
|
|
||||||
echo 'HTTP Requests total time: ' . $requestsTotalTime / 1000000 . PHP_EOL;
|
|
||||||
|
|
||||||
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
||||||
|
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
||||||
|
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
|
||||||
|
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
|
||||||
|
|
||||||
|
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;
|
@ -27,17 +27,17 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
|
|||||||
// Debug
|
// Debug
|
||||||
$timeStart = microtime(true);
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
$requestsTotal = 0;
|
$httpRequestsTotal = 0;
|
||||||
$requestSizeTotal = 0;
|
$httpRequestsSizeTotal = 0;
|
||||||
$downloadSizeTotal = 0;
|
$httpDownloadSizeTotal = 0;
|
||||||
$requestsTotalTime = 0;
|
$httpRequestsTimeTotal = 0;
|
||||||
|
|
||||||
$hostPagesProcessed = 0;
|
$hostPagesProcessed = 0;
|
||||||
$hostImagesProcessed = 0;
|
$hostImagesProcessed = 0;
|
||||||
$manifestsProcessed = 0;
|
$manifestsProcessed = 0;
|
||||||
$hostPagesIndexed = 0;
|
$hostPagesIndexed = 0;
|
||||||
$hostImagesIndexed = 0;
|
$hostImagesIndexed = 0;
|
||||||
$manifestsIndexed = 0;
|
$manifestsAdded = 0;
|
||||||
$hostPagesAdded = 0;
|
$hostPagesAdded = 0;
|
||||||
$hostImagesAdded = 0;
|
$hostImagesAdded = 0;
|
||||||
$hostsAdded = 0;
|
$hostsAdded = 0;
|
||||||
@ -57,10 +57,10 @@ try {
|
|||||||
$curl = new Curl($queueManifest->url);
|
$curl = new Curl($queueManifest->url);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Update manifest index anyway, with the current time and http code
|
// Update manifest index anyway, with the current time and http code
|
||||||
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
|
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
|
||||||
@ -120,10 +120,10 @@ try {
|
|||||||
$curl = new Curl($remoteManifest->result->api->hosts);
|
$curl = new Curl($remoteManifest->result->api->hosts);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Skip processing non 200 code
|
// Skip processing non 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
@ -184,10 +184,10 @@ try {
|
|||||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
$hostRobots = $curl->getContent();
|
$hostRobots = $curl->getContent();
|
||||||
@ -254,10 +254,10 @@ try {
|
|||||||
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
|
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Update image index anyway, with the current time and http code
|
// Update image index anyway, with the current time and http code
|
||||||
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
||||||
@ -334,10 +334,10 @@ try {
|
|||||||
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
|
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Update page index anyway, with the current time and http code
|
// Update page index anyway, with the current time and http code
|
||||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||||
@ -447,6 +447,8 @@ try {
|
|||||||
$metaYggoManifest,
|
$metaYggoManifest,
|
||||||
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
||||||
time());
|
time());
|
||||||
|
|
||||||
|
$manifestsAdded++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -504,10 +506,10 @@ try {
|
|||||||
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
$hostRobots = $curl->getContent();
|
$hostRobots = $curl->getContent();
|
||||||
@ -666,10 +668,10 @@ try {
|
|||||||
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$requestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$requestSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$downloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$requestsTotalTime += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
$hostRobots = $curl->getContent();
|
$hostRobots = $curl->getContent();
|
||||||
@ -745,24 +747,49 @@ try {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Debug
|
// Debug
|
||||||
|
$executionTimeTotal = microtime(true) - $timeStart;
|
||||||
|
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
||||||
|
|
||||||
|
if (CRAWL_LOG_ENABLED) {
|
||||||
|
|
||||||
|
$db->addCrawlerLog(time(),
|
||||||
|
$hostsAdded,
|
||||||
|
$hostPagesProcessed,
|
||||||
|
$hostPagesIndexed,
|
||||||
|
$hostPagesAdded,
|
||||||
|
$hostPagesBanned,
|
||||||
|
$hostImagesIndexed,
|
||||||
|
$hostImagesProcessed,
|
||||||
|
$hostImagesAdded,
|
||||||
|
$hostImagesBanned,
|
||||||
|
$manifestsProcessed,
|
||||||
|
$manifestsAdded,
|
||||||
|
$httpRequestsTotal,
|
||||||
|
$httpRequestsSizeTotal,
|
||||||
|
$httpDownloadSizeTotal,
|
||||||
|
$httpRequestsTimeTotal,
|
||||||
|
$executionTimeTotal);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Debug output
|
||||||
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||||
|
|
||||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||||
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
||||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||||
|
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
||||||
|
|
||||||
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
|
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
|
||||||
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
|
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
|
||||||
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
||||||
|
echo 'Images banned: ' . $hostImagesBanned . PHP_EOL;
|
||||||
|
|
||||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||||
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL;
|
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
|
||||||
|
|
||||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
||||||
echo 'Hosts pages banned: ' . $hostPagesBanned . PHP_EOL;
|
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
||||||
echo 'Hosts images banned: ' . $hostImagesBanned . PHP_EOL;
|
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
|
||||||
|
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
|
||||||
|
|
||||||
echo 'HTTP Requests total: ' . $requestsTotal . PHP_EOL;
|
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;
|
||||||
echo 'HTTP Requests total size: ' . $requestSizeTotal . PHP_EOL;
|
|
||||||
echo 'HTTP Download total size: ' . $downloadSizeTotal . PHP_EOL;
|
|
||||||
echo 'HTTP Requests total time: ' . $requestsTotalTime / 1000000 . PHP_EOL;
|
|
||||||
|
|
||||||
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
|
|
||||||
|
Binary file not shown.
@ -564,7 +564,7 @@ class MySQL {
|
|||||||
|
|
||||||
public function resetBannedHostPages(int $timeOffset) {
|
public function resetBannedHostPages(int $timeOffset) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` > ' . (int) $timeOffset);
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
||||||
|
|
||||||
$query->execute();
|
$query->execute();
|
||||||
|
|
||||||
@ -573,7 +573,72 @@ class MySQL {
|
|||||||
|
|
||||||
public function resetBannedHostImages(int $timeOffset) {
|
public function resetBannedHostImages(int $timeOffset) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` > ' . (int) $timeOffset);
|
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function addCleanerLog(int $timeAdded,
|
||||||
|
int $hostsTotal,
|
||||||
|
int $hostsUpdated,
|
||||||
|
int $hostPagesDeleted,
|
||||||
|
int $hostPagesBansRemoved,
|
||||||
|
int $hostImagesDeleted,
|
||||||
|
int $hostImagesBansRemoved,
|
||||||
|
int $manifestsTotal,
|
||||||
|
int $manifestsDeleted,
|
||||||
|
int $logsCleanerDeleted,
|
||||||
|
int $logsCrawlerDeleted,
|
||||||
|
int $httpRequestsTotal,
|
||||||
|
int $httpRequestsSizeTotal,
|
||||||
|
int $httpDownloadSizeTotal,
|
||||||
|
float $httpRequestsTimeTotal,
|
||||||
|
float $executionTimeTotal) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `logCleaner` (`timeAdded`,
|
||||||
|
`hostsTotal`,
|
||||||
|
`hostsUpdated`,
|
||||||
|
`hostPagesDeleted`,
|
||||||
|
`hostPagesBansRemoved`,
|
||||||
|
`hostImagesDeleted`,
|
||||||
|
`hostImagesBansRemoved`,
|
||||||
|
`manifestsTotal`,
|
||||||
|
`manifestsDeleted`,
|
||||||
|
`logsCleanerDeleted`,
|
||||||
|
`logsCrawlerDeleted`,
|
||||||
|
`httpRequestsTotal`,
|
||||||
|
`httpRequestsSizeTotal`,
|
||||||
|
`httpDownloadSizeTotal`,
|
||||||
|
`httpRequestsTimeTotal`,
|
||||||
|
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
|
$query->execute([
|
||||||
|
$timeAdded,
|
||||||
|
$hostsTotal,
|
||||||
|
$hostsUpdated,
|
||||||
|
$hostPagesDeleted,
|
||||||
|
$hostPagesBansRemoved,
|
||||||
|
$hostImagesDeleted,
|
||||||
|
$hostImagesBansRemoved,
|
||||||
|
$manifestsTotal,
|
||||||
|
$manifestsDeleted,
|
||||||
|
$logsCleanerDeleted,
|
||||||
|
$logsCrawlerDeleted,
|
||||||
|
$httpRequestsTotal,
|
||||||
|
$httpRequestsSizeTotal,
|
||||||
|
$httpDownloadSizeTotal,
|
||||||
|
$httpRequestsTimeTotal,
|
||||||
|
$executionTimeTotal
|
||||||
|
]);
|
||||||
|
|
||||||
|
return $this->_db->lastInsertId();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteLogCleaner(int $timeOffset) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `logCleaner` WHERE `timeAdded` < ' . (int) $timeOffset);
|
||||||
|
|
||||||
$query->execute();
|
$query->execute();
|
||||||
|
|
||||||
@ -675,4 +740,72 @@ class MySQL {
|
|||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function addCrawlerLog(int $timeAdded,
|
||||||
|
int $hostsAdded,
|
||||||
|
int $hostPagesProcessed,
|
||||||
|
int $hostPagesIndexed,
|
||||||
|
int $hostPagesAdded,
|
||||||
|
int $hostPagesBanned,
|
||||||
|
int $hostImagesIndexed,
|
||||||
|
int $hostImagesProcessed,
|
||||||
|
int $hostImagesAdded,
|
||||||
|
int $hostImagesBanned,
|
||||||
|
int $manifestsProcessed,
|
||||||
|
int $manifestsAdded,
|
||||||
|
int $httpRequestsTotal,
|
||||||
|
int $httpRequestsSizeTotal,
|
||||||
|
int $httpDownloadSizeTotal,
|
||||||
|
float $httpRequestsTimeTotal,
|
||||||
|
float $executionTimeTotal) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `logCrawler` (`timeAdded`,
|
||||||
|
`hostsAdded`,
|
||||||
|
`hostPagesProcessed`,
|
||||||
|
`hostPagesIndexed`,
|
||||||
|
`hostPagesAdded`,
|
||||||
|
`hostPagesBanned`,
|
||||||
|
`hostImagesIndexed`,
|
||||||
|
`hostImagesProcessed`,
|
||||||
|
`hostImagesAdded`,
|
||||||
|
`hostImagesBanned`,
|
||||||
|
`manifestsProcessed`,
|
||||||
|
`manifestsAdded`,
|
||||||
|
`httpRequestsTotal`,
|
||||||
|
`httpRequestsSizeTotal`,
|
||||||
|
`httpDownloadSizeTotal`,
|
||||||
|
`httpRequestsTimeTotal`,
|
||||||
|
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
|
$query->execute([
|
||||||
|
$timeAdded,
|
||||||
|
$hostsAdded,
|
||||||
|
$hostPagesProcessed,
|
||||||
|
$hostPagesIndexed,
|
||||||
|
$hostPagesAdded,
|
||||||
|
$hostPagesBanned,
|
||||||
|
$hostImagesIndexed,
|
||||||
|
$hostImagesProcessed,
|
||||||
|
$hostImagesAdded,
|
||||||
|
$hostImagesBanned,
|
||||||
|
$manifestsProcessed,
|
||||||
|
$manifestsAdded,
|
||||||
|
$httpRequestsTotal,
|
||||||
|
$httpRequestsSizeTotal,
|
||||||
|
$httpDownloadSizeTotal,
|
||||||
|
$httpRequestsTimeTotal,
|
||||||
|
$executionTimeTotal
|
||||||
|
]);
|
||||||
|
|
||||||
|
return $this->_db->lastInsertId();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteLogCrawler(int $timeOffset) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `logCrawler` WHERE `timeAdded` < ' . (int) $timeOffset);
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 110 KiB After Width: | Height: | Size: 126 KiB |
Loading…
Reference in New Issue
Block a user