mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
refactor cleaner, delegate tasks to crawler, init hostSetting table
This commit is contained in:
parent
fd90e2d517
commit
b24d31f360
@ -145,18 +145,6 @@ define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGver
|
|||||||
|
|
||||||
// Crawl settings
|
// Crawl settings
|
||||||
|
|
||||||
/*
|
|
||||||
* Save crawler debug to `logCrawler` table
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CRAWL_LOG_ENABLED', true);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Auto clean `logCrawler` items older seconds offset
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Crawler / Bot User Agent name
|
* Crawler / Bot User Agent name
|
||||||
*
|
*
|
||||||
@ -411,41 +399,6 @@ define('CRAWL_MANIFEST_DEFAULT_STATUS', true);
|
|||||||
|
|
||||||
// Cleaner settings
|
// Cleaner settings
|
||||||
|
|
||||||
/*
|
|
||||||
* Save cleaner debug to `logCleaner` table
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CLEAN_LOG_ENABLED', true);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Auto clean `logCleaner` items older seconds offset
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CLEAN_LOG_SECONDS_OFFSET', 60*60*24*30);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
|
|
||||||
*
|
|
||||||
* This option works with CLEAN_HOST_SECONDS_OFFSET
|
|
||||||
*
|
|
||||||
* The value depends of CPU resources available
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CLEAN_HOST_LIMIT', 20);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Apply cleaning rules to page older than value provided
|
|
||||||
*
|
|
||||||
* This option works with CLEAN_HOST_LIMIT step queue
|
|
||||||
*
|
|
||||||
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
|
|
||||||
* must have enough value to process all pages in the DB index
|
|
||||||
*
|
|
||||||
* or the cleaner can stuck in queue
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Remove page ban after following time
|
* Remove page ban after following time
|
||||||
*
|
*
|
||||||
@ -455,18 +408,6 @@ define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30);
|
|||||||
*/
|
*/
|
||||||
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||||
|
|
||||||
/*
|
|
||||||
* Remove page description history after following time
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Remove page DOM history after following time
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CLEAN_PAGE_DOM_OFFSET', 60*60*24*30*12*10);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Database tables optimization
|
* Database tables optimization
|
||||||
*
|
*
|
||||||
|
@ -4,4 +4,4 @@
|
|||||||
30 0 * * * indexer --all --rotate
|
30 0 * * * indexer --all --rotate
|
||||||
|
|
||||||
0 0 * * * /usr/bin/php /{PATH}/YGGo/crontab/cleaner.php >> /{PATH}/cleaner.log 2>&1
|
0 0 * * * /usr/bin/php /{PATH}/YGGo/crontab/cleaner.php >> /{PATH}/cleaner.log 2>&1
|
||||||
* 1-23 * * * /usr/bin/php /{PATH}/YGGo/crontab/crawler.php >> /{PATH}/crawler.log 2>&1
|
* * * * * /usr/bin/php /{PATH}/YGGo/crontab/crawler.php >> /{PATH}/crawler.log 2>&1
|
@ -18,287 +18,18 @@ if (false === sem_acquire($semaphore, true)) {
|
|||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Define variables
|
||||||
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
// Load system dependencies
|
// Load system dependencies
|
||||||
require_once(__DIR__ . '/../config/app.php');
|
require_once(__DIR__ . '/../config/app.php');
|
||||||
require_once(__DIR__ . '/../library/curl.php');
|
|
||||||
require_once(__DIR__ . '/../library/robots.php');
|
|
||||||
require_once(__DIR__ . '/../library/mysql.php');
|
require_once(__DIR__ . '/../library/mysql.php');
|
||||||
require_once(__DIR__ . '/../library/ftp.php');
|
|
||||||
|
|
||||||
// Connect database
|
// Connect database
|
||||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||||
|
|
||||||
// Debug
|
// Reset banned pages
|
||||||
$timeStart = microtime(true);
|
$hostPagesBansRemoved = $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
|
||||||
|
|
||||||
$httpRequestsTotal = 0;
|
|
||||||
$httpRequestsSizeTotal = 0;
|
|
||||||
$httpDownloadSizeTotal = 0;
|
|
||||||
$httpRequestsTimeTotal = 0;
|
|
||||||
|
|
||||||
$hostsTotal = $db->getTotalHosts();
|
|
||||||
$manifestsTotal = $db->getTotalManifests();
|
|
||||||
$hostsUpdated = 0;
|
|
||||||
$hostPagesDeleted = 0;
|
|
||||||
$hostPagesDescriptionsDeleted = 0;
|
|
||||||
$hostPagesDomsDeleted = 0;
|
|
||||||
$hostPagesSnapDeleted = 0;
|
|
||||||
$hostPagesToHostPageDeleted = 0;
|
|
||||||
$manifestsDeleted = 0;
|
|
||||||
$hostPagesBansRemoved = 0;
|
|
||||||
|
|
||||||
$logsCleanerDeleted = 0;
|
|
||||||
$logsCrawlerDeleted = 0;
|
|
||||||
|
|
||||||
// Begin update
|
|
||||||
$db->beginTransaction();
|
|
||||||
|
|
||||||
try {
|
|
||||||
|
|
||||||
// Get cleaner queue
|
|
||||||
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
|
||||||
|
|
||||||
// Get robots.txt if exists
|
|
||||||
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
|
||||||
|
|
||||||
// Update curl stats
|
|
||||||
$httpRequestsTotal++;
|
|
||||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
|
||||||
$hostRobots = $curl->getContent();
|
|
||||||
} else {
|
|
||||||
$hostRobots = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update host data
|
|
||||||
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
|
||||||
|
|
||||||
// Apply host pages limits
|
|
||||||
$totalHostPages = $db->getTotalHostPages($host->hostId);
|
|
||||||
|
|
||||||
if ($totalHostPages > $host->crawlPageLimit) {
|
|
||||||
|
|
||||||
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
|
|
||||||
|
|
||||||
if ($hostPage->uri != '/') {
|
|
||||||
|
|
||||||
// Delete host page descriptions
|
|
||||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
|
||||||
|
|
||||||
// Delete host page DOMs
|
|
||||||
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
|
|
||||||
|
|
||||||
// Delete host page refs data
|
|
||||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
|
||||||
|
|
||||||
// Delete host page snaps
|
|
||||||
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
|
|
||||||
|
|
||||||
// Prepare filenames
|
|
||||||
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
|
|
||||||
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
|
|
||||||
|
|
||||||
// Delete snap files
|
|
||||||
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
|
|
||||||
|
|
||||||
foreach ($storages as $location => $storage) {
|
|
||||||
|
|
||||||
switch ($node) {
|
|
||||||
|
|
||||||
case 'localhost':
|
|
||||||
|
|
||||||
if (file_exists($storage->directory . $hostPageSnapFile)) {
|
|
||||||
|
|
||||||
unlink($storage->directory . $hostPageSnapFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
case 'ftp':
|
|
||||||
|
|
||||||
$ftp = new Ftp();
|
|
||||||
|
|
||||||
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
|
|
||||||
|
|
||||||
$ftp->delete($hostPageSnapFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clean up DB registry
|
|
||||||
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
|
|
||||||
|
|
||||||
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
|
|
||||||
}
|
|
||||||
|
|
||||||
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
|
|
||||||
|
|
||||||
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete host page
|
|
||||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Apply new robots.txt rules
|
|
||||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
|
||||||
|
|
||||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
|
||||||
|
|
||||||
if ($hostPage->uri != '/' && !$robots->uriAllowed($hostPage->uri)) {
|
|
||||||
|
|
||||||
// Delete host page descriptions
|
|
||||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
|
||||||
|
|
||||||
// Delete host page DOMs
|
|
||||||
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
|
|
||||||
|
|
||||||
// Delete host page refs data
|
|
||||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
|
||||||
|
|
||||||
// Delete host page snaps
|
|
||||||
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
|
|
||||||
|
|
||||||
// Prepare filenames
|
|
||||||
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
|
|
||||||
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
|
|
||||||
|
|
||||||
// Delete snap files
|
|
||||||
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
|
|
||||||
|
|
||||||
foreach ($storages as $location => $storage) {
|
|
||||||
|
|
||||||
switch ($node) {
|
|
||||||
|
|
||||||
case 'localhost':
|
|
||||||
|
|
||||||
if (file_exists($storage->directory . $hostPageSnapFile)) {
|
|
||||||
|
|
||||||
unlink($storage->directory . $hostPageSnapFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
case 'ftp':
|
|
||||||
|
|
||||||
$ftp = new Ftp();
|
|
||||||
|
|
||||||
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
|
|
||||||
|
|
||||||
$ftp->delete($hostPageSnapFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clean up DB registry
|
|
||||||
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
|
|
||||||
|
|
||||||
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
|
|
||||||
}
|
|
||||||
|
|
||||||
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
|
|
||||||
|
|
||||||
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete host page
|
|
||||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clean up deprecated manifests
|
|
||||||
foreach ($db->getManifests() as $manifest) {
|
|
||||||
|
|
||||||
$delete = false;
|
|
||||||
|
|
||||||
$curl = new Curl($manifest->url, CRAWL_CURLOPT_USERAGENT);
|
|
||||||
|
|
||||||
// Update curl stats
|
|
||||||
$httpRequestsTotal++;
|
|
||||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
|
||||||
|
|
||||||
// Skip processing non 200 code
|
|
||||||
if (200 != $curl->getCode()) {
|
|
||||||
|
|
||||||
continue; // Wait for reconnect
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip processing without returned data
|
|
||||||
if (!$remoteManifest = $curl->getContent()) {
|
|
||||||
|
|
||||||
$delete = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip processing on json encoding error
|
|
||||||
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
|
||||||
|
|
||||||
$delete = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip processing on required fields missed
|
|
||||||
if (empty($remoteManifest->status) ||
|
|
||||||
empty($remoteManifest->result->config->crawlUrlRegexp) ||
|
|
||||||
empty($remoteManifest->result->api->version)) {
|
|
||||||
|
|
||||||
$delete = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip processing on API version not compatible
|
|
||||||
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
|
||||||
|
|
||||||
$delete = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
|
||||||
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
|
||||||
|
|
||||||
$delete = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($delete) {
|
|
||||||
|
|
||||||
$manifestsDeleted += $db->deleteManifest($manifest->manifestId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reset banned pages
|
|
||||||
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
|
|
||||||
|
|
||||||
// Delete page description history
|
|
||||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
|
|
||||||
|
|
||||||
// Delete page dom history
|
|
||||||
$hostPagesDomsDeleted += $db->deleteHostPageDomsByTimeAdded(time() - CLEAN_PAGE_DOM_OFFSET);
|
|
||||||
|
|
||||||
// Delete deprecated logs
|
|
||||||
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
|
||||||
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
|
|
||||||
|
|
||||||
// Delete failed snap files
|
|
||||||
// @TODO
|
|
||||||
|
|
||||||
// Commit results
|
|
||||||
$db->commit();
|
|
||||||
|
|
||||||
} catch (Exception $e) {
|
|
||||||
|
|
||||||
$db->rollBack();
|
|
||||||
|
|
||||||
var_dump($e);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Optimize tables
|
// Optimize tables
|
||||||
if (CLEAN_DB_TABLES_OPTIMIZATION) {
|
if (CLEAN_DB_TABLES_OPTIMIZATION) {
|
||||||
@ -314,51 +45,6 @@ if (CLEAN_DB_TABLES_OPTIMIZATION) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Debug
|
// Debug
|
||||||
$executionTimeTotal = microtime(true) - $timeStart;
|
|
||||||
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
|
||||||
|
|
||||||
if (CLEAN_LOG_ENABLED) {
|
|
||||||
|
|
||||||
$db->addCleanerLog( time(),
|
|
||||||
$hostsTotal,
|
|
||||||
$hostsUpdated,
|
|
||||||
$hostPagesDeleted,
|
|
||||||
$hostPagesDescriptionsDeleted,
|
|
||||||
$hostPagesDomsDeleted,
|
|
||||||
$hostPagesSnapDeleted,
|
|
||||||
$hostPagesToHostPageDeleted,
|
|
||||||
$hostPagesBansRemoved,
|
|
||||||
$manifestsTotal,
|
|
||||||
$manifestsDeleted,
|
|
||||||
$logsCleanerDeleted,
|
|
||||||
$logsCrawlerDeleted,
|
|
||||||
$httpRequestsTotal,
|
|
||||||
$httpRequestsSizeTotal,
|
|
||||||
$httpDownloadSizeTotal,
|
|
||||||
$httpRequestsTimeTotal,
|
|
||||||
$executionTimeTotal);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
|
||||||
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
|
||||||
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
|
|
||||||
|
|
||||||
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
|
||||||
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
|
||||||
|
|
||||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||||
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
|
|
||||||
echo 'Host page doms deleted: ' . $hostPagesDomsDeleted . PHP_EOL;
|
|
||||||
echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
|
|
||||||
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
|
|
||||||
|
|
||||||
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
|
||||||
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
|
|
||||||
|
|
||||||
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
|
||||||
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
|
||||||
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
|
|
||||||
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
|
|
||||||
|
|
||||||
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;
|
|
@ -1227,23 +1227,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
$executionTimeTotal = microtime(true) - $timeStart;
|
$executionTimeTotal = microtime(true) - $timeStart;
|
||||||
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
||||||
|
|
||||||
if (CRAWL_LOG_ENABLED) {
|
|
||||||
|
|
||||||
$db->addCrawlerLog(time(),
|
|
||||||
$hostsAdded,
|
|
||||||
$hostPagesProcessed,
|
|
||||||
$hostPagesAdded,
|
|
||||||
$hostPagesSnapAdded,
|
|
||||||
$hostPagesBanned,
|
|
||||||
$manifestsProcessed,
|
|
||||||
$manifestsAdded,
|
|
||||||
$httpRequestsTotal,
|
|
||||||
$httpRequestsSizeTotal,
|
|
||||||
$httpDownloadSizeTotal,
|
|
||||||
$httpRequestsTimeTotal,
|
|
||||||
$executionTimeTotal);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Debug output
|
// Debug output
|
||||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||||
|
|
||||||
|
Binary file not shown.
@ -559,39 +559,6 @@ class MySQL {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Cleaner tools
|
// Cleaner tools
|
||||||
public function getCleanerQueue(int $limit, int $timeFrom) {
|
|
||||||
|
|
||||||
$result = [];
|
|
||||||
|
|
||||||
// Get ID (to prevent memory over usage)
|
|
||||||
$query = $this->_db->prepare("SELECT `hostId`
|
|
||||||
|
|
||||||
FROM `host`
|
|
||||||
|
|
||||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ?
|
|
||||||
|
|
||||||
ORDER BY `hostId`
|
|
||||||
|
|
||||||
LIMIT " . (int) $limit);
|
|
||||||
|
|
||||||
$query->execute([$timeFrom, 0]);
|
|
||||||
|
|
||||||
// Get required page details
|
|
||||||
foreach ($query->fetchAll() as $host) {
|
|
||||||
|
|
||||||
$result[] = $this->getHost($host->hostId);
|
|
||||||
}
|
|
||||||
|
|
||||||
return (object) $result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getHostPagesBanned() {
|
|
||||||
|
|
||||||
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeBanned` IS NOT NULL');
|
|
||||||
|
|
||||||
return $query->fetchAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function resetBannedHostPages(int $timeOffset) {
|
public function resetBannedHostPages(int $timeOffset) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
||||||
@ -601,87 +568,7 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function deleteHostPageDescriptionsByTimeAdded(int $timeOffset) {
|
// Crawler tools
|
||||||
|
|
||||||
$query = $this->_db->prepare('DELETE FROM `hostPageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
|
|
||||||
|
|
||||||
$query->execute();
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function addCleanerLog(int $timeAdded,
|
|
||||||
int $hostsTotal,
|
|
||||||
int $hostsUpdated,
|
|
||||||
int $hostPagesDeleted,
|
|
||||||
int $hostPagesDescriptionsDeleted,
|
|
||||||
int $hostPagesDomsDeleted,
|
|
||||||
int $hostPagesSnapDeleted,
|
|
||||||
int $hostPagesToHostPageDeleted,
|
|
||||||
int $hostPagesBansRemoved,
|
|
||||||
int $manifestsTotal,
|
|
||||||
int $manifestsDeleted,
|
|
||||||
int $logsCleanerDeleted,
|
|
||||||
int $logsCrawlerDeleted,
|
|
||||||
int $httpRequestsTotal,
|
|
||||||
int $httpRequestsSizeTotal,
|
|
||||||
int $httpDownloadSizeTotal,
|
|
||||||
float $httpRequestsTimeTotal,
|
|
||||||
float $executionTimeTotal) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `logCleaner` (`timeAdded`,
|
|
||||||
`hostsTotal`,
|
|
||||||
`hostsUpdated`,
|
|
||||||
`hostPagesDeleted`,
|
|
||||||
`hostPagesDescriptionsDeleted`,
|
|
||||||
`hostPagesDomsDeleted`,
|
|
||||||
`hostPagesSnapDeleted`,
|
|
||||||
`hostPagesToHostPageDeleted`,
|
|
||||||
`hostPagesBansRemoved`,
|
|
||||||
`manifestsTotal`,
|
|
||||||
`manifestsDeleted`,
|
|
||||||
`logsCleanerDeleted`,
|
|
||||||
`logsCrawlerDeleted`,
|
|
||||||
`httpRequestsTotal`,
|
|
||||||
`httpRequestsSizeTotal`,
|
|
||||||
`httpDownloadSizeTotal`,
|
|
||||||
`httpRequestsTimeTotal`,
|
|
||||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
|
||||||
|
|
||||||
$query->execute([
|
|
||||||
$timeAdded,
|
|
||||||
$hostsTotal,
|
|
||||||
$hostsUpdated,
|
|
||||||
$hostPagesDeleted,
|
|
||||||
$hostPagesDescriptionsDeleted,
|
|
||||||
$hostPagesDomsDeleted,
|
|
||||||
$hostPagesSnapDeleted,
|
|
||||||
$hostPagesToHostPageDeleted,
|
|
||||||
$hostPagesBansRemoved,
|
|
||||||
$manifestsTotal,
|
|
||||||
$manifestsDeleted,
|
|
||||||
$logsCleanerDeleted,
|
|
||||||
$logsCrawlerDeleted,
|
|
||||||
$httpRequestsTotal,
|
|
||||||
$httpRequestsSizeTotal,
|
|
||||||
$httpDownloadSizeTotal,
|
|
||||||
$httpRequestsTimeTotal,
|
|
||||||
$executionTimeTotal
|
|
||||||
]);
|
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function deleteLogCleaner(int $timeOffset) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('DELETE FROM `logCleaner` WHERE `timeAdded` < ' . (int) $timeOffset);
|
|
||||||
|
|
||||||
$query->execute();
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Crawl tools
|
|
||||||
public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
|
public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
|
||||||
|
|
||||||
$query = $this->_db->prepare("SELECT COUNT(*) AS `total`
|
$query = $this->_db->prepare("SELECT COUNT(*) AS `total`
|
||||||
@ -833,62 +720,6 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function addCrawlerLog(int $timeAdded,
|
|
||||||
int $hostsAdded,
|
|
||||||
int $hostPagesProcessed,
|
|
||||||
int $hostPagesAdded,
|
|
||||||
int $hostPagesSnapAdded,
|
|
||||||
int $hostPagesBanned,
|
|
||||||
int $manifestsProcessed,
|
|
||||||
int $manifestsAdded,
|
|
||||||
int $httpRequestsTotal,
|
|
||||||
int $httpRequestsSizeTotal,
|
|
||||||
int $httpDownloadSizeTotal,
|
|
||||||
float $httpRequestsTimeTotal,
|
|
||||||
float $executionTimeTotal) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `logCrawler` (`timeAdded`,
|
|
||||||
`hostsAdded`,
|
|
||||||
`hostPagesProcessed`,
|
|
||||||
`hostPagesAdded`,
|
|
||||||
`hostPagesSnapAdded`,
|
|
||||||
`hostPagesBanned`,
|
|
||||||
`manifestsProcessed`,
|
|
||||||
`manifestsAdded`,
|
|
||||||
`httpRequestsTotal`,
|
|
||||||
`httpRequestsSizeTotal`,
|
|
||||||
`httpDownloadSizeTotal`,
|
|
||||||
`httpRequestsTimeTotal`,
|
|
||||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
|
||||||
|
|
||||||
$query->execute([
|
|
||||||
$timeAdded,
|
|
||||||
$hostsAdded,
|
|
||||||
$hostPagesProcessed,
|
|
||||||
$hostPagesAdded,
|
|
||||||
$hostPagesSnapAdded,
|
|
||||||
$hostPagesBanned,
|
|
||||||
$manifestsProcessed,
|
|
||||||
$manifestsAdded,
|
|
||||||
$httpRequestsTotal,
|
|
||||||
$httpRequestsSizeTotal,
|
|
||||||
$httpDownloadSizeTotal,
|
|
||||||
$httpRequestsTimeTotal,
|
|
||||||
$executionTimeTotal
|
|
||||||
]);
|
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function deleteLogCrawler(int $timeOffset) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('DELETE FROM `logCrawler` WHERE `timeAdded` < ' . (int) $timeOffset);
|
|
||||||
|
|
||||||
$query->execute();
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function optimize() {
|
public function optimize() {
|
||||||
|
|
||||||
$this->_db->query('OPTIMIZE TABLE `host`');
|
$this->_db->query('OPTIMIZE TABLE `host`');
|
||||||
@ -900,9 +731,6 @@ class MySQL {
|
|||||||
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');
|
||||||
|
|
||||||
$this->_db->query('OPTIMIZE TABLE `logCleaner`');
|
|
||||||
$this->_db->query('OPTIMIZE TABLE `logCrawler`');
|
|
||||||
|
|
||||||
$this->_db->query('OPTIMIZE TABLE `manifest`');
|
$this->_db->query('OPTIMIZE TABLE `manifest`');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 159 KiB After Width: | Height: | Size: 162 KiB |
Loading…
x
Reference in New Issue
Block a user