diff --git a/config/app.php.txt b/config/app.php.txt index 212072e..e6f2ee3 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -145,18 +145,6 @@ define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGver // Crawl settings -/* - * Save crawler debug to `logCrawler` table - * - */ -define('CRAWL_LOG_ENABLED', true); - -/* - * Auto clean `logCrawler` items older seconds offset - * - */ -define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30); - /* * Crawler / Bot User Agent name * @@ -411,41 +399,6 @@ define('CRAWL_MANIFEST_DEFAULT_STATUS', true); // Cleaner settings -/* - * Save cleaner debug to `logCleaner` table - * - */ -define('CLEAN_LOG_ENABLED', true); - -/* - * Auto clean `logCleaner` items older seconds offset - * - */ -define('CLEAN_LOG_SECONDS_OFFSET', 60*60*24*30); - -/* - * Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab) - * - * This option works with CLEAN_HOST_SECONDS_OFFSET - * - * The value depends of CPU resources available - * - */ -define('CLEAN_HOST_LIMIT', 20); - -/* - * Apply cleaning rules to page older than value provided - * - * This option works with CLEAN_HOST_LIMIT step queue - * - * Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair - * must have enough value to process all pages in the DB index - * - * or the cleaner can stuck in queue - * - */ -define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30); - /* * Remove page ban after following time * @@ -455,18 +408,6 @@ define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30); */ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); -/* - * Remove page description history after following time - * - */ -define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10); - -/* - * Remove page DOM history after following time - * - */ -define('CLEAN_PAGE_DOM_OFFSET', 60*60*24*30*12*10); - /* * Database tables optimization * diff --git a/config/crontab.txt b/config/crontab.txt index 9e35161..0583b22 100644 --- a/config/crontab.txt +++ b/config/crontab.txt @@ -4,4 +4,4 @@ 30 0 * * * indexer --all --rotate 0 0 * * * /usr/bin/php /{PATH}/YGGo/crontab/cleaner.php >> /{PATH}/cleaner.log 2>&1 -* 1-23 * * * /usr/bin/php /{PATH}/YGGo/crontab/crawler.php >> /{PATH}/crawler.log 2>&1 \ No newline at end of file +* * * * * /usr/bin/php /{PATH}/YGGo/crontab/crawler.php >> /{PATH}/crawler.log 2>&1 \ No newline at end of file diff --git a/crontab/cleaner.php b/crontab/cleaner.php index ddb0cd7..a827285 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -18,287 +18,18 @@ if (false === sem_acquire($semaphore, true)) { exit; } +// Define variables +$timeStart = microtime(true); // Load system dependencies require_once(__DIR__ . '/../config/app.php'); -require_once(__DIR__ . '/../library/curl.php'); -require_once(__DIR__ . '/../library/robots.php'); require_once(__DIR__ . '/../library/mysql.php'); -require_once(__DIR__ . '/../library/ftp.php'); // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); -// Debug -$timeStart = microtime(true); - -$httpRequestsTotal = 0; -$httpRequestsSizeTotal = 0; -$httpDownloadSizeTotal = 0; -$httpRequestsTimeTotal = 0; - -$hostsTotal = $db->getTotalHosts(); -$manifestsTotal = $db->getTotalManifests(); -$hostsUpdated = 0; -$hostPagesDeleted = 0; -$hostPagesDescriptionsDeleted = 0; -$hostPagesDomsDeleted = 0; -$hostPagesSnapDeleted = 0; -$hostPagesToHostPageDeleted = 0; -$manifestsDeleted = 0; -$hostPagesBansRemoved = 0; - -$logsCleanerDeleted = 0; -$logsCrawlerDeleted = 0; - -// Begin update -$db->beginTransaction(); - -try { - - // Get cleaner queue - foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { - - // Get robots.txt if exists - $curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); - - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); - - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = null; - } - - // Update host data - $hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time()); - - // Apply host pages limits - $totalHostPages = $db->getTotalHostPages($host->hostId); - - if ($totalHostPages > $host->crawlPageLimit) { - - foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) { - - if ($hostPage->uri != '/') { - - // Delete host page descriptions - $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); - - // Delete host page DOMs - $hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId); - - // Delete host page refs data - $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); - - // Delete host page snaps - foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { - - // Prepare filenames - $hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1); - $hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip'; - - // Delete snap files - foreach (json_decode(SNAP_STORAGE) as $node => $storages) { - - foreach ($storages as $location => $storage) { - - switch ($node) { - - case 'localhost': - - if (file_exists($storage->directory . $hostPageSnapFile)) { - - unlink($storage->directory . $hostPageSnapFile); - } - - break; - case 'ftp': - - $ftp = new Ftp(); - - if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { - - $ftp->delete($hostPageSnapFile); - } - - break; - } - - // Clean up DB registry - foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) { - - $db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId); - } - - $db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId); - - $hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); - } - } - } - - // Delete host page - $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); - } - } - } - - // Apply new robots.txt rules - $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - - foreach ($db->getHostPages($host->hostId) as $hostPage) { - - if ($hostPage->uri != '/' && !$robots->uriAllowed($hostPage->uri)) { - - // Delete host page descriptions - $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); - - // Delete host page DOMs - $hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId); - - // Delete host page refs data - $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); - - // Delete host page snaps - foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { - - // Prepare filenames - $hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1); - $hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip'; - - // Delete snap files - foreach (json_decode(SNAP_STORAGE) as $node => $storages) { - - foreach ($storages as $location => $storage) { - - switch ($node) { - - case 'localhost': - - if (file_exists($storage->directory . $hostPageSnapFile)) { - - unlink($storage->directory . $hostPageSnapFile); - } - - break; - case 'ftp': - - $ftp = new Ftp(); - - if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { - - $ftp->delete($hostPageSnapFile); - } - - break; - } - - // Clean up DB registry - foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) { - - $db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId); - } - - $db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId); - - $hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); - } - } - } - - // Delete host page - $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); - } - } - } - - // Clean up deprecated manifests - foreach ($db->getManifests() as $manifest) { - - $delete = false; - - $curl = new Curl($manifest->url, CRAWL_CURLOPT_USERAGENT); - - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); - - // Skip processing non 200 code - if (200 != $curl->getCode()) { - - continue; // Wait for reconnect - } - - // Skip processing without returned data - if (!$remoteManifest = $curl->getContent()) { - - $delete = true; - } - - // Skip processing on json encoding error - if (!$remoteManifest = @json_decode($remoteManifest)) { - - $delete = true; - } - - // Skip processing on required fields missed - if (empty($remoteManifest->status) || - empty($remoteManifest->result->config->crawlUrlRegexp) || - empty($remoteManifest->result->api->version)) { - - $delete = true; - } - - // Skip processing on API version not compatible - if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { - - $delete = true; - } - - // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition - if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { - - $delete = true; - } - - if ($delete) { - - $manifestsDeleted += $db->deleteManifest($manifest->manifestId); - } - } - - // Reset banned pages - $hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET); - - // Delete page description history - $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET); - - // Delete page dom history - $hostPagesDomsDeleted += $db->deleteHostPageDomsByTimeAdded(time() - CLEAN_PAGE_DOM_OFFSET); - - // Delete deprecated logs - $logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET); - $logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET); - - // Delete failed snap files - // @TODO - - // Commit results - $db->commit(); - -} catch (Exception $e) { - - $db->rollBack(); - - var_dump($e); -} +// Reset banned pages +$hostPagesBansRemoved = $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET); // Optimize tables if (CLEAN_DB_TABLES_OPTIMIZATION) { @@ -314,51 +45,6 @@ if (CLEAN_DB_TABLES_OPTIMIZATION) { } // Debug -$executionTimeTotal = microtime(true) - $timeStart; -$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; - -if (CLEAN_LOG_ENABLED) { - - $db->addCleanerLog( time(), - $hostsTotal, - $hostsUpdated, - $hostPagesDeleted, - $hostPagesDescriptionsDeleted, - $hostPagesDomsDeleted, - $hostPagesSnapDeleted, - $hostPagesToHostPageDeleted, - $hostPagesBansRemoved, - $manifestsTotal, - $manifestsDeleted, - $logsCleanerDeleted, - $logsCrawlerDeleted, - $httpRequestsTotal, - $httpRequestsSizeTotal, - $httpDownloadSizeTotal, - $httpRequestsTimeTotal, - $executionTimeTotal); - -} - -echo 'Hosts total: ' . $hostsTotal . PHP_EOL; -echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL; -echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL; - -echo 'Manifests total: ' . $manifestsTotal . PHP_EOL; -echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL; - echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; -echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL; -echo 'Host page doms deleted: ' . $hostPagesDomsDeleted . PHP_EOL; -echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL; -echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL; - -echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL; -echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL; - -echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; -echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL; -echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL; -echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL; -echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL; \ No newline at end of file +echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL; \ No newline at end of file diff --git a/crontab/crawler.php b/crontab/crawler.php index db553a2..f81022d 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -1227,23 +1227,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $executionTimeTotal = microtime(true) - $timeStart; $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; -if (CRAWL_LOG_ENABLED) { - - $db->addCrawlerLog(time(), - $hostsAdded, - $hostPagesProcessed, - $hostPagesAdded, - $hostPagesSnapAdded, - $hostPagesBanned, - $manifestsProcessed, - $manifestsAdded, - $httpRequestsTotal, - $httpRequestsSizeTotal, - $httpDownloadSizeTotal, - $httpRequestsTimeTotal, - $executionTimeTotal); -} - // Debug output echo 'Hosts added: ' . $hostsAdded . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb index 4a90b91..eb968ee 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/mysql.php b/library/mysql.php index f7e1d15..c2cbbda 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -559,39 +559,6 @@ class MySQL { } // Cleaner tools - public function getCleanerQueue(int $limit, int $timeFrom) { - - $result = []; - - // Get ID (to prevent memory over usage) - $query = $this->_db->prepare("SELECT `hostId` - - FROM `host` - - WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ? - - ORDER BY `hostId` - - LIMIT " . (int) $limit); - - $query->execute([$timeFrom, 0]); - - // Get required page details - foreach ($query->fetchAll() as $host) { - - $result[] = $this->getHost($host->hostId); - } - - return (object) $result; - } - - public function getHostPagesBanned() { - - $query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeBanned` IS NOT NULL'); - - return $query->fetchAll(); - } - public function resetBannedHostPages(int $timeOffset) { $query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset); @@ -601,87 +568,7 @@ class MySQL { return $query->rowCount(); } - public function deleteHostPageDescriptionsByTimeAdded(int $timeOffset) { - - $query = $this->_db->prepare('DELETE FROM `hostPageDescription` WHERE `timeAdded` < ' . (int) $timeOffset); - - $query->execute(); - - return $query->rowCount(); - } - - public function addCleanerLog(int $timeAdded, - int $hostsTotal, - int $hostsUpdated, - int $hostPagesDeleted, - int $hostPagesDescriptionsDeleted, - int $hostPagesDomsDeleted, - int $hostPagesSnapDeleted, - int $hostPagesToHostPageDeleted, - int $hostPagesBansRemoved, - int $manifestsTotal, - int $manifestsDeleted, - int $logsCleanerDeleted, - int $logsCrawlerDeleted, - int $httpRequestsTotal, - int $httpRequestsSizeTotal, - int $httpDownloadSizeTotal, - float $httpRequestsTimeTotal, - float $executionTimeTotal) { - - $query = $this->_db->prepare('INSERT INTO `logCleaner` (`timeAdded`, - `hostsTotal`, - `hostsUpdated`, - `hostPagesDeleted`, - `hostPagesDescriptionsDeleted`, - `hostPagesDomsDeleted`, - `hostPagesSnapDeleted`, - `hostPagesToHostPageDeleted`, - `hostPagesBansRemoved`, - `manifestsTotal`, - `manifestsDeleted`, - `logsCleanerDeleted`, - `logsCrawlerDeleted`, - `httpRequestsTotal`, - `httpRequestsSizeTotal`, - `httpDownloadSizeTotal`, - `httpRequestsTimeTotal`, - `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); - - $query->execute([ - $timeAdded, - $hostsTotal, - $hostsUpdated, - $hostPagesDeleted, - $hostPagesDescriptionsDeleted, - $hostPagesDomsDeleted, - $hostPagesSnapDeleted, - $hostPagesToHostPageDeleted, - $hostPagesBansRemoved, - $manifestsTotal, - $manifestsDeleted, - $logsCleanerDeleted, - $logsCrawlerDeleted, - $httpRequestsTotal, - $httpRequestsSizeTotal, - $httpDownloadSizeTotal, - $httpRequestsTimeTotal, - $executionTimeTotal - ]); - - return $this->_db->lastInsertId(); - } - - public function deleteLogCleaner(int $timeOffset) { - - $query = $this->_db->prepare('DELETE FROM `logCleaner` WHERE `timeAdded` < ' . (int) $timeOffset); - - $query->execute(); - - return $query->rowCount(); - } - - // Crawl tools + // Crawler tools public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { $query = $this->_db->prepare("SELECT COUNT(*) AS `total` @@ -833,62 +720,6 @@ class MySQL { return $query->rowCount(); } - public function addCrawlerLog(int $timeAdded, - int $hostsAdded, - int $hostPagesProcessed, - int $hostPagesAdded, - int $hostPagesSnapAdded, - int $hostPagesBanned, - int $manifestsProcessed, - int $manifestsAdded, - int $httpRequestsTotal, - int $httpRequestsSizeTotal, - int $httpDownloadSizeTotal, - float $httpRequestsTimeTotal, - float $executionTimeTotal) { - - $query = $this->_db->prepare('INSERT INTO `logCrawler` (`timeAdded`, - `hostsAdded`, - `hostPagesProcessed`, - `hostPagesAdded`, - `hostPagesSnapAdded`, - `hostPagesBanned`, - `manifestsProcessed`, - `manifestsAdded`, - `httpRequestsTotal`, - `httpRequestsSizeTotal`, - `httpDownloadSizeTotal`, - `httpRequestsTimeTotal`, - `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); - - $query->execute([ - $timeAdded, - $hostsAdded, - $hostPagesProcessed, - $hostPagesAdded, - $hostPagesSnapAdded, - $hostPagesBanned, - $manifestsProcessed, - $manifestsAdded, - $httpRequestsTotal, - $httpRequestsSizeTotal, - $httpDownloadSizeTotal, - $httpRequestsTimeTotal, - $executionTimeTotal - ]); - - return $this->_db->lastInsertId(); - } - - public function deleteLogCrawler(int $timeOffset) { - - $query = $this->_db->prepare('DELETE FROM `logCrawler` WHERE `timeAdded` < ' . (int) $timeOffset); - - $query->execute(); - - return $query->rowCount(); - } - public function optimize() { $this->_db->query('OPTIMIZE TABLE `host`'); @@ -900,9 +731,6 @@ class MySQL { $this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`'); $this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`'); - $this->_db->query('OPTIMIZE TABLE `logCleaner`'); - $this->_db->query('OPTIMIZE TABLE `logCrawler`'); - $this->_db->query('OPTIMIZE TABLE `manifest`'); } } diff --git a/media/db-prototype.png b/media/db-prototype.png index 7de3f9f..4416693 100644 Binary files a/media/db-prototype.png and b/media/db-prototype.png differ