From 8e8d89db0eb5642c4ae04426dd22c6f74ed1011c Mon Sep 17 00:00:00 2001 From: ghost Date: Sun, 9 Apr 2023 00:06:28 +0300 Subject: [PATCH] implement database cleaner --- config/app.php.txt | 8 +++-- crontab/cleaner.php | 84 +++++++++++++++++++++++++++++++++++++++++++++ library/mysql.php | 61 ++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 2 deletions(-) create mode 100644 crontab/cleaner.php diff --git a/config/app.php.txt b/config/app.php.txt index 3a04b40..7261932 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -21,7 +21,7 @@ define('DB_PASSWORD', ''); define('SPHINX_HOST', '127.0.0.1'); define('SPHINX_PORT', 9306); -// Crawl settings +// Crawler settings define('CRAWL_PAGE_LIMIT', 10); define('CRAWL_PAGE_SECONDS_OFFSET', 3600); @@ -49,4 +49,8 @@ define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null * yggdrasil: /database/yggdrasil/host.robotsPostfix.md * */ -define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null \ No newline at end of file +define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null + +// Cleaner settings +define('CLEAN_HOST_LIMIT', 20); +define('CLEAN_HOST_SECONDS_OFFSET', 3600); \ No newline at end of file diff --git a/crontab/cleaner.php b/crontab/cleaner.php new file mode 100644 index 0000000..a7e5674 --- /dev/null +++ b/crontab/cleaner.php @@ -0,0 +1,84 @@ +getTotalHosts(); +$hostsUpdated = 0; +$hostsPagesDeleted = 0; + +// Get host queue +foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { + + // Parse host info + $hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false); + + // Get robots.txt if exists + $curl = new Curl($hostURL . '/robots.txt'); + + if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { + $hostRobots = $curl->getContent(); + } else { + $hostRobots = null; + } + + // Begin update + $db->beginTransaction(); + + try { + + // Update host data + $hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time()); + + // Apply host pages limits + $totalHostPages = $db->getTotalHostPages($host->hostId); + + if ($totalHostPages > $host->crawlPageLimit) { + + $hostsPagesDeleted += $db->deleteHostPages($host->hostId, $totalHostPages - $host->crawlPageLimit); + } + + // Apply new robots.txt rules + $robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); + + foreach ($db->getHostPages($host->hostId) as $hostPage) { + + if (!$robots->uriAllowed($hostPage->uri)) { + + $hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); + } + } + + $db->commit(); + + } catch(Exception $e){ + + var_dump($e); + + $db->rollBack(); + } +} + +// Debug +echo 'Hosts total: ' . $hostsTotal . PHP_EOL; +echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL; +echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL; +echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL; \ No newline at end of file diff --git a/library/mysql.php b/library/mysql.php index 08e0c20..74bf14c 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -38,6 +38,15 @@ class MySQL { return $query->fetch(); } + public function getTotalHosts() { + + $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `host`'); + + $query->execute(); + + return $query->fetch()->total; + } + public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) { $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); @@ -47,6 +56,15 @@ class MySQL { return $this->_db->lastInsertId(); } + public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) { + + $query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1'); + + $query->execute([$robots, $timeUpdated, $hostId]); + + return $query->rowCount(); + } + // Pages public function getTotalHostPages(int $hostId) { @@ -92,6 +110,15 @@ class MySQL { return $query->fetch(); } + public function getHostPages(int $hostId) { + + $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?'); + + $query->execute([$hostId]); + + return $query->fetchAll(); + } + public function getFoundHostPage(int $hostPageId) { $query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`, @@ -159,6 +186,40 @@ class MySQL { return $query->rowCount(); } + public function deleteHostPage(int $hostPageId) { + + $query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1'); + + $query->execute([$hostPageId]); + + return $query->rowCount(); + } + + public function deleteHostPages(int $hostId, int $limit) { + + $query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit); + + $query->execute([$hostId]); + + return $query->rowCount(); + } + + // Cleaner tools + public function getCleanerQueue(int $limit, int $timeFrom) { + + $query = $this->_db->prepare('SELECT * FROM `host` + + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> 0 + + ORDER BY `hostId` + + LIMIT ' . (int) $limit); + + $query->execute([$timeFrom]); + + return $query->fetchAll(); + } + // Crawl tools public function getCrawlQueue(int $limit, int $timeFrom) {