From 8dbb4a06af129cb832f8c71bfb96e9afb43f5eff Mon Sep 17 00:00:00 2001 From: ghost Date: Sun, 23 Apr 2023 04:05:00 +0300 Subject: [PATCH] add disk quota validation --- config/app.php.txt | 6 ++++++ crontab/crawler.php | 7 +++++++ public/search.php | 50 ++++++++++++++++++++++++--------------------- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/config/app.php.txt b/config/app.php.txt index 34f6e90..27da7ea 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -40,6 +40,12 @@ define('SPHINX_PORT', 9306); // Crawler settings +/* + * Stop crawler on disk quota reached (Mb) + * + */ +define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); + /* * Pages (URI) processing limit in the crawler.php queue * diff --git a/crontab/crawler.php b/crontab/crawler.php index b37bfa0..632763c 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -9,6 +9,13 @@ if (false === sem_acquire($semaphore, true)) { exit; } +// Check disk quota +if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) { + + echo 'Disk quota reached.' . PHP_EOL; + exit; +} + // Load system dependencies require_once('../config/app.php'); require_once('../library/curl.php'); diff --git a/public/search.php b/public/search.php index d600d3f..60a8530 100644 --- a/public/search.php +++ b/public/search.php @@ -49,30 +49,34 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { // Register new host } else { - // Get robots.txt if exists - $curl = new Curl($hostURL->string . '/robots.txt'); - - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = null; + // Disk quota not reached + if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) { + + // Get robots.txt if exists + $curl = new Curl($hostURL->string . '/robots.txt'); + + if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { + $hostRobots = $curl->getContent(); + } else { + $hostRobots = null; + } + + $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; + + $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; + $hostId = $db->addHost($hostURL->scheme, + $hostURL->name, + $hostURL->port, + crc32($hostURL->string), + time(), + null, + $hostPageLimit, + (string) CRAWL_HOST_DEFAULT_META_ONLY, + (string) $hostStatus, + $hostRobots, + $hostRobotsPostfix); } - - $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - - $hostStatus = CRAWL_HOST_DEFAULT_STATUS; - $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - $hostId = $db->addHost($hostURL->scheme, - $hostURL->name, - $hostURL->port, - crc32($hostURL->string), - time(), - null, - $hostPageLimit, - (string) CRAWL_HOST_DEFAULT_META_ONLY, - (string) $hostStatus, - $hostRobots, - $hostRobotsPostfix); } // Parse page URI