add disk quota validation

This commit is contained in:
ghost 2023-04-23 04:05:00 +03:00
parent 7bee0ebb4d
commit 8dbb4a06af
3 changed files with 39 additions and 22 deletions

View File

@ -40,6 +40,12 @@ define('SPHINX_PORT', 9306);
// Crawler settings // Crawler settings
/*
* Stop crawler on disk quota reached (Mb)
*
*/
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
/* /*
* Pages (URI) processing limit in the crawler.php queue * Pages (URI) processing limit in the crawler.php queue
* *

View File

@ -9,6 +9,13 @@ if (false === sem_acquire($semaphore, true)) {
exit; exit;
} }
// Check disk quota
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
echo 'Disk quota reached.' . PHP_EOL;
exit;
}
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
require_once('../library/curl.php'); require_once('../library/curl.php');

View File

@ -49,6 +49,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
// Register new host // Register new host
} else { } else {
// Disk quota not reached
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
// Get robots.txt if exists // Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt'); $curl = new Curl($hostURL->string . '/robots.txt');
@ -74,6 +77,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostRobots, $hostRobots,
$hostRobotsPostfix); $hostRobotsPostfix);
} }
}
// Parse page URI // Parse page URI
$hostPageURI = Parser::uri($q); $hostPageURI = Parser::uri($q);