Browse Source

add disk quota validation

main
ghost 2 years ago
parent
commit
8dbb4a06af
  1. 6
      config/app.php.txt
  2. 7
      crontab/crawler.php
  3. 50
      public/search.php

6
config/app.php.txt

@ -40,6 +40,12 @@ define('SPHINX_PORT', 9306); @@ -40,6 +40,12 @@ define('SPHINX_PORT', 9306);
// Crawler settings
/*
* Stop crawler on disk quota reached (Mb)
*
*/
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
/*
* Pages (URI) processing limit in the crawler.php queue
*

7
crontab/crawler.php

@ -9,6 +9,13 @@ if (false === sem_acquire($semaphore, true)) { @@ -9,6 +9,13 @@ if (false === sem_acquire($semaphore, true)) {
exit;
}
// Check disk quota
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
echo 'Disk quota reached.' . PHP_EOL;
exit;
}
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');

50
public/search.php

@ -49,30 +49,34 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -49,30 +49,34 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
// Disk quota not reached
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots,
$hostRobotsPostfix);
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots,
$hostRobotsPostfix);
}
// Parse page URI

Loading…
Cancel
Save