|
|
@ -49,6 +49,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { |
|
|
|
// Register new host |
|
|
|
// Register new host |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Disk quota not reached |
|
|
|
|
|
|
|
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) { |
|
|
|
|
|
|
|
|
|
|
|
// Get robots.txt if exists |
|
|
|
// Get robots.txt if exists |
|
|
|
$curl = new Curl($hostURL->string . '/robots.txt'); |
|
|
|
$curl = new Curl($hostURL->string . '/robots.txt'); |
|
|
|
|
|
|
|
|
|
|
@ -74,6 +77,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { |
|
|
|
$hostRobots, |
|
|
|
$hostRobots, |
|
|
|
$hostRobotsPostfix); |
|
|
|
$hostRobotsPostfix); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Parse page URI |
|
|
|
// Parse page URI |
|
|
|
$hostPageURI = Parser::uri($q); |
|
|
|
$hostPageURI = Parser::uri($q); |
|
|
|