|
|
@ -44,11 +44,12 @@ $httpRequestsSizeTotal = 0; |
|
|
|
$httpDownloadSizeTotal = 0; |
|
|
|
$httpDownloadSizeTotal = 0; |
|
|
|
$httpRequestsTimeTotal = 0; |
|
|
|
$httpRequestsTimeTotal = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostsProcessed = 0; |
|
|
|
$hostsAdded = 0; |
|
|
|
$hostsAdded = 0; |
|
|
|
$hostPagesBanned = 0; |
|
|
|
|
|
|
|
$hostPagesSnapAdded = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesProcessed = 0; |
|
|
|
$hostPagesProcessed = 0; |
|
|
|
|
|
|
|
$hostPagesBanned = 0; |
|
|
|
|
|
|
|
$hostPagesSnapAdded = 0; |
|
|
|
$hostPagesAdded = 0; |
|
|
|
$hostPagesAdded = 0; |
|
|
|
|
|
|
|
|
|
|
|
$manifestsProcessed = 0; |
|
|
|
$manifestsProcessed = 0; |
|
|
@ -67,8 +68,18 @@ try { |
|
|
|
exit; |
|
|
|
exit; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Process robots crawl queue |
|
|
|
// Process hosts crawl queue |
|
|
|
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { |
|
|
|
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->beginTransaction(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update host crawl queue |
|
|
|
|
|
|
|
$hostsProcessed += $db->updateHostCrawlQueue($host->hostId); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Crawl robots.txt |
|
|
|
|
|
|
|
if (CRAWL_ROBOTS) { |
|
|
|
|
|
|
|
|
|
|
|
// Update robots |
|
|
|
// Update robots |
|
|
|
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); |
|
|
|
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); |
|
|
@ -91,6 +102,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ |
|
|
|
|
|
|
|
|
|
|
|
// Update host index |
|
|
|
// Update host index |
|
|
|
$db->updateHostRobots($host->hostId, $hostRobots, time()); |
|
|
|
$db->updateHostRobots($host->hostId, $hostRobots, time()); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Process sitemaps when enabled |
|
|
|
// Process sitemaps when enabled |
|
|
|
if (CRAWL_SITEMAPS) { |
|
|
|
if (CRAWL_SITEMAPS) { |
|
|
@ -138,7 +150,8 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Update manifest if available for this host |
|
|
|
// Update manifests |
|
|
|
|
|
|
|
if (CRAWL_MANIFEST) { |
|
|
|
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) { |
|
|
|
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) { |
|
|
|
|
|
|
|
|
|
|
|
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); |
|
|
|
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); |
|
|
@ -323,6 +336,21 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Process update errors |
|
|
|
|
|
|
|
} catch (Exception $e) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Debug std |
|
|
|
|
|
|
|
var_dump($e); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip item |
|
|
|
|
|
|
|
$db->rollBack(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Process pages crawl queue |
|
|
|
// Process pages crawl queue |
|
|
@ -1207,20 +1235,21 @@ $executionTimeTotal = microtime(true) - $timeStart; |
|
|
|
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; |
|
|
|
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; |
|
|
|
|
|
|
|
|
|
|
|
// Debug output |
|
|
|
// Debug output |
|
|
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; |
|
|
|
echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL; |
|
|
|
|
|
|
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL; |
|
|
|
|
|
|
|
|
|
|
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; |
|
|
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; |
|
|
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; |
|
|
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; |
|
|
|
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL; |
|
|
|
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL; |
|
|
|
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; |
|
|
|
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL . PHP_EOL; |
|
|
|
|
|
|
|
|
|
|
|
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL; |
|
|
|
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL . PHP_EOL; |
|
|
|
|
|
|
|
|
|
|
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; |
|
|
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL . PHP_EOL; |
|
|
|
|
|
|
|
|
|
|
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; |
|
|
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; |
|
|
|
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL; |
|
|
|
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL; |
|
|
|
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL; |
|
|
|
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL; |
|
|
|
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL; |
|
|
|
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL . PHP_EOL; |
|
|
|
|
|
|
|
|
|
|
|
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL; |
|
|
|
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL; |
|
|
|