Browse Source

show sitemaps processed debug

main
ghost 1 year ago
parent
commit
6ee5e53ef4
  1. 40
      crontab/crawler.php

40
crontab/crawler.php

@ -44,13 +44,16 @@ $httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0; $httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0; $httpRequestsTimeTotal = 0;
$hostPagesProcessed = 0;
$manifestsProcessed = 0;
$hostPagesAdded = 0;
$hostsAdded = 0; $hostsAdded = 0;
$hostPagesBanned = 0; $hostPagesBanned = 0;
$hostPagesSnapAdded = 0; $hostPagesSnapAdded = 0;
$hostPagesProcessed = 0;
$hostPagesAdded = 0;
$manifestsProcessed = 0;
$sitemapsProcessed = 0;
// Connect database // Connect database
try { try {
@ -111,21 +114,26 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
// Init sitemap data // Init sitemap data
$sitemap = new Sitemap($hostSitemapPath); $sitemap = new Sitemap($hostSitemapPath);
// Process collected sitemap links if ($sitemapLinks = $sitemap->getLinks()) {
foreach ($sitemap->getLinks() as $link => $attributes) {
$sitemapsProcessed++;
// Parse formatted link // Process collected sitemap links
$linkURI = Parser::uri($link); foreach ($sitemapLinks as $link => $attributes) {
$linkHostURL = Parser::hostURL($link);
// Add host page // Parse formatted link
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format $linkURI = Parser::uri($link);
$linkHostURL->string == $host->url && // this host links only $linkHostURL = Parser::hostURL($link);
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); // Add host page
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
$linkHostURL->string == $host->url && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
}
} }
} }
} }
@ -1206,6 +1214,8 @@ echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL; echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;

Loading…
Cancel
Save