mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 21:44:59 +00:00
show sitemaps processed debug
This commit is contained in:
parent
71724ae33f
commit
6ee5e53ef4
@ -44,13 +44,16 @@ $httpRequestsSizeTotal = 0;
|
|||||||
$httpDownloadSizeTotal = 0;
|
$httpDownloadSizeTotal = 0;
|
||||||
$httpRequestsTimeTotal = 0;
|
$httpRequestsTimeTotal = 0;
|
||||||
|
|
||||||
$hostPagesProcessed = 0;
|
|
||||||
$manifestsProcessed = 0;
|
|
||||||
$hostPagesAdded = 0;
|
|
||||||
$hostsAdded = 0;
|
$hostsAdded = 0;
|
||||||
$hostPagesBanned = 0;
|
$hostPagesBanned = 0;
|
||||||
$hostPagesSnapAdded = 0;
|
$hostPagesSnapAdded = 0;
|
||||||
|
|
||||||
|
$hostPagesProcessed = 0;
|
||||||
|
$hostPagesAdded = 0;
|
||||||
|
|
||||||
|
$manifestsProcessed = 0;
|
||||||
|
$sitemapsProcessed = 0;
|
||||||
|
|
||||||
// Connect database
|
// Connect database
|
||||||
try {
|
try {
|
||||||
|
|
||||||
@ -111,21 +114,26 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
|
|||||||
// Init sitemap data
|
// Init sitemap data
|
||||||
$sitemap = new Sitemap($hostSitemapPath);
|
$sitemap = new Sitemap($hostSitemapPath);
|
||||||
|
|
||||||
// Process collected sitemap links
|
if ($sitemapLinks = $sitemap->getLinks()) {
|
||||||
foreach ($sitemap->getLinks() as $link => $attributes) {
|
|
||||||
|
|
||||||
// Parse formatted link
|
$sitemapsProcessed++;
|
||||||
$linkURI = Parser::uri($link);
|
|
||||||
$linkHostURL = Parser::hostURL($link);
|
|
||||||
|
|
||||||
// Add host page
|
// Process collected sitemap links
|
||||||
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
foreach ($sitemapLinks as $link => $attributes) {
|
||||||
$linkHostURL->string == $host->url && // this host links only
|
|
||||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
|
||||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
|
||||||
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
|
||||||
|
|
||||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
// Parse formatted link
|
||||||
|
$linkURI = Parser::uri($link);
|
||||||
|
$linkHostURL = Parser::hostURL($link);
|
||||||
|
|
||||||
|
// Add host page
|
||||||
|
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||||
|
$linkHostURL->string == $host->url && // this host links only
|
||||||
|
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||||
|
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||||
|
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||||
|
|
||||||
|
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1206,6 +1214,8 @@ echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
|||||||
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
||||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
||||||
|
|
||||||
|
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL;
|
||||||
|
|
||||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||||
|
|
||||||
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user