skip xmpp links

This commit is contained in:
ghost 2023-08-02 11:57:54 +03:00
parent 06c136f05c
commit 1655ec63b2

View File

@ -362,13 +362,13 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
// This page has on 200 code
// This page not available
if (200 != $curl->getCode()) {
// Ban this page
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Try to receive target page location on page redirect available
// Try to receive target page location on page redirect available by following location
$curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
// Update curl stats
@ -1028,6 +1028,12 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
continue;
}
// Skip xmpp links
if (false !== stripos($href, 'xmpp:')) {
continue;
}
// Skip x-raw-image links
/*
if (false !== stripos($href, 'x-raw-image:')) {