Browse Source

skip xmpp links

main
ghost 1 year ago
parent
commit
1655ec63b2
  1. 10
      crontab/crawler.php

10
crontab/crawler.php

@ -362,13 +362,13 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Update page index anyway, with the current time and http code // Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload()); $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
// This page has on 200 code // This page not available
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
// Ban this page // Ban this page
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Try to receive target page location on page redirect available // Try to receive target page location on page redirect available by following location
$curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true); $curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
// Update curl stats // Update curl stats
@ -1028,6 +1028,12 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
continue; continue;
} }
// Skip xmpp links
if (false !== stripos($href, 'xmpp:')) {
continue;
}
// Skip x-raw-image links // Skip x-raw-image links
/* /*
if (false !== stripos($href, 'x-raw-image:')) { if (false !== stripos($href, 'x-raw-image:')) {

Loading…
Cancel
Save