From 1655ec63b2fae81fe734b68516b0904dfdedc486 Mon Sep 17 00:00:00 2001 From: ghost Date: Wed, 2 Aug 2023 11:57:54 +0300 Subject: [PATCH] skip xmpp links --- crontab/crawler.php | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crontab/crawler.php b/crontab/crawler.php index 135da2b..80441e5 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -362,13 +362,13 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Update page index anyway, with the current time and http code $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload()); - // This page has on 200 code + // This page not available if (200 != $curl->getCode()) { // Ban this page $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); - // Try to receive target page location on page redirect available + // Try to receive target page location on page redirect available by following location $curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true); // Update curl stats @@ -1028,6 +1028,12 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND continue; } + // Skip xmpp links + if (false !== stripos($href, 'xmpp:')) { + + continue; + } + // Skip x-raw-image links /* if (false !== stripos($href, 'x-raw-image:')) {