From bf1eeb332ceb79c68060985fd6e845f8130ccc9b Mon Sep 17 00:00:00 2001 From: ghost Date: Mon, 8 May 2023 12:10:57 +0300 Subject: [PATCH] fix page/image mime content type detection --- crontab/crawler.php | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/crontab/crawler.php b/crontab/crawler.php index 41db96f..342144a 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -58,7 +58,7 @@ try { // Update curl stats $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); @@ -255,7 +255,7 @@ try { // Update curl stats $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); @@ -279,11 +279,19 @@ try { } // Skip image processing on MIME type not allowed in settings - if (false === strpos(CRAWL_IMAGE_MIME, $hostImageContentType)) { + $hostImageBanned = true; + foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) { - $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); + if (false !== strpos($hostImageContentType, trim($mime))) { - continue; + $hostImageBanned = false; + break; + } + } + + if ($hostImageBanned) { + + $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); } // Convert remote image data to base64 string @@ -335,7 +343,7 @@ try { // Update curl stats $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); @@ -359,11 +367,19 @@ try { } // Skip page processing on MIME type not allowed in settings - if (false === strpos(CRAWL_PAGE_MIME, $contentType)) { + $hostPageBanned = true; + foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) { - $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); + if (false !== strpos($contentType, trim($mime))) { - continue; + $hostPageBanned = false; + break; + } + } + + if ($hostPageBanned) { + + $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); } // Skip page processing without returned data @@ -669,7 +685,7 @@ try { // Update curl stats $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime();