Browse Source

fix page/image mime content type detection

main
ghost 2 years ago
parent
commit
bf1eeb332c
  1. 36
      crontab/crawler.php

36
crontab/crawler.php

@ -58,7 +58,7 @@ try {
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
@ -255,7 +255,7 @@ try {
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
@ -279,11 +279,19 @@ try {
} }
// Skip image processing on MIME type not allowed in settings // Skip image processing on MIME type not allowed in settings
if (false === strpos(CRAWL_IMAGE_MIME, $hostImageContentType)) { $hostImageBanned = true;
foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); if (false !== strpos($hostImageContentType, trim($mime))) {
continue; $hostImageBanned = false;
break;
}
}
if ($hostImageBanned) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
} }
// Convert remote image data to base64 string // Convert remote image data to base64 string
@ -335,7 +343,7 @@ try {
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
@ -359,11 +367,19 @@ try {
} }
// Skip page processing on MIME type not allowed in settings // Skip page processing on MIME type not allowed in settings
if (false === strpos(CRAWL_PAGE_MIME, $contentType)) { $hostPageBanned = true;
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); if (false !== strpos($contentType, trim($mime))) {
continue; $hostPageBanned = false;
break;
}
}
if ($hostPageBanned) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
} }
// Skip page processing without returned data // Skip page processing without returned data
@ -669,7 +685,7 @@ try {
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();

Loading…
Cancel
Save