Browse Source

fix host page mime detection

main
ghost 2 years ago
parent
commit
93c6067fd9
  1. 28
      crontab/crawler.php

28
crontab/crawler.php

@ -449,25 +449,21 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
continue; continue;
} }
// Parse index MIME // Parse MIME
$hostPageIsDom = false; $hostPageIsHtml = false;
$hostPageInMime = false; $hostPageInMime = false;
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) { foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
$mime = Filter::mime($mime); // Ban page on MIME type not allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
// Check for DOM // Check for HTML page
if (false !== stripos('text/html', $mime)) { if (false !== stripos(Filter::mime($contentType), 'text/html')) {
$hostPageIsDom = true; $hostPageIsHtml = true;
$hostPageInMime = true;
break;
} }
// Ban page on MIME type not allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) {
$hostPageInMime = true; $hostPageInMime = true;
break; break;
} }
@ -503,7 +499,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$yggoManifest = null; $yggoManifest = null;
// Is DOM content // Is DOM content
if ($hostPageIsDom) { if ($hostPageIsHtml) {
// Parse content // Parse content
$dom = new DomDocument(); $dom = new DomDocument();
@ -598,10 +594,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
$mime = Filter::mime($mime);
// MIME type allowed in settings // MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) { if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapLocal = true; $snapLocal = true;
break; break;
@ -614,10 +608,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) { foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
$mime = Filter::mime($mime);
// MIME type allowed in settings // MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) { if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMega = true; $snapMega = true;
break; break;

Loading…
Cancel
Save