From 93c6067fd9ee8cda70a6da2f30de536863f22f13 Mon Sep 17 00:00:00 2001 From: ghost Date: Tue, 13 Jun 2023 22:29:28 +0300 Subject: [PATCH] fix host page mime detection --- crontab/crawler.php | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/crontab/crawler.php b/crontab/crawler.php index ad1f142..aace477 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -449,24 +449,20 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND continue; } - // Parse index MIME - $hostPageIsDom = false; + // Parse MIME + $hostPageIsHtml = false; $hostPageInMime = false; foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) { - $mime = Filter::mime($mime); - - // Check for DOM - if (false !== stripos('text/html', $mime)) { + // Ban page on MIME type not allowed in settings + if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { - $hostPageIsDom = true; - $hostPageInMime = true; - break; - } + // Check for HTML page + if (false !== stripos(Filter::mime($contentType), 'text/html')) { - // Ban page on MIME type not allowed in settings - if (false !== stripos(Filter::mime($contentType), $mime)) { + $hostPageIsHtml = true; + } $hostPageInMime = true; break; @@ -503,7 +499,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $yggoManifest = null; // Is DOM content - if ($hostPageIsDom) { + if ($hostPageIsHtml) { // Parse content $dom = new DomDocument(); @@ -598,10 +594,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { - $mime = Filter::mime($mime); - // MIME type allowed in settings - if (false !== stripos(Filter::mime($contentType), $mime)) { + if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { $snapLocal = true; break; @@ -614,10 +608,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) { - $mime = Filter::mime($mime); - // MIME type allowed in settings - if (false !== stripos(Filter::mime($contentType), $mime)) { + if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { $snapMega = true; break; @@ -647,10 +639,10 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Insert compressed snap data into the tmp storage if (true === $zip->addFromString('DATA', $content) && true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . - sprintf('CRC32: %s', $crc32data . PHP_EOL . - sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . - sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . - sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { + sprintf('CRC32: %s', $crc32data . PHP_EOL . + sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . + sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . + sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { // Done $zip->close();