From f88d2ee9ff9457bdb1960cb82a1a6aa8a580f995 Mon Sep 17 00:00:00 2001 From: ghost Date: Fri, 5 May 2023 21:25:57 +0300 Subject: [PATCH] implement MIME content-type crawler filter --- config/app.php.txt | 16 ++++++++++++++++ crontab/cleaner.php | 4 ++-- crontab/crawler.php | 32 ++++++++++++++++++++++++++++---- library/curl.php | 5 +++++ 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/config/app.php.txt b/config/app.php.txt index 102caf6..4a14369 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -170,6 +170,22 @@ define('CRAWL_MANIFEST_LIMIT', 10); */ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); +/* + * Index pages match MIME types + * + * comma separated + * + */ +define('CRAWL_PAGE_MIME_TYPE', 'text/html'); + +/* + * Index images match MIME types + * + * comma separated + * + */ +define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpg,image/ico'); + /* * Renew image index by timing offset provided * diff --git a/crontab/cleaner.php b/crontab/cleaner.php index 7b14b2a..90b2399 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -85,7 +85,7 @@ try { // Apply new robots.txt rules $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - foreach ($db->getHostImages($host->hostId) as $hostImage) { + foreach ($db->getHostImages($host->hostId) as $hostImage) { // @TODO implement CRAWL_IMAGE_MIME_TYPE updates if (!$robots->uriAllowed($hostImage->uri)) { @@ -98,7 +98,7 @@ try { } } - foreach ($db->getHostPages($host->hostId) as $hostPage) { + foreach ($db->getHostPages($host->hostId) as $hostPage) { // @TODO implement CRAWL_PAGE_MIME_TYPE updates if (!$robots->uriAllowed($hostPage->uri)) { diff --git a/crontab/crawler.php b/crontab/crawler.php index aca05ff..cf0f787 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -230,7 +230,7 @@ try { // Update image index anyway, with the current time and http code $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode()); - // Skip next image processing non 200 code + // Skip image processing non 200 code if (200 != $curl->getCode()) { continue; @@ -239,7 +239,19 @@ try { // Save image content on data settings enabled if (!CRAWL_HOST_DEFAULT_META_ONLY) { - // Skip next image processing images without returned data + // Skip image processing on MIME type not provided + if (!$contentType = $curl->getContentType()) { + + continue; + } + + // Skip image processing on MIME type not allowed in settings + if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) { + + continue; + } + + // Skip image processing images without returned content if (!$content = $curl->getContent()) { continue; @@ -271,13 +283,25 @@ try { // Update page index anyway, with the current time and http code $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); - // Skip next page processing non 200 code + // Skip page processing non 200 code if (200 != $curl->getCode()) { continue; } - // Skip next page processing pages without returned data + // Skip page processing on MIME type not provided + if (!$contentType = $curl->getContentType()) { + + continue; + } + + // Skip page processing on MIME type not allowed in settings + if (false === strpos($contentType, CRAWL_PAGE_MIME_TYPE)) { + + continue; + } + + // Skip page processing pages without returned data if (!$content = $curl->getContent()) { continue; diff --git a/library/curl.php b/library/curl.php index 41a03b5..e75bbdf 100644 --- a/library/curl.php +++ b/library/curl.php @@ -42,6 +42,11 @@ class Curl { } + public function getContentType() { + + return curl_getinfo($this->_connection, CURLINFO_CONTENT_TYPE); + } + public function getContent() { return $this->_response;