implement MIME content-type crawler filter

2025-03-13 05:41:02 +00:00 · 2023-05-05 21:25:57 +03:00 · 2023-05-05 21:25:57 +03:00 · f88d2ee9ff
commit f88d2ee9ff
parent d945fdfd91
4 changed files with 51 additions and 6 deletions
--- a/config/app.php.txt
+++ b/config/app.php.txt
@ -170,6 +170,22 @@ define('CRAWL_MANIFEST_LIMIT', 10);
 */
 define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);

+/*
+ * Index pages match MIME types
+ *
+ * comma separated
+ *
+ */
+define('CRAWL_PAGE_MIME_TYPE', 'text/html');
+
+/*
+ * Index images match MIME types
+ *
+ * comma separated
+ *
+ */
+define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpg,image/ico');
+
 /*
 * Renew image index by timing offset provided
 *
--- a/crontab/cleaner.php
+++ b/crontab/cleaner.php
@ -85,7 +85,7 @@ try {
    // Apply new robots.txt rules
    $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));

-    foreach ($db->getHostImages($host->hostId) as $hostImage) {
+    foreach ($db->getHostImages($host->hostId) as $hostImage) { // @TODO implement CRAWL_IMAGE_MIME_TYPE updates

      if (!$robots->uriAllowed($hostImage->uri)) {

@ -98,7 +98,7 @@ try {
      }
    }

-    foreach ($db->getHostPages($host->hostId) as $hostPage) {
+    foreach ($db->getHostPages($host->hostId) as $hostPage) { // @TODO implement CRAWL_PAGE_MIME_TYPE updates

      if (!$robots->uriAllowed($hostPage->uri)) {

--- a/crontab/crawler.php
+++ b/crontab/crawler.php
@ -230,7 +230,7 @@ try {
    // Update image index anyway, with the current time and http code
    $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());

-    // Skip next image processing non 200 code
+    // Skip image processing non 200 code
    if (200 != $curl->getCode()) {

      continue;
@ -239,7 +239,19 @@ try {
    // Save image content on data settings enabled
    if (!CRAWL_HOST_DEFAULT_META_ONLY) {

-      // Skip next image processing images without returned data
+      // Skip image processing on MIME type not provided
+      if (!$contentType = $curl->getContentType()) {
+
+        continue;
+      }
+
+      // Skip image processing on MIME type not allowed in settings
+      if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) {
+
+        continue;
+      }
+
+      // Skip image processing images without returned content
      if (!$content = $curl->getContent()) {

        continue;
@ -271,13 +283,25 @@ try {
    // Update page index anyway, with the current time and http code
    $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());

-    // Skip next page processing non 200 code
+    // Skip page processing non 200 code
    if (200 != $curl->getCode()) {

      continue;
    }

-    // Skip next page processing pages without returned data
+    // Skip page processing on MIME type not provided
+    if (!$contentType = $curl->getContentType()) {
+
+      continue;
+    }
+
+    // Skip page processing on MIME type not allowed in settings
+    if (false === strpos($contentType, CRAWL_PAGE_MIME_TYPE)) {
+
+      continue;
+    }
+
+    // Skip page processing pages without returned data
    if (!$content = $curl->getContent()) {

      continue;
--- a/library/curl.php
+++ b/library/curl.php
@ -42,6 +42,11 @@ class Curl {

  }

+  public function getContentType() {
+
+    return curl_getinfo($this->_connection, CURLINFO_CONTENT_TYPE);
+  }
+
  public function getContent() {

    return $this->_response;