implement sitemap support

2025-01-24 13:34:25 +00:00 · 2023-07-27 11:44:42 +03:00 · 2023-07-27 11:44:42 +03:00 · 2e2501b437
commit 2e2501b437
parent a905499926
6 changed files with 194 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -23,6 +23,7 @@ https://github.com/YGGverse/YGGo/tree/main/media
 ```
 php8^
 php-dom
+php-xml
 php-pdo
 php-curl
 php-gd
@ -199,7 +200,7 @@ GET m=SphinxQL
 * [ ] Host page DOM elements collecting by CSS selectors
  * [ ] Custom settings for each host
 * [ ] XML Feeds support
-  + [ ] Sitemap
+  + [x] Sitemap
  + [ ] RSS
  + [ ] Atom
 * [ ] Palette image index / filter
--- a/config/app.php.txt
+++ b/config/app.php.txt
@ -301,6 +301,32 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', true);
 */
 define('CRAWL_HOST_DEFAULT_NSFW', false);

+/*
+ * Collect sitemap index when available
+ *
+ * At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only
+ *
+ * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
+ *
+ * true|false
+ *
+ */
+define('CRAWL_SITEMAPS', true);
+
+/*
+ * Renew robots.txt index by timing offset provided
+ *
+ */
+define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7);
+
+/*
+ * Hosts Robots.txt processing limit in the crawler.php queue
+ *
+ * Set 0 to disable
+ *
+ */
+define('CRAWL_ROBOTS_LIMIT', 1);
+
 /*
 * Default robots.txt rules on remote file not exists
 * The crawler able to overwrite these rules
--- a/crontab/crawler.php
+++ b/crontab/crawler.php
@ -14,6 +14,7 @@ require_once(__DIR__ . '/../config/app.php');
 require_once(__DIR__ . '/../library/ftp.php');
 require_once(__DIR__ . '/../library/curl.php');
 require_once(__DIR__ . '/../library/robots.php');
+require_once(__DIR__ . '/../library/sitemap.php');
 require_once(__DIR__ . '/../library/filter.php');
 require_once(__DIR__ . '/../library/parser.php');
 require_once(__DIR__ . '/../library/mysql.php');
@ -263,6 +264,78 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
  }
 }

+// Process robots crawl queue
+foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
+
+  // Build web root URL
+  $hostURL = $host->scheme . '://' .
+             $host->name .
+            ($host->port ? ':' . $host->port : '');
+
+  // Get robots.txt
+  $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
+
+  // Update curl stats
+  $httpRequestsTotal++;
+  $httpRequestsSizeTotal += $curl->getSizeRequest();
+  $httpDownloadSizeTotal += $curl->getSizeDownload();
+  $httpRequestsTimeTotal += $curl->getTotalTime();
+
+  // Sitemap provided in robots.txt
+  if (200 == $curl->getCode()) {
+
+    $hostRobots = $curl->getContent();
+
+  } else {
+
+    $hostRobots = $host->robots;
+  }
+
+  // Update host index
+  $db->updateHostRobots($host->hostId, $hostRobots, time());
+
+  // Process sitemaps when enabled
+  if (CRAWL_SITEMAPS) {
+
+    // Look for custom sitemap URL served in robots.txt
+    $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
+
+    if ($hostSitemapPath = $robots->getSitemap()) {
+
+        // Replace relative paths
+        $hostSitemapPath = trim($hostSitemapPath, '/');
+        $hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath);
+        $hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath);
+
+    // Set default path when not exists
+    } else {
+
+        $hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL);
+    }
+
+    // Init sitemap data
+    $sitemap = new Sitemap($hostSitemapPath);
+
+    // Process collected sitemap links
+    foreach ($sitemap->getLinks() as $link => $attributes) {
+
+      // Parse formatted link
+      $linkURI     = Parser::uri($link);
+      $linkHostURL = Parser::hostURL($link);
+
+      // Add host page
+      if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
+          $linkHostURL->string == $hostURL && // this host links only
+          $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
+          $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
+         !$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
+
+          $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
+      }
+    }
+  }
+}
+
 // Process pages crawl queue
 foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {

--- a/library/mysql.php
+++ b/library/mysql.php
@ -728,6 +728,21 @@ class MySQL {
    return $query->rowCount();
  }

+  public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
+
+    $query = $this->_db->prepare('SELECT * FROM `host`
+
+                                           WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
+
+                                           ORDER BY RAND()
+
+                                           LIMIT ' . (int) $limit);
+
+    $query->execute([$timeFrom, 0]);
+
+    return $query->fetchAll();
+  }
+
  public function getManifestCrawlQueue(int $limit, int $timeFrom) {

    $query = $this->_db->prepare('SELECT * FROM `manifest`
--- a/library/robots.php
+++ b/library/robots.php
@ -2,8 +2,9 @@

 class Robots {

-  private $_rule = [];
-  private $_data = null;
+  private $_rule    = [];
+  private $_sitemap = null;
+  private $_data    = null;

  public function __construct(mixed $data) {

@ -15,6 +16,15 @@ class Robots {

      $row = strtolower(trim($row));

+      // Parse sitemap address
+      if (preg_match('!^sitemap:\s?(.*)!', $row, $matches)) {
+
+        if (!empty($matches[1])) {
+
+          $this->_sitemap = urldecode(trim($matches[1]));
+        }
+      }
+
      // User-agent * begin
      if (preg_match('!^user-agent:\s?\*!', $row)) {
        $read = true;
@ -63,6 +73,7 @@ class Robots {
    return $result;
  }

+  /* @TODO not in use
  public function append(string $key, string $value) {

    if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) {
@ -75,12 +86,18 @@ class Robots {
      $this->_data .= PHP_EOL . $key . ' ' . $value;
    }
  }
+  */

  public function getData() {

    return $this->_data;
  }

+  public function getSitemap() {
+
+    return $this->_sitemap;
+  }
+
  private function _regex(string $string) {

    return str_replace(
--- a/library/sitemap.php
+++ b/library/sitemap.php
@ -0,0 +1,59 @@
+<?php
+
+class Sitemap {
+
+  private $_files = [];
+  private $_links = [];
+
+  public function __construct(string $filename) {
+
+    $this->_scanFiles($filename);
+    $this->_scanLinks();
+  }
+
+  private function _scanFiles(string $filename) {
+
+    if ($data = @simplexml_load_file($filename)) {
+
+      if (!empty($data->sitemap)) { // sitemaps index
+
+        foreach ($data->sitemap as $value) {
+
+          if (!empty($value->loc)) {
+
+            $this->_scanFiles(trim(urldecode($value->loc)));
+          }
+        }
+
+      } else if (!empty($data->url)) { // target file
+
+        $this->_files[trim(urldecode($filename))] = []; // @TODO attributes
+      }
+    }
+  }
+
+  private function _scanLinks() {
+
+    foreach ($this->_files as $filename => $attributes) {
+
+      if ($data = @simplexml_load_file($filename)) {
+
+        if (!empty($data->url)) {
+
+          foreach ($data->url as $value) {
+
+            if (!empty($value->loc)) {
+
+              $this->_links[trim(urldecode($value->loc))] = []; // @TODO attributes
+            }
+          }
+        }
+      }
+    }
+  }
+
+  public function getLinks() {
+
+    return $this->_links;
+  }
+}