implement sitemap support

2025-01-12 07:48:34 +00:00 · 2023-07-27 11:44:42 +03:00 · 2023-07-27 11:44:42 +03:00 · 2e2501b437
commit 2e2501b437
parent a905499926
6 changed files with 194 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -23,6 +23,7 @@ https://github.com/YGGverse/YGGo/tree/main/media
 ```
 php8^
 php-dom
 php-xml
 php-pdo
 php-curl
 php-gd
@ -199,7 +200,7 @@ GET m=SphinxQL
 * [ ] Host page DOM elements collecting by CSS selectors
  * [ ] Custom settings for each host
 * [ ] XML Feeds support
-  + [ ] Sitemap
+  + [x] Sitemap
  + [ ] RSS
  + [ ] Atom
 * [ ] Palette image index / filter
--- a/config/app.php.txt
+++ b/config/app.php.txt
@ -301,6 +301,32 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', true);
 */
 define('CRAWL_HOST_DEFAULT_NSFW', false);
 /*
 * Collect sitemap index when available
 *
 * At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only
 *
 * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
 *
 * true|false
 *
 */
 define('CRAWL_SITEMAPS', true);
 /*
 * Renew robots.txt index by timing offset provided
 *
 */
 define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7);
 /*
 * Hosts Robots.txt processing limit in the crawler.php queue
 *
 * Set 0 to disable
 *
 */
 define('CRAWL_ROBOTS_LIMIT', 1);
 /*
 * Default robots.txt rules on remote file not exists
 * The crawler able to overwrite these rules
--- a/crontab/crawler.php
+++ b/crontab/crawler.php
@ -14,6 +14,7 @@ require_once(__DIR__ . '/../config/app.php');
 require_once(__DIR__ . '/../library/ftp.php');
 require_once(__DIR__ . '/../library/curl.php');
 require_once(__DIR__ . '/../library/robots.php');
 require_once(__DIR__ . '/../library/sitemap.php');
 require_once(__DIR__ . '/../library/filter.php');
 require_once(__DIR__ . '/../library/parser.php');
 require_once(__DIR__ . '/../library/mysql.php');
@ -263,6 +264,78 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
  }
 }
 // Process robots crawl queue
 foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
  // Build web root URL
  $hostURL = $host->scheme . '://' .
             $host->name .
            ($host->port ? ':' . $host->port : '');
  // Get robots.txt
  $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
  // Update curl stats
  $httpRequestsTotal++;
  $httpRequestsSizeTotal += $curl->getSizeRequest();
  $httpDownloadSizeTotal += $curl->getSizeDownload();
  $httpRequestsTimeTotal += $curl->getTotalTime();
  // Sitemap provided in robots.txt
  if (200 == $curl->getCode()) {
    $hostRobots = $curl->getContent();
  } else {
    $hostRobots = $host->robots;
  }
  // Update host index
  $db->updateHostRobots($host->hostId, $hostRobots, time());
  // Process sitemaps when enabled
  if (CRAWL_SITEMAPS) {
    // Look for custom sitemap URL served in robots.txt
    $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
    if ($hostSitemapPath = $robots->getSitemap()) {
        // Replace relative paths
        $hostSitemapPath = trim($hostSitemapPath, '/');
        $hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath);
        $hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath);
    // Set default path when not exists
    } else {
        $hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL);
    }
    // Init sitemap data
    $sitemap = new Sitemap($hostSitemapPath);
    // Process collected sitemap links
    foreach ($sitemap->getLinks() as $link => $attributes) {
      // Parse formatted link
      $linkURI     = Parser::uri($link);
      $linkHostURL = Parser::hostURL($link);
      // Add host page
      if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
          $linkHostURL->string == $hostURL && // this host links only
          $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
          $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
         !$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
          $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
      }
    }
  }
 }
 // Process pages crawl queue
 foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
--- a/library/mysql.php
+++ b/library/mysql.php
@ -728,6 +728,21 @@ class MySQL {
    return $query->rowCount();
  }
  public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
    $query = $this->_db->prepare('SELECT * FROM `host`
                                           WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
                                           ORDER BY RAND()
                                           LIMIT ' . (int) $limit);
    $query->execute([$timeFrom, 0]);
    return $query->fetchAll();
  }
  public function getManifestCrawlQueue(int $limit, int $timeFrom) {
    $query = $this->_db->prepare('SELECT * FROM `manifest`
--- a/library/robots.php
+++ b/library/robots.php
@ -2,8 +2,9 @@
 class Robots {
-  private $_rule = [];
+  private $_rule    = [];
-  private $_data = null;
+  private $_sitemap = null;
  private $_data    = null;
  public function __construct(mixed $data) {
@ -15,6 +16,15 @@ class Robots {
      $row = strtolower(trim($row));
      // Parse sitemap address
      if (preg_match('!^sitemap:\s?(.*)!', $row, $matches)) {
        if (!empty($matches[1])) {
          $this->_sitemap = urldecode(trim($matches[1]));
        }
      }
      // User-agent * begin
      if (preg_match('!^user-agent:\s?\*!', $row)) {
        $read = true;
@ -63,6 +73,7 @@ class Robots {
    return $result;
  }
  /* @TODO not in use
  public function append(string $key, string $value) {
    if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) {
@ -75,12 +86,18 @@ class Robots {
      $this->_data .= PHP_EOL . $key . ' ' . $value;
    }
  }
  */
  public function getData() {
    return $this->_data;
  }
  public function getSitemap() {
    return $this->_sitemap;
  }
  private function _regex(string $string) {
    return str_replace(
--- a/library/sitemap.php
+++ b/library/sitemap.php
@ -0,0 +1,59 @@
 <?php
 class Sitemap {
  private $_files = [];
  private $_links = [];
  public function __construct(string $filename) {
    $this->_scanFiles($filename);
    $this->_scanLinks();
  }
  private function _scanFiles(string $filename) {
    if ($data = @simplexml_load_file($filename)) {
      if (!empty($data->sitemap)) { // sitemaps index
        foreach ($data->sitemap as $value) {
          if (!empty($value->loc)) {
            $this->_scanFiles(trim(urldecode($value->loc)));
          }
        }
      } else if (!empty($data->url)) { // target file
        $this->_files[trim(urldecode($filename))] = []; // @TODO attributes
      }
    }
  }
  private function _scanLinks() {
    foreach ($this->_files as $filename => $attributes) {
      if ($data = @simplexml_load_file($filename)) {
        if (!empty($data->url)) {
          foreach ($data->url as $value) {
            if (!empty($value->loc)) {
              $this->_links[trim(urldecode($value->loc))] = []; // @TODO attributes
            }
          }
        }
      }
    }
  }
  public function getLinks() {
    return $this->_links;
  }
 }