From 345c59b5f435c5e744b3203ea9507a806e90f35b Mon Sep 17 00:00:00 2001
From: ghost <noreply@localhost>
Date: Sun, 4 Jun 2023 14:58:33 +0300
Subject: [PATCH] collect target location links on page redirect available

---
 README.md           |   2 +-
 crontab/crawler.php | 144 ++++++++++++++++++++++++++++++++++++++++++--
 library/curl.php    |  16 ++++-
 3 files changed, 154 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index bc48fbb..8f4044d 100644
--- a/README.md
+++ b/README.md
@@ -190,7 +190,7 @@ GET m=SphinxQL
 * [x] Ban non-condition links to prevent extra requests
 * [x] Debug log
 * [x] Index homepages and shorter URI with higher priority
-* [ ] Redirect codes extended processing
+* [x] Collect target location links on page redirect available
 * [ ] Palette image index / filter
 * [ ] Crawl queue balancer, that depends of CPU available
 
diff --git a/crontab/crawler.php b/crontab/crawler.php
index a4e614c..1ebf33a 100644
--- a/crontab/crawler.php
+++ b/crontab/crawler.php
@@ -232,11 +232,147 @@ try {
     // Update page index anyway, with the current time and http code
     $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
 
-    // Skip page processing non 200 code
+    // This page has on 200 code
     if (200 != $curl->getCode()) {
 
+      // Ban this page
       $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
 
+      // Try to receive target page location on page redirect available
+      $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 3, true, true);
+
+      // Update curl stats
+      $httpRequestsTotal++;
+      $httpRequestsSizeTotal += $curl->getSizeRequest();
+      $httpDownloadSizeTotal += $curl->getSizeDownload();
+      $httpRequestsTimeTotal += $curl->getTotalTime();
+
+      if (200 == $curl->getCode()) {
+
+        if (preg_match('~Location: (.*)~i', $curl->getContent(), $match)) {
+
+          if (empty($match[1])) {
+
+            continue;
+          }
+
+          $url = trim($match[1]);
+
+          //Make relative links absolute
+          if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
+
+            $url = $queueHostPage->scheme . '://' .
+                   $queueHostPage->name .
+                  ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
+                  '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
+          }
+
+          // Validate formatted link
+          if (filter_var($url, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $url)) {
+
+            // Parse formatted link
+            $hostURL     = Parser::hostURL($url);
+            $hostPageURI = Parser::uri($url);
+
+            // Host exists
+            if ($host = $db->getHost(crc32($hostURL->string))) {
+
+              $hostStatus        = $host->status;
+              $hostNsfw          = $host->nsfw;
+              $hostPageLimit     = $host->crawlPageLimit;
+              $hostMetaOnly      = $host->crawlMetaOnly;
+              $hostId            = $host->hostId;
+              $hostRobots        = $host->robots;
+              $hostRobotsPostfix = $host->robotsPostfix;
+
+            // Register new host
+            } else {
+
+              // Get robots.txt if exists
+              $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
+
+              // Update curl stats
+              $httpRequestsTotal++;
+              $httpRequestsSizeTotal += $curl->getSizeRequest();
+              $httpDownloadSizeTotal += $curl->getSizeDownload();
+              $httpRequestsTimeTotal += $curl->getTotalTime();
+
+              if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
+                $hostRobots = $curl->getContent();
+              } else {
+                $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
+              }
+
+              $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
+              $hostStatus        = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
+              $hostNsfw          = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
+              $hostMetaOnly      = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
+              $hostPageLimit     = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
+
+              $hostId = $db->addHost( $hostURL->scheme,
+                                      $hostURL->name,
+                                      $hostURL->port,
+                                      crc32($hostURL->string),
+                                      time(),
+                                      null,
+                                      $hostPageLimit,
+                                      (string) $hostMetaOnly,
+                                      (string) $hostStatus,
+                                      (string) $hostNsfw,
+                                      $hostRobots,
+                                      $hostRobotsPostfix);
+
+              // Add web root host page to make host visible in the crawl queue
+              $db->addHostPage($hostId, crc32('/'), '/', time());
+
+              // Increase counters
+              $hostPagesAdded++;
+              $hostsAdded++;
+
+              // When page is root, skip next operations
+              if ($hostPageURI->string == '/') {
+
+                continue;
+              }
+            }
+
+            // Init robots parser
+            $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
+
+            // Save page info
+            if ($hostStatus && // host enabled
+                $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
+                $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
+
+                if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
+
+                  $hostPageId = $hostPage->hostPageId;
+
+                } else {
+
+                  $hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
+
+                  // Apply referer meta description to the target page before indexing it
+                  if ($lastHostPageDescription = $db->getLastPageDescription($queueHostPage->hostPageId)) {
+
+                      $db->addHostPageDescription($hostPageId,
+                                                  $lastHostPageDescription->title,
+                                                  $lastHostPageDescription->description,
+                                                  $lastHostPageDescription->keywords,
+                                                  $hostMetaOnly ? null : ($lastHostPageDescription->data ? base64_encode($lastHostPageDescription->data) : null),
+                                                  time());
+                  }
+
+                  $hostPagesAdded++;
+                }
+
+                $db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
+            }
+          }
+        }
+      }
+
+      // Skip other this page actions
       continue;
     }
 
@@ -781,11 +917,7 @@ try {
                                           $link['description'],
                                           $link['keywords'],
                                           $hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
-                                          time(),
-                                          null,
-                                          null,
-                                          null,
-                                          $link['mime']);
+                                          time());
 
               $hostPagesAdded++;
             }
diff --git a/library/curl.php b/library/curl.php
index 5c8445a..ebaa7e9 100644
--- a/library/curl.php
+++ b/library/curl.php
@@ -5,10 +5,24 @@ class Curl {
   private $_connection;
   private $_response;
 
-  public function __construct(string $url, mixed $userAgent = false, int $connectTimeout = 3) {
+  public function __construct(string $url,
+                              mixed $userAgent = false,
+                              int $connectTimeout = 3,
+                              bool $header = false,
+                              bool $followLocation = false,
+                              int $maxRedirects = 3) {
 
     $this->_connection = curl_init($url);
 
+    if ($header) {
+      curl_setopt($this->_connection, CURLOPT_HEADER, true);
+    }
+
+    if ($followLocation) {
+      curl_setopt($this->_connection, CURLOPT_FOLLOWLOCATION, true);
+      curl_setopt($this->_connection, CURLOPT_MAXREDIRS, $maxRedirects);
+    }
+
     curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
     curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
     curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);