collect target location links on page redirect available

2025-09-12 14:21:58 +00:00 · 2023-06-04 14:58:33 +03:00 · 2023-06-04 14:58:33 +03:00 · 345c59b5f4
commit 345c59b5f4
parent 5d7f2bf68c
3 changed files with 154 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -190,7 +190,7 @@ GET m=SphinxQL
 * [x] Ban non-condition links to prevent extra requests
 * [x] Debug log
 * [x] Index homepages and shorter URI with higher priority
-* [ ] Redirect codes extended processing
+* [x] Collect target location links on page redirect available
 * [ ] Palette image index / filter
 * [ ] Crawl queue balancer, that depends of CPU available

--- a/crontab/crawler.php
+++ b/crontab/crawler.php
@ -232,11 +232,147 @@ try {
    // Update page index anyway, with the current time and http code
    $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());

-    // Skip page processing non 200 code
+    // This page has on 200 code
    if (200 != $curl->getCode()) {

+      // Ban this page
      $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());

+      // Try to receive target page location on page redirect available
+      $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 3, true, true);
+
+      // Update curl stats
+      $httpRequestsTotal++;
+      $httpRequestsSizeTotal += $curl->getSizeRequest();
+      $httpDownloadSizeTotal += $curl->getSizeDownload();
+      $httpRequestsTimeTotal += $curl->getTotalTime();
+
+      if (200 == $curl->getCode()) {
+
+        if (preg_match('~Location: (.*)~i', $curl->getContent(), $match)) {
+
+          if (empty($match[1])) {
+
+            continue;
+          }
+
+          $url = trim($match[1]);
+
+          //Make relative links absolute
+          if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
+
+            $url = $queueHostPage->scheme . '://' .
+                   $queueHostPage->name .
+                  ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
+                  '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
+          }
+
+          // Validate formatted link
+          if (filter_var($url, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $url)) {
+
+            // Parse formatted link
+            $hostURL     = Parser::hostURL($url);
+            $hostPageURI = Parser::uri($url);
+
+            // Host exists
+            if ($host = $db->getHost(crc32($hostURL->string))) {
+
+              $hostStatus        = $host->status;
+              $hostNsfw          = $host->nsfw;
+              $hostPageLimit     = $host->crawlPageLimit;
+              $hostMetaOnly      = $host->crawlMetaOnly;
+              $hostId            = $host->hostId;
+              $hostRobots        = $host->robots;
+              $hostRobotsPostfix = $host->robotsPostfix;
+
+            // Register new host
+            } else {
+
+              // Get robots.txt if exists
+              $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
+
+              // Update curl stats
+              $httpRequestsTotal++;
+              $httpRequestsSizeTotal += $curl->getSizeRequest();
+              $httpDownloadSizeTotal += $curl->getSizeDownload();
+              $httpRequestsTimeTotal += $curl->getTotalTime();
+
+              if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
+                $hostRobots = $curl->getContent();
+              } else {
+                $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
+              }
+
+              $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
+              $hostStatus        = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
+              $hostNsfw          = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
+              $hostMetaOnly      = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
+              $hostPageLimit     = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
+
+              $hostId = $db->addHost( $hostURL->scheme,
+                                      $hostURL->name,
+                                      $hostURL->port,
+                                      crc32($hostURL->string),
+                                      time(),
+                                      null,
+                                      $hostPageLimit,
+                                      (string) $hostMetaOnly,
+                                      (string) $hostStatus,
+                                      (string) $hostNsfw,
+                                      $hostRobots,
+                                      $hostRobotsPostfix);
+
+              // Add web root host page to make host visible in the crawl queue
+              $db->addHostPage($hostId, crc32('/'), '/', time());
+
+              // Increase counters
+              $hostPagesAdded++;
+              $hostsAdded++;
+
+              // When page is root, skip next operations
+              if ($hostPageURI->string == '/') {
+
+                continue;
+              }
+            }
+
+            // Init robots parser
+            $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
+
+            // Save page info
+            if ($hostStatus && // host enabled
+                $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
+                $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
+
+                if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
+
+                  $hostPageId = $hostPage->hostPageId;
+
+                } else {
+
+                  $hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
+
+                  // Apply referer meta description to the target page before indexing it
+                  if ($lastHostPageDescription = $db->getLastPageDescription($queueHostPage->hostPageId)) {
+
+                      $db->addHostPageDescription($hostPageId,
+                                                  $lastHostPageDescription->title,
+                                                  $lastHostPageDescription->description,
+                                                  $lastHostPageDescription->keywords,
+                                                  $hostMetaOnly ? null : ($lastHostPageDescription->data ? base64_encode($lastHostPageDescription->data) : null),
+                                                  time());
+                  }
+
+                  $hostPagesAdded++;
+                }
+
+                $db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
+            }
+          }
+        }
+      }
+
+      // Skip other this page actions
      continue;
    }

@ -781,11 +917,7 @@ try {
                                          $link['description'],
                                          $link['keywords'],
                                          $hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
-                                          time(),
-                                          null,
-                                          null,
-                                          null,
-                                          $link['mime']);
+                                          time());

              $hostPagesAdded++;
            }
--- a/library/curl.php
+++ b/library/curl.php
@ -5,10 +5,24 @@ class Curl {
  private $_connection;
  private $_response;

-  public function __construct(string $url, mixed $userAgent = false, int $connectTimeout = 3) {
+  public function __construct(string $url,
+                              mixed $userAgent = false,
+                              int $connectTimeout = 3,
+                              bool $header = false,
+                              bool $followLocation = false,
+                              int $maxRedirects = 3) {

    $this->_connection = curl_init($url);

+    if ($header) {
+      curl_setopt($this->_connection, CURLOPT_HEADER, true);
+    }
+
+    if ($followLocation) {
+      curl_setopt($this->_connection, CURLOPT_FOLLOWLOCATION, true);
+      curl_setopt($this->_connection, CURLOPT_MAXREDIRS, $maxRedirects);
+    }
+
    curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
    curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);