add crawler / proxy user agent settings

2025-01-31 08:54:17 +00:00 · 2023-05-04 07:38:22 +03:00 · 2023-05-04 07:38:22 +03:00 · 79878d17fe
commit 79878d17fe
parent 73f212e3d7
5 changed files with 31 additions and 9 deletions
--- a/config/app.php.txt
+++ b/config/app.php.txt
@ -74,7 +74,25 @@ define('DB_PASSWORD', '');
 define('SPHINX_HOST', '127.0.0.1');
 define('SPHINX_PORT', 9306);

-// Crawler settings
+// Proxy settings
+
+/*
+ * Search proxy User Agent name
+ *
+ * Shared to other hosts through CURL requests by search proxy
+ *
+ */
+define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
+
+// Crawl settings
+
+/*
+ * Crawler / Bot User Agent name
+ *
+ * Shared to other hosts through CURL requests by crawler
+ *
+ */
+define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');

 /*
 * Stop crawler on disk quota reached (Mb)
--- a/crontab/cleaner.php
+++ b/crontab/cleaner.php
@ -33,7 +33,7 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
  $hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);

  // Get robots.txt if exists
-  $curl = new Curl($hostURL . '/robots.txt');
+  $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);

  if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
    $hostRobots = $curl->getContent();
--- a/crontab/crawler.php
+++ b/crontab/crawler.php
@ -44,7 +44,7 @@ foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SEC
  // Build URL from the DB
  $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;

-  $curl = new Curl($queueHostImageURL);
+  $curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);

  // Update image index anyway, with the current time and http code
  $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
@ -85,7 +85,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
  // Build URL from the DB
  $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;

-  $curl = new Curl($queueHostPageURL);
+  $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);

  // Update page index anyway, with the current time and http code
  $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
@ -226,7 +226,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
          } else {

            // Get robots.txt if exists
-            $curl = new Curl($hostImageURL->string . '/robots.txt');
+            $curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);

            if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
              $hostRobots = $curl->getContent();
@ -391,7 +391,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
        } else {

          // Get robots.txt if exists
-          $curl = new Curl($hostURL->string . '/robots.txt');
+          $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);

          if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
            $hostRobots = $curl->getContent();
--- a/library/curl.php
+++ b/library/curl.php
@ -5,13 +5,17 @@ class Curl {
  private $_connection;
  private $_response;

-  public function __construct(string $url, int $connectTimeout = 5) {
+  public function __construct(string $url, mixed $userAgent = false, int $connectTimeout = 3) {

    $this->_connection = curl_init($url);

    curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);

+    if ($userAgent) {
+      curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);
+    }
+
    $this->_response = curl_exec($this->_connection);
  }

--- a/public/search.php
+++ b/public/search.php
@ -55,7 +55,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
        if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {

          // Get robots.txt if exists
-          $curl = new Curl($hostURL->string . '/robots.txt');
+          $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);

          if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
            $hostRobots = $curl->getContent();
@ -323,7 +323,7 @@ if (!empty($q)) {
              // Get remote image data
              if (empty($hostImage->data)) {

-                $hostImageCurl = new Curl($hostImageURL);
+                $hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);

                // Skip item render on timeout
                $hostImageHttpCode = $hostImageCurl->getCode();