diff --git a/config/app.php.txt b/config/app.php.txt index a97ba81..17905e6 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -135,6 +135,8 @@ define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.co /* * Skip curl download on response data size reached * + * See also: CURLOPT_TIMEOUT (library/curl.php) + * */ define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760); diff --git a/crontab/crawler.php b/crontab/crawler.php index 1ebf33a..a3d8265 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -239,7 +239,7 @@ try { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); // Try to receive target page location on page redirect available - $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 3, true, true); + $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true); // Update curl stats $httpRequestsTotal++; diff --git a/library/curl.php b/library/curl.php index ebaa7e9..4440897 100644 --- a/library/curl.php +++ b/library/curl.php @@ -7,10 +7,10 @@ class Curl { public function __construct(string $url, mixed $userAgent = false, - int $connectTimeout = 3, + int $connectTimeout = 10, bool $header = false, bool $followLocation = false, - int $maxRedirects = 3) { + int $maxRedirects = 10) { $this->_connection = curl_init($url); @@ -24,7 +24,8 @@ class Curl { } curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true); - curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout); + curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout); // skip resources with long time response + curl_setopt($this->_connection, CURLOPT_TIMEOUT, $connectTimeout); // prevent infinitive connection on streaming resources detected @TODO curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false); curl_setopt($this->_connection, CURLOPT_PROGRESSFUNCTION, function( $downloadSize, $downloaded, $uploadSize, $uploaded