Browse Source

prevent infinitive connection on streaming resources detected

main
ghost 2 years ago
parent
commit
4fa33afe40
  1. 2
      config/app.php.txt
  2. 2
      crontab/crawler.php
  3. 7
      library/curl.php

2
config/app.php.txt

@ -135,6 +135,8 @@ define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.co
/* /*
* Skip curl download on response data size reached * Skip curl download on response data size reached
* *
* See also: CURLOPT_TIMEOUT (library/curl.php)
*
*/ */
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760); define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760);

2
crontab/crawler.php

@ -239,7 +239,7 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Try to receive target page location on page redirect available // Try to receive target page location on page redirect available
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 3, true, true); $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;

7
library/curl.php

@ -7,10 +7,10 @@ class Curl {
public function __construct(string $url, public function __construct(string $url,
mixed $userAgent = false, mixed $userAgent = false,
int $connectTimeout = 3, int $connectTimeout = 10,
bool $header = false, bool $header = false,
bool $followLocation = false, bool $followLocation = false,
int $maxRedirects = 3) { int $maxRedirects = 10) {
$this->_connection = curl_init($url); $this->_connection = curl_init($url);
@ -24,7 +24,8 @@ class Curl {
} }
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true); curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout); curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout); // skip resources with long time response
curl_setopt($this->_connection, CURLOPT_TIMEOUT, $connectTimeout); // prevent infinitive connection on streaming resources detected @TODO
curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false); curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);
curl_setopt($this->_connection, CURLOPT_PROGRESSFUNCTION, function( curl_setopt($this->_connection, CURLOPT_PROGRESSFUNCTION, function(
$downloadSize, $downloaded, $uploadSize, $uploaded $downloadSize, $downloaded, $uploadSize, $uploaded

Loading…
Cancel
Save