Browse Source

prevent infinitive connection on streaming resources detected

main
ghost 2 years ago
parent
commit
4fa33afe40
  1. 2
      config/app.php.txt
  2. 2
      crontab/crawler.php
  3. 7
      library/curl.php

2
config/app.php.txt

@ -135,6 +135,8 @@ define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.co @@ -135,6 +135,8 @@ define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.co
/*
* Skip curl download on response data size reached
*
* See also: CURLOPT_TIMEOUT (library/curl.php)
*
*/
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760);

2
crontab/crawler.php

@ -239,7 +239,7 @@ try { @@ -239,7 +239,7 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Try to receive target page location on page redirect available
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 3, true, true);
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
// Update curl stats
$httpRequestsTotal++;

7
library/curl.php

@ -7,10 +7,10 @@ class Curl { @@ -7,10 +7,10 @@ class Curl {
public function __construct(string $url,
mixed $userAgent = false,
int $connectTimeout = 3,
int $connectTimeout = 10,
bool $header = false,
bool $followLocation = false,
int $maxRedirects = 3) {
int $maxRedirects = 10) {
$this->_connection = curl_init($url);
@ -24,7 +24,8 @@ class Curl { @@ -24,7 +24,8 @@ class Curl {
}
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout); // skip resources with long time response
curl_setopt($this->_connection, CURLOPT_TIMEOUT, $connectTimeout); // prevent infinitive connection on streaming resources detected @TODO
curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);
curl_setopt($this->_connection, CURLOPT_PROGRESSFUNCTION, function(
$downloadSize, $downloaded, $uploadSize, $uploaded

Loading…
Cancel
Save