skip curl download on response data size reached

This commit is contained in:
ghost 2023-05-09 10:21:37 +03:00
parent d7a5f7ef84
commit d186fff48f
5 changed files with 16 additions and 3 deletions

View File

@ -26,6 +26,7 @@ php-dom
php-pdo php-pdo
php-curl php-curl
php-gd php-gd
php-mbstring
php-mysql php-mysql
sphinxsearch sphinxsearch
``` ```

View File

@ -118,6 +118,12 @@ define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30);
*/ */
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
/*
* Skip curl download on response data size reached
*
*/
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760);
/* /*
* Stop crawler on disk quota reached (Mb) * Stop crawler on disk quota reached (Mb)
* *

View File

@ -149,7 +149,7 @@ try {
$delete = false; $delete = false;
$curl = new Curl($manifest->url); $curl = new Curl($manifest->url, CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;

View File

@ -54,7 +54,7 @@ try {
// Process manifests crawl queue // Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
$curl = new Curl($queueManifest->url); $curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
@ -117,7 +117,7 @@ try {
} }
// Begin hosts collection // Begin hosts collection
$curl = new Curl($remoteManifest->result->api->hosts); $curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;

View File

@ -11,6 +11,12 @@ class Curl {
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true); curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout); curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);
curl_setopt($this->_connection, CURLOPT_PROGRESSFUNCTION, function(
$downloadSize, $downloaded, $uploadSize, $uploaded
){
return ($downloaded > CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT) ? 1 : 0;
});
if ($userAgent) { if ($userAgent) {
curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent); curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);