mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-09 14:27:55 +00:00
skip curl download on response data size reached
This commit is contained in:
parent
d7a5f7ef84
commit
d186fff48f
@ -26,6 +26,7 @@ php-dom
|
|||||||
php-pdo
|
php-pdo
|
||||||
php-curl
|
php-curl
|
||||||
php-gd
|
php-gd
|
||||||
|
php-mbstring
|
||||||
php-mysql
|
php-mysql
|
||||||
sphinxsearch
|
sphinxsearch
|
||||||
```
|
```
|
||||||
|
@ -118,6 +118,12 @@ define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
|
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Skip curl download on response data size reached
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Stop crawler on disk quota reached (Mb)
|
* Stop crawler on disk quota reached (Mb)
|
||||||
*
|
*
|
||||||
|
@ -149,7 +149,7 @@ try {
|
|||||||
|
|
||||||
$delete = false;
|
$delete = false;
|
||||||
|
|
||||||
$curl = new Curl($manifest->url);
|
$curl = new Curl($manifest->url, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$httpRequestsTotal++;
|
$httpRequestsTotal++;
|
||||||
|
@ -54,7 +54,7 @@ try {
|
|||||||
// Process manifests crawl queue
|
// Process manifests crawl queue
|
||||||
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
||||||
|
|
||||||
$curl = new Curl($queueManifest->url);
|
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$httpRequestsTotal++;
|
$httpRequestsTotal++;
|
||||||
@ -117,7 +117,7 @@ try {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Begin hosts collection
|
// Begin hosts collection
|
||||||
$curl = new Curl($remoteManifest->result->api->hosts);
|
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$httpRequestsTotal++;
|
$httpRequestsTotal++;
|
||||||
|
@ -11,6 +11,12 @@ class Curl {
|
|||||||
|
|
||||||
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
|
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
|
||||||
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
|
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
|
||||||
|
curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);
|
||||||
|
curl_setopt($this->_connection, CURLOPT_PROGRESSFUNCTION, function(
|
||||||
|
$downloadSize, $downloaded, $uploadSize, $uploaded
|
||||||
|
){
|
||||||
|
return ($downloaded > CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT) ? 1 : 0;
|
||||||
|
});
|
||||||
|
|
||||||
if ($userAgent) {
|
if ($userAgent) {
|
||||||
curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);
|
curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);
|
||||||
|
Loading…
Reference in New Issue
Block a user