Browse Source

skip curl download on response data size reached

main
ghost 2 years ago
parent
commit
d186fff48f
  1. 1
      README.md
  2. 6
      config/app.php.txt
  3. 2
      crontab/cleaner.php
  4. 4
      crontab/crawler.php
  5. 6
      library/curl.php

1
README.md

@ -26,6 +26,7 @@ php-dom @@ -26,6 +26,7 @@ php-dom
php-pdo
php-curl
php-gd
php-mbstring
php-mysql
sphinxsearch
```

6
config/app.php.txt

@ -118,6 +118,12 @@ define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30); @@ -118,6 +118,12 @@ define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30);
*/
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
/*
* Skip curl download on response data size reached
*
*/
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760);
/*
* Stop crawler on disk quota reached (Mb)
*

2
crontab/cleaner.php

@ -149,7 +149,7 @@ try { @@ -149,7 +149,7 @@ try {
$delete = false;
$curl = new Curl($manifest->url);
$curl = new Curl($manifest->url, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;

4
crontab/crawler.php

@ -54,7 +54,7 @@ try { @@ -54,7 +54,7 @@ try {
// Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
$curl = new Curl($queueManifest->url);
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
@ -117,7 +117,7 @@ try { @@ -117,7 +117,7 @@ try {
}
// Begin hosts collection
$curl = new Curl($remoteManifest->result->api->hosts);
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;

6
library/curl.php

@ -11,6 +11,12 @@ class Curl { @@ -11,6 +11,12 @@ class Curl {
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);
curl_setopt($this->_connection, CURLOPT_PROGRESSFUNCTION, function(
$downloadSize, $downloaded, $uploadSize, $uploaded
){
return ($downloaded > CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT) ? 1 : 0;
});
if ($userAgent) {
curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);

Loading…
Cancel
Save