diff --git a/README.md b/README.md index e6f03bc..6c460d8 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ php-dom php-pdo php-curl php-gd +php-mbstring php-mysql sphinxsearch ``` diff --git a/config/app.php.txt b/config/app.php.txt index 4f637d8..c5af62a 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -118,6 +118,12 @@ define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30); */ define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); +/* + * Skip curl download on response data size reached + * + */ +define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760); + /* * Stop crawler on disk quota reached (Mb) * diff --git a/crontab/cleaner.php b/crontab/cleaner.php index 46f1789..8e30731 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -149,7 +149,7 @@ try { $delete = false; - $curl = new Curl($manifest->url); + $curl = new Curl($manifest->url, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; diff --git a/crontab/crawler.php b/crontab/crawler.php index ee46562..7970c56 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -54,7 +54,7 @@ try { // Process manifests crawl queue foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { - $curl = new Curl($queueManifest->url); + $curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; @@ -117,7 +117,7 @@ try { } // Begin hosts collection - $curl = new Curl($remoteManifest->result->api->hosts); + $curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; diff --git a/library/curl.php b/library/curl.php index 6f3f674..5c8445a 100644 --- a/library/curl.php +++ b/library/curl.php @@ -11,6 +11,12 @@ class Curl { curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true); curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout); + curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false); + curl_setopt($this->_connection, CURLOPT_PROGRESSFUNCTION, function( + $downloadSize, $downloaded, $uploadSize, $uploaded + ){ + return ($downloaded > CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT) ? 1 : 0; + }); if ($userAgent) { curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);