mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-31 08:54:17 +00:00
add crawler / proxy user agent settings
This commit is contained in:
parent
73f212e3d7
commit
79878d17fe
@ -74,7 +74,25 @@ define('DB_PASSWORD', '');
|
||||
define('SPHINX_HOST', '127.0.0.1');
|
||||
define('SPHINX_PORT', 9306);
|
||||
|
||||
// Crawler settings
|
||||
// Proxy settings
|
||||
|
||||
/*
|
||||
* Search proxy User Agent name
|
||||
*
|
||||
* Shared to other hosts through CURL requests by search proxy
|
||||
*
|
||||
*/
|
||||
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
|
||||
|
||||
// Crawl settings
|
||||
|
||||
/*
|
||||
* Crawler / Bot User Agent name
|
||||
*
|
||||
* Shared to other hosts through CURL requests by crawler
|
||||
*
|
||||
*/
|
||||
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
|
||||
|
||||
/*
|
||||
* Stop crawler on disk quota reached (Mb)
|
||||
|
@ -33,7 +33,7 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
|
||||
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL . '/robots.txt');
|
||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
|
@ -44,7 +44,7 @@ foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SEC
|
||||
// Build URL from the DB
|
||||
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
|
||||
|
||||
$curl = new Curl($queueHostImageURL);
|
||||
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update image index anyway, with the current time and http code
|
||||
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
||||
@ -85,7 +85,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
// Build URL from the DB
|
||||
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||
|
||||
$curl = new Curl($queueHostPageURL);
|
||||
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update page index anyway, with the current time and http code
|
||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||
@ -226,7 +226,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
} else {
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostImageURL->string . '/robots.txt');
|
||||
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
@ -391,7 +391,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
} else {
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL->string . '/robots.txt');
|
||||
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
|
@ -5,13 +5,17 @@ class Curl {
|
||||
private $_connection;
|
||||
private $_response;
|
||||
|
||||
public function __construct(string $url, int $connectTimeout = 5) {
|
||||
public function __construct(string $url, mixed $userAgent = false, int $connectTimeout = 3) {
|
||||
|
||||
$this->_connection = curl_init($url);
|
||||
|
||||
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
|
||||
|
||||
if ($userAgent) {
|
||||
curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);
|
||||
}
|
||||
|
||||
$this->_response = curl_exec($this->_connection);
|
||||
}
|
||||
|
||||
|
@ -55,7 +55,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL->string . '/robots.txt');
|
||||
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
@ -323,7 +323,7 @@ if (!empty($q)) {
|
||||
// Get remote image data
|
||||
if (empty($hostImage->data)) {
|
||||
|
||||
$hostImageCurl = new Curl($hostImageURL);
|
||||
$hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);
|
||||
|
||||
// Skip item render on timeout
|
||||
$hostImageHttpCode = $hostImageCurl->getCode();
|
||||
|
Loading…
x
Reference in New Issue
Block a user