add crawler / proxy user agent settings

This commit is contained in:
ghost 2023-05-04 07:38:22 +03:00
parent 73f212e3d7
commit 79878d17fe
5 changed files with 31 additions and 9 deletions

View File

@ -74,7 +74,25 @@ define('DB_PASSWORD', '');
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
// Crawler settings
// Proxy settings
/*
* Search proxy User Agent name
*
* Shared to other hosts through CURL requests by search proxy
*
*/
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
// Crawl settings
/*
* Crawler / Bot User Agent name
*
* Shared to other hosts through CURL requests by crawler
*
*/
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
/*
* Stop crawler on disk quota reached (Mb)

View File

@ -33,7 +33,7 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt');
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();

View File

@ -44,7 +44,7 @@ foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SEC
// Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
$curl = new Curl($queueHostImageURL);
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
@ -85,7 +85,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
$curl = new Curl($queueHostPageURL);
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
@ -226,7 +226,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
} else {
// Get robots.txt if exists
$curl = new Curl($hostImageURL->string . '/robots.txt');
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
@ -391,7 +391,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();

View File

@ -5,13 +5,17 @@ class Curl {
private $_connection;
private $_response;
public function __construct(string $url, int $connectTimeout = 5) {
public function __construct(string $url, mixed $userAgent = false, int $connectTimeout = 3) {
$this->_connection = curl_init($url);
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
if ($userAgent) {
curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);
}
$this->_response = curl_exec($this->_connection);
}

View File

@ -55,7 +55,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
@ -323,7 +323,7 @@ if (!empty($q)) {
// Get remote image data
if (empty($hostImage->data)) {
$hostImageCurl = new Curl($hostImageURL);
$hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);
// Skip item render on timeout
$hostImageHttpCode = $hostImageCurl->getCode();