mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-07 12:24:28 +00:00
add crawler / proxy user agent settings
This commit is contained in:
parent
73f212e3d7
commit
79878d17fe
@ -74,7 +74,25 @@ define('DB_PASSWORD', '');
|
|||||||
define('SPHINX_HOST', '127.0.0.1');
|
define('SPHINX_HOST', '127.0.0.1');
|
||||||
define('SPHINX_PORT', 9306);
|
define('SPHINX_PORT', 9306);
|
||||||
|
|
||||||
// Crawler settings
|
// Proxy settings
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Search proxy User Agent name
|
||||||
|
*
|
||||||
|
* Shared to other hosts through CURL requests by search proxy
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
|
||||||
|
|
||||||
|
// Crawl settings
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Crawler / Bot User Agent name
|
||||||
|
*
|
||||||
|
* Shared to other hosts through CURL requests by crawler
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Stop crawler on disk quota reached (Mb)
|
* Stop crawler on disk quota reached (Mb)
|
||||||
|
@ -33,7 +33,7 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
|
|||||||
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
|
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
|
||||||
|
|
||||||
// Get robots.txt if exists
|
// Get robots.txt if exists
|
||||||
$curl = new Curl($hostURL . '/robots.txt');
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
$hostRobots = $curl->getContent();
|
$hostRobots = $curl->getContent();
|
||||||
|
@ -44,7 +44,7 @@ foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SEC
|
|||||||
// Build URL from the DB
|
// Build URL from the DB
|
||||||
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
|
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
|
||||||
|
|
||||||
$curl = new Curl($queueHostImageURL);
|
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update image index anyway, with the current time and http code
|
// Update image index anyway, with the current time and http code
|
||||||
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
||||||
@ -85,7 +85,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
// Build URL from the DB
|
// Build URL from the DB
|
||||||
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||||
|
|
||||||
$curl = new Curl($queueHostPageURL);
|
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update page index anyway, with the current time and http code
|
// Update page index anyway, with the current time and http code
|
||||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||||
@ -226,7 +226,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Get robots.txt if exists
|
// Get robots.txt if exists
|
||||||
$curl = new Curl($hostImageURL->string . '/robots.txt');
|
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
$hostRobots = $curl->getContent();
|
$hostRobots = $curl->getContent();
|
||||||
@ -391,7 +391,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Get robots.txt if exists
|
// Get robots.txt if exists
|
||||||
$curl = new Curl($hostURL->string . '/robots.txt');
|
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
$hostRobots = $curl->getContent();
|
$hostRobots = $curl->getContent();
|
||||||
|
@ -5,13 +5,17 @@ class Curl {
|
|||||||
private $_connection;
|
private $_connection;
|
||||||
private $_response;
|
private $_response;
|
||||||
|
|
||||||
public function __construct(string $url, int $connectTimeout = 5) {
|
public function __construct(string $url, mixed $userAgent = false, int $connectTimeout = 3) {
|
||||||
|
|
||||||
$this->_connection = curl_init($url);
|
$this->_connection = curl_init($url);
|
||||||
|
|
||||||
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
|
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
|
||||||
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
|
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
|
||||||
|
|
||||||
|
if ($userAgent) {
|
||||||
|
curl_setopt($this->_connection, CURLOPT_USERAGENT, (string) $userAgent);
|
||||||
|
}
|
||||||
|
|
||||||
$this->_response = curl_exec($this->_connection);
|
$this->_response = curl_exec($this->_connection);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
|||||||
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
|
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
|
||||||
|
|
||||||
// Get robots.txt if exists
|
// Get robots.txt if exists
|
||||||
$curl = new Curl($hostURL->string . '/robots.txt');
|
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
$hostRobots = $curl->getContent();
|
$hostRobots = $curl->getContent();
|
||||||
@ -323,7 +323,7 @@ if (!empty($q)) {
|
|||||||
// Get remote image data
|
// Get remote image data
|
||||||
if (empty($hostImage->data)) {
|
if (empty($hostImage->data)) {
|
||||||
|
|
||||||
$hostImageCurl = new Curl($hostImageURL);
|
$hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Skip item render on timeout
|
// Skip item render on timeout
|
||||||
$hostImageHttpCode = $hostImageCurl->getCode();
|
$hostImageHttpCode = $hostImageCurl->getCode();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user