mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 21:44:59 +00:00
collect target location links on page redirect available
This commit is contained in:
parent
5d7f2bf68c
commit
345c59b5f4
@ -190,7 +190,7 @@ GET m=SphinxQL
|
|||||||
* [x] Ban non-condition links to prevent extra requests
|
* [x] Ban non-condition links to prevent extra requests
|
||||||
* [x] Debug log
|
* [x] Debug log
|
||||||
* [x] Index homepages and shorter URI with higher priority
|
* [x] Index homepages and shorter URI with higher priority
|
||||||
* [ ] Redirect codes extended processing
|
* [x] Collect target location links on page redirect available
|
||||||
* [ ] Palette image index / filter
|
* [ ] Palette image index / filter
|
||||||
* [ ] Crawl queue balancer, that depends of CPU available
|
* [ ] Crawl queue balancer, that depends of CPU available
|
||||||
|
|
||||||
|
@ -232,11 +232,147 @@ try {
|
|||||||
// Update page index anyway, with the current time and http code
|
// Update page index anyway, with the current time and http code
|
||||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||||
|
|
||||||
// Skip page processing non 200 code
|
// This page has on 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
|
// Ban this page
|
||||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||||
|
|
||||||
|
// Try to receive target page location on page redirect available
|
||||||
|
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 3, true, true);
|
||||||
|
|
||||||
|
// Update curl stats
|
||||||
|
$httpRequestsTotal++;
|
||||||
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
|
if (200 == $curl->getCode()) {
|
||||||
|
|
||||||
|
if (preg_match('~Location: (.*)~i', $curl->getContent(), $match)) {
|
||||||
|
|
||||||
|
if (empty($match[1])) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$url = trim($match[1]);
|
||||||
|
|
||||||
|
//Make relative links absolute
|
||||||
|
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
|
||||||
|
|
||||||
|
$url = $queueHostPage->scheme . '://' .
|
||||||
|
$queueHostPage->name .
|
||||||
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||||
|
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate formatted link
|
||||||
|
if (filter_var($url, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $url)) {
|
||||||
|
|
||||||
|
// Parse formatted link
|
||||||
|
$hostURL = Parser::hostURL($url);
|
||||||
|
$hostPageURI = Parser::uri($url);
|
||||||
|
|
||||||
|
// Host exists
|
||||||
|
if ($host = $db->getHost(crc32($hostURL->string))) {
|
||||||
|
|
||||||
|
$hostStatus = $host->status;
|
||||||
|
$hostNsfw = $host->nsfw;
|
||||||
|
$hostPageLimit = $host->crawlPageLimit;
|
||||||
|
$hostMetaOnly = $host->crawlMetaOnly;
|
||||||
|
$hostId = $host->hostId;
|
||||||
|
$hostRobots = $host->robots;
|
||||||
|
$hostRobotsPostfix = $host->robotsPostfix;
|
||||||
|
|
||||||
|
// Register new host
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// Get robots.txt if exists
|
||||||
|
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
|
// Update curl stats
|
||||||
|
$httpRequestsTotal++;
|
||||||
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
} else {
|
||||||
|
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||||
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||||
|
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||||
|
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||||
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||||
|
|
||||||
|
$hostId = $db->addHost( $hostURL->scheme,
|
||||||
|
$hostURL->name,
|
||||||
|
$hostURL->port,
|
||||||
|
crc32($hostURL->string),
|
||||||
|
time(),
|
||||||
|
null,
|
||||||
|
$hostPageLimit,
|
||||||
|
(string) $hostMetaOnly,
|
||||||
|
(string) $hostStatus,
|
||||||
|
(string) $hostNsfw,
|
||||||
|
$hostRobots,
|
||||||
|
$hostRobotsPostfix);
|
||||||
|
|
||||||
|
// Add web root host page to make host visible in the crawl queue
|
||||||
|
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||||
|
|
||||||
|
// Increase counters
|
||||||
|
$hostPagesAdded++;
|
||||||
|
$hostsAdded++;
|
||||||
|
|
||||||
|
// When page is root, skip next operations
|
||||||
|
if ($hostPageURI->string == '/') {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init robots parser
|
||||||
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
|
// Save page info
|
||||||
|
if ($hostStatus && // host enabled
|
||||||
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||||
|
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
||||||
|
|
||||||
|
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
|
||||||
|
|
||||||
|
$hostPageId = $hostPage->hostPageId;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
||||||
|
|
||||||
|
// Apply referer meta description to the target page before indexing it
|
||||||
|
if ($lastHostPageDescription = $db->getLastPageDescription($queueHostPage->hostPageId)) {
|
||||||
|
|
||||||
|
$db->addHostPageDescription($hostPageId,
|
||||||
|
$lastHostPageDescription->title,
|
||||||
|
$lastHostPageDescription->description,
|
||||||
|
$lastHostPageDescription->keywords,
|
||||||
|
$hostMetaOnly ? null : ($lastHostPageDescription->data ? base64_encode($lastHostPageDescription->data) : null),
|
||||||
|
time());
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostPagesAdded++;
|
||||||
|
}
|
||||||
|
|
||||||
|
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip other this page actions
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -781,11 +917,7 @@ try {
|
|||||||
$link['description'],
|
$link['description'],
|
||||||
$link['keywords'],
|
$link['keywords'],
|
||||||
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
|
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
|
||||||
time(),
|
time());
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
$link['mime']);
|
|
||||||
|
|
||||||
$hostPagesAdded++;
|
$hostPagesAdded++;
|
||||||
}
|
}
|
||||||
|
@ -5,10 +5,24 @@ class Curl {
|
|||||||
private $_connection;
|
private $_connection;
|
||||||
private $_response;
|
private $_response;
|
||||||
|
|
||||||
public function __construct(string $url, mixed $userAgent = false, int $connectTimeout = 3) {
|
public function __construct(string $url,
|
||||||
|
mixed $userAgent = false,
|
||||||
|
int $connectTimeout = 3,
|
||||||
|
bool $header = false,
|
||||||
|
bool $followLocation = false,
|
||||||
|
int $maxRedirects = 3) {
|
||||||
|
|
||||||
$this->_connection = curl_init($url);
|
$this->_connection = curl_init($url);
|
||||||
|
|
||||||
|
if ($header) {
|
||||||
|
curl_setopt($this->_connection, CURLOPT_HEADER, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($followLocation) {
|
||||||
|
curl_setopt($this->_connection, CURLOPT_FOLLOWLOCATION, true);
|
||||||
|
curl_setopt($this->_connection, CURLOPT_MAXREDIRS, $maxRedirects);
|
||||||
|
}
|
||||||
|
|
||||||
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
|
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
|
||||||
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
|
curl_setopt($this->_connection, CURLOPT_CONNECTTIMEOUT, $connectTimeout);
|
||||||
curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);
|
curl_setopt($this->_connection, CURLOPT_NOPROGRESS, false);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user