mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
implement sitemap support
This commit is contained in:
parent
a905499926
commit
2e2501b437
@ -23,6 +23,7 @@ https://github.com/YGGverse/YGGo/tree/main/media
|
||||
```
|
||||
php8^
|
||||
php-dom
|
||||
php-xml
|
||||
php-pdo
|
||||
php-curl
|
||||
php-gd
|
||||
@ -199,7 +200,7 @@ GET m=SphinxQL
|
||||
* [ ] Host page DOM elements collecting by CSS selectors
|
||||
* [ ] Custom settings for each host
|
||||
* [ ] XML Feeds support
|
||||
+ [ ] Sitemap
|
||||
+ [x] Sitemap
|
||||
+ [ ] RSS
|
||||
+ [ ] Atom
|
||||
* [ ] Palette image index / filter
|
||||
|
@ -301,6 +301,32 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', true);
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_NSFW', false);
|
||||
|
||||
/*
|
||||
* Collect sitemap index when available
|
||||
*
|
||||
* At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only
|
||||
*
|
||||
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
|
||||
*
|
||||
* true|false
|
||||
*
|
||||
*/
|
||||
define('CRAWL_SITEMAPS', true);
|
||||
|
||||
/*
|
||||
* Renew robots.txt index by timing offset provided
|
||||
*
|
||||
*/
|
||||
define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7);
|
||||
|
||||
/*
|
||||
* Hosts Robots.txt processing limit in the crawler.php queue
|
||||
*
|
||||
* Set 0 to disable
|
||||
*
|
||||
*/
|
||||
define('CRAWL_ROBOTS_LIMIT', 1);
|
||||
|
||||
/*
|
||||
* Default robots.txt rules on remote file not exists
|
||||
* The crawler able to overwrite these rules
|
||||
|
@ -14,6 +14,7 @@ require_once(__DIR__ . '/../config/app.php');
|
||||
require_once(__DIR__ . '/../library/ftp.php');
|
||||
require_once(__DIR__ . '/../library/curl.php');
|
||||
require_once(__DIR__ . '/../library/robots.php');
|
||||
require_once(__DIR__ . '/../library/sitemap.php');
|
||||
require_once(__DIR__ . '/../library/filter.php');
|
||||
require_once(__DIR__ . '/../library/parser.php');
|
||||
require_once(__DIR__ . '/../library/mysql.php');
|
||||
@ -263,6 +264,78 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
||||
}
|
||||
}
|
||||
|
||||
// Process robots crawl queue
|
||||
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
||||
|
||||
// Build web root URL
|
||||
$hostURL = $host->scheme . '://' .
|
||||
$host->name .
|
||||
($host->port ? ':' . $host->port : '');
|
||||
|
||||
// Get robots.txt
|
||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
// Sitemap provided in robots.txt
|
||||
if (200 == $curl->getCode()) {
|
||||
|
||||
$hostRobots = $curl->getContent();
|
||||
|
||||
} else {
|
||||
|
||||
$hostRobots = $host->robots;
|
||||
}
|
||||
|
||||
// Update host index
|
||||
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||
|
||||
// Process sitemaps when enabled
|
||||
if (CRAWL_SITEMAPS) {
|
||||
|
||||
// Look for custom sitemap URL served in robots.txt
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
if ($hostSitemapPath = $robots->getSitemap()) {
|
||||
|
||||
// Replace relative paths
|
||||
$hostSitemapPath = trim($hostSitemapPath, '/');
|
||||
$hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath);
|
||||
$hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath);
|
||||
|
||||
// Set default path when not exists
|
||||
} else {
|
||||
|
||||
$hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL);
|
||||
}
|
||||
|
||||
// Init sitemap data
|
||||
$sitemap = new Sitemap($hostSitemapPath);
|
||||
|
||||
// Process collected sitemap links
|
||||
foreach ($sitemap->getLinks() as $link => $attributes) {
|
||||
|
||||
// Parse formatted link
|
||||
$linkURI = Parser::uri($link);
|
||||
$linkHostURL = Parser::hostURL($link);
|
||||
|
||||
// Add host page
|
||||
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||
$linkHostURL->string == $hostURL && // this host links only
|
||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||
|
||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
|
@ -728,6 +728,21 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `host`
|
||||
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
|
||||
|
||||
ORDER BY RAND()
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom, 0]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `manifest`
|
||||
|
@ -2,8 +2,9 @@
|
||||
|
||||
class Robots {
|
||||
|
||||
private $_rule = [];
|
||||
private $_data = null;
|
||||
private $_rule = [];
|
||||
private $_sitemap = null;
|
||||
private $_data = null;
|
||||
|
||||
public function __construct(mixed $data) {
|
||||
|
||||
@ -15,6 +16,15 @@ class Robots {
|
||||
|
||||
$row = strtolower(trim($row));
|
||||
|
||||
// Parse sitemap address
|
||||
if (preg_match('!^sitemap:\s?(.*)!', $row, $matches)) {
|
||||
|
||||
if (!empty($matches[1])) {
|
||||
|
||||
$this->_sitemap = urldecode(trim($matches[1]));
|
||||
}
|
||||
}
|
||||
|
||||
// User-agent * begin
|
||||
if (preg_match('!^user-agent:\s?\*!', $row)) {
|
||||
$read = true;
|
||||
@ -63,6 +73,7 @@ class Robots {
|
||||
return $result;
|
||||
}
|
||||
|
||||
/* @TODO not in use
|
||||
public function append(string $key, string $value) {
|
||||
|
||||
if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) {
|
||||
@ -75,12 +86,18 @@ class Robots {
|
||||
$this->_data .= PHP_EOL . $key . ' ' . $value;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
public function getData() {
|
||||
|
||||
return $this->_data;
|
||||
}
|
||||
|
||||
public function getSitemap() {
|
||||
|
||||
return $this->_sitemap;
|
||||
}
|
||||
|
||||
private function _regex(string $string) {
|
||||
|
||||
return str_replace(
|
||||
|
59
library/sitemap.php
Normal file
59
library/sitemap.php
Normal file
@ -0,0 +1,59 @@
|
||||
<?php
|
||||
|
||||
class Sitemap {
|
||||
|
||||
private $_files = [];
|
||||
private $_links = [];
|
||||
|
||||
public function __construct(string $filename) {
|
||||
|
||||
$this->_scanFiles($filename);
|
||||
$this->_scanLinks();
|
||||
}
|
||||
|
||||
private function _scanFiles(string $filename) {
|
||||
|
||||
if ($data = @simplexml_load_file($filename)) {
|
||||
|
||||
if (!empty($data->sitemap)) { // sitemaps index
|
||||
|
||||
foreach ($data->sitemap as $value) {
|
||||
|
||||
if (!empty($value->loc)) {
|
||||
|
||||
$this->_scanFiles(trim(urldecode($value->loc)));
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!empty($data->url)) { // target file
|
||||
|
||||
$this->_files[trim(urldecode($filename))] = []; // @TODO attributes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function _scanLinks() {
|
||||
|
||||
foreach ($this->_files as $filename => $attributes) {
|
||||
|
||||
if ($data = @simplexml_load_file($filename)) {
|
||||
|
||||
if (!empty($data->url)) {
|
||||
|
||||
foreach ($data->url as $value) {
|
||||
|
||||
if (!empty($value->loc)) {
|
||||
|
||||
$this->_links[trim(urldecode($value->loc))] = []; // @TODO attributes
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public function getLinks() {
|
||||
|
||||
return $this->_links;
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user