mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-12 07:48:34 +00:00
implement sitemap support
This commit is contained in:
parent
a905499926
commit
2e2501b437
@ -23,6 +23,7 @@ https://github.com/YGGverse/YGGo/tree/main/media
|
|||||||
```
|
```
|
||||||
php8^
|
php8^
|
||||||
php-dom
|
php-dom
|
||||||
|
php-xml
|
||||||
php-pdo
|
php-pdo
|
||||||
php-curl
|
php-curl
|
||||||
php-gd
|
php-gd
|
||||||
@ -199,7 +200,7 @@ GET m=SphinxQL
|
|||||||
* [ ] Host page DOM elements collecting by CSS selectors
|
* [ ] Host page DOM elements collecting by CSS selectors
|
||||||
* [ ] Custom settings for each host
|
* [ ] Custom settings for each host
|
||||||
* [ ] XML Feeds support
|
* [ ] XML Feeds support
|
||||||
+ [ ] Sitemap
|
+ [x] Sitemap
|
||||||
+ [ ] RSS
|
+ [ ] RSS
|
||||||
+ [ ] Atom
|
+ [ ] Atom
|
||||||
* [ ] Palette image index / filter
|
* [ ] Palette image index / filter
|
||||||
|
@ -301,6 +301,32 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', true);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_HOST_DEFAULT_NSFW', false);
|
define('CRAWL_HOST_DEFAULT_NSFW', false);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Collect sitemap index when available
|
||||||
|
*
|
||||||
|
* At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only
|
||||||
|
*
|
||||||
|
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
|
||||||
|
*
|
||||||
|
* true|false
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_SITEMAPS', true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Renew robots.txt index by timing offset provided
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Hosts Robots.txt processing limit in the crawler.php queue
|
||||||
|
*
|
||||||
|
* Set 0 to disable
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_ROBOTS_LIMIT', 1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Default robots.txt rules on remote file not exists
|
* Default robots.txt rules on remote file not exists
|
||||||
* The crawler able to overwrite these rules
|
* The crawler able to overwrite these rules
|
||||||
|
@ -14,6 +14,7 @@ require_once(__DIR__ . '/../config/app.php');
|
|||||||
require_once(__DIR__ . '/../library/ftp.php');
|
require_once(__DIR__ . '/../library/ftp.php');
|
||||||
require_once(__DIR__ . '/../library/curl.php');
|
require_once(__DIR__ . '/../library/curl.php');
|
||||||
require_once(__DIR__ . '/../library/robots.php');
|
require_once(__DIR__ . '/../library/robots.php');
|
||||||
|
require_once(__DIR__ . '/../library/sitemap.php');
|
||||||
require_once(__DIR__ . '/../library/filter.php');
|
require_once(__DIR__ . '/../library/filter.php');
|
||||||
require_once(__DIR__ . '/../library/parser.php');
|
require_once(__DIR__ . '/../library/parser.php');
|
||||||
require_once(__DIR__ . '/../library/mysql.php');
|
require_once(__DIR__ . '/../library/mysql.php');
|
||||||
@ -263,6 +264,78 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Process robots crawl queue
|
||||||
|
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
||||||
|
|
||||||
|
// Build web root URL
|
||||||
|
$hostURL = $host->scheme . '://' .
|
||||||
|
$host->name .
|
||||||
|
($host->port ? ':' . $host->port : '');
|
||||||
|
|
||||||
|
// Get robots.txt
|
||||||
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
|
// Update curl stats
|
||||||
|
$httpRequestsTotal++;
|
||||||
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
|
// Sitemap provided in robots.txt
|
||||||
|
if (200 == $curl->getCode()) {
|
||||||
|
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
$hostRobots = $host->robots;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update host index
|
||||||
|
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||||
|
|
||||||
|
// Process sitemaps when enabled
|
||||||
|
if (CRAWL_SITEMAPS) {
|
||||||
|
|
||||||
|
// Look for custom sitemap URL served in robots.txt
|
||||||
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
|
if ($hostSitemapPath = $robots->getSitemap()) {
|
||||||
|
|
||||||
|
// Replace relative paths
|
||||||
|
$hostSitemapPath = trim($hostSitemapPath, '/');
|
||||||
|
$hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath);
|
||||||
|
$hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath);
|
||||||
|
|
||||||
|
// Set default path when not exists
|
||||||
|
} else {
|
||||||
|
|
||||||
|
$hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init sitemap data
|
||||||
|
$sitemap = new Sitemap($hostSitemapPath);
|
||||||
|
|
||||||
|
// Process collected sitemap links
|
||||||
|
foreach ($sitemap->getLinks() as $link => $attributes) {
|
||||||
|
|
||||||
|
// Parse formatted link
|
||||||
|
$linkURI = Parser::uri($link);
|
||||||
|
$linkHostURL = Parser::hostURL($link);
|
||||||
|
|
||||||
|
// Add host page
|
||||||
|
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||||
|
$linkHostURL->string == $hostURL && // this host links only
|
||||||
|
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||||
|
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||||
|
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||||
|
|
||||||
|
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Process pages crawl queue
|
// Process pages crawl queue
|
||||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
|
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
|
||||||
|
|
||||||
|
@ -728,6 +728,21 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `host`
|
||||||
|
|
||||||
|
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
|
||||||
|
|
||||||
|
ORDER BY RAND()
|
||||||
|
|
||||||
|
LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$timeFrom, 0]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
|
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `manifest`
|
$query = $this->_db->prepare('SELECT * FROM `manifest`
|
||||||
|
@ -2,8 +2,9 @@
|
|||||||
|
|
||||||
class Robots {
|
class Robots {
|
||||||
|
|
||||||
private $_rule = [];
|
private $_rule = [];
|
||||||
private $_data = null;
|
private $_sitemap = null;
|
||||||
|
private $_data = null;
|
||||||
|
|
||||||
public function __construct(mixed $data) {
|
public function __construct(mixed $data) {
|
||||||
|
|
||||||
@ -15,6 +16,15 @@ class Robots {
|
|||||||
|
|
||||||
$row = strtolower(trim($row));
|
$row = strtolower(trim($row));
|
||||||
|
|
||||||
|
// Parse sitemap address
|
||||||
|
if (preg_match('!^sitemap:\s?(.*)!', $row, $matches)) {
|
||||||
|
|
||||||
|
if (!empty($matches[1])) {
|
||||||
|
|
||||||
|
$this->_sitemap = urldecode(trim($matches[1]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// User-agent * begin
|
// User-agent * begin
|
||||||
if (preg_match('!^user-agent:\s?\*!', $row)) {
|
if (preg_match('!^user-agent:\s?\*!', $row)) {
|
||||||
$read = true;
|
$read = true;
|
||||||
@ -63,6 +73,7 @@ class Robots {
|
|||||||
return $result;
|
return $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* @TODO not in use
|
||||||
public function append(string $key, string $value) {
|
public function append(string $key, string $value) {
|
||||||
|
|
||||||
if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) {
|
if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) {
|
||||||
@ -75,12 +86,18 @@ class Robots {
|
|||||||
$this->_data .= PHP_EOL . $key . ' ' . $value;
|
$this->_data .= PHP_EOL . $key . ' ' . $value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
public function getData() {
|
public function getData() {
|
||||||
|
|
||||||
return $this->_data;
|
return $this->_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getSitemap() {
|
||||||
|
|
||||||
|
return $this->_sitemap;
|
||||||
|
}
|
||||||
|
|
||||||
private function _regex(string $string) {
|
private function _regex(string $string) {
|
||||||
|
|
||||||
return str_replace(
|
return str_replace(
|
||||||
|
59
library/sitemap.php
Normal file
59
library/sitemap.php
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class Sitemap {
|
||||||
|
|
||||||
|
private $_files = [];
|
||||||
|
private $_links = [];
|
||||||
|
|
||||||
|
public function __construct(string $filename) {
|
||||||
|
|
||||||
|
$this->_scanFiles($filename);
|
||||||
|
$this->_scanLinks();
|
||||||
|
}
|
||||||
|
|
||||||
|
private function _scanFiles(string $filename) {
|
||||||
|
|
||||||
|
if ($data = @simplexml_load_file($filename)) {
|
||||||
|
|
||||||
|
if (!empty($data->sitemap)) { // sitemaps index
|
||||||
|
|
||||||
|
foreach ($data->sitemap as $value) {
|
||||||
|
|
||||||
|
if (!empty($value->loc)) {
|
||||||
|
|
||||||
|
$this->_scanFiles(trim(urldecode($value->loc)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if (!empty($data->url)) { // target file
|
||||||
|
|
||||||
|
$this->_files[trim(urldecode($filename))] = []; // @TODO attributes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private function _scanLinks() {
|
||||||
|
|
||||||
|
foreach ($this->_files as $filename => $attributes) {
|
||||||
|
|
||||||
|
if ($data = @simplexml_load_file($filename)) {
|
||||||
|
|
||||||
|
if (!empty($data->url)) {
|
||||||
|
|
||||||
|
foreach ($data->url as $value) {
|
||||||
|
|
||||||
|
if (!empty($value->loc)) {
|
||||||
|
|
||||||
|
$this->_links[trim(urldecode($value->loc))] = []; // @TODO attributes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getLinks() {
|
||||||
|
|
||||||
|
return $this->_links;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user