Browse Source

implement sitemap support

main
ghost 1 year ago
parent
commit
2e2501b437
  1. 3
      README.md
  2. 26
      config/app.php.txt
  3. 73
      crontab/crawler.php
  4. 15
      library/mysql.php
  5. 21
      library/robots.php
  6. 59
      library/sitemap.php

3
README.md

@ -23,6 +23,7 @@ https://github.com/YGGverse/YGGo/tree/main/media @@ -23,6 +23,7 @@ https://github.com/YGGverse/YGGo/tree/main/media
```
php8^
php-dom
php-xml
php-pdo
php-curl
php-gd
@ -199,7 +200,7 @@ GET m=SphinxQL @@ -199,7 +200,7 @@ GET m=SphinxQL
* [ ] Host page DOM elements collecting by CSS selectors
* [ ] Custom settings for each host
* [ ] XML Feeds support
+ [ ] Sitemap
+ [x] Sitemap
+ [ ] RSS
+ [ ] Atom
* [ ] Palette image index / filter

26
config/app.php.txt

@ -301,6 +301,32 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', true); @@ -301,6 +301,32 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', true);
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
/*
* Collect sitemap index when available
*
* At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only
*
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
*
* true|false
*
*/
define('CRAWL_SITEMAPS', true);
/*
* Renew robots.txt index by timing offset provided
*
*/
define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7);
/*
* Hosts Robots.txt processing limit in the crawler.php queue
*
* Set 0 to disable
*
*/
define('CRAWL_ROBOTS_LIMIT', 1);
/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules

73
crontab/crawler.php

@ -14,6 +14,7 @@ require_once(__DIR__ . '/../config/app.php'); @@ -14,6 +14,7 @@ require_once(__DIR__ . '/../config/app.php');
require_once(__DIR__ . '/../library/ftp.php');
require_once(__DIR__ . '/../library/curl.php');
require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/sitemap.php');
require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/parser.php');
require_once(__DIR__ . '/../library/mysql.php');
@ -263,6 +264,78 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES @@ -263,6 +264,78 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
}
}
// Process robots crawl queue
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
// Build web root URL
$hostURL = $host->scheme . '://' .
$host->name .
($host->port ? ':' . $host->port : '');
// Get robots.txt
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Sitemap provided in robots.txt
if (200 == $curl->getCode()) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = $host->robots;
}
// Update host index
$db->updateHostRobots($host->hostId, $hostRobots, time());
// Process sitemaps when enabled
if (CRAWL_SITEMAPS) {
// Look for custom sitemap URL served in robots.txt
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
if ($hostSitemapPath = $robots->getSitemap()) {
// Replace relative paths
$hostSitemapPath = trim($hostSitemapPath, '/');
$hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath);
// Set default path when not exists
} else {
$hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL);
}
// Init sitemap data
$sitemap = new Sitemap($hostSitemapPath);
// Process collected sitemap links
foreach ($sitemap->getLinks() as $link => $attributes) {
// Parse formatted link
$linkURI = Parser::uri($link);
$linkHostURL = Parser::hostURL($link);
// Add host page
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
$linkHostURL->string == $hostURL && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
}
}
}
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {

15
library/mysql.php

@ -728,6 +728,21 @@ class MySQL { @@ -728,6 +728,21 @@ class MySQL {
return $query->rowCount();
}
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `host`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
ORDER BY RAND()
LIMIT ' . (int) $limit);
$query->execute([$timeFrom, 0]);
return $query->fetchAll();
}
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `manifest`

21
library/robots.php

@ -2,8 +2,9 @@ @@ -2,8 +2,9 @@
class Robots {
private $_rule = [];
private $_data = null;
private $_rule = [];
private $_sitemap = null;
private $_data = null;
public function __construct(mixed $data) {
@ -15,6 +16,15 @@ class Robots { @@ -15,6 +16,15 @@ class Robots {
$row = strtolower(trim($row));
// Parse sitemap address
if (preg_match('!^sitemap:\s?(.*)!', $row, $matches)) {
if (!empty($matches[1])) {
$this->_sitemap = urldecode(trim($matches[1]));
}
}
// User-agent * begin
if (preg_match('!^user-agent:\s?\*!', $row)) {
$read = true;
@ -63,6 +73,7 @@ class Robots { @@ -63,6 +73,7 @@ class Robots {
return $result;
}
/* @TODO not in use
public function append(string $key, string $value) {
if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) {
@ -75,12 +86,18 @@ class Robots { @@ -75,12 +86,18 @@ class Robots {
$this->_data .= PHP_EOL . $key . ' ' . $value;
}
}
*/
public function getData() {
return $this->_data;
}
public function getSitemap() {
return $this->_sitemap;
}
private function _regex(string $string) {
return str_replace(

59
library/sitemap.php

@ -0,0 +1,59 @@ @@ -0,0 +1,59 @@
<?php
class Sitemap {
private $_files = [];
private $_links = [];
public function __construct(string $filename) {
$this->_scanFiles($filename);
$this->_scanLinks();
}
private function _scanFiles(string $filename) {
if ($data = @simplexml_load_file($filename)) {
if (!empty($data->sitemap)) { // sitemaps index
foreach ($data->sitemap as $value) {
if (!empty($value->loc)) {
$this->_scanFiles(trim(urldecode($value->loc)));
}
}
} else if (!empty($data->url)) { // target file
$this->_files[trim(urldecode($filename))] = []; // @TODO attributes
}
}
}
private function _scanLinks() {
foreach ($this->_files as $filename => $attributes) {
if ($data = @simplexml_load_file($filename)) {
if (!empty($data->url)) {
foreach ($data->url as $value) {
if (!empty($value->loc)) {
$this->_links[trim(urldecode($value->loc))] = []; // @TODO attributes
}
}
}
}
}
}
public function getLinks() {
return $this->_links;
}
}
Loading…
Cancel
Save