From 2e2501b4373979865c26b11294d5bf91504aa3ed Mon Sep 17 00:00:00 2001 From: ghost Date: Thu, 27 Jul 2023 11:44:42 +0300 Subject: [PATCH] implement sitemap support --- README.md | 3 +- config/app.php.txt | 26 ++++++++++++++++ crontab/crawler.php | 73 +++++++++++++++++++++++++++++++++++++++++++++ library/mysql.php | 15 ++++++++++ library/robots.php | 21 +++++++++++-- library/sitemap.php | 59 ++++++++++++++++++++++++++++++++++++ 6 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 library/sitemap.php diff --git a/README.md b/README.md index a6e439e..7f1c843 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ https://github.com/YGGverse/YGGo/tree/main/media ``` php8^ php-dom +php-xml php-pdo php-curl php-gd @@ -199,7 +200,7 @@ GET m=SphinxQL * [ ] Host page DOM elements collecting by CSS selectors * [ ] Custom settings for each host * [ ] XML Feeds support - + [ ] Sitemap + + [x] Sitemap + [ ] RSS + [ ] Atom * [ ] Palette image index / filter diff --git a/config/app.php.txt b/config/app.php.txt index 76d2588..89371f7 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -301,6 +301,32 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', true); */ define('CRAWL_HOST_DEFAULT_NSFW', false); +/* + * Collect sitemap index when available + * + * At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only + * + * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml + * + * true|false + * + */ +define('CRAWL_SITEMAPS', true); + +/* + * Renew robots.txt index by timing offset provided + * + */ +define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7); + +/* + * Hosts Robots.txt processing limit in the crawler.php queue + * + * Set 0 to disable + * + */ +define('CRAWL_ROBOTS_LIMIT', 1); + /* * Default robots.txt rules on remote file not exists * The crawler able to overwrite these rules diff --git a/crontab/crawler.php b/crontab/crawler.php index cda0dce..0ce1827 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -14,6 +14,7 @@ require_once(__DIR__ . '/../config/app.php'); require_once(__DIR__ . '/../library/ftp.php'); require_once(__DIR__ . '/../library/curl.php'); require_once(__DIR__ . '/../library/robots.php'); +require_once(__DIR__ . '/../library/sitemap.php'); require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/parser.php'); require_once(__DIR__ . '/../library/mysql.php'); @@ -263,6 +264,78 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES } } +// Process robots crawl queue +foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { + + // Build web root URL + $hostURL = $host->scheme . '://' . + $host->name . + ($host->port ? ':' . $host->port : ''); + + // Get robots.txt + $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + + // Update curl stats + $httpRequestsTotal++; + $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpDownloadSizeTotal += $curl->getSizeDownload(); + $httpRequestsTimeTotal += $curl->getTotalTime(); + + // Sitemap provided in robots.txt + if (200 == $curl->getCode()) { + + $hostRobots = $curl->getContent(); + + } else { + + $hostRobots = $host->robots; + } + + // Update host index + $db->updateHostRobots($host->hostId, $hostRobots, time()); + + // Process sitemaps when enabled + if (CRAWL_SITEMAPS) { + + // Look for custom sitemap URL served in robots.txt + $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); + + if ($hostSitemapPath = $robots->getSitemap()) { + + // Replace relative paths + $hostSitemapPath = trim($hostSitemapPath, '/'); + $hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath); + $hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath); + + // Set default path when not exists + } else { + + $hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL); + } + + // Init sitemap data + $sitemap = new Sitemap($hostSitemapPath); + + // Process collected sitemap links + foreach ($sitemap->getLinks() as $link => $attributes) { + + // Parse formatted link + $linkURI = Parser::uri($link); + $linkHostURL = Parser::hostURL($link); + + // Add host page + if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format + $linkHostURL->string == $hostURL && // this host links only + $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules + $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit + !$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists + + $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); + } + } + } +} + // Process pages crawl queue foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) { diff --git a/library/mysql.php b/library/mysql.php index 61e287e..643499a 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -728,6 +728,21 @@ class MySQL { return $query->rowCount(); } + public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) { + + $query = $this->_db->prepare('SELECT * FROM `host` + + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? + + ORDER BY RAND() + + LIMIT ' . (int) $limit); + + $query->execute([$timeFrom, 0]); + + return $query->fetchAll(); + } + public function getManifestCrawlQueue(int $limit, int $timeFrom) { $query = $this->_db->prepare('SELECT * FROM `manifest` diff --git a/library/robots.php b/library/robots.php index c36c593..3cbf198 100644 --- a/library/robots.php +++ b/library/robots.php @@ -2,8 +2,9 @@ class Robots { - private $_rule = []; - private $_data = null; + private $_rule = []; + private $_sitemap = null; + private $_data = null; public function __construct(mixed $data) { @@ -15,6 +16,15 @@ class Robots { $row = strtolower(trim($row)); + // Parse sitemap address + if (preg_match('!^sitemap:\s?(.*)!', $row, $matches)) { + + if (!empty($matches[1])) { + + $this->_sitemap = urldecode(trim($matches[1])); + } + } + // User-agent * begin if (preg_match('!^user-agent:\s?\*!', $row)) { $read = true; @@ -63,6 +73,7 @@ class Robots { return $result; } + /* @TODO not in use public function append(string $key, string $value) { if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) { @@ -75,12 +86,18 @@ class Robots { $this->_data .= PHP_EOL . $key . ' ' . $value; } } + */ public function getData() { return $this->_data; } + public function getSitemap() { + + return $this->_sitemap; + } + private function _regex(string $string) { return str_replace( diff --git a/library/sitemap.php b/library/sitemap.php new file mode 100644 index 0000000..a29ef38 --- /dev/null +++ b/library/sitemap.php @@ -0,0 +1,59 @@ +_scanFiles($filename); + $this->_scanLinks(); + } + + private function _scanFiles(string $filename) { + + if ($data = @simplexml_load_file($filename)) { + + if (!empty($data->sitemap)) { // sitemaps index + + foreach ($data->sitemap as $value) { + + if (!empty($value->loc)) { + + $this->_scanFiles(trim(urldecode($value->loc))); + } + } + + } else if (!empty($data->url)) { // target file + + $this->_files[trim(urldecode($filename))] = []; // @TODO attributes + } + } + } + + private function _scanLinks() { + + foreach ($this->_files as $filename => $attributes) { + + if ($data = @simplexml_load_file($filename)) { + + if (!empty($data->url)) { + + foreach ($data->url as $value) { + + if (!empty($value->loc)) { + + $this->_links[trim(urldecode($value->loc))] = []; // @TODO attributes + } + } + } + } + } + } + + public function getLinks() { + + return $this->_links; + } +} \ No newline at end of file