Browse Source

implement hosts crawl queue, move robots, sitemaps, manifests to this task

main
ghost 1 year ago
parent
commit
ab6c0379c8
  1. 15
      config/app.php.example
  2. 395
      crontab/crawler.php
  3. 12
      library/mysql.php
  4. BIN
      media/db-prototype.png

15
config/app.php.example

@ -270,7 +270,7 @@ define('CRAWL_HOST_DEFAULT_NSFW', false);
/* /*
* Collect sitemap index when available * Collect sitemap index when available
* *
* At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only * At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
* *
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
* *
@ -290,18 +290,23 @@ define('CRAWL_SITEMAPS', true);
define('CRAWL_PAGE_RANK_UPDATE', true); define('CRAWL_PAGE_RANK_UPDATE', true);
/* /*
* Renew robots.txt index by timing offset provided * Renew hosts index by timing offset provided
* *
*/ */
define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7); define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7);
/* /*
* Hosts Robots.txt processing limit in the crawler.php queue * Hosts hosts processing limit in the crawler.php queue
* *
* Set 0 to disable * Set 0 to disable
* *
*/ */
define('CRAWL_ROBOTS_LIMIT', 1); define('CRAWL_HOST_LIMIT', 1);
/*
* Crawl robots.txt
*/
define('CRAWL_ROBOTS', true); // true|false
/* /*
* Default robots.txt rules on remote file not exists * Default robots.txt rules on remote file not exists

395
crontab/crawler.php

@ -44,11 +44,12 @@ $httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0; $httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0; $httpRequestsTimeTotal = 0;
$hostsProcessed = 0;
$hostsAdded = 0; $hostsAdded = 0;
$hostPagesBanned = 0;
$hostPagesSnapAdded = 0;
$hostPagesProcessed = 0; $hostPagesProcessed = 0;
$hostPagesBanned = 0;
$hostPagesSnapAdded = 0;
$hostPagesAdded = 0; $hostPagesAdded = 0;
$manifestsProcessed = 0; $manifestsProcessed = 0;
@ -67,261 +68,288 @@ try {
exit; exit;
} }
// Process robots crawl queue // Process hosts crawl queue
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) {
// Update robots $db->beginTransaction();
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats try {
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Sitemap provided in robots.txt // Update host crawl queue
if (200 == $curl->getCode()) { $hostsProcessed += $db->updateHostCrawlQueue($host->hostId);
$hostRobots = $curl->getContent(); // Crawl robots.txt
if (CRAWL_ROBOTS) {
} else { // Update robots
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
$hostRobots = $host->robots; // Update curl stats
} $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update host index // Sitemap provided in robots.txt
$db->updateHostRobots($host->hostId, $hostRobots, time()); if (200 == $curl->getCode()) {
// Process sitemaps when enabled $hostRobots = $curl->getContent();
if (CRAWL_SITEMAPS) {
// Look for custom sitemap URL served in robots.txt } else {
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
if ($hostSitemapPath = $robots->getSitemap()) { $hostRobots = $host->robots;
}
// Replace relative paths // Update host index
$hostSitemapPath = trim($hostSitemapPath, '/'); $db->updateHostRobots($host->hostId, $hostRobots, time());
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath); }
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
// Set default path when not exists // Process sitemaps when enabled
} else { if (CRAWL_SITEMAPS) {
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url); // Look for custom sitemap URL served in robots.txt
} $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Init sitemap data if ($hostSitemapPath = $robots->getSitemap()) {
$sitemap = new Sitemap($hostSitemapPath);
if ($sitemapLinks = $sitemap->getLinks()) { // Replace relative paths
$hostSitemapPath = trim($hostSitemapPath, '/');
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
$sitemapsProcessed++; // Set default path when not exists
} else {
// Process collected sitemap links $hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
foreach ($sitemapLinks as $link => $attributes) { }
// Init sitemap data
$sitemap = new Sitemap($hostSitemapPath);
if ($sitemapLinks = $sitemap->getLinks()) {
// Parse formatted link $sitemapsProcessed++;
$linkURI = Parser::uri($link);
$linkHostURL = Parser::hostURL($link);
// Add host page // Process collected sitemap links
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format foreach ($sitemapLinks as $link => $attributes) {
$linkHostURL->string == $host->url && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules // Parse formatted link
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit $linkURI = Parser::uri($link);
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists $linkHostURL = Parser::hostURL($link);
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); // Add host page
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
$linkHostURL->string == $host->url && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
}
} }
} }
} }
}
// Update manifest if available for this host // Update manifests
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) { if (CRAWL_MANIFEST) {
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); $curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
// Skip processing non 200 code // Skip processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing without returned data // Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) { if (!$remoteManifest = $curl->getContent()) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing on json encoding error // Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) { if (!$remoteManifest = @json_decode($remoteManifest)) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing on required fields missed // Skip processing on required fields missed
if (empty($remoteManifest->status) || if (empty($remoteManifest->status) ||
empty($remoteManifest->result->config->crawlUrlRegexp) || empty($remoteManifest->result->config->crawlUrlRegexp) ||
empty($remoteManifest->result->api->version) || empty($remoteManifest->result->api->version) ||
empty($remoteManifest->result->api->hosts)) { empty($remoteManifest->result->api->hosts)) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing on API version not compatible // Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing on host API not available // Skip processing on host API not available
if (!$remoteManifest->result->api->hosts) { if (!$remoteManifest->result->api->hosts) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing on host link does not match condition // Skip processing on host link does not match condition
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
$db->commit(); $db->commit();
continue; continue;
} }
// Begin hosts collection // Begin hosts collection
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT); $curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
// Skip processing non 200 code // Skip processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing without returned data // Skip processing without returned data
if (!$remoteManifestHosts = $curl->getContent()) { if (!$remoteManifestHosts = $curl->getContent()) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing on json encoding error // Skip processing on json encoding error
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
$db->commit(); $db->commit();
continue; continue;
} }
// Skip processing on required fields missed // Skip processing on required fields missed
if (empty($remoteManifestHosts->status) || if (empty($remoteManifestHosts->status) ||
empty($remoteManifestHosts->result)) { empty($remoteManifestHosts->result)) {
$db->commit(); $db->commit();
continue; continue;
} }
// Begin hosts processing // Begin hosts processing
foreach ($remoteManifestHosts->result as $remoteManifestHost) { foreach ($remoteManifestHosts->result as $remoteManifestHost) {
// Skip processing on required fields missed // Skip processing on required fields missed
if (empty($remoteManifestHost->scheme) || if (empty($remoteManifestHost->scheme) ||
empty($remoteManifestHost->name)) { empty($remoteManifestHost->name)) {
continue; continue;
} }
$hostURL = $remoteManifestHost->scheme . '://' . $hostURL = $remoteManifestHost->scheme . '://' .
$remoteManifestHost->name . $remoteManifestHost->name .
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link // Validate formatted link
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
// Host not exists // Host not exists
if (!$db->getHostByCRC32URL(crc32($hostURL))) { if (!$db->getHostByCRC32URL(crc32($hostURL))) {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Get robots.txt if exists if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); $hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
// Update curl stats $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostRobots = $curl->getContent(); $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
} else { $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostId = $db->addHost( $remoteManifestHosts->result->scheme,
$remoteManifestHosts->result->name,
$remoteManifestHosts->result->port,
crc32($hostURL),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; // Add web root host page to make host visible in the crawl queue
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; $db->addHostPage($hostId, crc32('/'), '/', time());
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost( $remoteManifestHosts->result->scheme, // Increase counters
$remoteManifestHosts->result->name, $hostPagesAdded++;
$remoteManifestHosts->result->port, $hostsAdded++;
crc32($hostURL), }
time(), }
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
} }
} }
} }
$db->commit();
// Process update errors
} catch (Exception $e) {
// Debug std
var_dump($e);
// Skip item
$db->rollBack();
continue;
} }
} }
@ -1207,20 +1235,21 @@ $executionTimeTotal = microtime(true) - $timeStart;
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
// Debug output // Debug output
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL; echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL . PHP_EOL;
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL; echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL . PHP_EOL;
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL; echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL; echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL; echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL . PHP_EOL;
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL; echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;

12
library/mysql.php

@ -667,7 +667,7 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) { public function getHostCrawlQueue(int $limit, int $timeFrom) {
$result = []; $result = [];
@ -693,9 +693,19 @@ class MySQL {
return (object) $result; return (object) $result;
} }
public function updateHostCrawlQueue(int $hostId, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `host` SET `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
$query->execute([$timeUpdated, $hostId]);
return $query->rowCount();
}
public function optimize() { public function optimize() {
$this->_db->query('OPTIMIZE TABLE `host`'); $this->_db->query('OPTIMIZE TABLE `host`');
$this->_db->query('OPTIMIZE TABLE `hostSetting`');
$this->_db->query('OPTIMIZE TABLE `hostPage`'); $this->_db->query('OPTIMIZE TABLE `hostPage`');
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`'); $this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
$this->_db->query('OPTIMIZE TABLE `hostPageDom`'); $this->_db->query('OPTIMIZE TABLE `hostPageDom`');

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 162 KiB

After

Width:  |  Height:  |  Size: 144 KiB

Loading…
Cancel
Save