diff --git a/config/app.php.example b/config/app.php.example index 3a24f37..aad771b 100644 --- a/config/app.php.example +++ b/config/app.php.example @@ -270,7 +270,7 @@ define('CRAWL_HOST_DEFAULT_NSFW', false); /* * Collect sitemap index when available * - * At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only + * At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only * * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml * @@ -290,18 +290,23 @@ define('CRAWL_SITEMAPS', true); define('CRAWL_PAGE_RANK_UPDATE', true); /* - * Renew robots.txt index by timing offset provided + * Renew hosts index by timing offset provided * */ -define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7); +define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7); /* - * Hosts Robots.txt processing limit in the crawler.php queue + * Hosts hosts processing limit in the crawler.php queue * * Set 0 to disable * */ -define('CRAWL_ROBOTS_LIMIT', 1); +define('CRAWL_HOST_LIMIT', 1); + +/* + * Crawl robots.txt + */ +define('CRAWL_ROBOTS', true); // true|false /* * Default robots.txt rules on remote file not exists diff --git a/crontab/crawler.php b/crontab/crawler.php index a03c3aa..34c32a0 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -44,11 +44,12 @@ $httpRequestsSizeTotal = 0; $httpDownloadSizeTotal = 0; $httpRequestsTimeTotal = 0; +$hostsProcessed = 0; $hostsAdded = 0; -$hostPagesBanned = 0; -$hostPagesSnapAdded = 0; $hostPagesProcessed = 0; +$hostPagesBanned = 0; +$hostPagesSnapAdded = 0; $hostPagesAdded = 0; $manifestsProcessed = 0; @@ -67,261 +68,288 @@ try { exit; } -// Process robots crawl queue -foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { +// Process hosts crawl queue +foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) { - // Update robots - $curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + $db->beginTransaction(); - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); + try { - // Sitemap provided in robots.txt - if (200 == $curl->getCode()) { + // Update host crawl queue + $hostsProcessed += $db->updateHostCrawlQueue($host->hostId); - $hostRobots = $curl->getContent(); + // Crawl robots.txt + if (CRAWL_ROBOTS) { - } else { + // Update robots + $curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); - $hostRobots = $host->robots; - } + // Update curl stats + $httpRequestsTotal++; + $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpDownloadSizeTotal += $curl->getSizeDownload(); + $httpRequestsTimeTotal += $curl->getTotalTime(); - // Update host index - $db->updateHostRobots($host->hostId, $hostRobots, time()); + // Sitemap provided in robots.txt + if (200 == $curl->getCode()) { - // Process sitemaps when enabled - if (CRAWL_SITEMAPS) { + $hostRobots = $curl->getContent(); - // Look for custom sitemap URL served in robots.txt - $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); + } else { - if ($hostSitemapPath = $robots->getSitemap()) { + $hostRobots = $host->robots; + } - // Replace relative paths - $hostSitemapPath = trim($hostSitemapPath, '/'); - $hostSitemapPath = str_replace($host->url, '', $hostSitemapPath); - $hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath); + // Update host index + $db->updateHostRobots($host->hostId, $hostRobots, time()); + } - // Set default path when not exists - } else { + // Process sitemaps when enabled + if (CRAWL_SITEMAPS) { - $hostSitemapPath = sprintf('%s/sitemap.xml', $host->url); - } + // Look for custom sitemap URL served in robots.txt + $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - // Init sitemap data - $sitemap = new Sitemap($hostSitemapPath); + if ($hostSitemapPath = $robots->getSitemap()) { - if ($sitemapLinks = $sitemap->getLinks()) { + // Replace relative paths + $hostSitemapPath = trim($hostSitemapPath, '/'); + $hostSitemapPath = str_replace($host->url, '', $hostSitemapPath); + $hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath); - $sitemapsProcessed++; + // Set default path when not exists + } else { - // Process collected sitemap links - foreach ($sitemapLinks as $link => $attributes) { + $hostSitemapPath = sprintf('%s/sitemap.xml', $host->url); + } + + // Init sitemap data + $sitemap = new Sitemap($hostSitemapPath); + + if ($sitemapLinks = $sitemap->getLinks()) { - // Parse formatted link - $linkURI = Parser::uri($link); - $linkHostURL = Parser::hostURL($link); + $sitemapsProcessed++; - // Add host page - if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format - $linkHostURL->string == $host->url && // this host links only - $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules - $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit - !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists + // Process collected sitemap links + foreach ($sitemapLinks as $link => $attributes) { + + // Parse formatted link + $linkURI = Parser::uri($link); + $linkHostURL = Parser::hostURL($link); - $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); + // Add host page + if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format + $linkHostURL->string == $host->url && // this host links only + $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules + $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit + !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists + + $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); + } } } } - } - // Update manifest if available for this host - if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) { + // Update manifests + if (CRAWL_MANIFEST) { + if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) { - $curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); + $curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); + // Update curl stats + $httpRequestsTotal++; + $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpDownloadSizeTotal += $curl->getSizeDownload(); + $httpRequestsTimeTotal += $curl->getTotalTime(); - // Skip processing non 200 code - if (200 != $curl->getCode()) { + // Skip processing non 200 code + if (200 != $curl->getCode()) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing without returned data - if (!$remoteManifest = $curl->getContent()) { + // Skip processing without returned data + if (!$remoteManifest = $curl->getContent()) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing on json encoding error - if (!$remoteManifest = @json_decode($remoteManifest)) { + // Skip processing on json encoding error + if (!$remoteManifest = @json_decode($remoteManifest)) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing on required fields missed - if (empty($remoteManifest->status) || - empty($remoteManifest->result->config->crawlUrlRegexp) || - empty($remoteManifest->result->api->version) || - empty($remoteManifest->result->api->hosts)) { + // Skip processing on required fields missed + if (empty($remoteManifest->status) || + empty($remoteManifest->result->config->crawlUrlRegexp) || + empty($remoteManifest->result->api->version) || + empty($remoteManifest->result->api->hosts)) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing on API version not compatible - if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { + // Skip processing on API version not compatible + if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing on host API not available - if (!$remoteManifest->result->api->hosts) { + // Skip processing on host API not available + if (!$remoteManifest->result->api->hosts) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition - if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { + // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition + if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing on host link does not match condition - if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { + // Skip processing on host link does not match condition + if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Begin hosts collection - $curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT); + // Begin hosts collection + $curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT); - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); + // Update curl stats + $httpRequestsTotal++; + $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpDownloadSizeTotal += $curl->getSizeDownload(); + $httpRequestsTimeTotal += $curl->getTotalTime(); - // Skip processing non 200 code - if (200 != $curl->getCode()) { + // Skip processing non 200 code + if (200 != $curl->getCode()) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing without returned data - if (!$remoteManifestHosts = $curl->getContent()) { + // Skip processing without returned data + if (!$remoteManifestHosts = $curl->getContent()) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing on json encoding error - if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { + // Skip processing on json encoding error + if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Skip processing on required fields missed - if (empty($remoteManifestHosts->status) || - empty($remoteManifestHosts->result)) { + // Skip processing on required fields missed + if (empty($remoteManifestHosts->status) || + empty($remoteManifestHosts->result)) { - $db->commit(); + $db->commit(); - continue; - } + continue; + } - // Begin hosts processing - foreach ($remoteManifestHosts->result as $remoteManifestHost) { + // Begin hosts processing + foreach ($remoteManifestHosts->result as $remoteManifestHost) { - // Skip processing on required fields missed - if (empty($remoteManifestHost->scheme) || - empty($remoteManifestHost->name)) { + // Skip processing on required fields missed + if (empty($remoteManifestHost->scheme) || + empty($remoteManifestHost->name)) { - continue; - } + continue; + } - $hostURL = $remoteManifestHost->scheme . '://' . - $remoteManifestHost->name . - (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); + $hostURL = $remoteManifestHost->scheme . '://' . + $remoteManifestHost->name . + (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); - // Validate formatted link - if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { + // Validate formatted link + if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { - // Host not exists - if (!$db->getHostByCRC32URL(crc32($hostURL))) { + // Host not exists + if (!$db->getHostByCRC32URL(crc32($hostURL))) { + + // Get robots.txt if exists + $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + + // Update curl stats + $httpRequestsTotal++; + $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpDownloadSizeTotal += $curl->getSizeDownload(); + $httpRequestsTimeTotal += $curl->getTotalTime(); - // Get robots.txt if exists - $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { + $hostRobots = $curl->getContent(); + } else { + $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; + } - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); + $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; - } + $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; + $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; + $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; + $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; + $hostId = $db->addHost( $remoteManifestHosts->result->scheme, + $remoteManifestHosts->result->name, + $remoteManifestHosts->result->port, + crc32($hostURL), + time(), + null, + $hostPageLimit, + (string) $hostMetaOnly, + (string) $hostStatus, + (string) $hostNsfw, + $hostRobots, + $hostRobotsPostfix); - $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; - $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; - $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; - $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; + // Add web root host page to make host visible in the crawl queue + $db->addHostPage($hostId, crc32('/'), '/', time()); - $hostId = $db->addHost( $remoteManifestHosts->result->scheme, - $remoteManifestHosts->result->name, - $remoteManifestHosts->result->port, - crc32($hostURL), - time(), - null, - $hostPageLimit, - (string) $hostMetaOnly, - (string) $hostStatus, - (string) $hostNsfw, - $hostRobots, - $hostRobotsPostfix); - - // Add web root host page to make host visible in the crawl queue - $db->addHostPage($hostId, crc32('/'), '/', time()); - - // Increase counters - $hostPagesAdded++; - $hostsAdded++; + // Increase counters + $hostPagesAdded++; + $hostsAdded++; + } + } } } } + + $db->commit(); + + // Process update errors + } catch (Exception $e) { + + // Debug std + var_dump($e); + + // Skip item + $db->rollBack(); + + continue; } } @@ -1207,20 +1235,21 @@ $executionTimeTotal = microtime(true) - $timeStart; $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; // Debug output -echo 'Hosts added: ' . $hostsAdded . PHP_EOL; +echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL; +echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL; -echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; +echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL . PHP_EOL; -echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL; +echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL . PHP_EOL; -echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; +echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL . PHP_EOL; echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL; echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL; -echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL; +echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL . PHP_EOL; echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL; diff --git a/library/mysql.php b/library/mysql.php index c6be019..aad78d9 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -667,7 +667,7 @@ class MySQL { return $query->rowCount(); } - public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) { + public function getHostCrawlQueue(int $limit, int $timeFrom) { $result = []; @@ -693,9 +693,19 @@ class MySQL { return (object) $result; } + public function updateHostCrawlQueue(int $hostId, int $timeUpdated) { + + $query = $this->_db->prepare('UPDATE `host` SET `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1'); + + $query->execute([$timeUpdated, $hostId]); + + return $query->rowCount(); + } + public function optimize() { $this->_db->query('OPTIMIZE TABLE `host`'); + $this->_db->query('OPTIMIZE TABLE `hostSetting`'); $this->_db->query('OPTIMIZE TABLE `hostPage`'); $this->_db->query('OPTIMIZE TABLE `hostPageDescription`'); $this->_db->query('OPTIMIZE TABLE `hostPageDom`'); diff --git a/media/db-prototype.png b/media/db-prototype.png index 4416693..e8c3b75 100644 Binary files a/media/db-prototype.png and b/media/db-prototype.png differ