diff --git a/config/app.php.example b/config/app.php.example index 48fc761..3a24f37 100644 --- a/config/app.php.example +++ b/config/app.php.example @@ -181,23 +181,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); */ define('CRAWL_PAGE_LIMIT', 20); -/* - * Manifest (URI) processing limit in the crawler.php queue - * - * Used to collect distributed data index - * that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION - * - * This option related to CRAWL_MANIFEST_SECONDS_OFFSET value - * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) - * - * Usually up to 20 pages per minute, - * to prevent websites overload by sending GET crawling requests - * - * Set 0 to disable - * - */ -define('CRAWL_MANIFEST_LIMIT', 10); - /* * Renew page index by timing offset provided * @@ -234,19 +217,6 @@ define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30); */ define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac'); -/* - * Renew manifests index by timing offset provided - * - * This option works with CRAWL_MANIFEST_LIMIT step queue - * - * Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair - * must have enough value to crawl all manifests collected in the DB index - * - * or the crawler can stuck in queue - * - */ -define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30); - /* * Only URL addresses match this rule will be auto-crawled * @@ -386,17 +356,6 @@ define('CRAWL_MANIFEST', true); */ define('CRAWL_MANIFEST_API_VERSION', 0.12); -/* - * Set default auto-crawl status for new manifest added - * - * true - crawler autostart manifest indexer - * false - requires manual validation by the moderator in the DB `manifest`.`status` field - * - * This option applying on CRAWL_MANIFEST enabled - * - */ -define('CRAWL_MANIFEST_DEFAULT_STATUS', true); - // Cleaner settings /* diff --git a/crontab/crawler.php b/crontab/crawler.php index 4eb7d11..20eb7b1 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -46,7 +46,6 @@ $httpRequestsTimeTotal = 0; $hostPagesProcessed = 0; $manifestsProcessed = 0; -$manifestsAdded = 0; $hostPagesAdded = 0; $hostsAdded = 0; $hostPagesBanned = 0; @@ -65,14 +64,76 @@ try { exit; } -// Process manifests crawl queue -foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { +// Process robots crawl queue +foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { - $db->beginTransaction(); + // Update robots + $curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); - try { + // Update curl stats + $httpRequestsTotal++; + $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpDownloadSizeTotal += $curl->getSizeDownload(); + $httpRequestsTimeTotal += $curl->getTotalTime(); + + // Sitemap provided in robots.txt + if (200 == $curl->getCode()) { + + $hostRobots = $curl->getContent(); + + } else { + + $hostRobots = $host->robots; + } + + // Update host index + $db->updateHostRobots($host->hostId, $hostRobots, time()); + + // Process sitemaps when enabled + if (CRAWL_SITEMAPS) { + + // Look for custom sitemap URL served in robots.txt + $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); + + if ($hostSitemapPath = $robots->getSitemap()) { + + // Replace relative paths + $hostSitemapPath = trim($hostSitemapPath, '/'); + $hostSitemapPath = str_replace($host->url, '', $hostSitemapPath); + $hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath); + + // Set default path when not exists + } else { + + $hostSitemapPath = sprintf('%s/sitemap.xml', $host->url); + } - $curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT); + // Init sitemap data + $sitemap = new Sitemap($hostSitemapPath); + + // Process collected sitemap links + foreach ($sitemap->getLinks() as $link => $attributes) { + + // Parse formatted link + $linkURI = Parser::uri($link); + $linkHostURL = Parser::hostURL($link); + + // Add host page + if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format + $linkHostURL->string == $host->url && // this host links only + $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules + $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit + !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists + + $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); + } + } + } + + // Update manifest if available for this host + if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) { + + $curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; @@ -80,9 +141,6 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); - // Update manifest index anyway, with the current time and http code - $manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode()); - // Skip processing non 200 code if (200 != $curl->getCode()) { @@ -203,7 +261,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES } $hostURL = $remoteManifestHost->scheme . '://' . - $remoteManifestHost->name . + $remoteManifestHost->name . (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); // Validate formatted link @@ -256,87 +314,6 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES } } } - - // Apply changes - $db->commit(); - - // Process update errors - } catch (Exception $e) { - - // Debug std - var_dump($e); - - // Skip item - $db->rollBack(); - - continue; - } -} - -// Process robots crawl queue -foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { - - // Get robots.txt - $curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); - - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); - - // Sitemap provided in robots.txt - if (200 == $curl->getCode()) { - - $hostRobots = $curl->getContent(); - - } else { - - $hostRobots = $host->robots; - } - - // Update host index - $db->updateHostRobots($host->hostId, $hostRobots, time()); - - // Process sitemaps when enabled - if (CRAWL_SITEMAPS) { - - // Look for custom sitemap URL served in robots.txt - $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - - if ($hostSitemapPath = $robots->getSitemap()) { - - // Replace relative paths - $hostSitemapPath = trim($hostSitemapPath, '/'); - $hostSitemapPath = str_replace($host->url, '', $hostSitemapPath); - $hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath); - - // Set default path when not exists - } else { - - $hostSitemapPath = sprintf('%s/sitemap.xml', $host->url); - } - - // Init sitemap data - $sitemap = new Sitemap($hostSitemapPath); - - // Process collected sitemap links - foreach ($sitemap->getLinks() as $link => $attributes) { - - // Parse formatted link - $linkURI = Parser::uri($link); - $linkHostURL = Parser::hostURL($link); - - // Add host page - if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format - $linkHostURL->string == $host->url && // this host links only - $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules - $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit - !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists - - $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); - } - } } } @@ -720,9 +697,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND if (false !== stripos(Filter::mime($contentType), 'text/html')) { // Define variables - $metaDescription = null; - $metaKeywords = null; - $metaYggoManifest = null; + $metaDescription = null; + $metaKeywords = null; + $metaYggoManifestURL = null; // Parse page content $dom = new DomDocument(); @@ -782,7 +759,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Grab meta yggo:manifest link when available if (@$meta->getAttribute('name') == 'yggo:manifest') { - $metaYggoManifest = Filter::url(@$meta->getAttribute('content')); + $metaYggoManifestURL = Filter::url(@$meta->getAttribute('content')); } } @@ -835,18 +812,12 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND } // Update manifest registry - if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { - - $metaYggoManifestCRC32 = crc32($metaYggoManifest); + if (CRAWL_MANIFEST && + !empty($metaYggoManifestURL) && + filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) && + preg_match(CRAWL_URL_REGEXP, $metaYggoManifestURL)) { - if (!$db->getManifest($metaYggoManifestCRC32)) { - $db->addManifest($metaYggoManifestCRC32, - $metaYggoManifest, - (string) CRAWL_MANIFEST_DEFAULT_STATUS, - time()); - - $manifestsAdded++; - } + $manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL); } // Begin page links collection @@ -1236,7 +1207,6 @@ echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; -echo 'Manifests added: ' . $manifestsAdded . PHP_EOL; echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb index eb968ee..2c6cab8 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/mysql.php b/library/mysql.php index c2cbbda..c6be019 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -28,52 +28,6 @@ class MySQL { $this->_db->rollBack(); } - // Manifest - public function getTotalManifests() { - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `manifest`'); - - $query->execute(); - - return $query->fetch()->total; - } - - public function getManifests() { - - $query = $this->_db->prepare('SELECT * FROM `manifest`'); - - $query->execute(); - - return $query->fetchAll(); - } - - public function getManifest(int $crc32url) { - - $query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1'); - - $query->execute([$crc32url]); - - return $query->fetch(); - } - - public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) { - - $query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)'); - - $query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]); - - return $this->_db->lastInsertId(); - } - - public function deleteManifest(int $manifestId) { - - $query = $this->_db->prepare('DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1'); - - $query->execute([$manifestId]); - - return $query->rowCount(); - } - // Host public function getAPIHosts(string $apiHostFields) { @@ -175,7 +129,50 @@ class MySQL { return $query->rowCount(); } - // Pages + // Host settings + public function getHostSetting(int $hostId, mixed $key) { + + $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1'); + + $query->execute([$hostId, $key]); + + return $query->rowCount() ? $query->fetch()->value : false; + } + + public function getHostSettings(int $hostId) { + + $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?'); + + $query->execute([$hostId]); + + return $query->fetchAll(); + } + + public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) { + + $query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ? + `key` = ?, + `value` = ?, + `timeAdded = ? + + ON DUPLICATE KEY UPDATE `value` = ?, + `timeUpdated` = ?'); + + $query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]); + + return $query->rowCount(); + } + + public function deleteHostSetting(int $hostSettingId) { + + $query = $this->_db->query('DELETE FROM `hostSetting` WHERE `hostSettingId` = ?'); + + $query->execute([$hostSettingId]); + + return $query->rowCount(); + } + + // Host pages public function getTotalHostPages(int $hostId) { $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?'); @@ -696,30 +693,6 @@ class MySQL { return (object) $result; } - public function getManifestCrawlQueue(int $limit, int $timeFrom) { - - $query = $this->_db->prepare('SELECT * FROM `manifest` - - WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? - - ORDER BY RAND() - - LIMIT ' . (int) $limit); - - $query->execute([$timeFrom, 0]); - - return $query->fetchAll(); - } - - public function updateManifestCrawlQueue(int $manifestId, int $timeUpdated, int $httpCode) { - - $query = $this->_db->prepare('UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1'); - - $query->execute([$timeUpdated, $httpCode, $manifestId]); - - return $query->rowCount(); - } - public function optimize() { $this->_db->query('OPTIMIZE TABLE `host`'); @@ -730,7 +703,5 @@ class MySQL { $this->_db->query('OPTIMIZE TABLE `hostPageSnapStorage`'); $this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`'); $this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`'); - - $this->_db->query('OPTIMIZE TABLE `manifest`'); } }