diff --git a/crontab/cleaner.php b/crontab/cleaner.php index 816aae6..ef9e35f 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -49,11 +49,8 @@ try { // Get cleaner queue foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { - // Parse host info - $hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false); - // Get robots.txt if exists - $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + $curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; diff --git a/crontab/crawler.php b/crontab/crawler.php index 98a2e8b..c0370a0 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -194,7 +194,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES } $hostURL = $remoteManifestHost->scheme . '://' . - $remoteManifestHost->name . + $remoteManifestHost->name . (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); // Validate formatted link @@ -267,13 +267,8 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES // Process robots crawl queue foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { - // Build web root URL - $hostURL = $host->scheme . '://' . - $host->name . - ($host->port ? ':' . $host->port : ''); - // Get robots.txt - $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + $curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; @@ -304,13 +299,13 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ // Replace relative paths $hostSitemapPath = trim($hostSitemapPath, '/'); - $hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath); - $hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath); + $hostSitemapPath = str_replace($host->hostURL, '', $hostSitemapPath); + $hostSitemapPath = sprintf('%s%s', $host->hostURL, $hostSitemapPath); // Set default path when not exists } else { - $hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL); + $hostSitemapPath = sprintf('%s/sitemap.xml', $host->hostURL); } // Init sitemap data @@ -325,7 +320,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ // Add host page if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format - $linkHostURL->string == $hostURL && // this host links only + $linkHostURL->string == $host->hostURL && // this host links only $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit !$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists @@ -343,11 +338,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND try { - // Build URL from the DB - $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; - // Init page request - $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT); + $curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; @@ -368,7 +360,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); // Try to receive target page location on page redirect available - $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true); + $curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true); // Update curl stats $httpRequestsTotal++; @@ -392,10 +384,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND //Make relative links absolute if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use - $url = $queueHostPage->scheme . '://' . - $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '') . - '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.'); + $url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.'); } // Validate formatted link @@ -693,7 +682,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND sprintf('CRC32: %s', $crc32data . PHP_EOL . sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . - sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { + sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL))))) { // Done $zip->close(); @@ -1055,10 +1044,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND //Make relative links absolute if (!parse_url($link['ref'], PHP_URL_HOST)) { - $link['ref'] = $queueHostPage->scheme . '://' . - $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '') . - '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.'); + $link['ref'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.'); } // Validate formatted link diff --git a/library/mysql.php b/library/mysql.php index 1372940..2a0a20d 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -242,7 +242,17 @@ class MySQL { `host`.`scheme`, `host`.`name`, - `host`.`port` + `host`.`port`, + + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), + CONCAT(`host`.`scheme`, '://', `host`.`name`) + ) AS `hostURL`, + + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), + CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) + ) AS `hostPageURL` FROM `hostPage` JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) @@ -294,22 +304,33 @@ class MySQL { public function getFoundHostPage(int $hostPageId) { - $query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`, + $query = $this->_db->prepare("SELECT `hostPage`.`hostPageId`, `hostPage`.`uri`, `hostPage`.`timeAdded`, `hostPage`.`timeUpdated`, `hostPage`.`mime`, `hostPage`.`size`, + `host`.`scheme`, `host`.`name`, - `host`.`port` + `host`.`port`, - FROM `hostPage` - JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), + CONCAT(`host`.`scheme`, '://', `host`.`name`) + ) AS `hostURL`, + + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), + CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) + ) AS `hostPageURL` + + FROM `hostPage` + JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) - WHERE `hostPage`.`hostPageId` = ? + WHERE `hostPage`.`hostPageId` = ? - LIMIT 1'); + LIMIT 1"); $query->execute([$hostPageId]); @@ -623,13 +644,16 @@ class MySQL { // Cleaner tools public function getCleanerQueue(int $limit, int $timeFrom) { - $query = $this->_db->prepare('SELECT * FROM `host` + $query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL, + CONCAT(`scheme`, '://', `name`, ':', `port`), + CONCAT(`scheme`, '://', `name`) + ) AS `hostURL` FROM `host` - WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ? + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ? - ORDER BY `hostId` + ORDER BY `hostId` - LIMIT ' . (int) $limit); + LIMIT " . (int) $limit); $query->execute([$timeFrom, 0]); @@ -755,25 +779,36 @@ class MySQL { $query = $this->_db->prepare("SELECT `hostPage`.`hostId`, `hostPage`.`hostPageId`, `hostPage`.`uri`, + `host`.`scheme`, `host`.`name`, `host`.`port`, `host`.`crawlPageLimit`, `host`.`crawlMetaOnly`, `host`.`robots`, - `host`.`robotsPostfix` + `host`.`robotsPostfix`, - FROM `hostPage` - JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), + CONCAT(`host`.`scheme`, '://', `host`.`name`) + ) AS `hostURL`, - WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)) + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), + CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) + ) AS `hostPageURL` - AND `host`.`status` <> ? - AND `hostPage`.`timeBanned` IS NULL + FROM `hostPage` + JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + + WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)) - ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND() + AND `host`.`status` <> ? + AND `hostPage`.`timeBanned` IS NULL - LIMIT " . (int) $limit); + ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND() + + LIMIT " . (int) $limit); $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); @@ -791,13 +826,18 @@ class MySQL { public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) { - $query = $this->_db->prepare('SELECT * FROM `host` + $query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL, + CONCAT(`scheme`, '://', `name`, ':', `port`), + CONCAT(`scheme`, '://', `name`) + ) AS `hostURL` - WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? + FROM `host` - ORDER BY RAND() + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? - LIMIT ' . (int) $limit); + ORDER BY RAND() + + LIMIT " . (int) $limit); $query->execute([$timeFrom, 0]); diff --git a/public/explore.php b/public/explore.php index fb2143d..c69c7c3 100644 --- a/public/explore.php +++ b/public/explore.php @@ -216,9 +216,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the keywords) ?> - + - scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . urldecode($hostPage->uri)) ?> + hostURL) . urldecode($hostPage->uri)) ?>
-
- scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 32 ? '...' . mb_substr(urldecode($hostPage->uri), -32) : urldecode($hostPage->uri))) ?>
+ hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 32 ? '...' . mb_substr(urldecode($hostPage->uri), -32) : urldecode($hostPage->uri))) ?>
|
diff --git a/public/search.php b/public/search.php
index a4472a5..a55e39d 100644
--- a/public/search.php
+++ b/public/search.php
@@ -339,9 +339,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
keywords ?>
-
+
- scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
+ hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
|
@@ -359,9 +359,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
-
+
- scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
+ hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>