diff --git a/crontab/cleaner.php b/crontab/cleaner.php index 87a74f0..ba57ddb 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -60,7 +60,7 @@ try { foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { // Get robots.txt if exists - $curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + $curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; diff --git a/crontab/crawler.php b/crontab/crawler.php index 94cd993..db553a2 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -277,7 +277,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { // Get robots.txt - $curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + $curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; @@ -308,13 +308,13 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ // Replace relative paths $hostSitemapPath = trim($hostSitemapPath, '/'); - $hostSitemapPath = str_replace($host->hostURL, '', $hostSitemapPath); - $hostSitemapPath = sprintf('%s%s', $host->hostURL, $hostSitemapPath); + $hostSitemapPath = str_replace($host->url, '', $hostSitemapPath); + $hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath); // Set default path when not exists } else { - $hostSitemapPath = sprintf('%s/sitemap.xml', $host->hostURL); + $hostSitemapPath = sprintf('%s/sitemap.xml', $host->url); } // Init sitemap data @@ -329,7 +329,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ // Add host page if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format - $linkHostURL->string == $host->hostURL && // this host links only + $linkHostURL->string == $host->url && // this host links only $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists diff --git a/library/mysql.php b/library/mysql.php index 43e3612..2811132 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -3,18 +3,13 @@ class MySQL { private PDO $_db; - private Memcached $_memcached; - public function __construct(string $host, int $port, string $database, string $username, string $password, Memcached $memcached = null) { + public function __construct(string $host, int $port, string $database, string $username, string $password) { $this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']); $this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); $this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); $this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600); - - if ($memcached) { - $this->_memcached = $memcached; - } } // System @@ -96,7 +91,22 @@ class MySQL { return $query->fetchAll(); } - public function getHost(int $crc32url) { + public function getHost(int $hostId) { + + $query = $this->_db->prepare("SELECT *, + IF (`port` IS NOT NULL, + CONCAT(`scheme`, '://', `name`, ':', `port`), + CONCAT(`scheme`, '://', `name`) + ) AS `url` + + FROM `host` WHERE `hostId` = ? LIMIT 1"); + + $query->execute([$hostId]); + + return $query->fetch(); + } + + public function getHostByCRC32URL(int $crc32url) { $query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1'); @@ -204,53 +214,57 @@ class MySQL { public function getTopHostPages(int $limit = 100) { - if ($this->_memcached) { + // Get ID (to prevent memory over usage) + $query = $this->_db->query("SELECT `hostPage`.`hostPageId` - if ($result = $this->_memcached->get(sprintf('MySQL.getTopHostPages.%s', $limit))) { + FROM `hostPage` + JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) - return $result; - } - } + WHERE `host`.`status` = '1' + AND `hostPage`.`httpCode` = 200 + AND `hostPage`.`rank` > 0 + AND `hostPage`.`timeBanned` IS NULL + AND `hostPage`.`mime` IS NOT NULL + + ORDER BY `rank` DESC - $query = $this->_db->query(" SELECT + LIMIT " . (int) $limit); - `hostPage`.`hostId`, - `hostPage`.`hostPageId`, - `hostPage`.`uri`, - `hostPage`.`rank`, + // Get required page details + foreach ($query->fetchAll() as $top) { - `host`.`scheme`, - `host`.`name`, - `host`.`port`, + $query = $this->_db->prepare("SELECT `hostPage`.`hostId`, + `hostPage`.`hostPageId`, + `hostPage`.`uri`, + `hostPage`.`rank`, - IF (`host`.`port` IS NOT NULL, - CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), - CONCAT(`host`.`scheme`, '://', `host`.`name`) - ) AS `hostURL`, + `host`.`scheme`, + `host`.`name`, + `host`.`port`, - IF (`host`.`port` IS NOT NULL, - CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), - CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) - ) AS `hostPageURL` + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), + CONCAT(`host`.`scheme`, '://', `host`.`name`) + ) AS `hostURL`, - FROM `hostPage` - JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), + CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) + ) AS `hostPageURL` - WHERE `host`.`status` = '1' - AND `hostPage`.`httpCode` = 200 - AND `hostPage`.`rank` > 0 - AND `hostPage`.`timeBanned` IS NULL - AND `hostPage`.`mime` IS NOT NULL + FROM `hostPage` + JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) - ORDER BY `rank` DESC + WHERE `hostPage`.`hostPageId` = ? - LIMIT " . (int) $limit); + LIMIT 1"); - $result = $query->fetchAll(); + $query->execute([$top->hostPageId]); - if ($this->_memcached) { + if ($query->rowCount()) { - $this->_memcached->set(sprintf('MySQL.getTopHostPages.%s', $limit), $result, time() + 3600); + $result[] = $query->fetch(); + } } return $result; @@ -582,20 +596,28 @@ class MySQL { // Cleaner tools public function getCleanerQueue(int $limit, int $timeFrom) { - $query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL, - CONCAT(`scheme`, '://', `name`, ':', `port`), - CONCAT(`scheme`, '://', `name`) - ) AS `hostURL` FROM `host` + $result = []; + + // Get ID (to prevent memory over usage) + $query = $this->_db->prepare("SELECT `hostId` - WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ? + FROM `host` - ORDER BY `hostId` + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ? - LIMIT " . (int) $limit); + ORDER BY `hostId` + + LIMIT " . (int) $limit); $query->execute([$timeFrom, 0]); - return $query->fetchAll(); + // Get required page details + foreach ($query->fetchAll() as $host) { + + $result[] = $this->getHost($host->hostId); + } + + return (object) $result; } public function getHostPagesBanned() { @@ -702,7 +724,13 @@ class MySQL { FROM `hostPage` JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) - WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)) + WHERE ( + `hostPage`.`timeUpdated` IS NULL OR + `hostPage`.`timeUpdated` < ? OR ( + `hostPage`.`uri` = '/' AND + `hostPage`.`timeUpdated` < ? + ) + ) AND `host`.`status` <> ? AND `hostPage`.`timeBanned` IS NULL"); @@ -714,32 +742,22 @@ class MySQL { public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { - $query = $this->_db->prepare("SELECT `hostPage`.`hostId`, - `hostPage`.`hostPageId`, - `hostPage`.`uri`, - - `host`.`scheme`, - `host`.`name`, - `host`.`port`, - `host`.`crawlPageLimit`, - `host`.`crawlMetaOnly`, - `host`.`robots`, - `host`.`robotsPostfix`, - - IF (`host`.`port` IS NOT NULL, - CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), - CONCAT(`host`.`scheme`, '://', `host`.`name`) - ) AS `hostURL`, + $result = []; - IF (`host`.`port` IS NOT NULL, - CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), - CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) - ) AS `hostPageURL` + // Get ID (to prevent memory over usage) + $query = $this->_db->prepare("SELECT `hostPage`.`hostPageId` FROM `hostPage` JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) - WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)) + WHERE ( + `hostPage`.`timeUpdated` IS NULL OR + `hostPage`.`timeUpdated` < ? + OR ( + `hostPage`.`uri` = '/' AND + `hostPage`.`timeUpdated` < ? + ) + ) AND `host`.`status` <> ? AND `hostPage`.`timeBanned` IS NULL @@ -750,7 +768,45 @@ class MySQL { $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); - return $query->fetchAll(); + // Get required page details + foreach ($query->fetchAll() as $queue) { + + $query = $this->_db->prepare("SELECT `hostPage`.`hostId`, + `hostPage`.`hostPageId`, + `hostPage`.`uri`, + + `host`.`scheme`, + `host`.`name`, + `host`.`port`, + `host`.`crawlPageLimit`, + `host`.`crawlMetaOnly`, + `host`.`robots`, + `host`.`robotsPostfix`, + + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), + CONCAT(`host`.`scheme`, '://', `host`.`name`) + ) AS `hostURL`, + + IF (`host`.`port` IS NOT NULL, + CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), + CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) + ) AS `hostPageURL` + + FROM `hostPage` + JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + + WHERE `hostPage`.`hostPageId` = ? LIMIT 1"); + + $query->execute([$queue->hostPageId]); + + if ($query->rowCount()) { + + $result[] = $query->fetch(); + } + } + + return (object) $result; } public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode, int $size) { @@ -764,22 +820,28 @@ class MySQL { public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) { - $query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL, - CONCAT(`scheme`, '://', `name`, ':', `port`), - CONCAT(`scheme`, '://', `name`) - ) AS `hostURL` + $result = []; + + // Get ID (to prevent memory over usage) + $query = $this->_db->prepare("SELECT `hostId` - FROM `host` + FROM `host` - WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? - ORDER BY RAND() + ORDER BY RAND() - LIMIT " . (int) $limit); + LIMIT " . (int) $limit); $query->execute([$timeFrom, 0]); - return $query->fetchAll(); + // Get required page details + foreach ($query->fetchAll() as $host) { + + $result[] = $this->getHost($host->hostId); + } + + return (object) $result; } public function getManifestCrawlQueue(int $limit, int $timeFrom) { diff --git a/public/top.php b/public/top.php index 705c8a4..4ccabd4 100644 --- a/public/top.php +++ b/public/top.php @@ -9,12 +9,8 @@ require_once(__DIR__ . '/../library/sphinxql.php'); // Connect Sphinx search server $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); -// Connect Memcached -$memcached = new Memcached(); -$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT); - // Connect database -$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD, $memcached); +$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); // Define page basics $totalPages = $sphinx->getHostPagesTotal();