From 1d7deffc4c3259639aef2c3061346ed94a18eab5 Mon Sep 17 00:00:00 2001 From: ghost Date: Wed, 2 Aug 2023 15:43:44 +0300 Subject: [PATCH] update PR generation, delegate PR value from redirecting pages, update method names --- README.md | 22 +++++++------- cli/yggo.php | 74 ++++++++++++++++++++++++++++++++++----------- crontab/crawler.php | 33 +++++++++++++++++--- library/mysql.php | 51 +++++-------------------------- public/explore.php | 4 +-- public/search.php | 6 ++-- 6 files changed, 108 insertions(+), 82 deletions(-) diff --git a/README.md b/README.md index 19214df..0aeb068 100644 --- a/README.md +++ b/README.md @@ -198,7 +198,7 @@ GET m=SphinxQL * [x] Index homepages and shorter URI with higher priority * [x] Collect target location links on page redirect available * [x] Collect referrer pages (redirects including) -* [ ] Aliasing page URL with ending slash +* [x] URL aliasing support on PR calculation * [ ] Host page DOM elements collecting by CSS selectors * [ ] Custom settings for each host * [ ] XML Feeds support @@ -225,23 +225,23 @@ GET m=SphinxQL ##### CLI * [x] help -* [x] crontab +* [x] db + * [x] optimize + [x] crontab * [x] crawl * [x] clean +* [x] hostPage + + [x] rank + + [x] reindex * [x] hostPageSnap - + [x] repair (not tested) - + [x] _sync DB-FS relations_ - + [x] _FTP_ - + [x] _localhost_ - + [x] _delete FS missed in the DB_ - + [x] _FTP_ - + [ ] _localhost_ + + [x] repair + + [x] db + + [x] fs + + [ ] reindex + [ ] truncate * [x] hostPageDom + [x] generate + [x] truncate -* [ ] hostPage - + [ ] add ##### Other diff --git a/cli/yggo.php b/cli/yggo.php index 00cdd3f..bb3b38c 100644 --- a/cli/yggo.php +++ b/cli/yggo.php @@ -313,35 +313,70 @@ if (!empty($argv[1])) { break; case 'hostPage': - switch ($argv[2]) { + if (!empty($argv[2])) { - case 'rank': + switch ($argv[2]) { - if (empty($argv[3])) { + case 'rank': - switch ($argv[3]) { + if (!empty($argv[3])) { - case 'reindex': + switch ($argv[3]) { - foreach ($db->getHosts() as $host) { + case 'reindex': - foreach ($db->getHostPages($host->hostId) as $hostPage) { + CLI::notice(_('hostPage rank fields reindex begin...')); - $db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover - } - } + foreach ($db->getHosts() as $host) { - CLI::success(_('hostPage rank successfully updated')); - exit; + foreach ($db->getHostPages($host->hostId) as $hostPage) { - break; - default: + // @TODO add common method + + $hostPageRank = 0; - CLI::danger(_('undefined action argument')); + // Get referrers + foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) { + + // Get source page details + if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) { + + // Increase PR on external referrer only + if ($hostPageSource->hostId != $hostPage->hostId) { + + $hostPageRank++; + } + + // Delegate page rank value from redirected pages + if (false !== strpos($hostPageSource->httpCode, '30')) { + + $hostPageRank += $hostPageSource->rank; + } + } + } + + // Update registry + if ($db->updateHostPageRank($hostPage->hostPageId, $hostPageRank)) { + + CLI::warning(sprintf(_('update hostPage #%s rank from %s to %s;'), $hostPage->hostPageId, $hostPage->rank, $hostPageRank)); + + } else { + + # CLI::success(sprintf(_('keep hostPage #%s rank %s;'), $hostPage->hostPageId, $hostPageRank)); + } + } + } + + CLI::notice(_('hostPage rank fields successfully updated!')); + CLI::break(); + exit; + + break; + } } - } - break; + break; + } } break; @@ -413,6 +448,7 @@ if (!empty($argv[1])) { } CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file')); + CLI::break(); exit; break; @@ -421,6 +457,7 @@ if (!empty($argv[1])) { $db->truncateHostPageDom(); CLI::success(_('hostPageDom table successfully truncated')); + CLI::break(); exit; break; @@ -450,7 +487,8 @@ CLI::default(' crawl - execute step in crawler queue'); CLI::default(' clean - execute step in cleaner queue'); CLI::break(); CLI::default(' hostPage '); -CLI::default(' rank - generate hostPage.rank fields'); +CLI::default(' rank '); +CLI::default(' reindex - reindex hostPage.rank fields'); CLI::break(); CLI::default(' hostPageSnap '); CLI::default(' repair '); diff --git a/crontab/crawler.php b/crontab/crawler.php index 80441e5..b326b65 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -332,7 +332,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ $linkHostURL->string == $host->hostURL && // this host links only $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit - !$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists + !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); } @@ -357,7 +357,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $httpRequestsTimeTotal += $curl->getTotalTime(); // Update page rank - $db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover + // @TODO add common method + + $hostPageRank = 0; + + // Get referrers + foreach ($db->getHostPagesToHostPageByHostPageIdTarget($queueHostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) { + + // Get source page details + if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) { + + // Increase PR on external referrer only + if ($hostPageSource->hostId != $queueHostPage->hostId) { + + $hostPageRank++; + } + + // Delegate page rank value from redirected pages + if (false !== strpos($hostPageSource->httpCode, '30')) { + + $hostPageRank += $hostPageSource->rank; + } + } + } + + // Update registry + $db->updateHostPageRank($queueHostPage->hostPageId, $hostPageRank); // Update page index anyway, with the current time and http code $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload()); @@ -475,7 +500,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit - if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) { + if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { $hostPageId = $hostPage->hostPageId; @@ -1139,7 +1164,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit - if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) { + if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { $hostPageId = $hostPage->hostPageId; diff --git a/library/mysql.php b/library/mysql.php index f5e1478..43e3612 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -175,37 +175,16 @@ class MySQL { return $query->fetch()->total; } - public function getTotalHostPagesIndexed(int $hostId) { + public function getHostPage(int $hostPageId) { - if ($this->_memcached) { - - if ($result = $this->_memcached->get(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId))) { - - return $result; - } - } - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` - - WHERE `hostId` = ? - - AND `httpCode` = 200 - AND `timeBanned` IS NULL - AND `mime` IS NOT NULL'); - - $query->execute([$hostId]); - - $result = $query->fetch()->total; - - if ($this->_memcached) { + $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1'); - $this->_memcached->set(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId), $result, time() + 3600); - } + $query->execute([$hostPageId]); - return $result; + return $query->fetch(); } - public function getHostPage(int $hostId, int $crc32uri) { + public function findHostPageByCRC32URI(int $hostId, int $crc32uri) { $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1'); @@ -449,23 +428,7 @@ class MySQL { return $query->rowCount(); } - public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) { - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` - - FROM `hostPageToHostPage` - JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`) - JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`) - - WHERE `hostPageToHostPage`.`hostPageIdTarget` = ? - AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`'); - - $query->execute([$hostPageIdTarget]); - - return $query->fetch()->total; - } - - public function getTotalHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget) { + public function getTotalHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget) { $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?'); @@ -474,7 +437,7 @@ class MySQL { return $query->fetch()->total; } - public function getHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) { + public function getHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) { $query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . (int) $limit); diff --git a/public/explore.php b/public/explore.php index f690640..9b02c66 100644 --- a/public/explore.php +++ b/public/explore.php @@ -253,14 +253,14 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the

- getTotalHostPageToHostPageByHostPageIdTarget($hp); ?> + getTotalHostPagesToHostPageByHostPageIdTarget($hp); ?>

- getHostPageToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?> + getHostPagesToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?> getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?> getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>

diff --git a/public/search.php b/public/search.php index 6a7c8b0..da1b1ec 100644 --- a/public/search.php +++ b/public/search.php @@ -108,7 +108,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { if ($hostStatus && // host enabled $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit - !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists + !$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); } @@ -339,7 +339,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { - mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($result->id)) { ?> + mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($result->id)) { ?>

- getHostPageToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?> + getHostPagesToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?> getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>