From 1d7deffc4c3259639aef2c3061346ed94a18eab5 Mon Sep 17 00:00:00 2001
From: ghost
Date: Wed, 2 Aug 2023 15:43:44 +0300
Subject: [PATCH] update PR generation, delegate PR value from redirecting
pages, update method names
---
README.md | 22 +++++++-------
cli/yggo.php | 74 ++++++++++++++++++++++++++++++++++-----------
crontab/crawler.php | 33 +++++++++++++++++---
library/mysql.php | 51 +++++--------------------------
public/explore.php | 4 +--
public/search.php | 6 ++--
6 files changed, 108 insertions(+), 82 deletions(-)
diff --git a/README.md b/README.md
index 19214df..0aeb068 100644
--- a/README.md
+++ b/README.md
@@ -198,7 +198,7 @@ GET m=SphinxQL
* [x] Index homepages and shorter URI with higher priority
* [x] Collect target location links on page redirect available
* [x] Collect referrer pages (redirects including)
-* [ ] Aliasing page URL with ending slash
+* [x] URL aliasing support on PR calculation
* [ ] Host page DOM elements collecting by CSS selectors
* [ ] Custom settings for each host
* [ ] XML Feeds support
@@ -225,23 +225,23 @@ GET m=SphinxQL
##### CLI
* [x] help
-* [x] crontab
+* [x] db
+ * [x] optimize
+ [x] crontab
* [x] crawl
* [x] clean
+* [x] hostPage
+ + [x] rank
+ + [x] reindex
* [x] hostPageSnap
- + [x] repair (not tested)
- + [x] _sync DB-FS relations_
- + [x] _FTP_
- + [x] _localhost_
- + [x] _delete FS missed in the DB_
- + [x] _FTP_
- + [ ] _localhost_
+ + [x] repair
+ + [x] db
+ + [x] fs
+ + [ ] reindex
+ [ ] truncate
* [x] hostPageDom
+ [x] generate
+ [x] truncate
-* [ ] hostPage
- + [ ] add
##### Other
diff --git a/cli/yggo.php b/cli/yggo.php
index 00cdd3f..bb3b38c 100644
--- a/cli/yggo.php
+++ b/cli/yggo.php
@@ -313,35 +313,70 @@ if (!empty($argv[1])) {
break;
case 'hostPage':
- switch ($argv[2]) {
+ if (!empty($argv[2])) {
- case 'rank':
+ switch ($argv[2]) {
- if (empty($argv[3])) {
+ case 'rank':
- switch ($argv[3]) {
+ if (!empty($argv[3])) {
- case 'reindex':
+ switch ($argv[3]) {
- foreach ($db->getHosts() as $host) {
+ case 'reindex':
- foreach ($db->getHostPages($host->hostId) as $hostPage) {
+ CLI::notice(_('hostPage rank fields reindex begin...'));
- $db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
- }
- }
+ foreach ($db->getHosts() as $host) {
- CLI::success(_('hostPage rank successfully updated'));
- exit;
+ foreach ($db->getHostPages($host->hostId) as $hostPage) {
- break;
- default:
+ // @TODO add common method
+
+ $hostPageRank = 0;
- CLI::danger(_('undefined action argument'));
+ // Get referrers
+ foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
+
+ // Get source page details
+ if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
+
+ // Increase PR on external referrer only
+ if ($hostPageSource->hostId != $hostPage->hostId) {
+
+ $hostPageRank++;
+ }
+
+ // Delegate page rank value from redirected pages
+ if (false !== strpos($hostPageSource->httpCode, '30')) {
+
+ $hostPageRank += $hostPageSource->rank;
+ }
+ }
+ }
+
+ // Update registry
+ if ($db->updateHostPageRank($hostPage->hostPageId, $hostPageRank)) {
+
+ CLI::warning(sprintf(_('update hostPage #%s rank from %s to %s;'), $hostPage->hostPageId, $hostPage->rank, $hostPageRank));
+
+ } else {
+
+ # CLI::success(sprintf(_('keep hostPage #%s rank %s;'), $hostPage->hostPageId, $hostPageRank));
+ }
+ }
+ }
+
+ CLI::notice(_('hostPage rank fields successfully updated!'));
+ CLI::break();
+ exit;
+
+ break;
+ }
}
- }
- break;
+ break;
+ }
}
break;
@@ -413,6 +448,7 @@ if (!empty($argv[1])) {
}
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
+ CLI::break();
exit;
break;
@@ -421,6 +457,7 @@ if (!empty($argv[1])) {
$db->truncateHostPageDom();
CLI::success(_('hostPageDom table successfully truncated'));
+ CLI::break();
exit;
break;
@@ -450,7 +487,8 @@ CLI::default(' crawl - execute step in crawler queue');
CLI::default(' clean - execute step in cleaner queue');
CLI::break();
CLI::default(' hostPage ');
-CLI::default(' rank - generate hostPage.rank fields');
+CLI::default(' rank ');
+CLI::default(' reindex - reindex hostPage.rank fields');
CLI::break();
CLI::default(' hostPageSnap ');
CLI::default(' repair ');
diff --git a/crontab/crawler.php b/crontab/crawler.php
index 80441e5..b326b65 100644
--- a/crontab/crawler.php
+++ b/crontab/crawler.php
@@ -332,7 +332,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
$linkHostURL->string == $host->hostURL && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
- !$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
+ !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
}
@@ -357,7 +357,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update page rank
- $db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover
+ // @TODO add common method
+
+ $hostPageRank = 0;
+
+ // Get referrers
+ foreach ($db->getHostPagesToHostPageByHostPageIdTarget($queueHostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
+
+ // Get source page details
+ if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
+
+ // Increase PR on external referrer only
+ if ($hostPageSource->hostId != $queueHostPage->hostId) {
+
+ $hostPageRank++;
+ }
+
+ // Delegate page rank value from redirected pages
+ if (false !== strpos($hostPageSource->httpCode, '30')) {
+
+ $hostPageRank += $hostPageSource->rank;
+ }
+ }
+ }
+
+ // Update registry
+ $db->updateHostPageRank($queueHostPage->hostPageId, $hostPageRank);
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
@@ -475,7 +500,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
- if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
+ if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId;
@@ -1139,7 +1164,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
- if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
+ if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId;
diff --git a/library/mysql.php b/library/mysql.php
index f5e1478..43e3612 100644
--- a/library/mysql.php
+++ b/library/mysql.php
@@ -175,37 +175,16 @@ class MySQL {
return $query->fetch()->total;
}
- public function getTotalHostPagesIndexed(int $hostId) {
+ public function getHostPage(int $hostPageId) {
- if ($this->_memcached) {
-
- if ($result = $this->_memcached->get(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId))) {
-
- return $result;
- }
- }
-
- $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`
-
- WHERE `hostId` = ?
-
- AND `httpCode` = 200
- AND `timeBanned` IS NULL
- AND `mime` IS NOT NULL');
-
- $query->execute([$hostId]);
-
- $result = $query->fetch()->total;
-
- if ($this->_memcached) {
+ $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
- $this->_memcached->set(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId), $result, time() + 3600);
- }
+ $query->execute([$hostPageId]);
- return $result;
+ return $query->fetch();
}
- public function getHostPage(int $hostId, int $crc32uri) {
+ public function findHostPageByCRC32URI(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
@@ -449,23 +428,7 @@ class MySQL {
return $query->rowCount();
}
- public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
-
- $query = $this->_db->prepare('SELECT COUNT(*) AS `total`
-
- FROM `hostPageToHostPage`
- JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
- JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`)
-
- WHERE `hostPageToHostPage`.`hostPageIdTarget` = ?
- AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`');
-
- $query->execute([$hostPageIdTarget]);
-
- return $query->fetch()->total;
- }
-
- public function getTotalHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
+ public function getTotalHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
@@ -474,7 +437,7 @@ class MySQL {
return $query->fetch()->total;
}
- public function getHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
+ public function getHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . (int) $limit);
diff --git a/public/explore.php b/public/explore.php
index f690640..9b02c66 100644
--- a/public/explore.php
+++ b/public/explore.php
@@ -253,14 +253,14 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
- getTotalHostPageToHostPageByHostPageIdTarget($hp); ?>
+ getTotalHostPagesToHostPageByHostPageIdTarget($hp); ?>
- getHostPageToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
+ getHostPagesToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
diff --git a/public/search.php b/public/search.php
index 6a7c8b0..da1b1ec 100644
--- a/public/search.php
+++ b/public/search.php
@@ -108,7 +108,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
- !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
+ !$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
}
@@ -339,7 +339,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
- mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($result->id)) { ?>
+ mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($result->id)) { ?>
- getHostPageToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
+ getHostPagesToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>