Browse Source

update PR generation, delegate PR value from redirecting pages, update method names

main
ghost 1 year ago
parent
commit
1d7deffc4c
  1. 22
      README.md
  2. 74
      cli/yggo.php
  3. 33
      crontab/crawler.php
  4. 51
      library/mysql.php
  5. 4
      public/explore.php
  6. 6
      public/search.php

22
README.md

@ -198,7 +198,7 @@ GET m=SphinxQL @@ -198,7 +198,7 @@ GET m=SphinxQL
* [x] Index homepages and shorter URI with higher priority
* [x] Collect target location links on page redirect available
* [x] Collect referrer pages (redirects including)
* [ ] Aliasing page URL with ending slash
* [x] URL aliasing support on PR calculation
* [ ] Host page DOM elements collecting by CSS selectors
* [ ] Custom settings for each host
* [ ] XML Feeds support
@ -225,23 +225,23 @@ GET m=SphinxQL @@ -225,23 +225,23 @@ GET m=SphinxQL
##### CLI
* [x] help
* [x] crontab
* [x] db
* [x] optimize
[x] crontab
* [x] crawl
* [x] clean
* [x] hostPage
+ [x] rank
+ [x] reindex
* [x] hostPageSnap
+ [x] repair (not tested)
+ [x] _sync DB-FS relations_
+ [x] _FTP_
+ [x] _localhost_
+ [x] _delete FS missed in the DB_
+ [x] _FTP_
+ [ ] _localhost_
+ [x] repair
+ [x] db
+ [x] fs
+ [ ] reindex
+ [ ] truncate
* [x] hostPageDom
+ [x] generate
+ [x] truncate
* [ ] hostPage
+ [ ] add
##### Other

74
cli/yggo.php

@ -313,35 +313,70 @@ if (!empty($argv[1])) { @@ -313,35 +313,70 @@ if (!empty($argv[1])) {
break;
case 'hostPage':
switch ($argv[2]) {
if (!empty($argv[2])) {
case 'rank':
switch ($argv[2]) {
if (empty($argv[3])) {
case 'rank':
switch ($argv[3]) {
if (!empty($argv[3])) {
case 'reindex':
switch ($argv[3]) {
foreach ($db->getHosts() as $host) {
case 'reindex':
foreach ($db->getHostPages($host->hostId) as $hostPage) {
CLI::notice(_('hostPage rank fields reindex begin...'));
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
}
}
foreach ($db->getHosts() as $host) {
CLI::success(_('hostPage rank successfully updated'));
exit;
foreach ($db->getHostPages($host->hostId) as $hostPage) {
break;
default:
// @TODO add common method
$hostPageRank = 0;
CLI::danger(_('undefined action argument'));
// Get referrers
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
// Get source page details
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
// Increase PR on external referrer only
if ($hostPageSource->hostId != $hostPage->hostId) {
$hostPageRank++;
}
// Delegate page rank value from redirected pages
if (false !== strpos($hostPageSource->httpCode, '30')) {
$hostPageRank += $hostPageSource->rank;
}
}
}
// Update registry
if ($db->updateHostPageRank($hostPage->hostPageId, $hostPageRank)) {
CLI::warning(sprintf(_('update hostPage #%s rank from %s to %s;'), $hostPage->hostPageId, $hostPage->rank, $hostPageRank));
} else {
# CLI::success(sprintf(_('keep hostPage #%s rank %s;'), $hostPage->hostPageId, $hostPageRank));
}
}
}
CLI::notice(_('hostPage rank fields successfully updated!'));
CLI::break();
exit;
break;
}
}
}
break;
break;
}
}
break;
@ -413,6 +448,7 @@ if (!empty($argv[1])) { @@ -413,6 +448,7 @@ if (!empty($argv[1])) {
}
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
CLI::break();
exit;
break;
@ -421,6 +457,7 @@ if (!empty($argv[1])) { @@ -421,6 +457,7 @@ if (!empty($argv[1])) {
$db->truncateHostPageDom();
CLI::success(_('hostPageDom table successfully truncated'));
CLI::break();
exit;
break;
@ -450,7 +487,8 @@ CLI::default(' crawl - execute step in crawler queue'); @@ -450,7 +487,8 @@ CLI::default(' crawl - execute step in crawler queue');
CLI::default(' clean - execute step in cleaner queue');
CLI::break();
CLI::default(' hostPage ');
CLI::default(' rank - generate hostPage.rank fields');
CLI::default(' rank ');
CLI::default(' reindex - reindex hostPage.rank fields');
CLI::break();
CLI::default(' hostPageSnap ');
CLI::default(' repair ');

33
crontab/crawler.php

@ -332,7 +332,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ @@ -332,7 +332,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
$linkHostURL->string == $host->hostURL && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
}
@ -357,7 +357,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -357,7 +357,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update page rank
$db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover
// @TODO add common method
$hostPageRank = 0;
// Get referrers
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($queueHostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
// Get source page details
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
// Increase PR on external referrer only
if ($hostPageSource->hostId != $queueHostPage->hostId) {
$hostPageRank++;
}
// Delegate page rank value from redirected pages
if (false !== strpos($hostPageSource->httpCode, '30')) {
$hostPageRank += $hostPageSource->rank;
}
}
}
// Update registry
$db->updateHostPageRank($queueHostPage->hostPageId, $hostPageRank);
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
@ -475,7 +500,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -475,7 +500,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId;
@ -1139,7 +1164,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -1139,7 +1164,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId;

51
library/mysql.php

@ -175,37 +175,16 @@ class MySQL { @@ -175,37 +175,16 @@ class MySQL {
return $query->fetch()->total;
}
public function getTotalHostPagesIndexed(int $hostId) {
public function getHostPage(int $hostPageId) {
if ($this->_memcached) {
if ($result = $this->_memcached->get(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId))) {
return $result;
}
}
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`
WHERE `hostId` = ?
AND `httpCode` = 200
AND `timeBanned` IS NULL
AND `mime` IS NOT NULL');
$query->execute([$hostId]);
$result = $query->fetch()->total;
if ($this->_memcached) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
$this->_memcached->set(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId), $result, time() + 3600);
}
$query->execute([$hostPageId]);
return $result;
return $query->fetch();
}
public function getHostPage(int $hostId, int $crc32uri) {
public function findHostPageByCRC32URI(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
@ -449,23 +428,7 @@ class MySQL { @@ -449,23 +428,7 @@ class MySQL {
return $query->rowCount();
}
public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
FROM `hostPageToHostPage`
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`)
WHERE `hostPageToHostPage`.`hostPageIdTarget` = ?
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`');
$query->execute([$hostPageIdTarget]);
return $query->fetch()->total;
}
public function getTotalHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
public function getTotalHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
@ -474,7 +437,7 @@ class MySQL { @@ -474,7 +437,7 @@ class MySQL {
return $query->fetch()->total;
}
public function getHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
public function getHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . (int) $limit);

4
public/explore.php

@ -253,14 +253,14 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -253,14 +253,14 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
</p>
<?php } ?>
<?php } ?>
<?php $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($hp); ?>
<?php $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($hp); ?>
<p>
<?php echo $totalHostPageIdSources ? Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources)]) : false ?>
</p>
<?php if ($totalHostPageIdSources) { ?>
<?php foreach ($db->getHostPageToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
<?php foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
<p>

6
public/search.php

@ -108,7 +108,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -108,7 +108,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
!$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
}
@ -339,7 +339,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -339,7 +339,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
<?php echo _('explore'); ?>
</a>
<?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($result->id)) { ?>
<?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($result->id)) { ?>
<p>
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources),
@ -347,7 +347,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -347,7 +347,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
]) ?>
</p>
<?php $i = 1 ?>
<?php foreach ($db->getHostPageToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
<?php foreach ($db->getHostPagesToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<?php $i++ ?>
<p>

Loading…
Cancel
Save