Browse Source

update PR generation, delegate PR value from redirecting pages, update method names

main
ghost 1 year ago
parent
commit
1d7deffc4c
  1. 22
      README.md
  2. 52
      cli/yggo.php
  3. 33
      crontab/crawler.php
  4. 51
      library/mysql.php
  5. 4
      public/explore.php
  6. 6
      public/search.php

22
README.md

@ -198,7 +198,7 @@ GET m=SphinxQL
* [x] Index homepages and shorter URI with higher priority * [x] Index homepages and shorter URI with higher priority
* [x] Collect target location links on page redirect available * [x] Collect target location links on page redirect available
* [x] Collect referrer pages (redirects including) * [x] Collect referrer pages (redirects including)
* [ ] Aliasing page URL with ending slash * [x] URL aliasing support on PR calculation
* [ ] Host page DOM elements collecting by CSS selectors * [ ] Host page DOM elements collecting by CSS selectors
* [ ] Custom settings for each host * [ ] Custom settings for each host
* [ ] XML Feeds support * [ ] XML Feeds support
@ -225,23 +225,23 @@ GET m=SphinxQL
##### CLI ##### CLI
* [x] help * [x] help
* [x] crontab * [x] db
* [x] optimize
[x] crontab
* [x] crawl * [x] crawl
* [x] clean * [x] clean
* [x] hostPage
+ [x] rank
+ [x] reindex
* [x] hostPageSnap * [x] hostPageSnap
+ [x] repair (not tested) + [x] repair
+ [x] _sync DB-FS relations_ + [x] db
+ [x] _FTP_ + [x] fs
+ [x] _localhost_ + [ ] reindex
+ [x] _delete FS missed in the DB_
+ [x] _FTP_
+ [ ] _localhost_
+ [ ] truncate + [ ] truncate
* [x] hostPageDom * [x] hostPageDom
+ [x] generate + [x] generate
+ [x] truncate + [x] truncate
* [ ] hostPage
+ [ ] add
##### Other ##### Other

52
cli/yggo.php

@ -313,36 +313,71 @@ if (!empty($argv[1])) {
break; break;
case 'hostPage': case 'hostPage':
if (!empty($argv[2])) {
switch ($argv[2]) { switch ($argv[2]) {
case 'rank': case 'rank':
if (empty($argv[3])) { if (!empty($argv[3])) {
switch ($argv[3]) { switch ($argv[3]) {
case 'reindex': case 'reindex':
CLI::notice(_('hostPage rank fields reindex begin...'));
foreach ($db->getHosts() as $host) { foreach ($db->getHosts() as $host) {
foreach ($db->getHostPages($host->hostId) as $hostPage) { foreach ($db->getHostPages($host->hostId) as $hostPage) {
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover // @TODO add common method
$hostPageRank = 0;
// Get referrers
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
// Get source page details
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
// Increase PR on external referrer only
if ($hostPageSource->hostId != $hostPage->hostId) {
$hostPageRank++;
}
// Delegate page rank value from redirected pages
if (false !== strpos($hostPageSource->httpCode, '30')) {
$hostPageRank += $hostPageSource->rank;
}
}
}
// Update registry
if ($db->updateHostPageRank($hostPage->hostPageId, $hostPageRank)) {
CLI::warning(sprintf(_('update hostPage #%s rank from %s to %s;'), $hostPage->hostPageId, $hostPage->rank, $hostPageRank));
} else {
# CLI::success(sprintf(_('keep hostPage #%s rank %s;'), $hostPage->hostPageId, $hostPageRank));
}
} }
} }
CLI::success(_('hostPage rank successfully updated')); CLI::notice(_('hostPage rank fields successfully updated!'));
CLI::break();
exit; exit;
break; break;
default:
CLI::danger(_('undefined action argument'));
} }
} }
break; break;
} }
}
break; break;
case 'hostPageDom': case 'hostPageDom':
@ -413,6 +448,7 @@ if (!empty($argv[1])) {
} }
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file')); CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
CLI::break();
exit; exit;
break; break;
@ -421,6 +457,7 @@ if (!empty($argv[1])) {
$db->truncateHostPageDom(); $db->truncateHostPageDom();
CLI::success(_('hostPageDom table successfully truncated')); CLI::success(_('hostPageDom table successfully truncated'));
CLI::break();
exit; exit;
break; break;
@ -450,7 +487,8 @@ CLI::default(' crawl - execute step in crawler queue');
CLI::default(' clean - execute step in cleaner queue'); CLI::default(' clean - execute step in cleaner queue');
CLI::break(); CLI::break();
CLI::default(' hostPage '); CLI::default(' hostPage ');
CLI::default(' rank - generate hostPage.rank fields'); CLI::default(' rank ');
CLI::default(' reindex - reindex hostPage.rank fields');
CLI::break(); CLI::break();
CLI::default(' hostPageSnap '); CLI::default(' hostPageSnap ');
CLI::default(' repair '); CLI::default(' repair ');

33
crontab/crawler.php

@ -332,7 +332,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
$linkHostURL->string == $host->hostURL && // this host links only $linkHostURL->string == $host->hostURL && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
} }
@ -357,7 +357,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
// Update page rank // Update page rank
$db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover // @TODO add common method
$hostPageRank = 0;
// Get referrers
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($queueHostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
// Get source page details
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
// Increase PR on external referrer only
if ($hostPageSource->hostId != $queueHostPage->hostId) {
$hostPageRank++;
}
// Delegate page rank value from redirected pages
if (false !== strpos($hostPageSource->httpCode, '30')) {
$hostPageRank += $hostPageSource->rank;
}
}
}
// Update registry
$db->updateHostPageRank($queueHostPage->hostPageId, $hostPageRank);
// Update page index anyway, with the current time and http code // Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload()); $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
@ -475,7 +500,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) { if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId; $hostPageId = $hostPage->hostPageId;
@ -1139,7 +1164,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) { if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId; $hostPageId = $hostPage->hostPageId;

51
library/mysql.php

@ -175,37 +175,16 @@ class MySQL {
return $query->fetch()->total; return $query->fetch()->total;
} }
public function getTotalHostPagesIndexed(int $hostId) { public function getHostPage(int $hostPageId) {
if ($this->_memcached) { $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
if ($result = $this->_memcached->get(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId))) {
return $result;
}
}
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`
WHERE `hostId` = ?
AND `httpCode` = 200
AND `timeBanned` IS NULL
AND `mime` IS NOT NULL');
$query->execute([$hostId]);
$result = $query->fetch()->total;
if ($this->_memcached) {
$this->_memcached->set(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId), $result, time() + 3600); $query->execute([$hostPageId]);
}
return $result; return $query->fetch();
} }
public function getHostPage(int $hostId, int $crc32uri) { public function findHostPageByCRC32URI(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
@ -449,23 +428,7 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) { public function getTotalHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
FROM `hostPageToHostPage`
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`)
WHERE `hostPageToHostPage`.`hostPageIdTarget` = ?
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`');
$query->execute([$hostPageIdTarget]);
return $query->fetch()->total;
}
public function getTotalHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?'); $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
@ -474,7 +437,7 @@ class MySQL {
return $query->fetch()->total; return $query->fetch()->total;
} }
public function getHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) { public function getHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . (int) $limit); $query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . (int) $limit);

4
public/explore.php

@ -253,14 +253,14 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
</p> </p>
<?php } ?> <?php } ?>
<?php } ?> <?php } ?>
<?php $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($hp); ?> <?php $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($hp); ?>
<p> <p>
<?php echo $totalHostPageIdSources ? Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources), <?php echo $totalHostPageIdSources ? Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources), sprintf(_('%s referrers'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources)]) : false ?> sprintf(_('%s referrers'), $totalHostPageIdSources)]) : false ?>
</p> </p>
<?php if ($totalHostPageIdSources) { ?> <?php if ($totalHostPageIdSources) { ?>
<?php foreach ($db->getHostPageToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?> <?php foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?> <?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?> <?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
<p> <p>

6
public/search.php

@ -108,7 +108,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
if ($hostStatus && // host enabled if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists !$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
} }
@ -339,7 +339,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>"> <a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
<?php echo _('explore'); ?> <?php echo _('explore'); ?>
</a> </a>
<?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($result->id)) { ?> <?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($result->id)) { ?>
<p> <p>
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources), <?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources), sprintf(_('%s referrers'), $totalHostPageIdSources),
@ -347,7 +347,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
]) ?> ]) ?>
</p> </p>
<?php $i = 1 ?> <?php $i = 1 ?>
<?php foreach ($db->getHostPageToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?> <?php foreach ($db->getHostPagesToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?> <?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<?php $i++ ?> <?php $i++ ?>
<p> <p>

Loading…
Cancel
Save