mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
update PR generation, delegate PR value from redirecting pages, update method names
This commit is contained in:
parent
bba718c901
commit
1d7deffc4c
22
README.md
22
README.md
@ -198,7 +198,7 @@ GET m=SphinxQL
|
|||||||
* [x] Index homepages and shorter URI with higher priority
|
* [x] Index homepages and shorter URI with higher priority
|
||||||
* [x] Collect target location links on page redirect available
|
* [x] Collect target location links on page redirect available
|
||||||
* [x] Collect referrer pages (redirects including)
|
* [x] Collect referrer pages (redirects including)
|
||||||
* [ ] Aliasing page URL with ending slash
|
* [x] URL aliasing support on PR calculation
|
||||||
* [ ] Host page DOM elements collecting by CSS selectors
|
* [ ] Host page DOM elements collecting by CSS selectors
|
||||||
* [ ] Custom settings for each host
|
* [ ] Custom settings for each host
|
||||||
* [ ] XML Feeds support
|
* [ ] XML Feeds support
|
||||||
@ -225,23 +225,23 @@ GET m=SphinxQL
|
|||||||
##### CLI
|
##### CLI
|
||||||
|
|
||||||
* [x] help
|
* [x] help
|
||||||
* [x] crontab
|
* [x] db
|
||||||
|
* [x] optimize
|
||||||
|
[x] crontab
|
||||||
* [x] crawl
|
* [x] crawl
|
||||||
* [x] clean
|
* [x] clean
|
||||||
|
* [x] hostPage
|
||||||
|
+ [x] rank
|
||||||
|
+ [x] reindex
|
||||||
* [x] hostPageSnap
|
* [x] hostPageSnap
|
||||||
+ [x] repair (not tested)
|
+ [x] repair
|
||||||
+ [x] _sync DB-FS relations_
|
+ [x] db
|
||||||
+ [x] _FTP_
|
+ [x] fs
|
||||||
+ [x] _localhost_
|
+ [ ] reindex
|
||||||
+ [x] _delete FS missed in the DB_
|
|
||||||
+ [x] _FTP_
|
|
||||||
+ [ ] _localhost_
|
|
||||||
+ [ ] truncate
|
+ [ ] truncate
|
||||||
* [x] hostPageDom
|
* [x] hostPageDom
|
||||||
+ [x] generate
|
+ [x] generate
|
||||||
+ [x] truncate
|
+ [x] truncate
|
||||||
* [ ] hostPage
|
|
||||||
+ [ ] add
|
|
||||||
|
|
||||||
##### Other
|
##### Other
|
||||||
|
|
||||||
|
74
cli/yggo.php
74
cli/yggo.php
@ -313,35 +313,70 @@ if (!empty($argv[1])) {
|
|||||||
break;
|
break;
|
||||||
case 'hostPage':
|
case 'hostPage':
|
||||||
|
|
||||||
switch ($argv[2]) {
|
if (!empty($argv[2])) {
|
||||||
|
|
||||||
case 'rank':
|
switch ($argv[2]) {
|
||||||
|
|
||||||
if (empty($argv[3])) {
|
case 'rank':
|
||||||
|
|
||||||
switch ($argv[3]) {
|
if (!empty($argv[3])) {
|
||||||
|
|
||||||
case 'reindex':
|
switch ($argv[3]) {
|
||||||
|
|
||||||
foreach ($db->getHosts() as $host) {
|
case 'reindex':
|
||||||
|
|
||||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
CLI::notice(_('hostPage rank fields reindex begin...'));
|
||||||
|
|
||||||
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
|
foreach ($db->getHosts() as $host) {
|
||||||
|
|
||||||
|
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||||
|
|
||||||
|
// @TODO add common method
|
||||||
|
|
||||||
|
$hostPageRank = 0;
|
||||||
|
|
||||||
|
// Get referrers
|
||||||
|
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
|
||||||
|
|
||||||
|
// Get source page details
|
||||||
|
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
|
||||||
|
|
||||||
|
// Increase PR on external referrer only
|
||||||
|
if ($hostPageSource->hostId != $hostPage->hostId) {
|
||||||
|
|
||||||
|
$hostPageRank++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delegate page rank value from redirected pages
|
||||||
|
if (false !== strpos($hostPageSource->httpCode, '30')) {
|
||||||
|
|
||||||
|
$hostPageRank += $hostPageSource->rank;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update registry
|
||||||
|
if ($db->updateHostPageRank($hostPage->hostPageId, $hostPageRank)) {
|
||||||
|
|
||||||
|
CLI::warning(sprintf(_('update hostPage #%s rank from %s to %s;'), $hostPage->hostPageId, $hostPage->rank, $hostPageRank));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
# CLI::success(sprintf(_('keep hostPage #%s rank %s;'), $hostPage->hostPageId, $hostPageRank));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
CLI::success(_('hostPage rank successfully updated'));
|
CLI::notice(_('hostPage rank fields successfully updated!'));
|
||||||
exit;
|
CLI::break();
|
||||||
|
exit;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
default:
|
}
|
||||||
|
|
||||||
CLI::danger(_('undefined action argument'));
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@ -413,6 +448,7 @@ if (!empty($argv[1])) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
|
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
|
||||||
|
CLI::break();
|
||||||
exit;
|
exit;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@ -421,6 +457,7 @@ if (!empty($argv[1])) {
|
|||||||
$db->truncateHostPageDom();
|
$db->truncateHostPageDom();
|
||||||
|
|
||||||
CLI::success(_('hostPageDom table successfully truncated'));
|
CLI::success(_('hostPageDom table successfully truncated'));
|
||||||
|
CLI::break();
|
||||||
exit;
|
exit;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@ -450,7 +487,8 @@ CLI::default(' crawl - execute step in crawler queue');
|
|||||||
CLI::default(' clean - execute step in cleaner queue');
|
CLI::default(' clean - execute step in cleaner queue');
|
||||||
CLI::break();
|
CLI::break();
|
||||||
CLI::default(' hostPage ');
|
CLI::default(' hostPage ');
|
||||||
CLI::default(' rank - generate hostPage.rank fields');
|
CLI::default(' rank ');
|
||||||
|
CLI::default(' reindex - reindex hostPage.rank fields');
|
||||||
CLI::break();
|
CLI::break();
|
||||||
CLI::default(' hostPageSnap ');
|
CLI::default(' hostPageSnap ');
|
||||||
CLI::default(' repair ');
|
CLI::default(' repair ');
|
||||||
|
@ -332,7 +332,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
|
|||||||
$linkHostURL->string == $host->hostURL && // this host links only
|
$linkHostURL->string == $host->hostURL && // this host links only
|
||||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||||
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
|
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||||
|
|
||||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||||
}
|
}
|
||||||
@ -357,7 +357,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Update page rank
|
// Update page rank
|
||||||
$db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover
|
// @TODO add common method
|
||||||
|
|
||||||
|
$hostPageRank = 0;
|
||||||
|
|
||||||
|
// Get referrers
|
||||||
|
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($queueHostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
|
||||||
|
|
||||||
|
// Get source page details
|
||||||
|
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
|
||||||
|
|
||||||
|
// Increase PR on external referrer only
|
||||||
|
if ($hostPageSource->hostId != $queueHostPage->hostId) {
|
||||||
|
|
||||||
|
$hostPageRank++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delegate page rank value from redirected pages
|
||||||
|
if (false !== strpos($hostPageSource->httpCode, '30')) {
|
||||||
|
|
||||||
|
$hostPageRank += $hostPageSource->rank;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update registry
|
||||||
|
$db->updateHostPageRank($queueHostPage->hostPageId, $hostPageRank);
|
||||||
|
|
||||||
// Update page index anyway, with the current time and http code
|
// Update page index anyway, with the current time and http code
|
||||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
|
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
|
||||||
@ -475,7 +500,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||||
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
||||||
|
|
||||||
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
|
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
|
||||||
|
|
||||||
$hostPageId = $hostPage->hostPageId;
|
$hostPageId = $hostPage->hostPageId;
|
||||||
|
|
||||||
@ -1139,7 +1164,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||||
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
||||||
|
|
||||||
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
|
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
|
||||||
|
|
||||||
$hostPageId = $hostPage->hostPageId;
|
$hostPageId = $hostPage->hostPageId;
|
||||||
|
|
||||||
|
@ -175,37 +175,16 @@ class MySQL {
|
|||||||
return $query->fetch()->total;
|
return $query->fetch()->total;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getTotalHostPagesIndexed(int $hostId) {
|
public function getHostPage(int $hostPageId) {
|
||||||
|
|
||||||
if ($this->_memcached) {
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
|
||||||
|
|
||||||
if ($result = $this->_memcached->get(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId))) {
|
$query->execute([$hostPageId]);
|
||||||
|
|
||||||
return $result;
|
return $query->fetch();
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`
|
|
||||||
|
|
||||||
WHERE `hostId` = ?
|
|
||||||
|
|
||||||
AND `httpCode` = 200
|
|
||||||
AND `timeBanned` IS NULL
|
|
||||||
AND `mime` IS NOT NULL');
|
|
||||||
|
|
||||||
$query->execute([$hostId]);
|
|
||||||
|
|
||||||
$result = $query->fetch()->total;
|
|
||||||
|
|
||||||
if ($this->_memcached) {
|
|
||||||
|
|
||||||
$this->_memcached->set(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId), $result, time() + 3600);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getHostPage(int $hostId, int $crc32uri) {
|
public function findHostPageByCRC32URI(int $hostId, int $crc32uri) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
|
||||||
|
|
||||||
@ -449,23 +428,7 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
|
public function getTotalHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
|
|
||||||
|
|
||||||
FROM `hostPageToHostPage`
|
|
||||||
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
|
|
||||||
JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`)
|
|
||||||
|
|
||||||
WHERE `hostPageToHostPage`.`hostPageIdTarget` = ?
|
|
||||||
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`');
|
|
||||||
|
|
||||||
$query->execute([$hostPageIdTarget]);
|
|
||||||
|
|
||||||
return $query->fetch()->total;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getTotalHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
|
||||||
|
|
||||||
@ -474,7 +437,7 @@ class MySQL {
|
|||||||
return $query->fetch()->total;
|
return $query->fetch()->total;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
|
public function getHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . (int) $limit);
|
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
@ -253,14 +253,14 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
|||||||
</p>
|
</p>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
<?php $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($hp); ?>
|
<?php $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($hp); ?>
|
||||||
<p>
|
<p>
|
||||||
<?php echo $totalHostPageIdSources ? Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
|
<?php echo $totalHostPageIdSources ? Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
|
||||||
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
||||||
sprintf(_('%s referrers'), $totalHostPageIdSources)]) : false ?>
|
sprintf(_('%s referrers'), $totalHostPageIdSources)]) : false ?>
|
||||||
</p>
|
</p>
|
||||||
<?php if ($totalHostPageIdSources) { ?>
|
<?php if ($totalHostPageIdSources) { ?>
|
||||||
<?php foreach ($db->getHostPageToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
|
<?php foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
|
||||||
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
||||||
<?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
|
<?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
|
||||||
<p>
|
<p>
|
||||||
|
@ -108,7 +108,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
|||||||
if ($hostStatus && // host enabled
|
if ($hostStatus && // host enabled
|
||||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||||
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
||||||
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
|
!$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
|
||||||
|
|
||||||
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
||||||
}
|
}
|
||||||
@ -339,7 +339,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
|||||||
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
|
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
|
||||||
<?php echo _('explore'); ?>
|
<?php echo _('explore'); ?>
|
||||||
</a>
|
</a>
|
||||||
<?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($result->id)) { ?>
|
<?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($result->id)) { ?>
|
||||||
<p>
|
<p>
|
||||||
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
|
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
|
||||||
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
||||||
@ -347,7 +347,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
|||||||
]) ?>
|
]) ?>
|
||||||
</p>
|
</p>
|
||||||
<?php $i = 1 ?>
|
<?php $i = 1 ?>
|
||||||
<?php foreach ($db->getHostPageToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
|
<?php foreach ($db->getHostPagesToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
|
||||||
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
||||||
<?php $i++ ?>
|
<?php $i++ ?>
|
||||||
<p>
|
<p>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user