mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
update PR generation, delegate PR value from redirecting pages, update method names
This commit is contained in:
parent
bba718c901
commit
1d7deffc4c
22
README.md
22
README.md
@ -198,7 +198,7 @@ GET m=SphinxQL
|
||||
* [x] Index homepages and shorter URI with higher priority
|
||||
* [x] Collect target location links on page redirect available
|
||||
* [x] Collect referrer pages (redirects including)
|
||||
* [ ] Aliasing page URL with ending slash
|
||||
* [x] URL aliasing support on PR calculation
|
||||
* [ ] Host page DOM elements collecting by CSS selectors
|
||||
* [ ] Custom settings for each host
|
||||
* [ ] XML Feeds support
|
||||
@ -225,23 +225,23 @@ GET m=SphinxQL
|
||||
##### CLI
|
||||
|
||||
* [x] help
|
||||
* [x] crontab
|
||||
* [x] db
|
||||
* [x] optimize
|
||||
[x] crontab
|
||||
* [x] crawl
|
||||
* [x] clean
|
||||
* [x] hostPage
|
||||
+ [x] rank
|
||||
+ [x] reindex
|
||||
* [x] hostPageSnap
|
||||
+ [x] repair (not tested)
|
||||
+ [x] _sync DB-FS relations_
|
||||
+ [x] _FTP_
|
||||
+ [x] _localhost_
|
||||
+ [x] _delete FS missed in the DB_
|
||||
+ [x] _FTP_
|
||||
+ [ ] _localhost_
|
||||
+ [x] repair
|
||||
+ [x] db
|
||||
+ [x] fs
|
||||
+ [ ] reindex
|
||||
+ [ ] truncate
|
||||
* [x] hostPageDom
|
||||
+ [x] generate
|
||||
+ [x] truncate
|
||||
* [ ] hostPage
|
||||
+ [ ] add
|
||||
|
||||
##### Other
|
||||
|
||||
|
74
cli/yggo.php
74
cli/yggo.php
@ -313,35 +313,70 @@ if (!empty($argv[1])) {
|
||||
break;
|
||||
case 'hostPage':
|
||||
|
||||
switch ($argv[2]) {
|
||||
if (!empty($argv[2])) {
|
||||
|
||||
case 'rank':
|
||||
switch ($argv[2]) {
|
||||
|
||||
if (empty($argv[3])) {
|
||||
case 'rank':
|
||||
|
||||
switch ($argv[3]) {
|
||||
if (!empty($argv[3])) {
|
||||
|
||||
case 'reindex':
|
||||
switch ($argv[3]) {
|
||||
|
||||
foreach ($db->getHosts() as $host) {
|
||||
case 'reindex':
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
CLI::notice(_('hostPage rank fields reindex begin...'));
|
||||
|
||||
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
|
||||
foreach ($db->getHosts() as $host) {
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
// @TODO add common method
|
||||
|
||||
$hostPageRank = 0;
|
||||
|
||||
// Get referrers
|
||||
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
|
||||
|
||||
// Get source page details
|
||||
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
|
||||
|
||||
// Increase PR on external referrer only
|
||||
if ($hostPageSource->hostId != $hostPage->hostId) {
|
||||
|
||||
$hostPageRank++;
|
||||
}
|
||||
|
||||
// Delegate page rank value from redirected pages
|
||||
if (false !== strpos($hostPageSource->httpCode, '30')) {
|
||||
|
||||
$hostPageRank += $hostPageSource->rank;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update registry
|
||||
if ($db->updateHostPageRank($hostPage->hostPageId, $hostPageRank)) {
|
||||
|
||||
CLI::warning(sprintf(_('update hostPage #%s rank from %s to %s;'), $hostPage->hostPageId, $hostPage->rank, $hostPageRank));
|
||||
|
||||
} else {
|
||||
|
||||
# CLI::success(sprintf(_('keep hostPage #%s rank %s;'), $hostPage->hostPageId, $hostPageRank));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CLI::success(_('hostPage rank successfully updated'));
|
||||
exit;
|
||||
CLI::notice(_('hostPage rank fields successfully updated!'));
|
||||
CLI::break();
|
||||
exit;
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
CLI::danger(_('undefined action argument'));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
@ -413,6 +448,7 @@ if (!empty($argv[1])) {
|
||||
}
|
||||
|
||||
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
|
||||
CLI::break();
|
||||
exit;
|
||||
|
||||
break;
|
||||
@ -421,6 +457,7 @@ if (!empty($argv[1])) {
|
||||
$db->truncateHostPageDom();
|
||||
|
||||
CLI::success(_('hostPageDom table successfully truncated'));
|
||||
CLI::break();
|
||||
exit;
|
||||
|
||||
break;
|
||||
@ -450,7 +487,8 @@ CLI::default(' crawl - execute step in crawler queue');
|
||||
CLI::default(' clean - execute step in cleaner queue');
|
||||
CLI::break();
|
||||
CLI::default(' hostPage ');
|
||||
CLI::default(' rank - generate hostPage.rank fields');
|
||||
CLI::default(' rank ');
|
||||
CLI::default(' reindex - reindex hostPage.rank fields');
|
||||
CLI::break();
|
||||
CLI::default(' hostPageSnap ');
|
||||
CLI::default(' repair ');
|
||||
|
@ -332,7 +332,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
|
||||
$linkHostURL->string == $host->hostURL && // this host links only
|
||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||
|
||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||
}
|
||||
@ -357,7 +357,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
// Update page rank
|
||||
$db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover
|
||||
// @TODO add common method
|
||||
|
||||
$hostPageRank = 0;
|
||||
|
||||
// Get referrers
|
||||
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($queueHostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
|
||||
|
||||
// Get source page details
|
||||
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
|
||||
|
||||
// Increase PR on external referrer only
|
||||
if ($hostPageSource->hostId != $queueHostPage->hostId) {
|
||||
|
||||
$hostPageRank++;
|
||||
}
|
||||
|
||||
// Delegate page rank value from redirected pages
|
||||
if (false !== strpos($hostPageSource->httpCode, '30')) {
|
||||
|
||||
$hostPageRank += $hostPageSource->rank;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update registry
|
||||
$db->updateHostPageRank($queueHostPage->hostPageId, $hostPageRank);
|
||||
|
||||
// Update page index anyway, with the current time and http code
|
||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
|
||||
@ -475,7 +500,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
||||
|
||||
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
|
||||
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
|
||||
|
||||
$hostPageId = $hostPage->hostPageId;
|
||||
|
||||
@ -1139,7 +1164,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
||||
|
||||
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
|
||||
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
|
||||
|
||||
$hostPageId = $hostPage->hostPageId;
|
||||
|
||||
|
@ -175,37 +175,16 @@ class MySQL {
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getTotalHostPagesIndexed(int $hostId) {
|
||||
public function getHostPage(int $hostPageId) {
|
||||
|
||||
if ($this->_memcached) {
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
|
||||
|
||||
if ($result = $this->_memcached->get(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId))) {
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $result;
|
||||
}
|
||||
}
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`
|
||||
|
||||
WHERE `hostId` = ?
|
||||
|
||||
AND `httpCode` = 200
|
||||
AND `timeBanned` IS NULL
|
||||
AND `mime` IS NOT NULL');
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
$result = $query->fetch()->total;
|
||||
|
||||
if ($this->_memcached) {
|
||||
|
||||
$this->_memcached->set(sprintf('MySQL.getTotalHostPagesIndexed.%s', $hostId), $result, time() + 3600);
|
||||
}
|
||||
|
||||
return $result;
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getHostPage(int $hostId, int $crc32uri) {
|
||||
public function findHostPageByCRC32URI(int $hostId, int $crc32uri) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
|
||||
|
||||
@ -449,23 +428,7 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
|
||||
|
||||
FROM `hostPageToHostPage`
|
||||
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
|
||||
JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`)
|
||||
|
||||
WHERE `hostPageToHostPage`.`hostPageIdTarget` = ?
|
||||
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`');
|
||||
|
||||
$query->execute([$hostPageIdTarget]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getTotalHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
|
||||
public function getTotalHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
|
||||
|
||||
@ -474,7 +437,7 @@ class MySQL {
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getHostPageToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
|
||||
public function getHostPagesToHostPageByHostPageIdTarget(int $hostPageIdTarget, int $limit = 1000) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . (int) $limit);
|
||||
|
||||
|
@ -253,14 +253,14 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
</p>
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<?php $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($hp); ?>
|
||||
<?php $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($hp); ?>
|
||||
<p>
|
||||
<?php echo $totalHostPageIdSources ? Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
|
||||
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
||||
sprintf(_('%s referrers'), $totalHostPageIdSources)]) : false ?>
|
||||
</p>
|
||||
<?php if ($totalHostPageIdSources) { ?>
|
||||
<?php foreach ($db->getHostPageToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
|
||||
<?php foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
|
||||
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
||||
<?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
|
||||
<p>
|
||||
|
@ -108,7 +108,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
if ($hostStatus && // host enabled
|
||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
||||
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
|
||||
!$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
|
||||
|
||||
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
||||
}
|
||||
@ -339,7 +339,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
|
||||
<?php echo _('explore'); ?>
|
||||
</a>
|
||||
<?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPageToHostPageByHostPageIdTarget($result->id)) { ?>
|
||||
<?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPagesToHostPageByHostPageIdTarget($result->id)) { ?>
|
||||
<p>
|
||||
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
|
||||
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
||||
@ -347,7 +347,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
]) ?>
|
||||
</p>
|
||||
<?php $i = 1 ?>
|
||||
<?php foreach ($db->getHostPageToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
|
||||
<?php foreach ($db->getHostPagesToHostPageByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
|
||||
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
||||
<?php $i++ ?>
|
||||
<p>
|
||||
|
Loading…
x
Reference in New Issue
Block a user