mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
make page rank procedural, optimize performance
This commit is contained in:
parent
4a4394fb27
commit
1dd0a8ee2c
51
cli/yggo.php
51
cli/yggo.php
@ -38,6 +38,56 @@ switch ($argv[1]) {
|
||||
|
||||
include_once(__DIR__ . '/../crontab/cleaner.php');
|
||||
|
||||
break;
|
||||
case 'hostPage':
|
||||
|
||||
if (empty($argv[2])) {
|
||||
echo PHP_EOL . _('hostPage method requires action argument') . PHP_EOL;
|
||||
}
|
||||
|
||||
switch ($argv[2]) {
|
||||
|
||||
case 'rank':
|
||||
|
||||
if (empty($argv[3])) {
|
||||
echo PHP_EOL . _('hostPage rank requires action argument') . PHP_EOL;
|
||||
}
|
||||
|
||||
switch ($argv[3]) {
|
||||
|
||||
case 'reindex':
|
||||
|
||||
foreach ($db->getHosts() as $host) {
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
|
||||
}
|
||||
}
|
||||
|
||||
echo _('hostPage rank successfully updated') . PHP_EOL;
|
||||
exit;
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
echo PHP_EOL . _('undefined action argument') . PHP_EOL;
|
||||
}
|
||||
|
||||
break;
|
||||
case 'truncate':
|
||||
|
||||
$db->truncateHostPageDom();
|
||||
|
||||
echo _('hostPageDom table successfully truncated') . PHP_EOL;
|
||||
exit;
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
echo PHP_EOL . _('undefined action argument') . PHP_EOL;
|
||||
}
|
||||
|
||||
break;
|
||||
case 'hostPageDom':
|
||||
|
||||
@ -190,6 +240,7 @@ echo PHP_EOL . _('available options:') . PHP_EOL . PHP_EOL;
|
||||
echo _(' help - this message') . PHP_EOL;
|
||||
echo _(' crawl - execute crawler step in the crontab queue') . PHP_EOL;
|
||||
echo _(' clean - execute cleaner step in the crontab queue') . PHP_EOL;
|
||||
echo _(' hostPage rank reindex - generate rank indexes in hostPage table') . PHP_EOL;
|
||||
echo _(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field') . PHP_EOL;
|
||||
echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL;
|
||||
echo _(' hostPageSnap truncate - flush hostPageSnap, hostPageSnapDownload tables') . PHP_EOL . PHP_EOL;
|
||||
|
@ -14,12 +14,9 @@ source hostPage : common
|
||||
sql_query = \
|
||||
SELECT `hostPage`.`hostPageId`, \
|
||||
`hostPage`.`uri`, \
|
||||
`hostPage`.`rank`, \
|
||||
`host`.`name`, \
|
||||
REGEXP_REPLACE(`hostPage`.`mime`, '^([A-z-]+)/[A-z-]+.*', '$1') AS `mime`, \
|
||||
(SELECT COUNT(*) FROM `hostPageToHostPage` \
|
||||
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \
|
||||
AND (SELECT `hostPageSource`.`hostId` FROM `hostPage` AS `hostPageSource` \
|
||||
WHERE `hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`) <> `hostPage`.`hostId`) AS `rank`, \
|
||||
(SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
|
||||
`hostPageDescription`.`description`, \
|
||||
`hostPageDescription`.`keywords`)) \
|
||||
|
@ -355,6 +355,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
// Update page rank
|
||||
$db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover
|
||||
|
||||
// Update page index anyway, with the current time and http code
|
||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
|
||||
|
||||
|
Binary file not shown.
@ -89,64 +89,6 @@ class MySQL {
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getTopHostPages() {
|
||||
|
||||
if ($this->_memcached) {
|
||||
|
||||
if ($result = $this->_memcached->get('MySQL.getTopHostPages')) {
|
||||
|
||||
return $result;
|
||||
}
|
||||
}
|
||||
|
||||
$query = $this->_db->query(" SELECT
|
||||
|
||||
`hostPageTarget`.`hostId`,
|
||||
`hostPageTarget`.`hostPageId`,
|
||||
`hostPageTarget`.`uri`,
|
||||
|
||||
`hostTarget`.`scheme`,
|
||||
`hostTarget`.`name`,
|
||||
`hostTarget`.`port`,
|
||||
|
||||
(
|
||||
|
||||
SELECT COUNT(*)
|
||||
|
||||
FROM `hostPageToHostPage`
|
||||
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
|
||||
|
||||
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPageTarget`.`hostPageId`
|
||||
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`
|
||||
|
||||
) AS `rank`
|
||||
|
||||
FROM `hostPage` AS `hostPageTarget`
|
||||
JOIN `host` AS `hostTarget` ON (`hostTarget`.`hostId` = `hostPageTarget`.`hostId`)
|
||||
|
||||
WHERE `hostTarget`.`status` = '1'
|
||||
AND `hostPageTarget`.`httpCode` = 200
|
||||
AND `hostPageTarget`.`timeBanned` IS NULL
|
||||
AND `hostPageTarget`.`mime` IS NOT NULL
|
||||
|
||||
GROUP BY `hostPageTarget`.`hostPageId`
|
||||
|
||||
HAVING `rank` > 0
|
||||
|
||||
ORDER BY `rank` DESC
|
||||
|
||||
");
|
||||
|
||||
$result = $query->fetchAll();
|
||||
|
||||
if ($this->_memcached) {
|
||||
|
||||
$this->_memcached->set('MySQL.getTopHostPages', $result, time() + 3600);
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
public function getHosts() {
|
||||
|
||||
$query = $this->_db->query('SELECT * FROM `host`');
|
||||
@ -300,6 +242,50 @@ class MySQL {
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getTopHostPages(int $limit = 100) {
|
||||
|
||||
if ($this->_memcached) {
|
||||
|
||||
if ($result = $this->_memcached->get(sprintf('MySQL.getTopHostPages.%s', $limit))) {
|
||||
|
||||
return $result;
|
||||
}
|
||||
}
|
||||
|
||||
$query = $this->_db->query(" SELECT
|
||||
|
||||
`hostPage`.`hostId`,
|
||||
`hostPage`.`hostPageId`,
|
||||
`hostPage`.`uri`,
|
||||
`hostPage`.`rank`,
|
||||
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
|
||||
|
||||
WHERE `host`.`status` = '1'
|
||||
AND `hostPage`.`httpCode` = 200
|
||||
AND `hostPage`.`rank` > 0
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
AND `hostPage`.`mime` IS NOT NULL
|
||||
|
||||
ORDER BY `rank` DESC
|
||||
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
$result = $query->fetchAll();
|
||||
|
||||
if ($this->_memcached) {
|
||||
|
||||
$this->_memcached->set(sprintf('MySQL.getTopHostPages.%s', $limit), $result, time() + 3600);
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
public function getHostPagesByIndexed() {
|
||||
|
||||
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL');
|
||||
@ -390,6 +376,15 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function updateHostPageRank(int $hostPageId, int $rank) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$rank, $hostPageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostPage(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
|
||||
@ -452,6 +447,22 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
|
||||
|
||||
FROM `hostPageToHostPage`
|
||||
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
|
||||
JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`)
|
||||
|
||||
WHERE `hostPageToHostPage`.`hostPageIdTarget` = ?
|
||||
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`');
|
||||
|
||||
$query->execute([$hostPageIdTarget]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getTotalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
|
||||
|
Loading…
x
Reference in New Issue
Block a user