mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
make page rank procedural, optimize performance
This commit is contained in:
parent
4a4394fb27
commit
1dd0a8ee2c
51
cli/yggo.php
51
cli/yggo.php
@ -38,6 +38,56 @@ switch ($argv[1]) {
|
|||||||
|
|
||||||
include_once(__DIR__ . '/../crontab/cleaner.php');
|
include_once(__DIR__ . '/../crontab/cleaner.php');
|
||||||
|
|
||||||
|
break;
|
||||||
|
case 'hostPage':
|
||||||
|
|
||||||
|
if (empty($argv[2])) {
|
||||||
|
echo PHP_EOL . _('hostPage method requires action argument') . PHP_EOL;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch ($argv[2]) {
|
||||||
|
|
||||||
|
case 'rank':
|
||||||
|
|
||||||
|
if (empty($argv[3])) {
|
||||||
|
echo PHP_EOL . _('hostPage rank requires action argument') . PHP_EOL;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch ($argv[3]) {
|
||||||
|
|
||||||
|
case 'reindex':
|
||||||
|
|
||||||
|
foreach ($db->getHosts() as $host) {
|
||||||
|
|
||||||
|
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||||
|
|
||||||
|
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
echo _('hostPage rank successfully updated') . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
|
||||||
|
echo PHP_EOL . _('undefined action argument') . PHP_EOL;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
case 'truncate':
|
||||||
|
|
||||||
|
$db->truncateHostPageDom();
|
||||||
|
|
||||||
|
echo _('hostPageDom table successfully truncated') . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
|
||||||
|
echo PHP_EOL . _('undefined action argument') . PHP_EOL;
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 'hostPageDom':
|
case 'hostPageDom':
|
||||||
|
|
||||||
@ -190,6 +240,7 @@ echo PHP_EOL . _('available options:') . PHP_EOL . PHP_EOL;
|
|||||||
echo _(' help - this message') . PHP_EOL;
|
echo _(' help - this message') . PHP_EOL;
|
||||||
echo _(' crawl - execute crawler step in the crontab queue') . PHP_EOL;
|
echo _(' crawl - execute crawler step in the crontab queue') . PHP_EOL;
|
||||||
echo _(' clean - execute cleaner step in the crontab queue') . PHP_EOL;
|
echo _(' clean - execute cleaner step in the crontab queue') . PHP_EOL;
|
||||||
|
echo _(' hostPage rank reindex - generate rank indexes in hostPage table') . PHP_EOL;
|
||||||
echo _(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field') . PHP_EOL;
|
echo _(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field') . PHP_EOL;
|
||||||
echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL;
|
echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL;
|
||||||
echo _(' hostPageSnap truncate - flush hostPageSnap, hostPageSnapDownload tables') . PHP_EOL . PHP_EOL;
|
echo _(' hostPageSnap truncate - flush hostPageSnap, hostPageSnapDownload tables') . PHP_EOL . PHP_EOL;
|
||||||
|
@ -14,12 +14,9 @@ source hostPage : common
|
|||||||
sql_query = \
|
sql_query = \
|
||||||
SELECT `hostPage`.`hostPageId`, \
|
SELECT `hostPage`.`hostPageId`, \
|
||||||
`hostPage`.`uri`, \
|
`hostPage`.`uri`, \
|
||||||
|
`hostPage`.`rank`, \
|
||||||
`host`.`name`, \
|
`host`.`name`, \
|
||||||
REGEXP_REPLACE(`hostPage`.`mime`, '^([A-z-]+)/[A-z-]+.*', '$1') AS `mime`, \
|
REGEXP_REPLACE(`hostPage`.`mime`, '^([A-z-]+)/[A-z-]+.*', '$1') AS `mime`, \
|
||||||
(SELECT COUNT(*) FROM `hostPageToHostPage` \
|
|
||||||
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \
|
|
||||||
AND (SELECT `hostPageSource`.`hostId` FROM `hostPage` AS `hostPageSource` \
|
|
||||||
WHERE `hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`) <> `hostPage`.`hostId`) AS `rank`, \
|
|
||||||
(SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
|
(SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
|
||||||
`hostPageDescription`.`description`, \
|
`hostPageDescription`.`description`, \
|
||||||
`hostPageDescription`.`keywords`)) \
|
`hostPageDescription`.`keywords`)) \
|
||||||
|
@ -355,6 +355,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
|
// Update page rank
|
||||||
|
$db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover
|
||||||
|
|
||||||
// Update page index anyway, with the current time and http code
|
// Update page index anyway, with the current time and http code
|
||||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
|
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
|
||||||
|
|
||||||
|
Binary file not shown.
@ -89,64 +89,6 @@ class MySQL {
|
|||||||
return $query->fetchAll();
|
return $query->fetchAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getTopHostPages() {
|
|
||||||
|
|
||||||
if ($this->_memcached) {
|
|
||||||
|
|
||||||
if ($result = $this->_memcached->get('MySQL.getTopHostPages')) {
|
|
||||||
|
|
||||||
return $result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$query = $this->_db->query(" SELECT
|
|
||||||
|
|
||||||
`hostPageTarget`.`hostId`,
|
|
||||||
`hostPageTarget`.`hostPageId`,
|
|
||||||
`hostPageTarget`.`uri`,
|
|
||||||
|
|
||||||
`hostTarget`.`scheme`,
|
|
||||||
`hostTarget`.`name`,
|
|
||||||
`hostTarget`.`port`,
|
|
||||||
|
|
||||||
(
|
|
||||||
|
|
||||||
SELECT COUNT(*)
|
|
||||||
|
|
||||||
FROM `hostPageToHostPage`
|
|
||||||
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
|
|
||||||
|
|
||||||
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPageTarget`.`hostPageId`
|
|
||||||
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`
|
|
||||||
|
|
||||||
) AS `rank`
|
|
||||||
|
|
||||||
FROM `hostPage` AS `hostPageTarget`
|
|
||||||
JOIN `host` AS `hostTarget` ON (`hostTarget`.`hostId` = `hostPageTarget`.`hostId`)
|
|
||||||
|
|
||||||
WHERE `hostTarget`.`status` = '1'
|
|
||||||
AND `hostPageTarget`.`httpCode` = 200
|
|
||||||
AND `hostPageTarget`.`timeBanned` IS NULL
|
|
||||||
AND `hostPageTarget`.`mime` IS NOT NULL
|
|
||||||
|
|
||||||
GROUP BY `hostPageTarget`.`hostPageId`
|
|
||||||
|
|
||||||
HAVING `rank` > 0
|
|
||||||
|
|
||||||
ORDER BY `rank` DESC
|
|
||||||
|
|
||||||
");
|
|
||||||
|
|
||||||
$result = $query->fetchAll();
|
|
||||||
|
|
||||||
if ($this->_memcached) {
|
|
||||||
|
|
||||||
$this->_memcached->set('MySQL.getTopHostPages', $result, time() + 3600);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getHosts() {
|
public function getHosts() {
|
||||||
|
|
||||||
$query = $this->_db->query('SELECT * FROM `host`');
|
$query = $this->_db->query('SELECT * FROM `host`');
|
||||||
@ -300,6 +242,50 @@ class MySQL {
|
|||||||
return $query->fetchAll();
|
return $query->fetchAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getTopHostPages(int $limit = 100) {
|
||||||
|
|
||||||
|
if ($this->_memcached) {
|
||||||
|
|
||||||
|
if ($result = $this->_memcached->get(sprintf('MySQL.getTopHostPages.%s', $limit))) {
|
||||||
|
|
||||||
|
return $result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$query = $this->_db->query(" SELECT
|
||||||
|
|
||||||
|
`hostPage`.`hostId`,
|
||||||
|
`hostPage`.`hostPageId`,
|
||||||
|
`hostPage`.`uri`,
|
||||||
|
`hostPage`.`rank`,
|
||||||
|
|
||||||
|
`host`.`scheme`,
|
||||||
|
`host`.`name`,
|
||||||
|
`host`.`port`
|
||||||
|
|
||||||
|
FROM `hostPage`
|
||||||
|
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
|
||||||
|
|
||||||
|
WHERE `host`.`status` = '1'
|
||||||
|
AND `hostPage`.`httpCode` = 200
|
||||||
|
AND `hostPage`.`rank` > 0
|
||||||
|
AND `hostPage`.`timeBanned` IS NULL
|
||||||
|
AND `hostPage`.`mime` IS NOT NULL
|
||||||
|
|
||||||
|
ORDER BY `rank` DESC
|
||||||
|
|
||||||
|
LIMIT " . (int) $limit);
|
||||||
|
|
||||||
|
$result = $query->fetchAll();
|
||||||
|
|
||||||
|
if ($this->_memcached) {
|
||||||
|
|
||||||
|
$this->_memcached->set(sprintf('MySQL.getTopHostPages.%s', $limit), $result, time() + 3600);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $result;
|
||||||
|
}
|
||||||
|
|
||||||
public function getHostPagesByIndexed() {
|
public function getHostPagesByIndexed() {
|
||||||
|
|
||||||
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL');
|
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL');
|
||||||
@ -390,6 +376,15 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function updateHostPageRank(int $hostPageId, int $rank) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$rank, $hostPageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
public function deleteHostPage(int $hostPageId) {
|
public function deleteHostPage(int $hostPageId) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
|
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
|
||||||
@ -452,6 +447,22 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
|
||||||
|
|
||||||
|
FROM `hostPageToHostPage`
|
||||||
|
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
|
||||||
|
JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`)
|
||||||
|
|
||||||
|
WHERE `hostPageToHostPage`.`hostPageIdTarget` = ?
|
||||||
|
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`');
|
||||||
|
|
||||||
|
$query->execute([$hostPageIdTarget]);
|
||||||
|
|
||||||
|
return $query->fetch()->total;
|
||||||
|
}
|
||||||
|
|
||||||
public function getTotalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
|
public function getTotalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user