Browse Source

make page rank procedural, optimize performance

main
ghost 1 year ago
parent
commit
1dd0a8ee2c
  1. 51
      cli/yggo.php
  2. 5
      config/sphinx.conf.txt
  3. 3
      crontab/crawler.php
  4. BIN
      database/yggo.mwb
  5. 127
      library/mysql.php

51
cli/yggo.php

@ -38,6 +38,56 @@ switch ($argv[1]) { @@ -38,6 +38,56 @@ switch ($argv[1]) {
include_once(__DIR__ . '/../crontab/cleaner.php');
break;
case 'hostPage':
if (empty($argv[2])) {
echo PHP_EOL . _('hostPage method requires action argument') . PHP_EOL;
}
switch ($argv[2]) {
case 'rank':
if (empty($argv[3])) {
echo PHP_EOL . _('hostPage rank requires action argument') . PHP_EOL;
}
switch ($argv[3]) {
case 'reindex':
foreach ($db->getHosts() as $host) {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
}
}
echo _('hostPage rank successfully updated') . PHP_EOL;
exit;
break;
default:
echo PHP_EOL . _('undefined action argument') . PHP_EOL;
}
break;
case 'truncate':
$db->truncateHostPageDom();
echo _('hostPageDom table successfully truncated') . PHP_EOL;
exit;
break;
default:
echo PHP_EOL . _('undefined action argument') . PHP_EOL;
}
break;
case 'hostPageDom':
@ -190,6 +240,7 @@ echo PHP_EOL . _('available options:') . PHP_EOL . PHP_EOL; @@ -190,6 +240,7 @@ echo PHP_EOL . _('available options:') . PHP_EOL . PHP_EOL;
echo _(' help - this message') . PHP_EOL;
echo _(' crawl - execute crawler step in the crontab queue') . PHP_EOL;
echo _(' clean - execute cleaner step in the crontab queue') . PHP_EOL;
echo _(' hostPage rank reindex - generate rank indexes in hostPage table') . PHP_EOL;
echo _(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field') . PHP_EOL;
echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL;
echo _(' hostPageSnap truncate - flush hostPageSnap, hostPageSnapDownload tables') . PHP_EOL . PHP_EOL;

5
config/sphinx.conf.txt

@ -14,12 +14,9 @@ source hostPage : common @@ -14,12 +14,9 @@ source hostPage : common
sql_query = \
SELECT `hostPage`.`hostPageId`, \
`hostPage`.`uri`, \
`hostPage`.`rank`, \
`host`.`name`, \
REGEXP_REPLACE(`hostPage`.`mime`, '^([A-z-]+)/[A-z-]+.*', '$1') AS `mime`, \
(SELECT COUNT(*) FROM `hostPageToHostPage` \
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \
AND (SELECT `hostPageSource`.`hostId` FROM `hostPage` AS `hostPageSource` \
WHERE `hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`) <> `hostPage`.`hostId`) AS `rank`, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
`hostPageDescription`.`description`, \
`hostPageDescription`.`keywords`)) \

3
crontab/crawler.php

@ -355,6 +355,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -355,6 +355,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update page rank
$db->updateHostPageRank($queueHostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($queueHostPage->hostPageId)); // @TODO add library cover
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());

BIN
database/yggo.mwb

Binary file not shown.

127
library/mysql.php

@ -89,64 +89,6 @@ class MySQL { @@ -89,64 +89,6 @@ class MySQL {
return $query->fetchAll();
}
public function getTopHostPages() {
if ($this->_memcached) {
if ($result = $this->_memcached->get('MySQL.getTopHostPages')) {
return $result;
}
}
$query = $this->_db->query(" SELECT
`hostPageTarget`.`hostId`,
`hostPageTarget`.`hostPageId`,
`hostPageTarget`.`uri`,
`hostTarget`.`scheme`,
`hostTarget`.`name`,
`hostTarget`.`port`,
(
SELECT COUNT(*)
FROM `hostPageToHostPage`
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPageTarget`.`hostPageId`
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`
) AS `rank`
FROM `hostPage` AS `hostPageTarget`
JOIN `host` AS `hostTarget` ON (`hostTarget`.`hostId` = `hostPageTarget`.`hostId`)
WHERE `hostTarget`.`status` = '1'
AND `hostPageTarget`.`httpCode` = 200
AND `hostPageTarget`.`timeBanned` IS NULL
AND `hostPageTarget`.`mime` IS NOT NULL
GROUP BY `hostPageTarget`.`hostPageId`
HAVING `rank` > 0
ORDER BY `rank` DESC
");
$result = $query->fetchAll();
if ($this->_memcached) {
$this->_memcached->set('MySQL.getTopHostPages', $result, time() + 3600);
}
return $result;
}
public function getHosts() {
$query = $this->_db->query('SELECT * FROM `host`');
@ -300,6 +242,50 @@ class MySQL { @@ -300,6 +242,50 @@ class MySQL {
return $query->fetchAll();
}
public function getTopHostPages(int $limit = 100) {
if ($this->_memcached) {
if ($result = $this->_memcached->get(sprintf('MySQL.getTopHostPages.%s', $limit))) {
return $result;
}
}
$query = $this->_db->query(" SELECT
`hostPage`.`hostId`,
`hostPage`.`hostPageId`,
`hostPage`.`uri`,
`hostPage`.`rank`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
FROM `hostPage`
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
WHERE `host`.`status` = '1'
AND `hostPage`.`httpCode` = 200
AND `hostPage`.`rank` > 0
AND `hostPage`.`timeBanned` IS NULL
AND `hostPage`.`mime` IS NOT NULL
ORDER BY `rank` DESC
LIMIT " . (int) $limit);
$result = $query->fetchAll();
if ($this->_memcached) {
$this->_memcached->set(sprintf('MySQL.getTopHostPages.%s', $limit), $result, time() + 3600);
}
return $result;
}
public function getHostPagesByIndexed() {
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL');
@ -390,6 +376,15 @@ class MySQL { @@ -390,6 +376,15 @@ class MySQL {
return $query->rowCount();
}
public function updateHostPageRank(int $hostPageId, int $rank) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$rank, $hostPageId]);
return $query->rowCount();
}
public function deleteHostPage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
@ -452,6 +447,22 @@ class MySQL { @@ -452,6 +447,22 @@ class MySQL {
return $query->rowCount();
}
public function getTotalExternalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
FROM `hostPageToHostPage`
JOIN `hostPage` AS `hostPageSource` ON (`hostPageSource`.`hostPageId` = `hostPageToHostPage`.`hostPageIdSource`)
JOIN `hostPage` AS `hostPageTarget` ON (`hostPageTarget`.`hostPageId` = `hostPageToHostPage`.`hostPageIdTarget`)
WHERE `hostPageToHostPage`.`hostPageIdTarget` = ?
AND `hostPageSource`.`hostId` <> `hostPageTarget`.`hostId`');
$query->execute([$hostPageIdTarget]);
return $query->fetch()->total;
}
public function getTotalHostPageIdSourcesByHostPageIdTarget(int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?');

Loading…
Cancel
Save