Browse Source

implement page ranking

main
ghost 2 years ago
parent
commit
8671fc4bde
  1. 12
      crontab/crawler.php
  2. 17
      library/mysql.php
  3. 6
      library/sphinxql.php

12
crontab/crawler.php

@ -183,6 +183,18 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET @@ -183,6 +183,18 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Increase page rank when link does not match the current host
if ($hostURL->scheme . '://' .
$hostURL->name .
($hostURL->port ? ':' . $hostURL->port : '')
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
}
// Register new host
} else {

17
library/mysql.php

@ -143,6 +143,7 @@ class MySQL { @@ -143,6 +143,7 @@ class MySQL {
`hostPage`.`metaDescription`,
`hostPage`.`data`,
`hostPage`.`uri`,
`hostPage`.`rank`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
@ -204,6 +205,22 @@ class MySQL { @@ -204,6 +205,22 @@ class MySQL {
return $query->rowCount();
}
public function updateHostPageRank(int $hostId,
int $crc32uri,
int $increment) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = `rank` + ' . (int) $increment . '
WHERE `hostId` = ?
AND `crc32uri` = ?
LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->rowCount();
}
public function deleteHostPage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');

6
library/sphinxql.php

@ -13,10 +13,14 @@ class SphinxQL { @@ -13,10 +13,14 @@ class SphinxQL {
public function searchHostPages(string $keyword, int $start, int $limit, int $maxMatches) {
$query = $this->_sphinx->prepare('SELECT * FROM `hostPage`
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
FROM `hostPage`
WHERE MATCH(?)
ORDER BY `rank` DESC, WEIGHT() DESC
LIMIT ' . (int) ($start > $maxMatches ? ($maxMatches > 0 ? $maxMatches - 1 : 0) : $start) . ',' . (int) $limit . '
OPTION `max_matches`=' . (int) ($maxMatches > 1 ? $maxMatches : 1));

Loading…
Cancel
Save