Browse Source

implement host page description history crawling

main
ghost 2 years ago
parent
commit
0e9d29675f
  1. 21
      config/sphinx.conf.txt
  2. 4
      crontab/cleaner.php
  3. 22
      crontab/crawler.php
  4. BIN
      database/yggo.mwb
  5. 97
      library/mysql.php
  6. 12
      public/search.php

21
config/sphinx.conf.txt

@ -12,7 +12,17 @@ source common @@ -12,7 +12,17 @@ source common
source hostPage : common
{
sql_query = \
SELECT hostPage.hostPageId, hostPage.rank, hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords, hostPage.data, hostPage.uri, host.name \
SELECT hostPage.hostPageId, \
hostPage.rank, \
hostPage.uri, \
host.name, \
(SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \
hostPageDescription.metaDescription, \
hostPageDescription.metaKeywords) \
FROM hostPageDescription \
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
ORDER BY hostPageDescription.timeAdded DESC \
LIMIT 1) AS pageDescription \
FROM hostPage \
JOIN host ON (host.hostId = hostPage.hostId) \
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
@ -24,15 +34,12 @@ source hostImage : common @@ -24,15 +34,12 @@ source hostImage : common
{
sql_query = \
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \
(SELECT CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title) \
FROM hostImageDescription \
WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords)) \
FROM hostPage \
WHERE hostPage.hostPageId IN (SELECT hostImageToHostPage.hostPageId FROM hostImageToHostPage WHERE hostImageToHostPage.hostImageId = hostImage.hostImageId)) AS pageDescription \
WHERE hostImageDescription.hostImageId = hostImage.hostImageId ORDER BY hostImageDescription.timeAdded DESC LIMIT 1) AS imageDescription \
FROM hostImage \
JOIN host ON (host.hostId = hostImage.hostId) \
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
sql_attr_uint = rank
}

4
crontab/cleaner.php

@ -94,6 +94,8 @@ try { @@ -94,6 +94,8 @@ try {
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}
@ -122,6 +124,8 @@ try { @@ -122,6 +124,8 @@ try {
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}

22
crontab/crawler.php

@ -293,7 +293,7 @@ try { @@ -293,7 +293,7 @@ try {
if ($hostImageBanned) {
$db->updateHostImageMime($queueHostImage->hostImageId, $hostImageContentType, time());
$db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
@ -387,7 +387,7 @@ try { @@ -387,7 +387,7 @@ try {
if ($hostPageBanned) {
$db->updateHostPageMime($queueHostPage->hostPageId, $contentType, time());
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
@ -456,14 +456,26 @@ try { @@ -456,14 +456,26 @@ try {
continue;
}
// Update queued page data
// Update queued page
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::mime($contentType),
time());
// Format page content
$content = Filter::pageData($content);
// Add queued page description if not exists
$crc32data = crc32($content);
if (!$db->getHostPageDescription($queueHostPage->hostPageId, $crc32data)) {
$db->addHostPageDescription($queueHostPage->hostPageId,
$crc32data,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
Filter::mime($contentType),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content),
CRAWL_HOST_DEFAULT_META_ONLY ? null : $content,
time());
}
// Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {

BIN
database/yggo.mwb

Binary file not shown.

97
library/mysql.php

@ -384,19 +384,34 @@ class MySQL { @@ -384,19 +384,34 @@ class MySQL {
public function getHostPagesByLimit(int $hostId, int $limit) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . (int) $limit);
$query->execute([$hostId]);
return $query->fetchAll();
}
public function getHostPageDescription(int $hostPageId, int $crc32data) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data]);
return $query->fetch();
}
public function getLastPageDescription(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
$query->execute([$hostPageId]);
return $query->fetch();
}
public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
`hostPage`.`metaDescription`,
`hostPage`.`data`,
`hostPage`.`uri`,
$query = $this->_db->prepare('SELECT `hostPage`.`uri`,
`hostPage`.`rank`,
`host`.`scheme`,
`host`.`name`,
@ -449,11 +464,7 @@ class MySQL { @@ -449,11 +464,7 @@ class MySQL {
mixed $timeBanned = null,
mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null,
mixed $metaTitle = null,
mixed $metaDescription = null,
mixed $metaKeywords = null,
mixed $data = null) {
mixed $rank = null) {
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
`crc32uri`,
@ -463,35 +474,18 @@ class MySQL { @@ -463,35 +474,18 @@ class MySQL {
`timeBanned`,
`httpCode`,
`mime`,
`rank`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
return $this->_db->lastInsertId();
}
public function updateHostPage( int $hostPageId,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
string $mime,
mixed $data,
int $timeUpdated,
mixed $timeBanned = null) {
public function updateHostPage(int $hostPageId, string $mime, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
`metaDescription` = ?,
`metaKeywords` = ?,
`mime` = ?,
`data` = ?,
`timeUpdated` = ?,
`timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `mime` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $timeUpdated, $timeBanned, $hostPageId]);
$query->execute([$timeUpdated, $mime, $hostPageId]);
return $query->rowCount();
}
@ -548,6 +542,15 @@ class MySQL { @@ -548,6 +542,15 @@ class MySQL {
return $query->rowCount();
}
public function deleteHostPageDescriptions(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPageDescription` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->rowCount();
}
public function deleteHostPageToHostImage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
@ -557,6 +560,36 @@ class MySQL { @@ -557,6 +560,36 @@ class MySQL {
return $query->rowCount();
}
public function addHostPageDescription(int $hostPageId,
int $crc32data,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
mixed $data,
int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
`crc32data`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`,
`timeAdded`
) VALUES (?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$hostPageId,
$crc32data,
$metaTitle,
$metaDescription,
$metaKeywords,
$data,
$timeAdded
]);
return $query->rowCount();
}
// Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) {

12
public/search.php

@ -436,7 +436,9 @@ if (!empty($q)) { @@ -436,7 +436,9 @@ if (!empty($q)) {
<?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<h3><?php echo $hostPage->metaTitle ?></h3>
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
<?php } ?>
<?php if (!empty($hostImage->description)) { ?>
<span><?php echo $hostImage->description ?></span>
<?php } ?>
@ -469,9 +471,11 @@ if (!empty($q)) { @@ -469,9 +471,11 @@ if (!empty($q)) {
?>
<div>
<h2><?php echo $hostPage->metaTitle ?></h2>
<?php if (!empty($hostPage->metaDescription)) { ?>
<span><?php echo $hostPage->metaDescription ?></span>
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<h2><?php echo $hostPageDescription->metaTitle ?></h2>
<?php if (!empty($hostPageDescription->metaDescription)) { ?>
<span><?php echo $hostPageDescription->metaDescription ?></span>
<?php } ?>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />

Loading…
Cancel
Save