Browse Source

implement host page description history crawling

main
ghost 2 years ago
parent
commit
0e9d29675f
  1. 21
      config/sphinx.conf.txt
  2. 4
      crontab/cleaner.php
  3. 22
      crontab/crawler.php
  4. BIN
      database/yggo.mwb
  5. 97
      library/mysql.php
  6. 12
      public/search.php

21
config/sphinx.conf.txt

@ -12,7 +12,17 @@ source common
source hostPage : common source hostPage : common
{ {
sql_query = \ sql_query = \
SELECT hostPage.hostPageId, hostPage.rank, hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords, hostPage.data, hostPage.uri, host.name \ SELECT hostPage.hostPageId, \
hostPage.rank, \
hostPage.uri, \
host.name, \
(SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \
hostPageDescription.metaDescription, \
hostPageDescription.metaKeywords) \
FROM hostPageDescription \
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
ORDER BY hostPageDescription.timeAdded DESC \
LIMIT 1) AS pageDescription \
FROM hostPage \ FROM hostPage \
JOIN host ON (host.hostId = hostPage.hostId) \ JOIN host ON (host.hostId = hostPage.hostId) \
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
@ -24,15 +34,12 @@ source hostImage : common
{ {
sql_query = \ sql_query = \
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \ SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \ (SELECT CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title) \
FROM hostImageDescription \ FROM hostImageDescription \
WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription, \ WHERE hostImageDescription.hostImageId = hostImage.hostImageId ORDER BY hostImageDescription.timeAdded DESC LIMIT 1) AS imageDescription \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords)) \
FROM hostPage \
WHERE hostPage.hostPageId IN (SELECT hostImageToHostPage.hostPageId FROM hostImageToHostPage WHERE hostImageToHostPage.hostImageId = hostImage.hostImageId)) AS pageDescription \
FROM hostImage \ FROM hostImage \
JOIN host ON (host.hostId = hostImage.hostId) \ JOIN host ON (host.hostId = hostImage.hostId) \
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
sql_attr_uint = rank sql_attr_uint = rank
} }

4
crontab/cleaner.php

@ -94,6 +94,8 @@ try {
$db->deleteHostPageToHostImage($hostPage->hostPageId); $db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page // Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
} }
} }
@ -122,6 +124,8 @@ try {
$db->deleteHostPageToHostImage($hostPage->hostPageId); $db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page // Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
} }
} }

22
crontab/crawler.php

@ -293,7 +293,7 @@ try {
if ($hostImageBanned) { if ($hostImageBanned) {
$db->updateHostImageMime($queueHostImage->hostImageId, $hostImageContentType, time()); $db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
@ -387,7 +387,7 @@ try {
if ($hostPageBanned) { if ($hostPageBanned) {
$db->updateHostPageMime($queueHostPage->hostPageId, $contentType, time()); $db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
@ -456,14 +456,26 @@ try {
continue; continue;
} }
// Update queued page data // Update queued page
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::mime($contentType),
time());
// Format page content
$content = Filter::pageData($content);
// Add queued page description if not exists
$crc32data = crc32($content);
if (!$db->getHostPageDescription($queueHostPage->hostPageId, $crc32data)) {
$db->addHostPageDescription($queueHostPage->hostPageId,
$crc32data,
Filter::pageTitle($title->item(0)->nodeValue), Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription), Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords), Filter::pageKeywords($metaKeywords),
Filter::mime($contentType), CRAWL_HOST_DEFAULT_META_ONLY ? null : $content,
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content),
time()); time());
}
// Update manifest registry // Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {

BIN
database/yggo.mwb

Binary file not shown.

97
library/mysql.php

@ -384,19 +384,34 @@ class MySQL {
public function getHostPagesByLimit(int $hostId, int $limit) { public function getHostPagesByLimit(int $hostId, int $limit) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit); $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . (int) $limit);
$query->execute([$hostId]); $query->execute([$hostId]);
return $query->fetchAll(); return $query->fetchAll();
} }
public function getHostPageDescription(int $hostPageId, int $crc32data) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data]);
return $query->fetch();
}
public function getLastPageDescription(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
$query->execute([$hostPageId]);
return $query->fetch();
}
public function getFoundHostPage(int $hostPageId) { public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`, $query = $this->_db->prepare('SELECT `hostPage`.`uri`,
`hostPage`.`metaDescription`,
`hostPage`.`data`,
`hostPage`.`uri`,
`hostPage`.`rank`, `hostPage`.`rank`,
`host`.`scheme`, `host`.`scheme`,
`host`.`name`, `host`.`name`,
@ -449,11 +464,7 @@ class MySQL {
mixed $timeBanned = null, mixed $timeBanned = null,
mixed $httpCode = null, mixed $httpCode = null,
mixed $mime = null, mixed $mime = null,
mixed $rank = null, mixed $rank = null) {
mixed $metaTitle = null,
mixed $metaDescription = null,
mixed $metaKeywords = null,
mixed $data = null) {
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`, $query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
`crc32uri`, `crc32uri`,
@ -463,35 +474,18 @@ class MySQL {
`timeBanned`, `timeBanned`,
`httpCode`, `httpCode`,
`mime`, `mime`,
`rank`, `rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]); $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function updateHostPage( int $hostPageId, public function updateHostPage(int $hostPageId, string $mime, int $timeUpdated) {
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
string $mime,
mixed $data,
int $timeUpdated,
mixed $timeBanned = null) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?, $query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `mime` = ? WHERE `hostPageId` = ? LIMIT 1');
`metaDescription` = ?,
`metaKeywords` = ?,
`mime` = ?,
`data` = ?,
`timeUpdated` = ?,
`timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $timeUpdated, $timeBanned, $hostPageId]); $query->execute([$timeUpdated, $mime, $hostPageId]);
return $query->rowCount(); return $query->rowCount();
} }
@ -548,6 +542,15 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function deleteHostPageDescriptions(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPageDescription` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->rowCount();
}
public function deleteHostPageToHostImage(int $hostPageId) { public function deleteHostPageToHostImage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?'); $query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
@ -557,6 +560,36 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function addHostPageDescription(int $hostPageId,
int $crc32data,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
mixed $data,
int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
`crc32data`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`,
`timeAdded`
) VALUES (?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$hostPageId,
$crc32data,
$metaTitle,
$metaDescription,
$metaKeywords,
$data,
$timeAdded
]);
return $query->rowCount();
}
// Cleaner tools // Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) { public function getCleanerQueue(int $limit, int $timeFrom) {

12
public/search.php

@ -436,7 +436,9 @@ if (!empty($q)) {
<?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?> <?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?> <?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?> <?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<h3><?php echo $hostPage->metaTitle ?></h3> <?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
<?php } ?>
<?php if (!empty($hostImage->description)) { ?> <?php if (!empty($hostImage->description)) { ?>
<span><?php echo $hostImage->description ?></span> <span><?php echo $hostImage->description ?></span>
<?php } ?> <?php } ?>
@ -469,9 +471,11 @@ if (!empty($q)) {
?> ?>
<div> <div>
<h2><?php echo $hostPage->metaTitle ?></h2> <?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<?php if (!empty($hostPage->metaDescription)) { ?> <h2><?php echo $hostPageDescription->metaTitle ?></h2>
<span><?php echo $hostPage->metaDescription ?></span> <?php if (!empty($hostPageDescription->metaDescription)) { ?>
<span><?php echo $hostPageDescription->metaDescription ?></span>
<?php } ?>
<?php } ?> <?php } ?>
<a href="<?php echo $hostPageURL ?>"> <a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" /> <img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />

Loading…
Cancel
Save