mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-03-13 05:41:02 +00:00
implement host page description history crawling
This commit is contained in:
parent
6371def666
commit
0e9d29675f
@ -12,7 +12,17 @@ source common
|
||||
source hostPage : common
|
||||
{
|
||||
sql_query = \
|
||||
SELECT hostPage.hostPageId, hostPage.rank, hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords, hostPage.data, hostPage.uri, host.name \
|
||||
SELECT hostPage.hostPageId, \
|
||||
hostPage.rank, \
|
||||
hostPage.uri, \
|
||||
host.name, \
|
||||
(SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \
|
||||
hostPageDescription.metaDescription, \
|
||||
hostPageDescription.metaKeywords) \
|
||||
FROM hostPageDescription \
|
||||
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
|
||||
ORDER BY hostPageDescription.timeAdded DESC \
|
||||
LIMIT 1) AS pageDescription \
|
||||
FROM hostPage \
|
||||
JOIN host ON (host.hostId = hostPage.hostId) \
|
||||
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
|
||||
@ -24,15 +34,12 @@ source hostImage : common
|
||||
{
|
||||
sql_query = \
|
||||
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
|
||||
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \
|
||||
(SELECT CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title) \
|
||||
FROM hostImageDescription \
|
||||
WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription, \
|
||||
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords)) \
|
||||
FROM hostPage \
|
||||
WHERE hostPage.hostPageId IN (SELECT hostImageToHostPage.hostPageId FROM hostImageToHostPage WHERE hostImageToHostPage.hostImageId = hostImage.hostImageId)) AS pageDescription \
|
||||
WHERE hostImageDescription.hostImageId = hostImage.hostImageId ORDER BY hostImageDescription.timeAdded DESC LIMIT 1) AS imageDescription \
|
||||
FROM hostImage \
|
||||
JOIN host ON (host.hostId = hostImage.hostId) \
|
||||
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL
|
||||
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
|
||||
|
||||
sql_attr_uint = rank
|
||||
}
|
||||
|
@ -94,6 +94,8 @@ try {
|
||||
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||
|
||||
// Delete host page
|
||||
$db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
}
|
||||
}
|
||||
@ -122,6 +124,8 @@ try {
|
||||
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||
|
||||
// Delete host page
|
||||
$db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
}
|
||||
}
|
||||
|
@ -293,7 +293,7 @@ try {
|
||||
|
||||
if ($hostImageBanned) {
|
||||
|
||||
$db->updateHostImageMime($queueHostImage->hostImageId, $hostImageContentType, time());
|
||||
$db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
|
||||
|
||||
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
||||
|
||||
@ -387,7 +387,7 @@ try {
|
||||
|
||||
if ($hostPageBanned) {
|
||||
|
||||
$db->updateHostPageMime($queueHostPage->hostPageId, $contentType, time());
|
||||
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
|
||||
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
@ -456,15 +456,27 @@ try {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Update queued page data
|
||||
// Update queued page
|
||||
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
||||
Filter::pageTitle($title->item(0)->nodeValue),
|
||||
Filter::pageDescription($metaDescription),
|
||||
Filter::pageKeywords($metaKeywords),
|
||||
Filter::mime($contentType),
|
||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content),
|
||||
time());
|
||||
|
||||
// Format page content
|
||||
$content = Filter::pageData($content);
|
||||
|
||||
// Add queued page description if not exists
|
||||
$crc32data = crc32($content);
|
||||
|
||||
if (!$db->getHostPageDescription($queueHostPage->hostPageId, $crc32data)) {
|
||||
$db->addHostPageDescription($queueHostPage->hostPageId,
|
||||
$crc32data,
|
||||
Filter::pageTitle($title->item(0)->nodeValue),
|
||||
Filter::pageDescription($metaDescription),
|
||||
Filter::pageKeywords($metaKeywords),
|
||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : $content,
|
||||
time());
|
||||
}
|
||||
|
||||
// Update manifest registry
|
||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||
|
||||
|
Binary file not shown.
@ -384,19 +384,34 @@ class MySQL {
|
||||
|
||||
public function getHostPagesByLimit(int $hostId, int $limit) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getHostPageDescription(int $hostPageId, int $crc32data) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageId, $crc32data]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getLastPageDescription(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getFoundHostPage(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
|
||||
`hostPage`.`metaDescription`,
|
||||
`hostPage`.`data`,
|
||||
`hostPage`.`uri`,
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`uri`,
|
||||
`hostPage`.`rank`,
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
@ -449,11 +464,7 @@ class MySQL {
|
||||
mixed $timeBanned = null,
|
||||
mixed $httpCode = null,
|
||||
mixed $mime = null,
|
||||
mixed $rank = null,
|
||||
mixed $metaTitle = null,
|
||||
mixed $metaDescription = null,
|
||||
mixed $metaKeywords = null,
|
||||
mixed $data = null) {
|
||||
mixed $rank = null) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
|
||||
`crc32uri`,
|
||||
@ -463,35 +474,18 @@ class MySQL {
|
||||
`timeBanned`,
|
||||
`httpCode`,
|
||||
`mime`,
|
||||
`rank`,
|
||||
`metaTitle`,
|
||||
`metaDescription`,
|
||||
`metaKeywords`,
|
||||
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
|
||||
public function updateHostPage( int $hostPageId,
|
||||
mixed $metaTitle,
|
||||
mixed $metaDescription,
|
||||
mixed $metaKeywords,
|
||||
string $mime,
|
||||
mixed $data,
|
||||
int $timeUpdated,
|
||||
mixed $timeBanned = null) {
|
||||
public function updateHostPage(int $hostPageId, string $mime, int $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
|
||||
`metaDescription` = ?,
|
||||
`metaKeywords` = ?,
|
||||
`mime` = ?,
|
||||
`data` = ?,
|
||||
`timeUpdated` = ?,
|
||||
`timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `mime` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $timeUpdated, $timeBanned, $hostPageId]);
|
||||
$query->execute([$timeUpdated, $mime, $hostPageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
@ -548,6 +542,15 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostPageDescriptions(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPageDescription` WHERE `hostPageId` = ?');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostPageToHostImage(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
|
||||
@ -557,6 +560,36 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function addHostPageDescription(int $hostPageId,
|
||||
int $crc32data,
|
||||
mixed $metaTitle,
|
||||
mixed $metaDescription,
|
||||
mixed $metaKeywords,
|
||||
mixed $data,
|
||||
int $timeAdded) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
|
||||
`crc32data`,
|
||||
`metaTitle`,
|
||||
`metaDescription`,
|
||||
`metaKeywords`,
|
||||
`data`,
|
||||
`timeAdded`
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([
|
||||
$hostPageId,
|
||||
$crc32data,
|
||||
$metaTitle,
|
||||
$metaDescription,
|
||||
$metaKeywords,
|
||||
$data,
|
||||
$timeAdded
|
||||
]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Cleaner tools
|
||||
public function getCleanerQueue(int $limit, int $timeFrom) {
|
||||
|
||||
|
@ -436,7 +436,9 @@ if (!empty($q)) {
|
||||
<?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?>
|
||||
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
|
||||
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
|
||||
<h3><?php echo $hostPage->metaTitle ?></h3>
|
||||
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
|
||||
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
|
||||
<?php } ?>
|
||||
<?php if (!empty($hostImage->description)) { ?>
|
||||
<span><?php echo $hostImage->description ?></span>
|
||||
<?php } ?>
|
||||
@ -469,9 +471,11 @@ if (!empty($q)) {
|
||||
|
||||
?>
|
||||
<div>
|
||||
<h2><?php echo $hostPage->metaTitle ?></h2>
|
||||
<?php if (!empty($hostPage->metaDescription)) { ?>
|
||||
<span><?php echo $hostPage->metaDescription ?></span>
|
||||
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
|
||||
<h2><?php echo $hostPageDescription->metaTitle ?></h2>
|
||||
<?php if (!empty($hostPageDescription->metaDescription)) { ?>
|
||||
<span><?php echo $hostPageDescription->metaDescription ?></span>
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<a href="<?php echo $hostPageURL ?>">
|
||||
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
|
||||
|
Loading…
x
Reference in New Issue
Block a user