mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-08-26 13:51:55 +00:00
implement host page description history crawling
This commit is contained in:
parent
6371def666
commit
0e9d29675f
@ -12,7 +12,17 @@ source common
|
|||||||
source hostPage : common
|
source hostPage : common
|
||||||
{
|
{
|
||||||
sql_query = \
|
sql_query = \
|
||||||
SELECT hostPage.hostPageId, hostPage.rank, hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords, hostPage.data, hostPage.uri, host.name \
|
SELECT hostPage.hostPageId, \
|
||||||
|
hostPage.rank, \
|
||||||
|
hostPage.uri, \
|
||||||
|
host.name, \
|
||||||
|
(SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \
|
||||||
|
hostPageDescription.metaDescription, \
|
||||||
|
hostPageDescription.metaKeywords) \
|
||||||
|
FROM hostPageDescription \
|
||||||
|
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
|
||||||
|
ORDER BY hostPageDescription.timeAdded DESC \
|
||||||
|
LIMIT 1) AS pageDescription \
|
||||||
FROM hostPage \
|
FROM hostPage \
|
||||||
JOIN host ON (host.hostId = hostPage.hostId) \
|
JOIN host ON (host.hostId = hostPage.hostId) \
|
||||||
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
|
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
|
||||||
@ -24,15 +34,12 @@ source hostImage : common
|
|||||||
{
|
{
|
||||||
sql_query = \
|
sql_query = \
|
||||||
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
|
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
|
||||||
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \
|
(SELECT CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title) \
|
||||||
FROM hostImageDescription \
|
FROM hostImageDescription \
|
||||||
WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription, \
|
WHERE hostImageDescription.hostImageId = hostImage.hostImageId ORDER BY hostImageDescription.timeAdded DESC LIMIT 1) AS imageDescription \
|
||||||
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords)) \
|
|
||||||
FROM hostPage \
|
|
||||||
WHERE hostPage.hostPageId IN (SELECT hostImageToHostPage.hostPageId FROM hostImageToHostPage WHERE hostImageToHostPage.hostImageId = hostImage.hostImageId)) AS pageDescription \
|
|
||||||
FROM hostImage \
|
FROM hostImage \
|
||||||
JOIN host ON (host.hostId = hostImage.hostId) \
|
JOIN host ON (host.hostId = hostImage.hostId) \
|
||||||
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL
|
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
|
||||||
|
|
||||||
sql_attr_uint = rank
|
sql_attr_uint = rank
|
||||||
}
|
}
|
||||||
|
@ -94,6 +94,8 @@ try {
|
|||||||
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||||
|
|
||||||
// Delete host page
|
// Delete host page
|
||||||
|
$db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||||
|
|
||||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -122,6 +124,8 @@ try {
|
|||||||
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||||
|
|
||||||
// Delete host page
|
// Delete host page
|
||||||
|
$db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||||
|
|
||||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -293,7 +293,7 @@ try {
|
|||||||
|
|
||||||
if ($hostImageBanned) {
|
if ($hostImageBanned) {
|
||||||
|
|
||||||
$db->updateHostImageMime($queueHostImage->hostImageId, $hostImageContentType, time());
|
$db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
|
||||||
|
|
||||||
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
||||||
|
|
||||||
@ -387,7 +387,7 @@ try {
|
|||||||
|
|
||||||
if ($hostPageBanned) {
|
if ($hostPageBanned) {
|
||||||
|
|
||||||
$db->updateHostPageMime($queueHostPage->hostPageId, $contentType, time());
|
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
|
||||||
|
|
||||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||||
|
|
||||||
@ -456,15 +456,27 @@ try {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update queued page data
|
// Update queued page
|
||||||
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
||||||
Filter::pageTitle($title->item(0)->nodeValue),
|
|
||||||
Filter::pageDescription($metaDescription),
|
|
||||||
Filter::pageKeywords($metaKeywords),
|
|
||||||
Filter::mime($contentType),
|
Filter::mime($contentType),
|
||||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content),
|
|
||||||
time());
|
time());
|
||||||
|
|
||||||
|
// Format page content
|
||||||
|
$content = Filter::pageData($content);
|
||||||
|
|
||||||
|
// Add queued page description if not exists
|
||||||
|
$crc32data = crc32($content);
|
||||||
|
|
||||||
|
if (!$db->getHostPageDescription($queueHostPage->hostPageId, $crc32data)) {
|
||||||
|
$db->addHostPageDescription($queueHostPage->hostPageId,
|
||||||
|
$crc32data,
|
||||||
|
Filter::pageTitle($title->item(0)->nodeValue),
|
||||||
|
Filter::pageDescription($metaDescription),
|
||||||
|
Filter::pageKeywords($metaKeywords),
|
||||||
|
CRAWL_HOST_DEFAULT_META_ONLY ? null : $content,
|
||||||
|
time());
|
||||||
|
}
|
||||||
|
|
||||||
// Update manifest registry
|
// Update manifest registry
|
||||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||||
|
|
||||||
|
Binary file not shown.
@ -384,19 +384,34 @@ class MySQL {
|
|||||||
|
|
||||||
public function getHostPagesByLimit(int $hostId, int $limit) {
|
public function getHostPagesByLimit(int $hostId, int $limit) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
$query->execute([$hostId]);
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
return $query->fetchAll();
|
return $query->fetchAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getHostPageDescription(int $hostPageId, int $crc32data) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostPageId, $crc32data]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getLastPageDescription(int $hostPageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostPageId]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
public function getFoundHostPage(int $hostPageId) {
|
public function getFoundHostPage(int $hostPageId) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
|
$query = $this->_db->prepare('SELECT `hostPage`.`uri`,
|
||||||
`hostPage`.`metaDescription`,
|
|
||||||
`hostPage`.`data`,
|
|
||||||
`hostPage`.`uri`,
|
|
||||||
`hostPage`.`rank`,
|
`hostPage`.`rank`,
|
||||||
`host`.`scheme`,
|
`host`.`scheme`,
|
||||||
`host`.`name`,
|
`host`.`name`,
|
||||||
@ -449,11 +464,7 @@ class MySQL {
|
|||||||
mixed $timeBanned = null,
|
mixed $timeBanned = null,
|
||||||
mixed $httpCode = null,
|
mixed $httpCode = null,
|
||||||
mixed $mime = null,
|
mixed $mime = null,
|
||||||
mixed $rank = null,
|
mixed $rank = null) {
|
||||||
mixed $metaTitle = null,
|
|
||||||
mixed $metaDescription = null,
|
|
||||||
mixed $metaKeywords = null,
|
|
||||||
mixed $data = null) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
|
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
|
||||||
`crc32uri`,
|
`crc32uri`,
|
||||||
@ -463,35 +474,18 @@ class MySQL {
|
|||||||
`timeBanned`,
|
`timeBanned`,
|
||||||
`httpCode`,
|
`httpCode`,
|
||||||
`mime`,
|
`mime`,
|
||||||
`rank`,
|
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
`metaTitle`,
|
|
||||||
`metaDescription`,
|
|
||||||
`metaKeywords`,
|
|
||||||
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
|
||||||
|
|
||||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
|
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function updateHostPage( int $hostPageId,
|
public function updateHostPage(int $hostPageId, string $mime, int $timeUpdated) {
|
||||||
mixed $metaTitle,
|
|
||||||
mixed $metaDescription,
|
|
||||||
mixed $metaKeywords,
|
|
||||||
string $mime,
|
|
||||||
mixed $data,
|
|
||||||
int $timeUpdated,
|
|
||||||
mixed $timeBanned = null) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `mime` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||||
`metaDescription` = ?,
|
|
||||||
`metaKeywords` = ?,
|
|
||||||
`mime` = ?,
|
|
||||||
`data` = ?,
|
|
||||||
`timeUpdated` = ?,
|
|
||||||
`timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
|
|
||||||
|
|
||||||
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $timeUpdated, $timeBanned, $hostPageId]);
|
$query->execute([$timeUpdated, $mime, $hostPageId]);
|
||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
@ -548,6 +542,15 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function deleteHostPageDescriptions(int $hostPageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostPageDescription` WHERE `hostPageId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostPageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
public function deleteHostPageToHostImage(int $hostPageId) {
|
public function deleteHostPageToHostImage(int $hostPageId) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
|
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
|
||||||
@ -557,6 +560,36 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function addHostPageDescription(int $hostPageId,
|
||||||
|
int $crc32data,
|
||||||
|
mixed $metaTitle,
|
||||||
|
mixed $metaDescription,
|
||||||
|
mixed $metaKeywords,
|
||||||
|
mixed $data,
|
||||||
|
int $timeAdded) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
|
||||||
|
`crc32data`,
|
||||||
|
`metaTitle`,
|
||||||
|
`metaDescription`,
|
||||||
|
`metaKeywords`,
|
||||||
|
`data`,
|
||||||
|
`timeAdded`
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
|
$query->execute([
|
||||||
|
$hostPageId,
|
||||||
|
$crc32data,
|
||||||
|
$metaTitle,
|
||||||
|
$metaDescription,
|
||||||
|
$metaKeywords,
|
||||||
|
$data,
|
||||||
|
$timeAdded
|
||||||
|
]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
// Cleaner tools
|
// Cleaner tools
|
||||||
public function getCleanerQueue(int $limit, int $timeFrom) {
|
public function getCleanerQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
|
@ -436,7 +436,9 @@ if (!empty($q)) {
|
|||||||
<?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?>
|
<?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?>
|
||||||
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
|
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
|
||||||
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
|
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
|
||||||
<h3><?php echo $hostPage->metaTitle ?></h3>
|
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
|
||||||
|
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
|
||||||
|
<?php } ?>
|
||||||
<?php if (!empty($hostImage->description)) { ?>
|
<?php if (!empty($hostImage->description)) { ?>
|
||||||
<span><?php echo $hostImage->description ?></span>
|
<span><?php echo $hostImage->description ?></span>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
@ -469,9 +471,11 @@ if (!empty($q)) {
|
|||||||
|
|
||||||
?>
|
?>
|
||||||
<div>
|
<div>
|
||||||
<h2><?php echo $hostPage->metaTitle ?></h2>
|
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
|
||||||
<?php if (!empty($hostPage->metaDescription)) { ?>
|
<h2><?php echo $hostPageDescription->metaTitle ?></h2>
|
||||||
<span><?php echo $hostPage->metaDescription ?></span>
|
<?php if (!empty($hostPageDescription->metaDescription)) { ?>
|
||||||
|
<span><?php echo $hostPageDescription->metaDescription ?></span>
|
||||||
|
<?php } ?>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
<a href="<?php echo $hostPageURL ?>">
|
<a href="<?php echo $hostPageURL ?>">
|
||||||
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
|
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
|
||||||
|
Loading…
x
Reference in New Issue
Block a user