mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-03-13 05:41:02 +00:00
add custom home page reindex settings
This commit is contained in:
parent
f4bf6b9fa4
commit
3218add372
@ -190,6 +190,21 @@ define('CRAWL_MANIFEST_LIMIT', 10);
|
||||
*/
|
||||
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
|
||||
/*
|
||||
* Renew home page index by timing offset provided
|
||||
*
|
||||
* Used for new pages scanning in highter priority
|
||||
*
|
||||
* This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue
|
||||
*
|
||||
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
|
||||
* must have enough value to crawl all pages collected in the DB index
|
||||
*
|
||||
* or the crawler can stuck in queue
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7);
|
||||
|
||||
/*
|
||||
* Index pages match MIME types
|
||||
*
|
||||
@ -314,7 +329,7 @@ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
||||
* At this moment feature available in the CLI only (cli/yggo.php)
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1,h2,h3,h4,h5,h6');
|
||||
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6');
|
||||
|
||||
/*
|
||||
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content
|
||||
@ -337,7 +352,7 @@ define('CRAWL_MANIFEST', true);
|
||||
* Manifest API version compatibility
|
||||
*
|
||||
*/
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.9);
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.10);
|
||||
|
||||
/*
|
||||
* Set default auto-crawl status for new manifest added
|
||||
|
@ -264,7 +264,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
|
@ -504,7 +504,9 @@ class MySQL {
|
||||
|
||||
public function deleteHostPageDoms(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->query('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?');
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
@ -636,9 +638,26 @@ class MySQL {
|
||||
}
|
||||
|
||||
// Crawl tools
|
||||
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
|
||||
public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
|
||||
$query = $this->_db->prepare("SELECT COUNT(*) AS `total`
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
|
||||
WHERE (`hostPage`.`timeUpdated` IS NULL OR (`hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)))
|
||||
|
||||
AND `host`.`status` <> ?
|
||||
AND `hostPage`.`timeBanned` IS NULL");
|
||||
|
||||
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
|
||||
|
||||
$query = $this->_db->prepare("SELECT `hostPage`.`hostId`,
|
||||
`hostPage`.`hostPageId`,
|
||||
`hostPage`.`uri`,
|
||||
`host`.`scheme`,
|
||||
@ -652,33 +671,20 @@ class MySQL {
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
|
||||
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> ?
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
WHERE (`hostPage`.`timeUpdated` IS NULL OR (`hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)))
|
||||
|
||||
AND `host`.`status` <> ?
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
|
||||
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom, 0]);
|
||||
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getHostPageCrawlQueueTotal(int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
|
||||
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> ?
|
||||
AND `hostPage`.`timeBanned` IS NULL');
|
||||
|
||||
$query->execute([$timeFrom, 0]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode, int $size) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ?, `size` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||
|
@ -1,7 +1,7 @@
|
||||
<?php
|
||||
|
||||
// Current version
|
||||
define('API_VERSION', 0.9);
|
||||
define('API_VERSION', 0.10);
|
||||
|
||||
// Load system dependencies
|
||||
require_once('../config/app.php');
|
||||
@ -102,18 +102,19 @@ if (API_ENABLED) {
|
||||
'status' => true,
|
||||
'result' => [
|
||||
'config' => [
|
||||
'websiteDomain' => WEBSITE_DOMAIN,
|
||||
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
|
||||
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
|
||||
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
|
||||
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
||||
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
||||
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
|
||||
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
|
||||
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
||||
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
||||
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
||||
'websiteDomain' => WEBSITE_DOMAIN,
|
||||
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
|
||||
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
|
||||
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
|
||||
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
||||
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
||||
'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
|
||||
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
|
||||
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
|
||||
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
||||
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
||||
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
||||
],
|
||||
'api' => [
|
||||
'version' => API_VERSION,
|
||||
|
@ -274,7 +274,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<?php } else { ?>
|
||||
<div style="text-align:center">
|
||||
<span><?php echo _('Not found') ?></span>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
|
||||
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
|
||||
<?php } ?>
|
||||
</div>
|
||||
|
@ -321,7 +321,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
<?php if ($results) { ?>
|
||||
<div>
|
||||
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
|
||||
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
|
||||
<?php } ?>
|
||||
</div>
|
||||
@ -391,7 +391,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
<?php } else { ?>
|
||||
<div style="text-align:center">
|
||||
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
|
||||
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
|
||||
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
|
||||
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
|
||||
<?php } ?>
|
||||
</div>
|
||||
|
Loading…
x
Reference in New Issue
Block a user