Browse Source

add custom home page reindex settings

main
ghost 2 years ago
parent
commit
3218add372
  1. 19
      config/app.php.txt
  2. 2
      crontab/crawler.php
  3. 50
      library/mysql.php
  4. 27
      public/api.php
  5. 2
      public/explore.php
  6. 4
      public/search.php

19
config/app.php.txt

@ -190,6 +190,21 @@ define('CRAWL_MANIFEST_LIMIT', 10);
*/ */
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Renew home page index by timing offset provided
*
* Used for new pages scanning in highter priority
*
* This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7);
/* /*
* Index pages match MIME types * Index pages match MIME types
* *
@ -314,7 +329,7 @@ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
* At this moment feature available in the CLI only (cli/yggo.php) * At this moment feature available in the CLI only (cli/yggo.php)
* *
*/ */
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1,h2,h3,h4,h5,h6'); define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6');
/* /*
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content * Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content
@ -337,7 +352,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility * Manifest API version compatibility
* *
*/ */
define('CRAWL_MANIFEST_API_VERSION', 0.9); define('CRAWL_MANIFEST_API_VERSION', 0.10);
/* /*
* Set default auto-crawl status for new manifest added * Set default auto-crawl status for new manifest added

2
crontab/crawler.php

@ -264,7 +264,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
} }
// Process pages crawl queue // Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
$db->beginTransaction(); $db->beginTransaction();

50
library/mysql.php

@ -504,7 +504,9 @@ class MySQL {
public function deleteHostPageDoms(int $hostPageId) { public function deleteHostPageDoms(int $hostPageId) {
$query = $this->_db->query('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?'); $query = $this->_db->prepare('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->rowCount(); return $query->rowCount();
} }
@ -636,9 +638,26 @@ class MySQL {
} }
// Crawl tools // Crawl tools
public function getHostPageCrawlQueue(int $limit, int $timeFrom) { public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
$query = $this->_db->prepare("SELECT COUNT(*) AS `total`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR (`hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)))
AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL");
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
return $query->fetch()->total;
}
public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`, $query = $this->_db->prepare("SELECT `hostPage`.`hostId`,
`hostPage`.`hostPageId`, `hostPage`.`hostPageId`,
`hostPage`.`uri`, `hostPage`.`uri`,
`host`.`scheme`, `host`.`scheme`,
@ -652,33 +671,20 @@ class MySQL {
FROM `hostPage` FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> ? WHERE (`hostPage`.`timeUpdated` IS NULL OR (`hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)))
AND `hostPage`.`timeBanned` IS NULL
AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND() ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
LIMIT ' . (int) $limit); LIMIT " . (int) $limit);
$query->execute([$timeFrom, 0]); $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
return $query->fetchAll(); return $query->fetchAll();
} }
public function getHostPageCrawlQueueTotal(int $timeFrom) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL');
$query->execute([$timeFrom, 0]);
return $query->fetch()->total;
}
public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode, int $size) { public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode, int $size) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ?, `size` = ? WHERE `hostPageId` = ? LIMIT 1'); $query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ?, `size` = ? WHERE `hostPageId` = ? LIMIT 1');

27
public/api.php

@ -1,7 +1,7 @@
<?php <?php
// Current version // Current version
define('API_VERSION', 0.9); define('API_VERSION', 0.10);
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
@ -102,18 +102,19 @@ if (API_ENABLED) {
'status' => true, 'status' => true,
'result' => [ 'result' => [
'config' => [ 'config' => [
'websiteDomain' => WEBSITE_DOMAIN, 'websiteDomain' => WEBSITE_DOMAIN,
'crawlUrlRegexp' => CRAWL_URL_REGEXP, 'crawlUrlRegexp' => CRAWL_URL_REGEXP,
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW, 'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX, 'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL, 'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
], ],
'api' => [ 'api' => [
'version' => API_VERSION, 'version' => API_VERSION,

2
public/explore.php

@ -274,7 +274,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php } else { ?> <?php } else { ?>
<div style="text-align:center"> <div style="text-align:center">
<span><?php echo _('Not found') ?></span> <span><?php echo _('Not found') ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?> <?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span> <span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?> <?php } ?>
</div> </div>

4
public/search.php

@ -321,7 +321,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php if ($results) { ?> <?php if ($results) { ?>
<div> <div>
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span> <span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?> <?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span> <span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?> <?php } ?>
</div> </div>
@ -391,7 +391,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php } else { ?> <?php } else { ?>
<div style="text-align:center"> <div style="text-align:center">
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span> <span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?> <?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span> <span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?> <?php } ?>
</div> </div>

Loading…
Cancel
Save