Browse Source

add custom home page reindex settings

main
ghost 2 years ago
parent
commit
3218add372
  1. 19
      config/app.php.txt
  2. 2
      crontab/crawler.php
  3. 48
      library/mysql.php
  4. 3
      public/api.php
  5. 2
      public/explore.php
  6. 4
      public/search.php

19
config/app.php.txt

@ -190,6 +190,21 @@ define('CRAWL_MANIFEST_LIMIT', 10); @@ -190,6 +190,21 @@ define('CRAWL_MANIFEST_LIMIT', 10);
*/
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Renew home page index by timing offset provided
*
* Used for new pages scanning in highter priority
*
* This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7);
/*
* Index pages match MIME types
*
@ -314,7 +329,7 @@ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null @@ -314,7 +329,7 @@ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
* At this moment feature available in the CLI only (cli/yggo.php)
*
*/
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1,h2,h3,h4,h5,h6');
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6');
/*
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content
@ -337,7 +352,7 @@ define('CRAWL_MANIFEST', true); @@ -337,7 +352,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility
*
*/
define('CRAWL_MANIFEST_API_VERSION', 0.9);
define('CRAWL_MANIFEST_API_VERSION', 0.10);
/*
* Set default auto-crawl status for new manifest added

2
crontab/crawler.php

@ -264,7 +264,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES @@ -264,7 +264,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
$db->beginTransaction();

48
library/mysql.php

@ -504,7 +504,9 @@ class MySQL { @@ -504,7 +504,9 @@ class MySQL {
public function deleteHostPageDoms(int $hostPageId) {
$query = $this->_db->query('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?');
$query = $this->_db->prepare('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->rowCount();
}
@ -636,9 +638,26 @@ class MySQL { @@ -636,9 +638,26 @@ class MySQL {
}
// Crawl tools
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
$query = $this->_db->prepare("SELECT COUNT(*) AS `total`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR (`hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)))
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL");
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
return $query->fetch()->total;
}
public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
$query = $this->_db->prepare("SELECT `hostPage`.`hostId`,
`hostPage`.`hostPageId`,
`hostPage`.`uri`,
`host`.`scheme`,
@ -652,33 +671,20 @@ class MySQL { @@ -652,33 +671,20 @@ class MySQL {
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> ?
WHERE (`hostPage`.`timeUpdated` IS NULL OR (`hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)))
AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
LIMIT ' . (int) $limit);
LIMIT " . (int) $limit);
$query->execute([$timeFrom, 0]);
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
return $query->fetchAll();
}
public function getHostPageCrawlQueueTotal(int $timeFrom) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL');
$query->execute([$timeFrom, 0]);
return $query->fetch()->total;
}
public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode, int $size) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ?, `size` = ? WHERE `hostPageId` = ? LIMIT 1');

3
public/api.php

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
<?php
// Current version
define('API_VERSION', 0.9);
define('API_VERSION', 0.10);
// Load system dependencies
require_once('../config/app.php');
@ -109,6 +109,7 @@ if (API_ENABLED) { @@ -109,6 +109,7 @@ if (API_ENABLED) {
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,

2
public/explore.php

@ -274,7 +274,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -274,7 +274,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php } else { ?>
<div style="text-align:center">
<span><?php echo _('Not found') ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?>
</div>

4
public/search.php

@ -321,7 +321,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -321,7 +321,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php if ($results) { ?>
<div>
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?>
</div>
@ -391,7 +391,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -391,7 +391,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php } else { ?>
<div style="text-align:center">
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?>
</div>

Loading…
Cancel
Save