From 3218add372f6089a7f3cde8655eb4881b9846a46 Mon Sep 17 00:00:00 2001 From: ghost Date: Fri, 30 Jun 2023 13:28:22 +0300 Subject: [PATCH] add custom home page reindex settings --- config/app.php.txt | 19 +++++++++++++++-- crontab/crawler.php | 2 +- library/mysql.php | 50 +++++++++++++++++++++++++-------------------- public/api.php | 27 ++++++++++++------------ public/explore.php | 2 +- public/search.php | 4 ++-- 6 files changed, 63 insertions(+), 41 deletions(-) diff --git a/config/app.php.txt b/config/app.php.txt index 191e5c1..76d2588 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -190,6 +190,21 @@ define('CRAWL_MANIFEST_LIMIT', 10); */ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); +/* + * Renew home page index by timing offset provided + * + * Used for new pages scanning in highter priority + * + * This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue + * + * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair + * must have enough value to crawl all pages collected in the DB index + * + * or the crawler can stuck in queue + * + */ +define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7); + /* * Index pages match MIME types * @@ -314,7 +329,7 @@ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null * At this moment feature available in the CLI only (cli/yggo.php) * */ -define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1,h2,h3,h4,h5,h6'); +define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6'); /* * Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content @@ -337,7 +352,7 @@ define('CRAWL_MANIFEST', true); * Manifest API version compatibility * */ -define('CRAWL_MANIFEST_API_VERSION', 0.9); +define('CRAWL_MANIFEST_API_VERSION', 0.10); /* * Set default auto-crawl status for new manifest added diff --git a/crontab/crawler.php b/crontab/crawler.php index 9a8bf7d..75ca6d7 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -264,7 +264,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES } // Process pages crawl queue -foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { +foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) { $db->beginTransaction(); diff --git a/library/mysql.php b/library/mysql.php index 6c7d28a..7b35744 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -504,7 +504,9 @@ class MySQL { public function deleteHostPageDoms(int $hostPageId) { - $query = $this->_db->query('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?'); + $query = $this->_db->prepare('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?'); + + $query->execute([$hostPageId]); return $query->rowCount(); } @@ -636,9 +638,26 @@ class MySQL { } // Crawl tools - public function getHostPageCrawlQueue(int $limit, int $timeFrom) { + public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { + + $query = $this->_db->prepare("SELECT COUNT(*) AS `total` + + FROM `hostPage` + JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + + WHERE (`hostPage`.`timeUpdated` IS NULL OR (`hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?))) + + AND `host`.`status` <> ? + AND `hostPage`.`timeBanned` IS NULL"); + + $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); + + return $query->fetch()->total; + } + + public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { - $query = $this->_db->prepare('SELECT `hostPage`.`hostId`, + $query = $this->_db->prepare("SELECT `hostPage`.`hostId`, `hostPage`.`hostPageId`, `hostPage`.`uri`, `host`.`scheme`, @@ -652,33 +671,20 @@ class MySQL { FROM `hostPage` JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) - WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> ? - AND `hostPage`.`timeBanned` IS NULL + WHERE (`hostPage`.`timeUpdated` IS NULL OR (`hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?))) + + AND `host`.`status` <> ? + AND `hostPage`.`timeBanned` IS NULL ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND() - LIMIT ' . (int) $limit); + LIMIT " . (int) $limit); - $query->execute([$timeFrom, 0]); + $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); return $query->fetchAll(); } - public function getHostPageCrawlQueueTotal(int $timeFrom) { - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` - - FROM `hostPage` - JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) - - WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> ? - AND `hostPage`.`timeBanned` IS NULL'); - - $query->execute([$timeFrom, 0]); - - return $query->fetch()->total; - } - public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode, int $size) { $query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ?, `size` = ? WHERE `hostPageId` = ? LIMIT 1'); diff --git a/public/api.php b/public/api.php index cd6f9b1..48a8f2d 100644 --- a/public/api.php +++ b/public/api.php @@ -1,7 +1,7 @@ true, 'result' => [ 'config' => [ - 'websiteDomain' => WEBSITE_DOMAIN, - 'crawlUrlRegexp' => CRAWL_URL_REGEXP, - 'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW, - 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, - 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, - 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, - 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, - 'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX, - 'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL, - 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, - 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, - 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, + 'websiteDomain' => WEBSITE_DOMAIN, + 'crawlUrlRegexp' => CRAWL_URL_REGEXP, + 'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW, + 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, + 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, + 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, + 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, + 'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET, + 'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX, + 'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL, + 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, + 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, + 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, ], 'api' => [ 'version' => API_VERSION, diff --git a/public/explore.php b/public/explore.php index a959c70..a96986b 100644 --- a/public/explore.php +++ b/public/explore.php @@ -274,7 +274,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?> + getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
diff --git a/public/search.php b/public/search.php index 37cdacb..f4ebc70 100644 --- a/public/search.php +++ b/public/search.php @@ -321,7 +321,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?> + getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
@@ -391,7 +391,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?> + getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>