From 5df59661d8c191cd7e071f589949b56c41c518f6 Mon Sep 17 00:00:00 2001 From: ghost Date: Wed, 2 Aug 2023 21:21:23 +0300 Subject: [PATCH] add page rank update optional in the crawl queue --- config/app.php.txt | 10 ++++++++++ crontab/crawler.php | 29 ++++++++++++++++------------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/config/app.php.txt b/config/app.php.txt index a01f713..3e8a1d6 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -321,6 +321,16 @@ define('CRAWL_HOST_DEFAULT_NSFW', false); */ define('CRAWL_SITEMAPS', true); +/* + * Re-calculate page rank on page update + * + * When enabled, may enlarge execution time + * + * true|false + * + */ +define('CRAWL_PAGE_RANK_UPDATE', true); + /* * Renew robots.txt index by timing offset provided * diff --git a/crontab/crawler.php b/crontab/crawler.php index b326b65..f302bb4 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -357,26 +357,29 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $httpRequestsTimeTotal += $curl->getTotalTime(); // Update page rank - // @TODO add common method + if (CRAWL_PAGE_RANK_UPDATE) { - $hostPageRank = 0; + // @TODO add common method - // Get referrers - foreach ($db->getHostPagesToHostPageByHostPageIdTarget($queueHostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) { + $hostPageRank = 0; - // Get source page details - if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) { + // Get referrers + foreach ($db->getHostPagesToHostPageByHostPageIdTarget($queueHostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) { - // Increase PR on external referrer only - if ($hostPageSource->hostId != $queueHostPage->hostId) { + // Get source page details + if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) { - $hostPageRank++; - } + // Increase PR on external referrer only + if ($hostPageSource->hostId != $queueHostPage->hostId) { + + $hostPageRank++; + } - // Delegate page rank value from redirected pages - if (false !== strpos($hostPageSource->httpCode, '30')) { + // Delegate page rank value from redirected pages + if (false !== strpos($hostPageSource->httpCode, '30')) { - $hostPageRank += $hostPageSource->rank; + $hostPageRank += $hostPageSource->rank; + } } } }