diff --git a/src/crontab/crawler.php b/src/crontab/crawler.php index 117c4d5..cc86a12 100644 --- a/src/crontab/crawler.php +++ b/src/crontab/crawler.php @@ -74,11 +74,10 @@ try { exit; } -// Connect memcached +// Connect Yggverse\Cache\Memory try { - $memcached = new Memcached(); - $memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT); + $memory = new Yggverse\Cache\Memory(MEMCACHED_HOST, MEMCACHED_PORT, MEMCACHED_NAMESPACE, MEMCACHED_TIMEOUT + time()); } catch(Exception $e) { @@ -100,21 +99,21 @@ if (CRAWL_YGGSTATE) { try { - if (!$memcached->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) { + if (!$memory->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) { $yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password); foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) { // Register new host - if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, sprintf('http://[%s]/', $yggStatePeer->address))) { + if ($linkToDBresult = Helper::addLinkToDB($db, $memory, sprintf('http://[%s]/', $yggStatePeer->address))) { $hostsAdded += count($linkToDBresult->new->hostId); $hostPagesAdded += count($linkToDBresult->new->hostPageId); } } - $memcached->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout); + $memory->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout); } } catch(Exception $e) { @@ -154,7 +153,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF // Update robots.txt rules if (200 == $curl->getCode() && false !== stripos(trim(mb_strtolower((string) $curl->getContentType())), 'text/plain')) { - Helper::setHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent()); + Helper::setHostSetting($db, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent()); } } @@ -163,8 +162,8 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF // Look for custom sitemap URL served in robots.txt $robots = new Robots( - Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL . - Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX) + Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL . + Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX) ); if ($sitemapLink = $robots->getSitemap()) { @@ -204,7 +203,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF } // Register new link - if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $loc)) { + if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $loc)) { $hostsAdded += count($linkToDBresult->new->hostId); $hostPagesAdded += count($linkToDBresult->new->hostPageId); @@ -217,7 +216,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF if (CRAWL_MANIFEST) { // Host have manifest provided - if ($manifestURL = Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'MANIFEST_URL', NULL)) { + if ($manifestURL = Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'MANIFEST_URL', NULL)) { // Get remote manifest $curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); @@ -269,13 +268,13 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF // Skip processing on remote host URL does not match local condition if ($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP != - Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) { + Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) { continue; } // Skip processing on remote host link does not match local condition - if (false === preg_match(Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), + if (false === preg_match(Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $remoteManifest->result->api->hosts)) { continue; @@ -324,7 +323,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF } // Register new link - if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $remoteManifestHost->url)) { + if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $remoteManifestHost->url)) { $hostsAdded += count($linkToDBresult->new->hostId); $hostPagesAdded += count($linkToDBresult->new->hostPageId); @@ -431,7 +430,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ } // Register new link - if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $url)) { + if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $url)) { $hostsAdded += count($linkToDBresult->new->hostId); $hostPagesAdded += count($linkToDBresult->new->hostPageId); @@ -480,7 +479,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ // Check for MIME $hostPageInMime = false; - foreach ((array) explode(',', Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) { + foreach ((array) explode(',', Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) { // Ban page on MIME type not allowed in settings if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { @@ -735,11 +734,11 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ $metaTitle, $metaDescription ? Filter::pageDescription($metaDescription) : null, $metaKeywords ? Filter::pageKeywords($metaKeywords) : null, - $content ? (Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null, + $content ? (Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null, time()); // Collect page DOM elements data on enabled - if ($hostPageDomSelectors = Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) { + if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) { // Begin selectors extraction $html = str_get_html($content); @@ -753,7 +752,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ $db->addHostPageDom($queueHostPage->hostPageId, time(), $selector, - trim(Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/', + trim(Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/', ' ', str_replace(['
', '
', '
', 'getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ } // Register new link - if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $link['href'])) { + if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $link['href'])) { // Increase new hosts counters if ($linkToDBresult->new->hostId) { diff --git a/src/library/helper.php b/src/library/helper.php index 14ddfab..6fda9b7 100644 --- a/src/library/helper.php +++ b/src/library/helper.php @@ -5,48 +5,42 @@ require_once __DIR__ . '/../../vendor/autoload.php'; class Helper { - public static function getHostSetting(MySQL $db, - Memcached $memcached, + public static function getHostSettingValue(MySQL $db, + Yggverse\Cache\Memory $memory, int $hostId, string $key, mixed $defaultValue) : mixed { - if ($value = $memcached->get(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key))) { + if (false !== $value = $memory->getByMethodCallback( + $db, 'findHostSettingValue', [$hostId, $key], time() + 3600 + )) { return $value; - } - if (!$value = $db->findHostSettingValue($hostId, $key)) { + } else { - $value = $defaultValue; + return $defaultValue; } - - $memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600); - - return $value; } public static function setHostSetting(MySQL $db, - Memcached $memcached, int $hostId, string $key, mixed $value) : int { if ($hostSetting = $db->findHostSetting($hostId, $key)) { - $rowsAffected = $db->updateHostSetting($hostSetting->hostSettingId, $value, time()); + return $db->updateHostSetting($hostSetting->hostSettingId, $value, time()); } else { - $rowsAffected = $db->addHostSetting($hostId, $key, $value, time()); + return $db->addHostSetting($hostId, $key, $value, time()); } - $memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600); - - return $rowsAffected; + // @TODO update cache } - public static function addLinkToDB(MySQL $db, Memcached $memcached, string $link) : mixed { + public static function addLinkToDB(MySQL $db, Yggverse\Cache\Memory $memory, string $link) : mixed { // Define variables $result = (object) @@ -79,7 +73,7 @@ class Helper { if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) { // Make sure host URL compatible with this host rules before continue - if (!preg_match(self::getHostSetting($db, $memcached, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) { + if (!preg_match(self::getHostSettingValue($db, $memory, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) { return false; } @@ -131,21 +125,21 @@ class Helper { } else { // Make sure host page URL compatible with this host rules before continue - if (!preg_match(self::getHostSetting($db, $memcached, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) { + if (!preg_match(self::getHostSettingValue($db, $memory, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) { return false; } // Validate page limits for this host - if ($db->getTotalHostPages($hostId) >= self::getHostSetting($db, $memcached, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) { + if ($db->getTotalHostPages($hostId) >= self::getHostSettingValue($db, $memory, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) { return false; } // Validate ROBOTS.TXT $robots = new Robots( - self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL . - self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX) + self::getHostSettingValue($db, $memory, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL . + self::getHostSettingValue($db, $memory, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX) ); if (!$robots->uriAllowed($link->page->uri)) { diff --git a/src/public/search.php b/src/public/search.php index f27d2f3..773cc62 100644 --- a/src/public/search.php +++ b/src/public/search.php @@ -32,22 +32,6 @@ try { exit; } -// Connect memcached -// @TODO -// legacy, upgrade to yggverse/cache instead -// https://github.com/YGGverse/cache-php -try { - - $memcached = new Memcached(); - $memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT); - -} catch(Exception $e) { - - var_dump($e); - - exit; -} - // Connect Yggverse\Cache\Memory try { @@ -94,7 +78,7 @@ if (Yggverse\Parser\Url::is($q)) { $db->beginTransaction(); - if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $q)) { + if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $q)) { if (count($linkToDBresult->new->hostPageId)) {