Browse Source

replace memcached to Yggverse\Cache\Memory API

main
ghost 1 year ago
parent
commit
a27cb61f69
  1. 39
      src/crontab/crawler.php
  2. 38
      src/library/helper.php
  3. 18
      src/public/search.php

39
src/crontab/crawler.php

@ -74,11 +74,10 @@ try {
exit; exit;
} }
// Connect memcached // Connect Yggverse\Cache\Memory
try { try {
$memcached = new Memcached(); $memory = new Yggverse\Cache\Memory(MEMCACHED_HOST, MEMCACHED_PORT, MEMCACHED_NAMESPACE, MEMCACHED_TIMEOUT + time());
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
} catch(Exception $e) { } catch(Exception $e) {
@ -100,21 +99,21 @@ if (CRAWL_YGGSTATE) {
try { try {
if (!$memcached->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) { if (!$memory->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) {
$yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password); $yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password);
foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) { foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) {
// Register new host // Register new host
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, sprintf('http://[%s]/', $yggStatePeer->address))) { if ($linkToDBresult = Helper::addLinkToDB($db, $memory, sprintf('http://[%s]/', $yggStatePeer->address))) {
$hostsAdded += count($linkToDBresult->new->hostId); $hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId); $hostPagesAdded += count($linkToDBresult->new->hostPageId);
} }
} }
$memcached->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout); $memory->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout);
} }
} catch(Exception $e) { } catch(Exception $e) {
@ -154,7 +153,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Update robots.txt rules // Update robots.txt rules
if (200 == $curl->getCode() && false !== stripos(trim(mb_strtolower((string) $curl->getContentType())), 'text/plain')) { if (200 == $curl->getCode() && false !== stripos(trim(mb_strtolower((string) $curl->getContentType())), 'text/plain')) {
Helper::setHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent()); Helper::setHostSetting($db, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent());
} }
} }
@ -163,8 +162,8 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Look for custom sitemap URL served in robots.txt // Look for custom sitemap URL served in robots.txt
$robots = new Robots( $robots = new Robots(
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL . Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX) Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
); );
if ($sitemapLink = $robots->getSitemap()) { if ($sitemapLink = $robots->getSitemap()) {
@ -204,7 +203,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
} }
// Register new link // Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $loc)) { if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $loc)) {
$hostsAdded += count($linkToDBresult->new->hostId); $hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId); $hostPagesAdded += count($linkToDBresult->new->hostPageId);
@ -217,7 +216,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
if (CRAWL_MANIFEST) { if (CRAWL_MANIFEST) {
// Host have manifest provided // Host have manifest provided
if ($manifestURL = Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'MANIFEST_URL', NULL)) { if ($manifestURL = Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'MANIFEST_URL', NULL)) {
// Get remote manifest // Get remote manifest
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); $curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
@ -269,13 +268,13 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Skip processing on remote host URL does not match local condition // Skip processing on remote host URL does not match local condition
if ($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP != if ($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP !=
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) { Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) {
continue; continue;
} }
// Skip processing on remote host link does not match local condition // Skip processing on remote host link does not match local condition
if (false === preg_match(Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), if (false === preg_match(Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP),
$remoteManifest->result->api->hosts)) { $remoteManifest->result->api->hosts)) {
continue; continue;
@ -324,7 +323,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
} }
// Register new link // Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $remoteManifestHost->url)) { if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $remoteManifestHost->url)) {
$hostsAdded += count($linkToDBresult->new->hostId); $hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId); $hostPagesAdded += count($linkToDBresult->new->hostPageId);
@ -431,7 +430,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
} }
// Register new link // Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $url)) { if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $url)) {
$hostsAdded += count($linkToDBresult->new->hostId); $hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId); $hostPagesAdded += count($linkToDBresult->new->hostPageId);
@ -480,7 +479,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
// Check for MIME // Check for MIME
$hostPageInMime = false; $hostPageInMime = false;
foreach ((array) explode(',', Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) { foreach ((array) explode(',', Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) {
// Ban page on MIME type not allowed in settings // Ban page on MIME type not allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
@ -735,11 +734,11 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
$metaTitle, $metaTitle,
$metaDescription ? Filter::pageDescription($metaDescription) : null, $metaDescription ? Filter::pageDescription($metaDescription) : null,
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null, $metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
$content ? (Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null, $content ? (Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null,
time()); time());
// Collect page DOM elements data on enabled // Collect page DOM elements data on enabled
if ($hostPageDomSelectors = Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) { if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
// Begin selectors extraction // Begin selectors extraction
$html = str_get_html($content); $html = str_get_html($content);
@ -753,7 +752,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
$db->addHostPageDom($queueHostPage->hostPageId, $db->addHostPageDom($queueHostPage->hostPageId,
time(), time(),
$selector, $selector,
trim(Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/', trim(Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/',
' ', ' ',
str_replace(['<br />', '<br/>', '<br>', '</'], str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'], [' ', ' ', ' ', ' </'],
@ -1028,7 +1027,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
} }
// Register new link // Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $link['href'])) { if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $link['href'])) {
// Increase new hosts counters // Increase new hosts counters
if ($linkToDBresult->new->hostId) { if ($linkToDBresult->new->hostId) {

38
src/library/helper.php

@ -5,48 +5,42 @@ require_once __DIR__ . '/../../vendor/autoload.php';
class Helper { class Helper {
public static function getHostSetting(MySQL $db, public static function getHostSettingValue(MySQL $db,
Memcached $memcached, Yggverse\Cache\Memory $memory,
int $hostId, int $hostId,
string $key, string $key,
mixed $defaultValue) : mixed { mixed $defaultValue) : mixed {
if ($value = $memcached->get(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key))) { if (false !== $value = $memory->getByMethodCallback(
$db, 'findHostSettingValue', [$hostId, $key], time() + 3600
)) {
return $value; return $value;
}
if (!$value = $db->findHostSettingValue($hostId, $key)) { } else {
$value = $defaultValue; return $defaultValue;
} }
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
return $value;
} }
public static function setHostSetting(MySQL $db, public static function setHostSetting(MySQL $db,
Memcached $memcached,
int $hostId, int $hostId,
string $key, string $key,
mixed $value) : int { mixed $value) : int {
if ($hostSetting = $db->findHostSetting($hostId, $key)) { if ($hostSetting = $db->findHostSetting($hostId, $key)) {
$rowsAffected = $db->updateHostSetting($hostSetting->hostSettingId, $value, time()); return $db->updateHostSetting($hostSetting->hostSettingId, $value, time());
} else { } else {
$rowsAffected = $db->addHostSetting($hostId, $key, $value, time()); return $db->addHostSetting($hostId, $key, $value, time());
} }
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600); // @TODO update cache
return $rowsAffected;
} }
public static function addLinkToDB(MySQL $db, Memcached $memcached, string $link) : mixed { public static function addLinkToDB(MySQL $db, Yggverse\Cache\Memory $memory, string $link) : mixed {
// Define variables // Define variables
$result = (object) $result = (object)
@ -79,7 +73,7 @@ class Helper {
if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) { if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) {
// Make sure host URL compatible with this host rules before continue // Make sure host URL compatible with this host rules before continue
if (!preg_match(self::getHostSetting($db, $memcached, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) { if (!preg_match(self::getHostSettingValue($db, $memory, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) {
return false; return false;
} }
@ -131,21 +125,21 @@ class Helper {
} else { } else {
// Make sure host page URL compatible with this host rules before continue // Make sure host page URL compatible with this host rules before continue
if (!preg_match(self::getHostSetting($db, $memcached, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) { if (!preg_match(self::getHostSettingValue($db, $memory, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) {
return false; return false;
} }
// Validate page limits for this host // Validate page limits for this host
if ($db->getTotalHostPages($hostId) >= self::getHostSetting($db, $memcached, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) { if ($db->getTotalHostPages($hostId) >= self::getHostSettingValue($db, $memory, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) {
return false; return false;
} }
// Validate ROBOTS.TXT // Validate ROBOTS.TXT
$robots = new Robots( $robots = new Robots(
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL . self::getHostSettingValue($db, $memory, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX) self::getHostSettingValue($db, $memory, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
); );
if (!$robots->uriAllowed($link->page->uri)) { if (!$robots->uriAllowed($link->page->uri)) {

18
src/public/search.php

@ -32,22 +32,6 @@ try {
exit; exit;
} }
// Connect memcached
// @TODO
// legacy, upgrade to yggverse/cache instead
// https://github.com/YGGverse/cache-php
try {
$memcached = new Memcached();
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect Yggverse\Cache\Memory // Connect Yggverse\Cache\Memory
try { try {
@ -94,7 +78,7 @@ if (Yggverse\Parser\Url::is($q)) {
$db->beginTransaction(); $db->beginTransaction();
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $q)) { if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $q)) {
if (count($linkToDBresult->new->hostPageId)) { if (count($linkToDBresult->new->hostPageId)) {

Loading…
Cancel
Save