Browse Source

replace memcached to Yggverse\Cache\Memory API

main
ghost 1 year ago
parent
commit
a27cb61f69
  1. 39
      src/crontab/crawler.php
  2. 38
      src/library/helper.php
  3. 18
      src/public/search.php

39
src/crontab/crawler.php

@ -74,11 +74,10 @@ try { @@ -74,11 +74,10 @@ try {
exit;
}
// Connect memcached
// Connect Yggverse\Cache\Memory
try {
$memcached = new Memcached();
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
$memory = new Yggverse\Cache\Memory(MEMCACHED_HOST, MEMCACHED_PORT, MEMCACHED_NAMESPACE, MEMCACHED_TIMEOUT + time());
} catch(Exception $e) {
@ -100,21 +99,21 @@ if (CRAWL_YGGSTATE) { @@ -100,21 +99,21 @@ if (CRAWL_YGGSTATE) {
try {
if (!$memcached->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) {
if (!$memory->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) {
$yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password);
foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) {
// Register new host
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, sprintf('http://[%s]/', $yggStatePeer->address))) {
if ($linkToDBresult = Helper::addLinkToDB($db, $memory, sprintf('http://[%s]/', $yggStatePeer->address))) {
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
}
}
$memcached->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout);
$memory->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout);
}
} catch(Exception $e) {
@ -154,7 +153,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -154,7 +153,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Update robots.txt rules
if (200 == $curl->getCode() && false !== stripos(trim(mb_strtolower((string) $curl->getContentType())), 'text/plain')) {
Helper::setHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent());
Helper::setHostSetting($db, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent());
}
}
@ -163,8 +162,8 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -163,8 +162,8 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Look for custom sitemap URL served in robots.txt
$robots = new Robots(
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
);
if ($sitemapLink = $robots->getSitemap()) {
@ -204,7 +203,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -204,7 +203,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
}
// Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $loc)) {
if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $loc)) {
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
@ -217,7 +216,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -217,7 +216,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
if (CRAWL_MANIFEST) {
// Host have manifest provided
if ($manifestURL = Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'MANIFEST_URL', NULL)) {
if ($manifestURL = Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'MANIFEST_URL', NULL)) {
// Get remote manifest
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
@ -269,13 +268,13 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -269,13 +268,13 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Skip processing on remote host URL does not match local condition
if ($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP !=
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) {
Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) {
continue;
}
// Skip processing on remote host link does not match local condition
if (false === preg_match(Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP),
if (false === preg_match(Helper::getHostSettingValue($db, $memory, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP),
$remoteManifest->result->api->hosts)) {
continue;
@ -324,7 +323,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -324,7 +323,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
}
// Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $remoteManifestHost->url)) {
if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $remoteManifestHost->url)) {
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
@ -431,7 +430,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ @@ -431,7 +430,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
}
// Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $url)) {
if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $url)) {
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
@ -480,7 +479,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ @@ -480,7 +479,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
// Check for MIME
$hostPageInMime = false;
foreach ((array) explode(',', Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) {
foreach ((array) explode(',', Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) {
// Ban page on MIME type not allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
@ -735,11 +734,11 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ @@ -735,11 +734,11 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
$metaTitle,
$metaDescription ? Filter::pageDescription($metaDescription) : null,
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
$content ? (Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null,
$content ? (Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null,
time());
// Collect page DOM elements data on enabled
if ($hostPageDomSelectors = Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
// Begin selectors extraction
$html = str_get_html($content);
@ -753,7 +752,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ @@ -753,7 +752,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
$db->addHostPageDom($queueHostPage->hostPageId,
time(),
$selector,
trim(Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/',
trim(Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
@ -1028,7 +1027,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ @@ -1028,7 +1027,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
}
// Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $link['href'])) {
if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $link['href'])) {
// Increase new hosts counters
if ($linkToDBresult->new->hostId) {

38
src/library/helper.php

@ -5,48 +5,42 @@ require_once __DIR__ . '/../../vendor/autoload.php'; @@ -5,48 +5,42 @@ require_once __DIR__ . '/../../vendor/autoload.php';
class Helper {
public static function getHostSetting(MySQL $db,
Memcached $memcached,
public static function getHostSettingValue(MySQL $db,
Yggverse\Cache\Memory $memory,
int $hostId,
string $key,
mixed $defaultValue) : mixed {
if ($value = $memcached->get(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key))) {
if (false !== $value = $memory->getByMethodCallback(
$db, 'findHostSettingValue', [$hostId, $key], time() + 3600
)) {
return $value;
}
if (!$value = $db->findHostSettingValue($hostId, $key)) {
} else {
$value = $defaultValue;
return $defaultValue;
}
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
return $value;
}
public static function setHostSetting(MySQL $db,
Memcached $memcached,
int $hostId,
string $key,
mixed $value) : int {
if ($hostSetting = $db->findHostSetting($hostId, $key)) {
$rowsAffected = $db->updateHostSetting($hostSetting->hostSettingId, $value, time());
return $db->updateHostSetting($hostSetting->hostSettingId, $value, time());
} else {
$rowsAffected = $db->addHostSetting($hostId, $key, $value, time());
return $db->addHostSetting($hostId, $key, $value, time());
}
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
return $rowsAffected;
// @TODO update cache
}
public static function addLinkToDB(MySQL $db, Memcached $memcached, string $link) : mixed {
public static function addLinkToDB(MySQL $db, Yggverse\Cache\Memory $memory, string $link) : mixed {
// Define variables
$result = (object)
@ -79,7 +73,7 @@ class Helper { @@ -79,7 +73,7 @@ class Helper {
if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) {
// Make sure host URL compatible with this host rules before continue
if (!preg_match(self::getHostSetting($db, $memcached, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) {
if (!preg_match(self::getHostSettingValue($db, $memory, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) {
return false;
}
@ -131,21 +125,21 @@ class Helper { @@ -131,21 +125,21 @@ class Helper {
} else {
// Make sure host page URL compatible with this host rules before continue
if (!preg_match(self::getHostSetting($db, $memcached, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) {
if (!preg_match(self::getHostSettingValue($db, $memory, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) {
return false;
}
// Validate page limits for this host
if ($db->getTotalHostPages($hostId) >= self::getHostSetting($db, $memcached, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) {
if ($db->getTotalHostPages($hostId) >= self::getHostSettingValue($db, $memory, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) {
return false;
}
// Validate ROBOTS.TXT
$robots = new Robots(
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
self::getHostSettingValue($db, $memory, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
self::getHostSettingValue($db, $memory, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
);
if (!$robots->uriAllowed($link->page->uri)) {

18
src/public/search.php

@ -32,22 +32,6 @@ try { @@ -32,22 +32,6 @@ try {
exit;
}
// Connect memcached
// @TODO
// legacy, upgrade to yggverse/cache instead
// https://github.com/YGGverse/cache-php
try {
$memcached = new Memcached();
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect Yggverse\Cache\Memory
try {
@ -94,7 +78,7 @@ if (Yggverse\Parser\Url::is($q)) { @@ -94,7 +78,7 @@ if (Yggverse\Parser\Url::is($q)) {
$db->beginTransaction();
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $q)) {
if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $q)) {
if (count($linkToDBresult->new->hostPageId)) {

Loading…
Cancel
Save