mirror of
https://github.com/YGGverse/Yo.git
synced 2025-01-29 16:04:20 +00:00
implement DNS resolver with memory cache feature #15
This commit is contained in:
parent
298322a4c3
commit
1b8bcb084a
@ -31,7 +31,7 @@ To use `HTTP` version, please checkout [main branch](https://github.com/YGGverse
|
||||
* `wget https://repo.manticoresearch.com/manticore-repo.noarch.deb`
|
||||
* `dpkg -i manticore-repo.noarch.deb`
|
||||
* `apt update`
|
||||
* `apt install git composer manticore manticore-extra php-fpm php-mbstring`
|
||||
* `apt install git composer manticore manticore-extra memcached php-fpm php-mbstring php-memcached`
|
||||
|
||||
Yo search engine uses Manticore as the primary database. If your server sensitive to power down,
|
||||
change default [binlog flush strategy](https://manual.manticoresearch.com/Logging/Binary_logging#Binary-flushing-strategies) to `binlog_flush = 1`
|
||||
|
@ -16,8 +16,8 @@
|
||||
"require": {
|
||||
"manticoresoftware/manticoresearch-php": "^3.1",
|
||||
"yggverse/ftp": "^1.0",
|
||||
"yggverse/net": "^1.3",
|
||||
"yggverse/gemini": "^0.5",
|
||||
"yggverse/net": "^1.7",
|
||||
"yggverse/gemini": "^0.6",
|
||||
"yggverse/yo-tools": "dev-main"
|
||||
}
|
||||
}
|
||||
|
@ -21,6 +21,14 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"memcached":
|
||||
{
|
||||
"server":
|
||||
{
|
||||
"host":"127.0.0.1",
|
||||
"port":11211
|
||||
}
|
||||
},
|
||||
"gui":
|
||||
{
|
||||
"pagination":
|
||||
@ -117,7 +125,42 @@
|
||||
"connection":
|
||||
{
|
||||
"timeout":3,
|
||||
"length":10485760
|
||||
"length":10485760,
|
||||
"options":
|
||||
{
|
||||
"ssl":
|
||||
{
|
||||
"verify_peer": false,
|
||||
"verify_peer_name": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"resolver":
|
||||
{
|
||||
"enabled":true,
|
||||
"providers":
|
||||
[
|
||||
"1.1.1.1",
|
||||
"8.8.8.8"
|
||||
],
|
||||
"records":
|
||||
[
|
||||
"A",
|
||||
"AAAA"
|
||||
],
|
||||
"connection":
|
||||
{
|
||||
"timeout":5,
|
||||
"delay":60
|
||||
},
|
||||
"result":
|
||||
{
|
||||
"shuffle":false,
|
||||
"cache":
|
||||
{
|
||||
"timeout":3600
|
||||
}
|
||||
}
|
||||
},
|
||||
"queue":
|
||||
{
|
||||
|
@ -17,8 +17,7 @@ $config = json_decode(
|
||||
$semaphore = sem_get(
|
||||
crc32(
|
||||
__DIR__ . '.yo.cli.document.crawl'
|
||||
),
|
||||
1
|
||||
), 1
|
||||
);
|
||||
|
||||
if (false === sem_acquire($semaphore, true))
|
||||
@ -125,6 +124,34 @@ catch (Exception $exception)
|
||||
exit;
|
||||
}
|
||||
|
||||
// Init memory
|
||||
try
|
||||
{
|
||||
$memory = new \Memcached();
|
||||
|
||||
$memory->addServer(
|
||||
$config->memcached->server->host,
|
||||
$config->memcached->server->port
|
||||
);
|
||||
}
|
||||
|
||||
catch (Exception $exception)
|
||||
{
|
||||
if ($config->cli->document->crawl->debug->level->error)
|
||||
{
|
||||
echo sprintf(
|
||||
_('[%s] [error] %s') . PHP_EOL,
|
||||
date('c'),
|
||||
print_r(
|
||||
$exception,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Debug totals
|
||||
if ($config->cli->document->crawl->debug->level->notice)
|
||||
{
|
||||
@ -172,6 +199,125 @@ foreach($index->search('')
|
||||
$document->get('url'),
|
||||
$config->manticore->index->document->name
|
||||
);
|
||||
} // @TODO
|
||||
|
||||
// Init base address
|
||||
$base = new \Yggverse\Net\Address(
|
||||
$document->get('url')
|
||||
);
|
||||
|
||||
// Init worker address
|
||||
$address = new \Yggverse\Net\Address(
|
||||
$document->get('url')
|
||||
);
|
||||
|
||||
// Custom resolver enabled
|
||||
if ($config->cli->document->crawl->resolver->enabled
|
||||
&&
|
||||
// Host still not resolved
|
||||
\Yggverse\Net\Valid::domainHostName(
|
||||
$address->getHost()
|
||||
)
|
||||
) {
|
||||
// Generate memory ID
|
||||
$id = sprintf(
|
||||
'%s.%s.resolved',
|
||||
$config->manticore->index->document->name,
|
||||
$address->getHost()
|
||||
);
|
||||
|
||||
// Check for cached results
|
||||
if ($host = $memory->get($id))
|
||||
{
|
||||
$address->setHost(
|
||||
$host
|
||||
);
|
||||
|
||||
// Debug event
|
||||
if ($config->cli->document->crawl->debug->level->notice)
|
||||
{
|
||||
echo sprintf(
|
||||
_('[%s] [notice] resolve "%s" as "%s" from cache') . PHP_EOL,
|
||||
date('c'),
|
||||
$base->getHost(),
|
||||
$address->getHost()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Init connection loop until the address will be resolved
|
||||
else
|
||||
{
|
||||
$attempt = 1;
|
||||
|
||||
do
|
||||
{
|
||||
// Resolve begin
|
||||
$resolve = new \Yggverse\Net\Resolve(
|
||||
$config->cli->document->crawl->resolver->records,
|
||||
$config->cli->document->crawl->resolver->providers,
|
||||
$config->cli->document->crawl->resolver->connection->timeout,
|
||||
$config->cli->document->crawl->resolver->result->shuffle
|
||||
);
|
||||
|
||||
$result = [];
|
||||
$errors = [];
|
||||
|
||||
$resolved = $resolve->address(
|
||||
$address,
|
||||
$result,
|
||||
$errors
|
||||
);
|
||||
|
||||
if ($resolved)
|
||||
{
|
||||
// Update address
|
||||
$address = $resolved;
|
||||
|
||||
// Update cache
|
||||
$memory->set(
|
||||
$id,
|
||||
$address->getHost(),
|
||||
$config->cli->document->crawl->resolver->result->cache->timeout + time()
|
||||
);
|
||||
|
||||
// Debug event
|
||||
if ($config->cli->document->crawl->debug->level->notice)
|
||||
{
|
||||
echo sprintf(
|
||||
_('[%s] [notice] resolve "%s" as "%s"') . PHP_EOL,
|
||||
date('c'),
|
||||
$base->getHost(),
|
||||
$address->getHost()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
// Log event
|
||||
if ($config->cli->document->crawl->debug->level->warning)
|
||||
{
|
||||
echo sprintf(
|
||||
_('[%s] [warning] could not resolve "%s" (attempt: %d, response: %s), wait for reconnection...') . PHP_EOL,
|
||||
date('c'),
|
||||
$base->getHost(),
|
||||
$attempt++,
|
||||
print_r(
|
||||
$errors,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Next connection delay
|
||||
sleep(
|
||||
$config->cli->document->crawl->resolver->connection->delay
|
||||
);
|
||||
}
|
||||
|
||||
} while (!$resolved);
|
||||
}
|
||||
}
|
||||
|
||||
// Update index time anyway and set reset code to 51
|
||||
@ -184,9 +330,21 @@ foreach($index->search('')
|
||||
$document->getId()
|
||||
);
|
||||
|
||||
// Request remote URL
|
||||
// Prepare remote request
|
||||
$request = new \Yggverse\Gemini\Client\Request(
|
||||
$document->get('url')
|
||||
$address->get()
|
||||
);
|
||||
|
||||
// Apply stream options
|
||||
$request->setOptions(
|
||||
[
|
||||
'ssl' =>
|
||||
[
|
||||
'peer_name' => $base->getHost(), // SNI
|
||||
'verify_peer' => $config->cli->document->crawl->connection->options->ssl->verify_peer,
|
||||
'verify_peer_name' => $config->cli->document->crawl->connection->options->ssl->verify_peer_name
|
||||
]
|
||||
]
|
||||
);
|
||||
|
||||
$response = new \Yggverse\Gemini\Client\Response(
|
||||
@ -301,10 +459,6 @@ foreach($index->search('')
|
||||
);
|
||||
|
||||
// Crawl links
|
||||
$base = new \Yggverse\Net\Address(
|
||||
$document->get('url')
|
||||
);
|
||||
|
||||
$documents = [];
|
||||
|
||||
foreach ($body->getLinks() as $line)
|
||||
|
Loading…
x
Reference in New Issue
Block a user