Browse Source

implement DNS resolver with memory cache feature #15

gemini
yggverse 9 months ago
parent
commit
1b8bcb084a
  1. 2
      README.md
  2. 4
      composer.json
  3. 45
      example/config.json
  4. 170
      src/cli/document/crawl.php

2
README.md

@ -31,7 +31,7 @@ To use `HTTP` version, please checkout [main branch](https://github.com/YGGverse @@ -31,7 +31,7 @@ To use `HTTP` version, please checkout [main branch](https://github.com/YGGverse
* `wget https://repo.manticoresearch.com/manticore-repo.noarch.deb`
* `dpkg -i manticore-repo.noarch.deb`
* `apt update`
* `apt install git composer manticore manticore-extra php-fpm php-mbstring`
* `apt install git composer manticore manticore-extra memcached php-fpm php-mbstring php-memcached`
Yo search engine uses Manticore as the primary database. If your server sensitive to power down,
change default [binlog flush strategy](https://manual.manticoresearch.com/Logging/Binary_logging#Binary-flushing-strategies) to `binlog_flush = 1`

4
composer.json

@ -16,8 +16,8 @@ @@ -16,8 +16,8 @@
"require": {
"manticoresoftware/manticoresearch-php": "^3.1",
"yggverse/ftp": "^1.0",
"yggverse/net": "^1.3",
"yggverse/gemini": "^0.5",
"yggverse/net": "^1.7",
"yggverse/gemini": "^0.6",
"yggverse/yo-tools": "dev-main"
}
}

45
example/config.json

@ -21,6 +21,14 @@ @@ -21,6 +21,14 @@
}
}
},
"memcached":
{
"server":
{
"host":"127.0.0.1",
"port":11211
}
},
"gui":
{
"pagination":
@ -117,7 +125,42 @@ @@ -117,7 +125,42 @@
"connection":
{
"timeout":3,
"length":10485760
"length":10485760,
"options":
{
"ssl":
{
"verify_peer": false,
"verify_peer_name": false
}
}
},
"resolver":
{
"enabled":true,
"providers":
[
"1.1.1.1",
"8.8.8.8"
],
"records":
[
"A",
"AAAA"
],
"connection":
{
"timeout":5,
"delay":60
},
"result":
{
"shuffle":false,
"cache":
{
"timeout":3600
}
}
},
"queue":
{

170
src/cli/document/crawl.php

@ -17,8 +17,7 @@ $config = json_decode( @@ -17,8 +17,7 @@ $config = json_decode(
$semaphore = sem_get(
crc32(
__DIR__ . '.yo.cli.document.crawl'
),
1
), 1
);
if (false === sem_acquire($semaphore, true))
@ -125,6 +124,34 @@ catch (Exception $exception) @@ -125,6 +124,34 @@ catch (Exception $exception)
exit;
}
// Init memory
try
{
$memory = new \Memcached();
$memory->addServer(
$config->memcached->server->host,
$config->memcached->server->port
);
}
catch (Exception $exception)
{
if ($config->cli->document->crawl->debug->level->error)
{
echo sprintf(
_('[%s] [error] %s') . PHP_EOL,
date('c'),
print_r(
$exception,
true
)
);
}
exit;
}
// Debug totals
if ($config->cli->document->crawl->debug->level->notice)
{
@ -172,6 +199,125 @@ foreach($index->search('') @@ -172,6 +199,125 @@ foreach($index->search('')
$document->get('url'),
$config->manticore->index->document->name
);
} // @TODO
// Init base address
$base = new \Yggverse\Net\Address(
$document->get('url')
);
// Init worker address
$address = new \Yggverse\Net\Address(
$document->get('url')
);
// Custom resolver enabled
if ($config->cli->document->crawl->resolver->enabled
&&
// Host still not resolved
\Yggverse\Net\Valid::domainHostName(
$address->getHost()
)
) {
// Generate memory ID
$id = sprintf(
'%s.%s.resolved',
$config->manticore->index->document->name,
$address->getHost()
);
// Check for cached results
if ($host = $memory->get($id))
{
$address->setHost(
$host
);
// Debug event
if ($config->cli->document->crawl->debug->level->notice)
{
echo sprintf(
_('[%s] [notice] resolve "%s" as "%s" from cache') . PHP_EOL,
date('c'),
$base->getHost(),
$address->getHost()
);
}
}
// Init connection loop until the address will be resolved
else
{
$attempt = 1;
do
{
// Resolve begin
$resolve = new \Yggverse\Net\Resolve(
$config->cli->document->crawl->resolver->records,
$config->cli->document->crawl->resolver->providers,
$config->cli->document->crawl->resolver->connection->timeout,
$config->cli->document->crawl->resolver->result->shuffle
);
$result = [];
$errors = [];
$resolved = $resolve->address(
$address,
$result,
$errors
);
if ($resolved)
{
// Update address
$address = $resolved;
// Update cache
$memory->set(
$id,
$address->getHost(),
$config->cli->document->crawl->resolver->result->cache->timeout + time()
);
// Debug event
if ($config->cli->document->crawl->debug->level->notice)
{
echo sprintf(
_('[%s] [notice] resolve "%s" as "%s"') . PHP_EOL,
date('c'),
$base->getHost(),
$address->getHost()
);
}
}
else
{
// Log event
if ($config->cli->document->crawl->debug->level->warning)
{
echo sprintf(
_('[%s] [warning] could not resolve "%s" (attempt: %d, response: %s), wait for reconnection...') . PHP_EOL,
date('c'),
$base->getHost(),
$attempt++,
print_r(
$errors,
true
)
);
}
// Next connection delay
sleep(
$config->cli->document->crawl->resolver->connection->delay
);
}
} while (!$resolved);
}
}
// Update index time anyway and set reset code to 51
@ -184,9 +330,21 @@ foreach($index->search('') @@ -184,9 +330,21 @@ foreach($index->search('')
$document->getId()
);
// Request remote URL
// Prepare remote request
$request = new \Yggverse\Gemini\Client\Request(
$document->get('url')
$address->get()
);
// Apply stream options
$request->setOptions(
[
'ssl' =>
[
'peer_name' => $base->getHost(), // SNI
'verify_peer' => $config->cli->document->crawl->connection->options->ssl->verify_peer,
'verify_peer_name' => $config->cli->document->crawl->connection->options->ssl->verify_peer_name
]
]
);
$response = new \Yggverse\Gemini\Client\Response(
@ -301,10 +459,6 @@ foreach($index->search('') @@ -301,10 +459,6 @@ foreach($index->search('')
);
// Crawl links
$base = new \Yggverse\Net\Address(
$document->get('url')
);
$documents = [];
foreach ($body->getLinks() as $line)

Loading…
Cancel
Save