Browse Source

implement DNS resolver with memory cache feature #15

gemini
yggverse 8 months ago
parent
commit
1b8bcb084a
  1. 2
      README.md
  2. 4
      composer.json
  3. 45
      example/config.json
  4. 170
      src/cli/document/crawl.php

2
README.md

@ -31,7 +31,7 @@ To use `HTTP` version, please checkout [main branch](https://github.com/YGGverse
* `wget https://repo.manticoresearch.com/manticore-repo.noarch.deb` * `wget https://repo.manticoresearch.com/manticore-repo.noarch.deb`
* `dpkg -i manticore-repo.noarch.deb` * `dpkg -i manticore-repo.noarch.deb`
* `apt update` * `apt update`
* `apt install git composer manticore manticore-extra php-fpm php-mbstring` * `apt install git composer manticore manticore-extra memcached php-fpm php-mbstring php-memcached`
Yo search engine uses Manticore as the primary database. If your server sensitive to power down, Yo search engine uses Manticore as the primary database. If your server sensitive to power down,
change default [binlog flush strategy](https://manual.manticoresearch.com/Logging/Binary_logging#Binary-flushing-strategies) to `binlog_flush = 1` change default [binlog flush strategy](https://manual.manticoresearch.com/Logging/Binary_logging#Binary-flushing-strategies) to `binlog_flush = 1`

4
composer.json

@ -16,8 +16,8 @@
"require": { "require": {
"manticoresoftware/manticoresearch-php": "^3.1", "manticoresoftware/manticoresearch-php": "^3.1",
"yggverse/ftp": "^1.0", "yggverse/ftp": "^1.0",
"yggverse/net": "^1.3", "yggverse/net": "^1.7",
"yggverse/gemini": "^0.5", "yggverse/gemini": "^0.6",
"yggverse/yo-tools": "dev-main" "yggverse/yo-tools": "dev-main"
} }
} }

45
example/config.json

@ -21,6 +21,14 @@
} }
} }
}, },
"memcached":
{
"server":
{
"host":"127.0.0.1",
"port":11211
}
},
"gui": "gui":
{ {
"pagination": "pagination":
@ -117,7 +125,42 @@
"connection": "connection":
{ {
"timeout":3, "timeout":3,
"length":10485760 "length":10485760,
"options":
{
"ssl":
{
"verify_peer": false,
"verify_peer_name": false
}
}
},
"resolver":
{
"enabled":true,
"providers":
[
"1.1.1.1",
"8.8.8.8"
],
"records":
[
"A",
"AAAA"
],
"connection":
{
"timeout":5,
"delay":60
},
"result":
{
"shuffle":false,
"cache":
{
"timeout":3600
}
}
}, },
"queue": "queue":
{ {

170
src/cli/document/crawl.php

@ -17,8 +17,7 @@ $config = json_decode(
$semaphore = sem_get( $semaphore = sem_get(
crc32( crc32(
__DIR__ . '.yo.cli.document.crawl' __DIR__ . '.yo.cli.document.crawl'
), ), 1
1
); );
if (false === sem_acquire($semaphore, true)) if (false === sem_acquire($semaphore, true))
@ -125,6 +124,34 @@ catch (Exception $exception)
exit; exit;
} }
// Init memory
try
{
$memory = new \Memcached();
$memory->addServer(
$config->memcached->server->host,
$config->memcached->server->port
);
}
catch (Exception $exception)
{
if ($config->cli->document->crawl->debug->level->error)
{
echo sprintf(
_('[%s] [error] %s') . PHP_EOL,
date('c'),
print_r(
$exception,
true
)
);
}
exit;
}
// Debug totals // Debug totals
if ($config->cli->document->crawl->debug->level->notice) if ($config->cli->document->crawl->debug->level->notice)
{ {
@ -172,6 +199,125 @@ foreach($index->search('')
$document->get('url'), $document->get('url'),
$config->manticore->index->document->name $config->manticore->index->document->name
); );
} // @TODO
// Init base address
$base = new \Yggverse\Net\Address(
$document->get('url')
);
// Init worker address
$address = new \Yggverse\Net\Address(
$document->get('url')
);
// Custom resolver enabled
if ($config->cli->document->crawl->resolver->enabled
&&
// Host still not resolved
\Yggverse\Net\Valid::domainHostName(
$address->getHost()
)
) {
// Generate memory ID
$id = sprintf(
'%s.%s.resolved',
$config->manticore->index->document->name,
$address->getHost()
);
// Check for cached results
if ($host = $memory->get($id))
{
$address->setHost(
$host
);
// Debug event
if ($config->cli->document->crawl->debug->level->notice)
{
echo sprintf(
_('[%s] [notice] resolve "%s" as "%s" from cache') . PHP_EOL,
date('c'),
$base->getHost(),
$address->getHost()
);
}
}
// Init connection loop until the address will be resolved
else
{
$attempt = 1;
do
{
// Resolve begin
$resolve = new \Yggverse\Net\Resolve(
$config->cli->document->crawl->resolver->records,
$config->cli->document->crawl->resolver->providers,
$config->cli->document->crawl->resolver->connection->timeout,
$config->cli->document->crawl->resolver->result->shuffle
);
$result = [];
$errors = [];
$resolved = $resolve->address(
$address,
$result,
$errors
);
if ($resolved)
{
// Update address
$address = $resolved;
// Update cache
$memory->set(
$id,
$address->getHost(),
$config->cli->document->crawl->resolver->result->cache->timeout + time()
);
// Debug event
if ($config->cli->document->crawl->debug->level->notice)
{
echo sprintf(
_('[%s] [notice] resolve "%s" as "%s"') . PHP_EOL,
date('c'),
$base->getHost(),
$address->getHost()
);
}
}
else
{
// Log event
if ($config->cli->document->crawl->debug->level->warning)
{
echo sprintf(
_('[%s] [warning] could not resolve "%s" (attempt: %d, response: %s), wait for reconnection...') . PHP_EOL,
date('c'),
$base->getHost(),
$attempt++,
print_r(
$errors,
true
)
);
}
// Next connection delay
sleep(
$config->cli->document->crawl->resolver->connection->delay
);
}
} while (!$resolved);
}
} }
// Update index time anyway and set reset code to 51 // Update index time anyway and set reset code to 51
@ -184,9 +330,21 @@ foreach($index->search('')
$document->getId() $document->getId()
); );
// Request remote URL // Prepare remote request
$request = new \Yggverse\Gemini\Client\Request( $request = new \Yggverse\Gemini\Client\Request(
$document->get('url') $address->get()
);
// Apply stream options
$request->setOptions(
[
'ssl' =>
[
'peer_name' => $base->getHost(), // SNI
'verify_peer' => $config->cli->document->crawl->connection->options->ssl->verify_peer,
'verify_peer_name' => $config->cli->document->crawl->connection->options->ssl->verify_peer_name
]
]
); );
$response = new \Yggverse\Gemini\Client\Response( $response = new \Yggverse\Gemini\Client\Response(
@ -301,10 +459,6 @@ foreach($index->search('')
); );
// Crawl links // Crawl links
$base = new \Yggverse\Net\Address(
$document->get('url')
);
$documents = []; $documents = [];
foreach ($body->getLinks() as $line) foreach ($body->getLinks() as $line)

Loading…
Cancel
Save