|
|
@ -17,8 +17,7 @@ $config = json_decode( |
|
|
|
$semaphore = sem_get( |
|
|
|
$semaphore = sem_get( |
|
|
|
crc32( |
|
|
|
crc32( |
|
|
|
__DIR__ . '.yo.cli.document.crawl' |
|
|
|
__DIR__ . '.yo.cli.document.crawl' |
|
|
|
), |
|
|
|
), 1 |
|
|
|
1 |
|
|
|
|
|
|
|
); |
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
if (false === sem_acquire($semaphore, true)) |
|
|
|
if (false === sem_acquire($semaphore, true)) |
|
|
@ -125,6 +124,34 @@ catch (Exception $exception) |
|
|
|
exit; |
|
|
|
exit; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Init memory |
|
|
|
|
|
|
|
try |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
$memory = new \Memcached(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$memory->addServer( |
|
|
|
|
|
|
|
$config->memcached->server->host, |
|
|
|
|
|
|
|
$config->memcached->server->port |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
catch (Exception $exception) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if ($config->cli->document->crawl->debug->level->error) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
echo sprintf( |
|
|
|
|
|
|
|
_('[%s] [error] %s') . PHP_EOL, |
|
|
|
|
|
|
|
date('c'), |
|
|
|
|
|
|
|
print_r( |
|
|
|
|
|
|
|
$exception, |
|
|
|
|
|
|
|
true |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
exit; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Debug totals |
|
|
|
// Debug totals |
|
|
|
if ($config->cli->document->crawl->debug->level->notice) |
|
|
|
if ($config->cli->document->crawl->debug->level->notice) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -172,6 +199,125 @@ foreach($index->search('') |
|
|
|
$document->get('url'), |
|
|
|
$document->get('url'), |
|
|
|
$config->manticore->index->document->name |
|
|
|
$config->manticore->index->document->name |
|
|
|
); |
|
|
|
); |
|
|
|
|
|
|
|
} // @TODO |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Init base address |
|
|
|
|
|
|
|
$base = new \Yggverse\Net\Address( |
|
|
|
|
|
|
|
$document->get('url') |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Init worker address |
|
|
|
|
|
|
|
$address = new \Yggverse\Net\Address( |
|
|
|
|
|
|
|
$document->get('url') |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Custom resolver enabled |
|
|
|
|
|
|
|
if ($config->cli->document->crawl->resolver->enabled |
|
|
|
|
|
|
|
&& |
|
|
|
|
|
|
|
// Host still not resolved |
|
|
|
|
|
|
|
\Yggverse\Net\Valid::domainHostName( |
|
|
|
|
|
|
|
$address->getHost() |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
) { |
|
|
|
|
|
|
|
// Generate memory ID |
|
|
|
|
|
|
|
$id = sprintf( |
|
|
|
|
|
|
|
'%s.%s.resolved', |
|
|
|
|
|
|
|
$config->manticore->index->document->name, |
|
|
|
|
|
|
|
$address->getHost() |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Check for cached results |
|
|
|
|
|
|
|
if ($host = $memory->get($id)) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
$address->setHost( |
|
|
|
|
|
|
|
$host |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Debug event |
|
|
|
|
|
|
|
if ($config->cli->document->crawl->debug->level->notice) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
echo sprintf( |
|
|
|
|
|
|
|
_('[%s] [notice] resolve "%s" as "%s" from cache') . PHP_EOL, |
|
|
|
|
|
|
|
date('c'), |
|
|
|
|
|
|
|
$base->getHost(), |
|
|
|
|
|
|
|
$address->getHost() |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Init connection loop until the address will be resolved |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
$attempt = 1; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
do |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// Resolve begin |
|
|
|
|
|
|
|
$resolve = new \Yggverse\Net\Resolve( |
|
|
|
|
|
|
|
$config->cli->document->crawl->resolver->records, |
|
|
|
|
|
|
|
$config->cli->document->crawl->resolver->providers, |
|
|
|
|
|
|
|
$config->cli->document->crawl->resolver->connection->timeout, |
|
|
|
|
|
|
|
$config->cli->document->crawl->resolver->result->shuffle |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$result = []; |
|
|
|
|
|
|
|
$errors = []; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$resolved = $resolve->address( |
|
|
|
|
|
|
|
$address, |
|
|
|
|
|
|
|
$result, |
|
|
|
|
|
|
|
$errors |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($resolved) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// Update address |
|
|
|
|
|
|
|
$address = $resolved; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update cache |
|
|
|
|
|
|
|
$memory->set( |
|
|
|
|
|
|
|
$id, |
|
|
|
|
|
|
|
$address->getHost(), |
|
|
|
|
|
|
|
$config->cli->document->crawl->resolver->result->cache->timeout + time() |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Debug event |
|
|
|
|
|
|
|
if ($config->cli->document->crawl->debug->level->notice) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
echo sprintf( |
|
|
|
|
|
|
|
_('[%s] [notice] resolve "%s" as "%s"') . PHP_EOL, |
|
|
|
|
|
|
|
date('c'), |
|
|
|
|
|
|
|
$base->getHost(), |
|
|
|
|
|
|
|
$address->getHost() |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// Log event |
|
|
|
|
|
|
|
if ($config->cli->document->crawl->debug->level->warning) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
echo sprintf( |
|
|
|
|
|
|
|
_('[%s] [warning] could not resolve "%s" (attempt: %d, response: %s), wait for reconnection...') . PHP_EOL, |
|
|
|
|
|
|
|
date('c'), |
|
|
|
|
|
|
|
$base->getHost(), |
|
|
|
|
|
|
|
$attempt++, |
|
|
|
|
|
|
|
print_r( |
|
|
|
|
|
|
|
$errors, |
|
|
|
|
|
|
|
true |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Next connection delay |
|
|
|
|
|
|
|
sleep( |
|
|
|
|
|
|
|
$config->cli->document->crawl->resolver->connection->delay |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} while (!$resolved); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Update index time anyway and set reset code to 51 |
|
|
|
// Update index time anyway and set reset code to 51 |
|
|
@ -184,9 +330,21 @@ foreach($index->search('') |
|
|
|
$document->getId() |
|
|
|
$document->getId() |
|
|
|
); |
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
// Request remote URL |
|
|
|
// Prepare remote request |
|
|
|
$request = new \Yggverse\Gemini\Client\Request( |
|
|
|
$request = new \Yggverse\Gemini\Client\Request( |
|
|
|
$document->get('url') |
|
|
|
$address->get() |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Apply stream options |
|
|
|
|
|
|
|
$request->setOptions( |
|
|
|
|
|
|
|
[ |
|
|
|
|
|
|
|
'ssl' => |
|
|
|
|
|
|
|
[ |
|
|
|
|
|
|
|
'peer_name' => $base->getHost(), // SNI |
|
|
|
|
|
|
|
'verify_peer' => $config->cli->document->crawl->connection->options->ssl->verify_peer, |
|
|
|
|
|
|
|
'verify_peer_name' => $config->cli->document->crawl->connection->options->ssl->verify_peer_name |
|
|
|
|
|
|
|
] |
|
|
|
|
|
|
|
] |
|
|
|
); |
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
$response = new \Yggverse\Gemini\Client\Response( |
|
|
|
$response = new \Yggverse\Gemini\Client\Response( |
|
|
@ -301,10 +459,6 @@ foreach($index->search('') |
|
|
|
); |
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
// Crawl links |
|
|
|
// Crawl links |
|
|
|
$base = new \Yggverse\Net\Address( |
|
|
|
|
|
|
|
$document->get('url') |
|
|
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$documents = []; |
|
|
|
$documents = []; |
|
|
|
|
|
|
|
|
|
|
|
foreach ($body->getLinks() as $line) |
|
|
|
foreach ($body->getLinks() as $line) |
|
|
|