diff --git a/README.md b/README.md index 61f5912..a075321 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ To use `HTTP` version, please checkout [main branch](https://github.com/YGGverse * `wget https://repo.manticoresearch.com/manticore-repo.noarch.deb` * `dpkg -i manticore-repo.noarch.deb` * `apt update` -* `apt install git composer manticore manticore-extra php-fpm php-mbstring` +* `apt install git composer manticore manticore-extra memcached php-fpm php-mbstring php-memcached` Yo search engine uses Manticore as the primary database. If your server sensitive to power down, change default [binlog flush strategy](https://manual.manticoresearch.com/Logging/Binary_logging#Binary-flushing-strategies) to `binlog_flush = 1` diff --git a/composer.json b/composer.json index 6212b6d..fee34ef 100644 --- a/composer.json +++ b/composer.json @@ -16,8 +16,8 @@ "require": { "manticoresoftware/manticoresearch-php": "^3.1", "yggverse/ftp": "^1.0", - "yggverse/net": "^1.3", - "yggverse/gemini": "^0.5", + "yggverse/net": "^1.7", + "yggverse/gemini": "^0.6", "yggverse/yo-tools": "dev-main" } } diff --git a/example/config.json b/example/config.json index 2acd350..c0ba097 100644 --- a/example/config.json +++ b/example/config.json @@ -21,6 +21,14 @@ } } }, + "memcached": + { + "server": + { + "host":"127.0.0.1", + "port":11211 + } + }, "gui": { "pagination": @@ -117,7 +125,42 @@ "connection": { "timeout":3, - "length":10485760 + "length":10485760, + "options": + { + "ssl": + { + "verify_peer": false, + "verify_peer_name": false + } + } + }, + "resolver": + { + "enabled":true, + "providers": + [ + "1.1.1.1", + "8.8.8.8" + ], + "records": + [ + "A", + "AAAA" + ], + "connection": + { + "timeout":5, + "delay":60 + }, + "result": + { + "shuffle":false, + "cache": + { + "timeout":3600 + } + } }, "queue": { diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index 4085361..bce3101 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -17,8 +17,7 @@ $config = json_decode( $semaphore = sem_get( crc32( __DIR__ . '.yo.cli.document.crawl' - ), - 1 + ), 1 ); if (false === sem_acquire($semaphore, true)) @@ -125,6 +124,34 @@ catch (Exception $exception) exit; } +// Init memory +try +{ + $memory = new \Memcached(); + + $memory->addServer( + $config->memcached->server->host, + $config->memcached->server->port + ); +} + +catch (Exception $exception) +{ + if ($config->cli->document->crawl->debug->level->error) + { + echo sprintf( + _('[%s] [error] %s') . PHP_EOL, + date('c'), + print_r( + $exception, + true + ) + ); + } + + exit; +} + // Debug totals if ($config->cli->document->crawl->debug->level->notice) { @@ -172,6 +199,125 @@ foreach($index->search('') $document->get('url'), $config->manticore->index->document->name ); + } // @TODO + + // Init base address + $base = new \Yggverse\Net\Address( + $document->get('url') + ); + + // Init worker address + $address = new \Yggverse\Net\Address( + $document->get('url') + ); + + // Custom resolver enabled + if ($config->cli->document->crawl->resolver->enabled + && + // Host still not resolved + \Yggverse\Net\Valid::domainHostName( + $address->getHost() + ) + ) { + // Generate memory ID + $id = sprintf( + '%s.%s.resolved', + $config->manticore->index->document->name, + $address->getHost() + ); + + // Check for cached results + if ($host = $memory->get($id)) + { + $address->setHost( + $host + ); + + // Debug event + if ($config->cli->document->crawl->debug->level->notice) + { + echo sprintf( + _('[%s] [notice] resolve "%s" as "%s" from cache') . PHP_EOL, + date('c'), + $base->getHost(), + $address->getHost() + ); + } + } + + // Init connection loop until the address will be resolved + else + { + $attempt = 1; + + do + { + // Resolve begin + $resolve = new \Yggverse\Net\Resolve( + $config->cli->document->crawl->resolver->records, + $config->cli->document->crawl->resolver->providers, + $config->cli->document->crawl->resolver->connection->timeout, + $config->cli->document->crawl->resolver->result->shuffle + ); + + $result = []; + $errors = []; + + $resolved = $resolve->address( + $address, + $result, + $errors + ); + + if ($resolved) + { + // Update address + $address = $resolved; + + // Update cache + $memory->set( + $id, + $address->getHost(), + $config->cli->document->crawl->resolver->result->cache->timeout + time() + ); + + // Debug event + if ($config->cli->document->crawl->debug->level->notice) + { + echo sprintf( + _('[%s] [notice] resolve "%s" as "%s"') . PHP_EOL, + date('c'), + $base->getHost(), + $address->getHost() + ); + } + } + + else + { + // Log event + if ($config->cli->document->crawl->debug->level->warning) + { + echo sprintf( + _('[%s] [warning] could not resolve "%s" (attempt: %d, response: %s), wait for reconnection...') . PHP_EOL, + date('c'), + $base->getHost(), + $attempt++, + print_r( + $errors, + true + ) + ); + } + + // Next connection delay + sleep( + $config->cli->document->crawl->resolver->connection->delay + ); + } + + } while (!$resolved); + } } // Update index time anyway and set reset code to 51 @@ -184,9 +330,21 @@ foreach($index->search('') $document->getId() ); - // Request remote URL + // Prepare remote request $request = new \Yggverse\Gemini\Client\Request( - $document->get('url') + $address->get() + ); + + // Apply stream options + $request->setOptions( + [ + 'ssl' => + [ + 'peer_name' => $base->getHost(), // SNI + 'verify_peer' => $config->cli->document->crawl->connection->options->ssl->verify_peer, + 'verify_peer_name' => $config->cli->document->crawl->connection->options->ssl->verify_peer_name + ] + ] ); $response = new \Yggverse\Gemini\Client\Response( @@ -301,10 +459,6 @@ foreach($index->search('') ); // Crawl links - $base = new \Yggverse\Net\Address( - $document->get('url') - ); - $documents = []; foreach ($body->getLinks() as $line)