diff --git a/README.md b/README.md index 4f19ff6..97f2712 100644 --- a/README.md +++ b/README.md @@ -2,26 +2,24 @@ Micro Web Crawler in PHP & Manticore -Yo! is the super thin layer for Manticore search server that extends official [manticoresearch-php](https://github.com/manticoresoftware/manticoresearch-php) client with CLI tools and simple JS-less WebUI. +Yo! Gemini is the super thin layer for Manticore search server that extends official [manticoresearch-php](https://github.com/manticoresoftware/manticoresearch-php) client with CLI tools and Gemini protocol UI. + +This branch contain implementation for [Gemini Protocol](https://geminiprotocol.net). + +To use `HTTP` version, please checkout [main branch](https://github.com/YGGverse/Yo)! ## Features * MIME-based crawler with flexible filter settings by regular expressions, selectors, external links etc * Page snap history with local and remote mirrors support (including FTP protocol) * CLI tools for index administration and crontab tasks -* JS-less frontend to run local or public search web portal -* API tools to make search index distributed +* Gemini Protocol UI (coming soon) ## Components * [Manticore Server](https://github.com/manticoresoftware/manticoresearch) * [PHP library for Manticore](https://github.com/manticoresoftware/manticoresearch-php) -* [Symfony DOM crawler](https://github.com/symfony/dom-crawler) -* [Symfony CSS selector](https://github.com/symfony/css-selector) * [FTP client for snap mirrors](https://github.com/YGGverse/ftp-php) -* [Hostname ident icons](https://github.com/dmester/jdenticon-php) -* [Captcha](https://github.com/Gregwar/Captcha) -* [Bootstrap icons](https://icons.getbootstrap.com/) ### Install @@ -32,22 +30,23 @@ Yo! is the super thin layer for Manticore search server that extends official [m * `wget https://repo.manticoresearch.com/manticore-repo.noarch.deb` * `dpkg -i manticore-repo.noarch.deb` * `apt update` -* `apt install git composer manticore manticore-extra php-fpm php-curl php-mbstring php-gd` +* `apt install git composer manticore manticore-extra php-fpm php-mbstring` Yo search engine uses Manticore as the primary database. If your server sensitive to power down, change default [binlog flush strategy](https://manual.manticoresearch.com/Logging/Binary_logging#Binary-flushing-strategies) to `binlog_flush = 1` #### Deployment -Project in development, to create new search project, use `dev-main` branch: - -* `composer create-project yggverse/yo:dev-main` +* `git clone https://github.com/YGGverse/Yo.git` +* `cd Yo` +* `git checkout gemini` +* `composer update` #### Development * `git clone https://github.com/YGGverse/Yo.git` * `cd Yo` -* `composer update` +* `git checkout gemini` * `git checkout -b pr-branch` * `git commit -m 'new fix'` * `git push` @@ -69,11 +68,9 @@ Project in development, to create new search project, use `dev-main` branch: * `php src/cli/document/crawl.php` * `php src/cli/document/search.php '*'` -#### Web UI +#### Gemini UI -1. `cd src/webui` -2. `php -S 127.0.0.1:8080` -3. open `http://127.0.0.1:8080` in browser +Coming soon.. ## Documentation @@ -134,27 +131,6 @@ php src/cli/document/search.php '@title "*"' [limit] * `query` - required * `limit` - optional search results limit -##### Migration - -###### YGGo - -Import index from YGGo database - -``` -php src/cli/yggo/import.php 'host' 'port' 'user' 'password' 'database' [unique=off] [start=0] [limit=100] -``` - -Source DB fields required: - -* `host` -* `port` -* `user` -* `password` -* `database` -* `unique` - optional, check for unique URL (takes more time) -* `start` - optional, offset to start queue -* `limit` - optional, limit queue - ### Backup #### Logical @@ -171,13 +147,4 @@ Better for infrastructure administration and includes original data binaries. ## Instances -### [Yggdrasil](https://github.com/yggdrasil-network) - -* `http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yo/` - IPv6 `0200::/7` addresses only | [index](http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yo/index.sql) - -### [Alfis DNS](https://github.com/Revertron/Alfis) - -* `http://yo.ygg` - `.ygg` domain zone search only | [index](http://yo.ygg/index.sql) -* `http://ygg.yo.index` - alias of `http://yo.ygg` | [index](http://ygg.yo.index/index.sql) - -_*`*.yo.index` reserved for domain-oriented instances e.g. `.btn`, `.conf`, `.mirror` - feel free to request the address_ \ No newline at end of file +Coming soon.. \ No newline at end of file diff --git a/composer.json b/composer.json index 9e09263..38c0f4c 100644 --- a/composer.json +++ b/composer.json @@ -15,11 +15,8 @@ ], "require": { "manticoresoftware/manticoresearch-php": "^3.1", - "symfony/css-selector": "^6.3", - "symfony/dom-crawler": "^6.3", - "jdenticon/jdenticon": "^1.0", "yggverse/ftp": "^1.0", - "gregwar/captcha": "^1.2", - "yggverse/net": "^1.2" + "yggverse/net": "^1.2", + "yggverse/gemini": "^0.4.0" } } diff --git a/example/config.json b/example/config.json index 9324b30..95b64c5 100644 --- a/example/config.json +++ b/example/config.json @@ -21,7 +21,7 @@ } } }, - "webui": + "gui": { "pagination": { @@ -35,7 +35,7 @@ { "url":{ "enabled":false, - "regex":"/.*/ui" + "regex":"/^gemini:\/\/.*/ui" } } }, @@ -59,9 +59,9 @@ "fields": [ "url", - "title", - "description", - "keywords", + "h1", + "h2", + "h3", "body" ], "options": @@ -71,57 +71,6 @@ } } }, - "footer": - { - "links": - [ - { - "text":"0200::/7", - "attributes": - { - "title":"Search in 0200::/7 IPv6", - "href":"http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yo/" - }, - "index": - [ - "http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yo/index.sql" - ] - }, - { - "text":"yo.ygg", - "attributes": - { - "title":"Search in .ygg zone", - "href":"http://yo.ygg" - }, - "index": - [ - "http://yo.ygg/index.sql" - ] - }, - { - "text":"ygg.yo.index", - "attributes": - { - "title":"Search in .ygg zone", - "href":"http://ygg.yo.index" - }, - "index": - [ - "http://ygg.yo.index/index.sql" - ] - }, - { - "text":"GitHub", - "attributes": - { - "title":"Source code", - "href":"https://github.com/YGGverse/Yo" - }, - "index":[] - } - ] - }, "index": { "enabled":true @@ -161,119 +110,30 @@ "timeout":5, "socket": { - "201:5eb5:f061:678e:7565:6338:c02c:5251":80 + "8.8.8.8":80 } } }, - "curl": + "connection": { - "connection": - { - "timeout":3 - }, - "download": - { - "size": - { - "max":10000024 - } - } + "timeout":3, + "length":1048576, + "chunk":1 }, "queue": { "limit":1, "delay":1 }, - "selector": - { - "a:not([rel=nofollow])": - { - "attribute":"href", - "external":false, - "regex":"/.*/ui" - }, - "image": - { - "attribute":"src", - "external":false, - "regex":"/.*/ui" - }, - "audio": - { - "attribute":"src", - "external":false, - "regex":"/.*/ui" - }, - "video": - { - "attribute":"src", - "external":false, - "regex":"/.*/ui" - }, - "script": - { - "attribute":"href", - "external":false, - "regex":"/.*/ui" - } - }, - "skip": + "url": { - "stripos": + "external":true, + "regex":"/^gemini:\/\/.*/ui", + "skip": { - "url": + "stripos": [ - "#", - "?", - "javascript:", - "mailto:", - "magnet:", - "xmpp:", - "/commit", - "/diff", - "/print", - "/raw", - "/cache", - "/download", - "/share", - "/explore", - "/register", - "/login", - "/password", - "/forgot", - "/restore", - "/account", - "/reply", - "/read", - "/compose", - "/comment", - "/add", - "/edit", - "/delete", - "/quote", - "/report", - "/export", - "/import", - "/mobile", - "/mwiki", - "/branch", - "/block", - "/transaction", - "/search", - "/tag", - "/page", - "/sort", - "/order", - "/pdf", - "/fb2", - "/mobi", - "/epub", - "/djvu", - "/_detail", - "/_media", - "/t/", - "/q/", - "/s/" + "?" ] } }, @@ -297,28 +157,21 @@ "directory":"storage/snap", "size": { - "max":10000024 + "max":1048576 }, - "mime": + "meta": { "stripos": [ - "application/xhtml+xml", - "application/javascript", - "text/html", - "text/plain", - "text/css", - "image/webp", - "image/png", - "image/gif", - "image/ico" + "text/gemini", + "image/" ] }, "url": { "stripos": [ - "http" + "gemini://" ] } }, @@ -345,28 +198,21 @@ }, "size": { - "max":10000024 + "max":1048576 }, - "mime": + "meta": { "stripos": [ - "application/xhtml+xml", - "application/javascript", - "text/html", - "text/plain", - "text/css", - "image/webp", - "image/png", - "image/gif", - "image/ico" + "text/gemini", + "image/" ] }, "url": { "stripos": [ - "http" + "gemini://" ] } } diff --git a/src/cli/document/clean.php b/src/cli/document/clean.php index 481f939..fcc0580 100644 --- a/src/cli/document/clean.php +++ b/src/cli/document/clean.php @@ -39,7 +39,7 @@ $index = $client->index( // Apply new configuration rules echo _('apply new configuration rules...') . PHP_EOL; -foreach ($config->cli->document->crawl->skip->stripos->url as $condition) +foreach ($config->cli->document->crawl->url->skip->stripos as $condition) { echo sprintf( _('cleanup documents with url that contain substring "%s"...') . PHP_EOL, diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index 972fe18..239b948 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -6,7 +6,7 @@ $microtime = microtime(true); // Load dependencies require_once __DIR__ . '/../../../vendor/autoload.php'; -// Define helpers +// Define helpers @TODO move to separated library (yo-php) function getLastSnapTime(array $files): int { $time = []; @@ -37,6 +37,40 @@ function getLastSnapTime(array $files): int return 0; } +function relative2absolute( + string $source, // current document url to grab the base + string $target, // relative or absolute link + ?string &$scheme = null, + ?string &$host = null, + ?int &$port = null +) { + if (!parse_url($target, PHP_URL_HOST)) + { + $scheme = parse_url($base, PHP_URL_SCHEME); + $host = parse_url($base, PHP_URL_HOST); + $port = parse_url($base, PHP_URL_PORT); + + return $scheme . '://' . $host . ($port ? ':' . $port : null) . + '/' . + trim( + ltrim( + str_replace( + [ + './', + '../' + ], + '', + $target + ), + '/' + ), + '.' + ); + } + + return $target; +} + // Init config $config = json_decode( file_get_contents( @@ -182,16 +216,16 @@ foreach($index->search('') $data = [ - 'url' => $document->get('url'), - 'title' => $document->get('title'), - 'description' => $document->get('description'), - 'keywords' => $document->get('keywords'), - 'code' => $document->get('code'), - 'size' => $document->get('size'), - 'mime' => $document->get('mime'), - 'rank' => $document->get('rank'), - 'time' => $time, - 'index' => 0 + 'url' => $document->get('url'), + 'h1' => $document->get('h1'), + 'h2' => $document->get('h2'), + 'h3' => $document->get('h3'), + 'code' => $document->get('code'), + 'size' => $document->get('size'), + 'meta' => $document->get('meta'), + 'rank' => $document->get('rank'), + 'time' => $time, + 'index' => 0 ]; // Debug target @@ -205,114 +239,50 @@ foreach($index->search('') ); } - // Update index time anyway and set reset code to 404 + // Update index time anyway and set reset code to 51 $index->updateDocument( [ 'time' => time(), - 'code' => 200, + 'code' => 20, 'index' => 0 ], $document->getId() ); // Request remote URL - $request = curl_init( + $request = new \Yggverse\Gemini\Client\Request( $document->get('url') ); - // Drop URL with long response - curl_setopt( - $request, - CURLOPT_CONNECTTIMEOUT, - $config->cli->document->crawl->curl->connection->timeout - ); - - curl_setopt( - $request, - CURLOPT_TIMEOUT, - $config->cli->document->crawl->curl->connection->timeout - ); - - // Prevent huge content download e.g. media streams URL - curl_setopt( - $request, - CURLOPT_RETURNTRANSFER, - true - ); - - curl_setopt( - $request, - CURLOPT_NOPROGRESS, - false - ); - - curl_setopt( - $request, - CURLOPT_PROGRESSFUNCTION, - function( - $download, - $downloaded, - $upload, - $uploaded - ) { - global $config; - - global $index; - global $document; - - $index->updateDocument( - [ - 'time' => time(), - 'code' => 200, - 'index' => 0 - ], - $document->getId() - ); - - return $downloaded > $config->cli->document->crawl->curl->download->size->max ? 1 : 0; - } + $response = new \Yggverse\Gemini\Client\Response( + $request->getResponse( + $config->cli->document->crawl->connection->timeout, + $config->cli->document->crawl->connection->length, + $config->cli->document->crawl->connection->chunk, + $length + ) ); // Begin request - if ($response = curl_exec($request)) + if ($code = $request->getCode()) // @TODO process redirects { - // Update HTTP code or skip on empty - if ($code = curl_getinfo($request, CURLINFO_HTTP_CODE)) - { - // Delete deprecated document from index as HTTP code still not 200 - /* - if ($code != 200 && !empty($data['code']) && $data['code'] != 200) - { - $index->deleteDocument( - $document->getId() - ); - - continue; - } - */ - - $data['code'] = $code; - - } else continue; + // Update status code + $data['code'] = $code; // Update size or skip on empty - if ($size = curl_getinfo($request, CURLINFO_SIZE_DOWNLOAD)) + if ($length) { - $size = round( // float - $size - ); - - $data['size'] = $size; + $data['size'] = $length; } else continue; - // Update MIME type or skip on empty - if ($type = curl_getinfo($request, CURLINFO_CONTENT_TYPE)) + // Update meta or skip on empty + if ($meta = $response->getMeta()) { - $data['mime'] = $type; + $data['meta'] = $meta; // On document charset specified - if (preg_match('/charset=([^\s;]+)/i', $type, $charset)) + if (preg_match('/charset=([^\s;]+)/i', $meta, $charset)) { if (!empty($charset[1])) { @@ -322,10 +292,12 @@ foreach($index->search('') if (strtolower($charset[1]) == strtolower($encoding)) { // Convert response to UTF-8 - $response = mb_convert_encoding( - $response, - 'UTF-8', - $charset[1] + $response->setBody( + mb_convert_encoding( + $response->getBody(), + 'UTF-8', + $charset[1] + ) ); break; @@ -336,241 +308,102 @@ foreach($index->search('') } else continue; - // DOM crawler - if ( - false !== stripos($type, 'text/html') - || - false !== stripos($type, 'text/xhtml') - || - false !== stripos($type, 'application/xhtml') - ) { - $crawler = new Symfony\Component\DomCrawler\Crawler(); - $crawler->addHtmlContent( - $response + // Gemtext parser + if (false !== stripos($response->getMeta(), 'text/gemini')) + { + $body = new \Yggverse\Gemini\Client\Gemtext\Body( + $response->getBody() ); - // Get title - foreach ($crawler->filter('head > title')->each(function($node) { - - return $node->text(); - - }) as $value) + // Get H1 + $h1 = []; + foreach ($body->getH1() as $value) { - if (!empty($value)) - { - $data['title'] = trim( - strip_tags( - html_entity_decode( - $value - ) - ) - ); - } + $h1[] = $value; } - // Get description - foreach ($crawler->filter('head > meta[name="description"]')->each(function($node) { - - return $node->attr('content'); + $data['h1'] = implode( + ',', + array_unique( + $h1 + ) + ); - }) as $value) + // Get H1 + $h2 = []; + foreach ($body->getH2() as $value) { - if (!empty($value)) - { - $data['description'] = trim( - strip_tags( - html_entity_decode( - $value - ) - ) - ); - } + $h2[] = $value; } - // Get keywords - $keywords = []; - - // Extract from meta tag - foreach ($crawler->filter('head > meta[name="keywords"]')->each(function($node) { - - return $node->attr('content'); - - }) as $value) - { - if (!empty($value)) - { - foreach ((array) explode( - ',', - mb_strtolower( - strip_tags( - html_entity_decode( - $value - ) - ) - ) - ) as $keyword) - { - // Remove extra spaces - $keyword = trim( - $keyword - ); - - // Skip short words - if (mb_strlen($keyword) > 2) - { - $keywords[] = $keyword; - } - } - } - } - - // Get keywords from headers - /* Disable keywords collection from headers as body index enabled - - foreach ($crawler->filter('h1,h2,h3,h4,h5,h6')->each(function($node) { - - return $node->text(); + $data['h2'] = implode( + ',', + array_unique( + $h2 + ) + ); - }) as $value) + // Get H3 + $h3 = []; + foreach ($body->getH3() as $value) { - if (!empty($value)) - { - foreach ((array) explode( - ',', - mb_strtolower( - strip_tags( - html_entity_decode( - $value - ) - ) - ) - ) as $keyword) - { - // Remove extra spaces - $keyword = trim( - $keyword - ); - - // Skip short words - if (mb_strlen($keyword) > 2) - { - $keywords[] = $keyword; - } - } - } + $h3[] = $value; } - */ - // Keep keywords unique - $keywords = array_unique( - $keywords + $data['h3'] = implode( + ',', + array_unique( + $h3 + ) ); - // Update previous keywords when new value exists - if ($keywords) - { - $data['keywords'] = implode(',', $keywords); - } - // Save document body text to index - foreach ($crawler->filter('html > body')->each(function($node) { - - return $node->html(); - - }) as $value) - { - if (!empty($value)) - { - $data['body'] = trim( - preg_replace( - '/[\s]{2,}/', // strip extra separators - ' ', - strip_tags( - str_replace( // make text separators before strip any closing tag, new line, etc - [ - '<', - '>', - PHP_EOL, - ], - [ - ' <', - '> ', - PHP_EOL . ' ', - ], - preg_replace( - [ - '/]*)>([\s\S]*?)<\/script>/i', // strip js content - '/]*)>([\s\S]*?)<\/style>/i', // strip css content - '/]*)>([\s\S]*?)<\/pre>/i', // strip code content - '/]*)>([\s\S]*?)<\/code>/i', - ], - '', - html_entity_decode( - $value - ) - ) - ) - ) - ) - ); - } - } + $data['body'] = trim( + preg_replace( + '/[\s]{2,}/', // strip extra separators + ' ', + $response->getBody() + ) + ); - // Crawl documents + // Crawl links $documents = []; - $scheme = parse_url($document->get('url'), PHP_URL_SCHEME); - $host = parse_url($document->get('url'), PHP_URL_HOST); - $port = parse_url($document->get('url'), PHP_URL_PORT); - - foreach ($config->cli->document->crawl->selector as $selector => $settings) + foreach ($body->getLinks() as $line) { - foreach ($crawler->filter($selector)->each(function($node) { - - return $node; + $link = new \Yggverse\Gemini\Gemtext\Link( + $line + ); - }) as $value) { + if ($url = $link->getAddress()) + { + //Make relative links absolute + $url = relative2absolute( + $document->get('url'), + $url, + $scheme, + $host, + $port, + ); - if ($url = $value->attr($settings->attribute)) + // Regex rules + if (!preg_match($config->cli->document->crawl->url->regex, $url)) { - //Make relative links absolute - if (!parse_url($url, PHP_URL_HOST)) - { - $url = $scheme . '://' . $host . ($port ? ':' . $port : null) . - '/' . - trim( - ltrim( - str_replace( - [ - './', - '../' - ], - '', - $url - ), - '/' - ), - '.' - ); - } - - // Regex rules - if (!preg_match($settings->regex, $url)) - { - continue; - } - - // External host rules - if (!$settings->external && parse_url($url, PHP_URL_HOST) != $host) - { - continue; - } + continue; + } - $documents[] = $url; + // External host rules + if (!$config->cli->document->crawl->url->external && parse_url($url, PHP_URL_HOST) != $host) + { + continue; } + + $documents[] = $url; } } + // @TODO find document links by protocol ($body->findLinks('gemini')) + if ($documents) { foreach (array_unique($documents) as $url) @@ -578,7 +411,7 @@ foreach($index->search('') // Apply stripos condition $skip = false; - foreach ($config->cli->document->crawl->skip->stripos->url as $condition) + foreach ($config->cli->document->crawl->url->skip->stripos as $condition) { if (false !== stripos($url, $condition)) { @@ -597,7 +430,7 @@ foreach($index->search('') date('c'), $url, print_r( - $config->cli->document->crawl->skip->stripos->url, + $config->cli->document->crawl->url->skip->stripos, true ) ); @@ -701,7 +534,7 @@ foreach($index->search('') } // Create snap - if ($config->cli->document->crawl->snap->enabled && $code === 200) + if ($config->cli->document->crawl->snap->enabled && $request->getCode() === 20) { try { @@ -734,12 +567,12 @@ foreach($index->search('') $snap->addFromString( 'DATA', - $response + $response->getBody() ); $snap->addFromString( - 'MIME', - $type + 'META', + $response->getMeta() ); $snap->addFromString( @@ -767,12 +600,12 @@ foreach($index->search('') // Copy to local storage on enabled if ($config->snap->storage->local->enabled) { - // Check for mime allowed + // Check for meta allowed $allowed = false; - foreach ($config->snap->storage->local->mime->stripos as $whitelist) + foreach ($config->snap->storage->local->meta->stripos as $whitelist) { - if (false !== stripos($type, $whitelist)) + if (false !== stripos($response->getMeta(), $whitelist)) { $allowed = true; break; @@ -904,12 +737,12 @@ foreach($index->search('') continue; } - // Check for mime allowed + // Check for meta allowed $allowed = false; - foreach ($ftp->mime->stripos as $whitelist) + foreach ($ftp->meta->stripos as $whitelist) { - if (false !== stripos($type, $whitelist)) + if (false !== stripos($response->getMeta(), $whitelist)) { $allowed = true; break; diff --git a/src/cli/index/init.php b/src/cli/index/init.php index a893d1e..8b71cc8 100644 --- a/src/cli/index/init.php +++ b/src/cli/index/init.php @@ -52,15 +52,15 @@ $result = $index->create( [ 'type' => 'text' ], - 'title' => + 'h1' => [ 'type' => 'text' ], - 'description' => + 'h2' => [ 'type' => 'text' ], - 'keywords' => + 'h3' => [ 'type' => 'text' ], @@ -68,7 +68,7 @@ $result = $index->create( [ 'type' => 'text' ], - 'mime' => + 'meta' => [ 'type' => 'text' ], diff --git a/src/cli/yggo/import.php b/src/cli/yggo/import.php deleted file mode 100644 index 26a9a89..0000000 --- a/src/cli/yggo/import.php +++ /dev/null @@ -1,177 +0,0 @@ - $config->manticore->server->host, - 'port' => $config->manticore->server->port, - ] -); - -// Init index -$index = $client->index( - $config->manticore->index->document->name -); - -// Connect Yggo DB -try -{ - $yggo = new PDO( - 'mysql:dbname=' . $argv[5] . ';host=' . $argv[1] . ';port=' . $argv[2] . ';charset=utf8', - $argv[3], - $argv[4], - [ - PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8' - ] - ); - - $yggo->setAttribute( - PDO::ATTR_ERRMODE, - PDO::ERRMODE_EXCEPTION - ); - - $yggo->setAttribute( - PDO::ATTR_DEFAULT_FETCH_MODE, - PDO::FETCH_OBJ - ); - - $yggo->setAttribute( - PDO::ATTR_TIMEOUT, - 600 - ); -} - -catch (Exception $error) -{ - var_dump( - $error - ); - - exit; -} - -$start = isset($argv[7]) ? (int) $argv[7] : 0; -$limit = isset($argv[8]) ? (int) $argv[8] : 100; - -$total = $yggo->query('SELECT COUNT(*) AS `total` FROM `hostPage` - - WHERE `hostPage`.`httpCode` = 200 - AND `hostPage`.`timeUpdated` IS NOT NULL - AND `hostPage`.`mime` IS NOT NULL - AND `hostPage`.`size` IS NOT NULL')->fetch()->total; - -$processed = $start; - -for ($i = 0; $i <= $total; $i++) -{ - $query = $yggo->query('SELECT `hostPage`.`hostPageId`, - `hostPage`.`httpCode`, - `hostPage`.`mime`, - `hostPage`.`size`, - `hostPage`.`timeUpdated`, - `hostPage`.`uri`, - - `host`.`scheme`, - `host`.`name`, - `host`.`port`, - - ( - SELECT `hostPageDescription`.`title` FROM `hostPageDescription` - WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId` - ORDER BY `hostPageDescription`.`timeAdded` DESC - LIMIT 1 - ) AS `title`, - - ( - SELECT `hostPageDescription`.`description` FROM `hostPageDescription` - WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId` - ORDER BY `hostPageDescription`.`timeAdded` DESC - LIMIT 1 - ) AS `description`, - - ( - SELECT `hostPageDescription`.`keywords` FROM `hostPageDescription` - WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId` - ORDER BY `hostPageDescription`.`timeAdded` DESC - LIMIT 1 - ) AS `keywords` - - FROM `hostPage` - JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) - - WHERE `hostPage`.`httpCode` = 200 - AND `hostPage`.`timeUpdated` IS NOT NULL - AND `hostPage`.`mime` IS NOT NULL - AND `hostPage`.`size` IS NOT NULL - - GROUP BY `hostPage`.`hostPageId` - - LIMIT ' . $start . ',' . $limit); - - - foreach ($query->fetchAll() as $remote) - { - $url = $remote->scheme . '://' . $remote->name . ($remote->port ? ':' . $remote->port : false) . $remote->uri; - $crc32url = crc32($url); - - // Check for unique URL requested - if (isset($argv[6])) - { - $local = $index->search('') - ->filter('id', $crc32url) - ->limit(1) - ->get(); - - if ($local->getTotal()) - { - // Result - echo sprintf( - _('[%s/%s] [skip duplicate] %s') . PHP_EOL, - $processed++, - $total, - $url - ); - - continue; - } - } - - $index->addDocument( - [ - 'url' => $url, - 'time' => (int) $remote->timeUpdated, - 'code' => (int) $remote->httpCode, - 'size' => (int) $remote->size, - 'mime' => (string) $remote->mime, - 'title' => (string) $remote->title, - 'description' => (string) $remote->description, - 'keywords' => (string) $remote->keywords - ], - (int) $crc32url - ); - - // Result - echo sprintf( - _('[%s/%s] [add] %s') . PHP_EOL, - $processed++, - $total, - $url - ); - } - - // Update queue offset - $start = $start + $limit; -} - -// Done -echo _('import completed!') . PHP_EOL; \ No newline at end of file diff --git a/src/webui/api.php b/src/webui/api.php deleted file mode 100644 index 85c07e3..0000000 --- a/src/webui/api.php +++ /dev/null @@ -1,271 +0,0 @@ - false, - 'message' => _('valid source required') - ] - ); - - exit; - - case isset($_GET['id']) && preg_match('/^[\d]+$/', $_GET['id']): - - echo json_encode( - [ - 'status' => false, - 'message' => _('valid document identifier required') - ] - ); - - exit; - - case isset($_GET['time']) && preg_match('/^[\d]+$/', $_GET['time']): - - echo json_encode( - [ - 'status' => false, - 'message' => _('valid time required') - ] - ); - - exit; - } - - // Detect remote snap source - if (preg_match('/^[\d]+$/', $_GET['source'])) - { - if (!isset($config->snap->storage->remote->ftp[$_GET['source']]) || !$config->snap->storage->remote->ftp[$_GET['source']]->enabled) - { - echo json_encode( - [ - 'status' => false, - 'message' => _('requested source not found') - ] - ); - - exit; - } - - // Connect remote - $remote = new \Yggverse\Ftp\Client(); - - $connection = $remote->connect( - $config->snap->storage->remote->ftp[$_GET['source']]->connection->host, - $config->snap->storage->remote->ftp[$_GET['source']]->connection->port, - $config->snap->storage->remote->ftp[$_GET['source']]->connection->username, - $config->snap->storage->remote->ftp[$_GET['source']]->connection->password, - $config->snap->storage->remote->ftp[$_GET['source']]->connection->directory, - $config->snap->storage->remote->ftp[$_GET['source']]->connection->timeout, - $config->snap->storage->remote->ftp[$_GET['source']]->connection->passive - ); - - // Remote host connected - if ($connection) { - - // Prepare snap path - $filename = sprintf( - '%s/%s.tar.gz', - implode( - '/', - str_split( - $_GET['id'] - ) - ), - $_GET['time'] - ); - - // Check snap exist - if (!$size = $remote->size($filename)) - { - echo json_encode( - [ - 'status' => false, - 'message' => _('requested snap not found') - ] - ); - - exit; - } - - // Set headers - header( - 'Content-Type: application/tar+gzip' - ); - - header( - sprintf( - 'Content-Length: %s', - $size - ) - ); - - header( - sprintf( - 'Content-Disposition: filename="snap.%s.%s"', - $_GET['id'], - basename( - $filename - ) - ) - ); - - // Return file - $remote->get( - $filename, - 'php://output' - ); - - $remote->close(); - } - } - - // Local - else if ($config->snap->storage->local->enabled) - { - // Prefix absolute - if ('/' === substr($config->snap->storage->local->directory, 0, 1)) - { - $prefix = $config->snap->storage->local->directory; - } - - // Prefix relative - else - { - $prefix = __DIR__ . '/../../' . $config->snap->storage->local->directory; - } - - // Prepare snap path - $filename = sprintf( - '%s/%s/%s.tar.gz', - $prefix, - implode( - '/', - str_split( - $_GET['id'] - ) - ), - $_GET['time'] - ); - - // Check snap exist - if (!file_exists($filename) || !is_readable($filename)) - { - echo json_encode( - [ - 'status' => false, - 'message' => _('requested snap not found') - ] - ); - - exit; - } - - // Check snap has valid size - if (!$size = filesize($filename)) - { - echo json_encode( - [ - 'status' => false, - 'message' => _('requested snap has invalid size') - ] - ); - - exit; - } - - // Set headers - header( - 'Content-Type: application/tar+gzip' - ); - - header( - sprintf( - 'Content-Length: %s', - $size - ) - ); - - header( - sprintf( - 'Content-Disposition: filename="snap.%s.%s"', - $_GET['id'], - basename( - $filename - ) - ) - ); - - readfile( - $filename - ); - - exit; - } - - else - { - echo json_encode( - [ - 'status' => false, - 'message' => _('requested source not found') - ] - ); - } - - break; - - default: - - echo json_encode( - [ - 'status' => false, - 'message' => _('Undefined API method') - ] - ); - } - - break; - - default: - - echo json_encode( - [ - 'status' => false, - 'message' => _('Undefined API action') - ] - ); -} diff --git a/src/webui/explore.php b/src/webui/explore.php deleted file mode 100644 index dcb5f4d..0000000 --- a/src/webui/explore.php +++ /dev/null @@ -1,563 +0,0 @@ - $config->manticore->server->host, - 'port' => $config->manticore->server->port, - ] -); - -// Init index -$index = $client->index( - $config->manticore->index->document->name -); - -// Get totals -$total = $index->search('') - ->option('cutoff', 0) - ->limit(0) - ->get() - ->getTotal(); - -$placeholder = sprintf( - _('Search in %s documents %s'), - number_format( - $total - ), - $config->webui->search->index->request->url->enabled ? _('or enter new address to crawl...') : false -); - -// Get document data -$document = $index->getDocumentById( - isset($_GET['i']) ? $_GET['i'] : 0 -); - -// Get icon -$hostname = parse_url( - $document->url, - PHP_URL_HOST -); - -$identicon = new \Jdenticon\Identicon(); - -$identicon->setValue( - $hostname -); - -$identicon->setSize(36); - -$identicon->setStyle( - [ - 'backgroundColor' => 'rgba(255, 255, 255, 0)', - 'padding' => 0 - ] -); - -$icon = $identicon->getImageDataUri('webp'); - -// Get snaps info -$snaps = []; - -/// Prepare location -$filepath = implode( - '/', - str_split( - $document->getId() - ) -); - -/// Local snaps -if ($config->snap->storage->local->enabled) -{ - /// absolute - if ('/' === substr($config->snap->storage->local->directory, 0, 1)) - { - $prefix = $config->snap->storage->local->directory; - } - - /// relative - else - { - $prefix = __DIR__ . '/../../' . $config->snap->storage->local->directory; - } - - $directory = sprintf('%s/%s', $prefix, $filepath); - - if (is_dir($directory)) - { - foreach ((array) scandir($directory) as $filename) - { - if (!str_ends_with($filename, '.tar.gz')) - { - continue; - } - - $basename = basename( - $filename - ); - - $time = preg_replace( - '/^([\d]+)\.tar\.gz$/', - '$1', - $basename - ); - - $snaps[_('Local')][] = (object) - [ - 'source' => 'local', - 'id' => $document->getId(), - 'name' => $basename, - 'time' => $time, - 'size' => filesize( - sprintf( - '%s/%s', - $directory, - $filename - ) - ), - ]; - } - } -} - -/// Remote snaps -foreach ($config->snap->storage->remote->ftp as $i => $ftp) -{ - // Resource enabled - if (!$ftp->enabled) - { - continue; - } - - $remote = new \Yggverse\Ftp\Client(); - - $connection = $remote->connect( - $ftp->connection->host, - $ftp->connection->port, - $ftp->connection->username, - $ftp->connection->password, - $ftp->connection->directory, - $ftp->connection->timeout, - $ftp->connection->passive - ); - - // Remote host connected - if ($connection) { - - foreach ((array) $remote->nlist($filepath) as $filename) - { - if (!str_ends_with($filename, '.tar.gz')) - { - continue; - } - - $basename = basename( - $filename - ); - - $time = preg_replace( - '/^([\d]+)\.tar\.gz$/', - '$1', - $basename - ); - - $snaps[sprintf(_('Server #%s'), $i + 1)][] = (object) - [ - 'source' => $i, - 'id' => $document->getId(), - 'name' => $basename, - 'time' => $time, - 'size' => $remote->size($filename), - ]; - } - - $remote->close(); - } -} - -// Process index request -if ($config->webui->index->enabled) -{ - session_start(); - - if (isset($_POST['captcha']) && $_POST['captcha'] == $_SESSION['captcha']) - { - $index->updateDocument( - [ - 'index' => time() - ], - $document->getId() - ); - - header( - sprintf( - 'Location: explore.php?i=%d', - $document->getId() - ) - ); - } - - $captcha = new \Gregwar\Captcha\CaptchaBuilder( - null, - new \Gregwar\Captcha\PhraseBuilder( - $config->webui->captcha->length, - $config->webui->captcha->phrase - ) - ); - - $captcha->setBackgroundColor( - $config->webui->captcha->background->r, - $config->webui->captcha->background->g, - $config->webui->captcha->background->b - ); - - $captcha->build(); - - $_SESSION['captcha'] = $captcha->getPhrase(); -} - -?> - - - - - <?php echo _('Yo! explore') ?> - - - - -
-
-

- - webui->search->extended->enabled) { ?> - - - -
-
-
- -
- time)) { ?> -
- -
- - title)) { ?> -

- title) ?> -

- - description)) { ?> -
- description) ?> -
- - keywords)) { ?> -
- keywords) ?> -
- - -
- url)) ?> -
-
-
-
- identicon -
- code)) { ?> -

- code == 200) { ?> -
- code ?> -
- -
- code ?> -
- - - mime)) { ?> -

-
mime ?>
- - size)) { ?> -

-
size)) ?>
- - time)) { ?> -

-
time) ?>
- - -

- - - body)) { ?> -

-
body) ?>
- - webui->index->enabled) { ?> -

-
- get('index')) { ?> - get('index'))) ?> - - captcha -
-
- - - -
-
- -
- -
- -
- -
- -
- - \ No newline at end of file diff --git a/src/webui/index.php b/src/webui/index.php deleted file mode 100644 index bbaea8b..0000000 --- a/src/webui/index.php +++ /dev/null @@ -1,336 +0,0 @@ - $config->manticore->server->host, - 'port' => $config->manticore->server->port, - ] -); - -// Init index -$index = $client->index( - $config->manticore->index->document->name -); - -// Get totals -$total = $index->search('') - ->option('cutoff', 0) - ->limit(0) - ->get() - ->getTotal(); - -$placeholder = sprintf( - _('Search in %s documents %s'), - number_format( - $total - ), - $config->webui->search->index->request->url->enabled ? _('or enter new address to crawl...') : false -); - -?> - - - - - <?php echo _('Yo! Web Search Engine') ?> - - - - - - -
-
-

- - -
-
- -
-
    -
  • -
  • -
  • -
  • -
  • -
  • -
  • -
  • -
  • -
  • -
-
- - - - \ No newline at end of file diff --git a/src/webui/search.php b/src/webui/search.php deleted file mode 100644 index 4242a74..0000000 --- a/src/webui/search.php +++ /dev/null @@ -1,514 +0,0 @@ - $config->manticore->server->host, - 'port' => $config->manticore->server->port, - ] -); - -// Init index -$index = $client->index( - $config->manticore->index->document->name -); - -// Get totals -$total = $index->search('') - ->option('cutoff', 0) - ->limit(0) - ->get() - ->getTotal(); - -$placeholder = sprintf( - _('Search in %s documents %s'), - number_format( - $total - ), - $config->webui->search->index->request->url->enabled ? _('or enter new address to crawl...') : false -); - -$response = false; - -// Request -$q = !empty($_GET['q']) ? trim($_GET['q']) : ''; -$p = !empty($_GET['p']) ? (int) $_GET['p'] : 1; - -// Register new URL by request on enabled -if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTER_VALIDATE_URL)) -{ - if (preg_match($config->webui->search->index->request->url->regex, $q)) - { - // Prepare URL - $url = $q; - $crc32url = crc32($url); - - // Check URL for exist - $exist = $index->search('') - ->filter('id', $crc32url) - ->limit(1) - ->get() - ->getTotal(); - - if ($exist) - { - /* disable as regular search request possible - $response = sprintf( - _('URL "%s" exists in search index'), - htmlentities($q) - ); - */ - } - - // Add URL - else - { - // @TODO check http code - - $index->addDocument( - [ - 'url' => $url, - 'rank' => (int) mb_strlen( - (string) - urldecode( - (string) - parse_url( - $url, - PHP_URL_PATH - ) - ) - ) - ], - $crc32url - ); - - $response = sprintf( - _('URL "%s" added to the crawl queue!'), - htmlentities($q) - ); - } - } - - else { - $response = sprintf( - _('URL "%s" does not match node settings!'), - htmlentities($q) - ); - } -} - -// Extended corrections -switch (true) -{ - // Empty query - case empty($q): - - $query = $index->search('')->sort('RAND()'); - - break; - - // URL request - case filter_var($q, FILTER_VALIDATE_URL): - - $query = $index->search('')->filter('id', crc32($q)); - - break; - - default: - - // Allow raw requests on extended syntax mode requested - // http://sphinxsearch.com/docs/current/extended-syntax.html - if (isset($_GET['e']) && $config->webui->search->extended->enabled) - { - $query = $index->search($q); - } - - // Regular request - else - { - $query = $index->search( - @\Manticoresearch\Utils::escape( - $q - ) - ); - } -} - -// Apply search options (e.g. field_weights) -foreach ($config->webui->search->options as $key => $value) -{ - if (is_int($value) || is_string($value)) - { - $query->option( - $key, - $value - ); - } - - else - { - $query->option( - $key, - (array) $value - ); - } -} - -// Apply highlight options -if ($config->webui->search->highlight->fields) -{ - $query->highlight( - (array) $config->webui->search->highlight->fields, - (array) $config->webui->search->highlight->options - ); -} - -// Get found -$found = empty($q) ? $total : $query->get()->getTotal(); - -// Search request begin -$results = $query->offset($p * $config->webui->pagination->limit - $config->webui->pagination->limit) - ->limit($config->webui->pagination->limit) - ->get(); - -?> - - - - - <?php echo sprintf(_('Yo! %s'), htmlentities($q)) ?> - - - - - -
-
-

- - webui->search->extended->enabled) { ?> - - - -
-
-
- webui->search->extended->enabled) { ?> -
-

- - - - - - - - -

-

- - @title - @description - @keywords - @mime - @url -

-
- - -
- -
- -
- -
- -
- url, - PHP_URL_HOST - ); - - $identicon = new \Jdenticon\Identicon(); - - $identicon->setValue( - $hostname - ); - - $identicon->setSize(14); - - $identicon->setStyle( - [ - 'backgroundColor' => 'rgba(255, 255, 255, 0)', - 'padding' => 0 - ] - ); - - $icon = $identicon->getImageDataUri('webp'); - - ?> - identicon - getHighlight()['title'])) { ?> -
-

- getHighlight()['title'] as $title) { ?> -

- -

-
- title)) { ?> -
-

title ?>

-
- - getHighlight()['description'])) { ?> -
- getHighlight()['description'] as $description) { ?> -

- -
- description)) { ?> -
- description ?> -
- - getHighlight()['keywords'])) { ?> -
- getHighlight()['keywords'] as $keywords) { ?> -

- -
- keywords)) { ?> -
- keywords ?> -
- - getHighlight()['body'])) { ?> -
- getHighlight()['body'] as $body) { ?> -

- -
- -
- getHighlight()['url'])) { ?> - getHighlight()['url'] as $url) { ?> - - - title)) { ?> - url)) ?> - - get('code'), [0, 200])) { ?> - - - - - get('code') ?> - - - -
-
- - webui->pagination->limit <= $results->getTotal()) { ?> -
-
- - - -
-
- -
- - \ No newline at end of file