From a8ffe143493730fb0b45b19506a5e0915c30540f Mon Sep 17 00:00:00 2001 From: ghost Date: Thu, 17 Aug 2023 14:58:06 +0300 Subject: [PATCH] implement 'hostPage add' CLI method --- README.md | 2 ++ src/cli/yggo.php | 86 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1a558a7..c56a0c3 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ sphinxsearch * Configuration examples presented at `/config` folder * Make sure `/src/storage/cache`, `/src/storage/tmp`, `/src/storage/snap` folders are writable * Set up the `/src/crontab` by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt) +* To start crawler, add at least one initial URL using search form or CLI #### JSON API @@ -241,6 +242,7 @@ _*CLI interface still under construction, use it for your own risk!_ + [ ] delete + [ ] flush * [x] hostPage + + [x] add + [x] rank + [x] reindex * [x] hostPageSnap diff --git a/src/cli/yggo.php b/src/cli/yggo.php index 23928cb..3d6dd60 100644 --- a/src/cli/yggo.php +++ b/src/cli/yggo.php @@ -6,6 +6,7 @@ require_once(__DIR__ . '/../library/cli.php'); require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/ftp.php'); +require_once(__DIR__ . '/../library/helper.php'); require_once __DIR__ . '/../../vendor/autoload.php'; // CLI only to prevent https server connection timeout @@ -42,8 +43,29 @@ if (false === sem_acquire($semaphore, true)) { exit; } -// Connect database -$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); +// Connect DB +try { + + $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} + +// Connect Yggverse\Cache\Memory +try { + + $memory = new Yggverse\Cache\Memory(MEMCACHED_HOST, MEMCACHED_PORT, MEMCACHED_NAMESPACE, MEMCACHED_TIMEOUT + time()); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} // CLI begin if (!empty($argv[1])) { @@ -401,6 +423,65 @@ if (!empty($argv[1])) { switch ($argv[2]) { + case 'add': + + if (empty($argv[3])) { + + CLI::danger('URL required'); + + exit; + } + + if (false === Yggverse\Parser\Url::is($argv[3])) { + + CLI::danger('URL address invalid'); + + exit; + } + + try { + + $db->beginTransaction(); + + if ($linkToDBresult = Helper::addLinkToDB($db, $memory, $argv[3])) { + + if (count($linkToDBresult->new->hostPageId)) { + + CLI::success('URL successfully registered in the crawler queue!'); + + $db->commit(); + + exit; + + } else { + + CLI::warning('URL already registered in the crawler queue!'); + + $db->rollBack(); + + exit; + } + + } else { + + CLI::danger('URL address not supported by this host rules!'); + + $db->rollBack(); + + exit; + } + + } catch(Exception $e){ + + var_dump($e); + + $db->rollBack(); + + exit; + } + + break; + case 'rank': if (!empty($argv[3])) { @@ -675,6 +756,7 @@ CLI::default(' crawl - execute step in crawler queue') CLI::default(' clean - execute step in cleaner queue'); CLI::break(); CLI::default(' hostPage '); +CLI::default(' add [URL] - register new address in the crawl queue'); CLI::default(' rank '); CLI::default(' reindex - reindex hostPage.rank fields'); CLI::break();