From 3448eb85f7d7087a98497f079d512f2d37a25f6f Mon Sep 17 00:00:00 2001 From: ghost Date: Sat, 25 Nov 2023 04:44:07 +0200 Subject: [PATCH] implement yggo db migration cli tool --- README.md | 21 ++++- src/cli/yggo/import.php | 169 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 src/cli/yggo/import.php diff --git a/README.md b/README.md index 600eb11..1c11283 100644 --- a/README.md +++ b/README.md @@ -78,4 +78,23 @@ php src/cli/document/crawl.php php src/cli/document/search.php '@title "*"' [limit] ``` * `query` - required -* `limit` - optional search results limit \ No newline at end of file +* `limit` - optional search results limit + +##### Migration + +###### YGGo + +Import index from YGGo database + +``` +php src/cli/yggo/import.php 'host' 'port' 'user' 'password' 'database' [unique] +``` + +Source DB fields required: + +* `host` +* `port` +* `user` +* `password` +* `database` +* `unique` - optional, check for unique URL (takes more time) \ No newline at end of file diff --git a/src/cli/yggo/import.php b/src/cli/yggo/import.php new file mode 100644 index 0000000..2b48da7 --- /dev/null +++ b/src/cli/yggo/import.php @@ -0,0 +1,169 @@ + $config->manticore->server->host, + 'port' => $config->manticore->server->port, + ] +); + +// Init index +$index = $client->index( + $config->manticore->index->document +); + +// Connect Yggo DB +try +{ + $yggo = new PDO( + 'mysql:dbname=' . $argv[5] . ';host=' . $argv[1] . ';port=' . $argv[2] . ';charset=utf8', + $argv[3], + $argv[4], + [ + PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8' + ] + ); + + $yggo->setAttribute( + PDO::ATTR_ERRMODE, + PDO::ERRMODE_EXCEPTION + ); + + $yggo->setAttribute( + PDO::ATTR_DEFAULT_FETCH_MODE, + PDO::FETCH_OBJ + ); + + $yggo->setAttribute( + PDO::ATTR_TIMEOUT, + 600 + ); +} + +catch (Exception $error) +{ + var_dump( + $error + ); + + exit; +} + +$start = 0; +$limit = 100; + +$total = $yggo->query('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` = 200')->fetch()->total; + +$processed = 0; + +for ($i = 0; $i <= $total; $i++) +{ + $query = $yggo->query('SELECT `hostPage`.`hostPageId`, + `hostPage`.`httpCode`, + `hostPage`.`mime`, + `hostPage`.`size`, + `hostPage`.`timeUpdated`, + `hostPage`.`uri`, + + `host`.`scheme`, + `host`.`name`, + `host`.`port`, + + ( + SELECT `hostPageDescription`.`title` FROM `hostPageDescription` + WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId` + ORDER BY `hostPageDescription`.`timeAdded` DESC + LIMIT 1 + ) AS `title`, + + ( + SELECT `hostPageDescription`.`description` FROM `hostPageDescription` + WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId` + ORDER BY `hostPageDescription`.`timeAdded` DESC + LIMIT 1 + ) AS `description`, + + ( + SELECT `hostPageDescription`.`keywords` FROM `hostPageDescription` + WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId` + ORDER BY `hostPageDescription`.`timeAdded` DESC + LIMIT 1 + ) AS `keywords` + + FROM `hostPage` + JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + + WHERE `hostPage`.`httpCode` = 200 + AND `hostPage`.`timeUpdated` IS NOT NULL + AND `hostPage`.`mime` IS NOT NULL + AND `hostPage`.`size` IS NOT NULL + + GROUP BY `hostPage`.`hostPageId` + + LIMIT ' . $start . ',' . $limit); + + + foreach ($query->fetchAll() as $remote) + { + $url = $remote->scheme . '://' . $remote->name . ($remote->port ? ':' . $remote->port : false) . $remote->uri; + + // Check for unique URL requested + if (isset($argv[6])) + { + $local = $index->search('@url "' . trim($argv[1]) . '"') + ->limit(1) + ->get(); + + if ($local->getTotal()) + { + // Result + echo sprintf( + _('[%s/%s] [skip duplicate] %s') . PHP_EOL, + $processed++, + $total, + $url + ); + + continue; + } + } + + $index->addDocument( + [ + 'url' => $url, + 'time' => $remote->timeUpdated, + 'code' => $remote->httpCode, + 'mime' => $remote->mime, + 'size' => $remote->size, + 'title' => $remote->title, + 'description' => $remote->description, + 'keywords' => $remote->keywords + ] + ); + + // Result + echo sprintf( + _('[%s/%s] [add] %s') . PHP_EOL, + $processed++, + $total, + $url + ); + } + + // Update queue offset + $start = $start + $limit; +} + +// Done +echo _('import completed!') . PHP_EOL; \ No newline at end of file