implement index cleaner tool #5

This commit is contained in:
ghost 2023-11-27 19:29:17 +02:00
parent 7ea9cbffcd
commit 2961045c76
2 changed files with 89 additions and 0 deletions

View File

@ -70,6 +70,15 @@ php src/cli/document/add.php URL
php src/cli/document/crawl.php
```
##### Clean
```
php src/cli/document/clean.php
```
* remove `url` duplicates
* make index optimization
##### Search
```

View File

@ -0,0 +1,80 @@
<?php
// Load dependencies
require_once __DIR__ . '/../../../vendor/autoload.php';
// Init config
$config = json_decode(
file_get_contents(
__DIR__ . '/../../../config.json'
)
);
// Init client
$client = new \Manticoresearch\Client(
[
'host' => $config->manticore->server->host,
'port' => $config->manticore->server->port,
]
);
// Init index
$index = $client->index(
$config->manticore->index->document->name
);
// Get totals
$total = $index->search('')
->option('cutoff', 0)
->limit(0)
->get()
->getTotal();
// Delete duplicates #5
$delete = [];
foreach($index->search('')->limit($total)->get() as $queue)
{
$duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get();
if ($duplicates->getTotal() > 1)
{
foreach ($duplicates as $duplicate)
{
$delete[$duplicate->crc32url][] = $duplicate->getId();
}
}
}
$i = 0;
foreach ($delete as $crc32url => $ids)
{
$j = 0;
foreach ($ids as $id)
{
$i++;
$j++;
// Skip first link
if ($j == 1) continue;
// Delete duplicate
$index->deleteDocument($id);
}
}
// Free mem
$delete = [];
// Dump operation result
echo sprintf(
_('duplicated URLs deleted: %s') . PHP_EOL,
number_format($i)
);
// Optimize indexes
echo _('indexes optimization begin') . PHP_EOL;
$index->optimize();
echo _('indexes optimization completed') . PHP_EOL;