mirror of
https://github.com/YGGverse/Yo.git
synced 2025-01-29 16:04:20 +00:00
implement index cleaner tool #5
This commit is contained in:
parent
7ea9cbffcd
commit
2961045c76
@ -70,6 +70,15 @@ php src/cli/document/add.php URL
|
||||
php src/cli/document/crawl.php
|
||||
```
|
||||
|
||||
##### Clean
|
||||
|
||||
```
|
||||
php src/cli/document/clean.php
|
||||
```
|
||||
|
||||
* remove `url` duplicates
|
||||
* make index optimization
|
||||
|
||||
##### Search
|
||||
|
||||
```
|
||||
|
80
src/cli/document/clean.php
Normal file
80
src/cli/document/clean.php
Normal file
@ -0,0 +1,80 @@
|
||||
<?php
|
||||
|
||||
// Load dependencies
|
||||
require_once __DIR__ . '/../../../vendor/autoload.php';
|
||||
|
||||
// Init config
|
||||
$config = json_decode(
|
||||
file_get_contents(
|
||||
__DIR__ . '/../../../config.json'
|
||||
)
|
||||
);
|
||||
|
||||
// Init client
|
||||
$client = new \Manticoresearch\Client(
|
||||
[
|
||||
'host' => $config->manticore->server->host,
|
||||
'port' => $config->manticore->server->port,
|
||||
]
|
||||
);
|
||||
|
||||
// Init index
|
||||
$index = $client->index(
|
||||
$config->manticore->index->document->name
|
||||
);
|
||||
|
||||
// Get totals
|
||||
$total = $index->search('')
|
||||
->option('cutoff', 0)
|
||||
->limit(0)
|
||||
->get()
|
||||
->getTotal();
|
||||
|
||||
// Delete duplicates #5
|
||||
$delete = [];
|
||||
|
||||
foreach($index->search('')->limit($total)->get() as $queue)
|
||||
{
|
||||
$duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get();
|
||||
|
||||
if ($duplicates->getTotal() > 1)
|
||||
{
|
||||
foreach ($duplicates as $duplicate)
|
||||
{
|
||||
$delete[$duplicate->crc32url][] = $duplicate->getId();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$i = 0;
|
||||
foreach ($delete as $crc32url => $ids)
|
||||
{
|
||||
$j = 0;
|
||||
foreach ($ids as $id)
|
||||
{
|
||||
$i++;
|
||||
$j++;
|
||||
|
||||
// Skip first link
|
||||
if ($j == 1) continue;
|
||||
|
||||
// Delete duplicate
|
||||
$index->deleteDocument($id);
|
||||
}
|
||||
}
|
||||
|
||||
// Free mem
|
||||
$delete = [];
|
||||
|
||||
// Dump operation result
|
||||
echo sprintf(
|
||||
_('duplicated URLs deleted: %s') . PHP_EOL,
|
||||
number_format($i)
|
||||
);
|
||||
|
||||
// Optimize indexes
|
||||
echo _('indexes optimization begin') . PHP_EOL;
|
||||
|
||||
$index->optimize();
|
||||
|
||||
echo _('indexes optimization completed') . PHP_EOL;
|
Loading…
x
Reference in New Issue
Block a user