mirror of
https://github.com/YGGverse/Yo.git
synced 2025-01-29 16:04:20 +00:00
set crc32url as document id
This commit is contained in:
parent
93baed4b90
commit
6f4abe4729
@ -88,7 +88,6 @@ php src/cli/document/crawl.php
|
||||
php src/cli/document/clean.php
|
||||
```
|
||||
|
||||
* remove `url` duplicates
|
||||
* make index optimization
|
||||
|
||||
##### Search
|
||||
|
@ -29,7 +29,7 @@ $crc32url = crc32($url);
|
||||
|
||||
// Check URL for exist
|
||||
$result = $index->search('')
|
||||
->filter('crc32url', $crc32url)
|
||||
->filter('id', $crc32url)
|
||||
->limit(1)
|
||||
->get();
|
||||
|
||||
@ -47,9 +47,9 @@ if ($result->getTotal())
|
||||
// Add
|
||||
$result = $index->addDocument(
|
||||
[
|
||||
'url' => $url,
|
||||
'crc32url' => $crc32url
|
||||
]
|
||||
'url' => $url
|
||||
],
|
||||
$crc32url
|
||||
);
|
||||
|
||||
echo sprintf(
|
||||
|
@ -36,57 +36,6 @@ $index = $client->index(
|
||||
$config->manticore->index->document->name
|
||||
);
|
||||
|
||||
// Get totals
|
||||
$total = $index->search('')
|
||||
->option('cutoff', 0)
|
||||
->limit(0)
|
||||
->get()
|
||||
->getTotal();
|
||||
|
||||
// Delete duplicates #5
|
||||
$delete = [];
|
||||
|
||||
foreach($index->search('')->limit($total)->get() as $queue)
|
||||
{
|
||||
$duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get();
|
||||
|
||||
if ($duplicates->getTotal() > 1)
|
||||
{
|
||||
foreach ($duplicates as $duplicate)
|
||||
{
|
||||
$delete[$duplicate->crc32url][] = $duplicate->getId();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$i = 0;
|
||||
foreach ($delete as $crc32url => $ids)
|
||||
{
|
||||
$j = 0;
|
||||
foreach ($ids as $id)
|
||||
{
|
||||
$i++;
|
||||
$j++;
|
||||
|
||||
// Skip first link
|
||||
if ($j == 1) continue;
|
||||
|
||||
// Delete duplicate
|
||||
$index->deleteDocument($id);
|
||||
}
|
||||
}
|
||||
|
||||
// Free mem
|
||||
$delete = [];
|
||||
|
||||
// @TODO $config->cli->document->crawl->skip->stripos->url
|
||||
|
||||
// Dump operation result
|
||||
echo sprintf(
|
||||
_('duplicated URLs deleted: %s') . PHP_EOL,
|
||||
number_format($i)
|
||||
);
|
||||
|
||||
// Optimize indexes
|
||||
echo _('indexes optimization begin') . PHP_EOL;
|
||||
|
||||
|
@ -98,7 +98,6 @@ foreach($index->search('')
|
||||
$data =
|
||||
[
|
||||
'url' => $document->get('url'),
|
||||
'crc32url' => $document->get('crc32url'),
|
||||
'title' => $document->get('title'),
|
||||
'description' => $document->get('description'),
|
||||
'keywords' => $document->get('keywords'),
|
||||
@ -355,17 +354,20 @@ foreach($index->search('')
|
||||
$crc32url = crc32($url);
|
||||
|
||||
if (!$index->search('')
|
||||
->filter('crc32url', $crc32url)
|
||||
->filter('id', $crc32url)
|
||||
->limit(1)
|
||||
->get()
|
||||
->getTotal())
|
||||
{
|
||||
echo 'add';
|
||||
/*
|
||||
$index->addDocument(
|
||||
[
|
||||
'url' => $url,
|
||||
'crc32url' => $crc32url
|
||||
]
|
||||
'url' => $url
|
||||
],
|
||||
$crc32url
|
||||
);
|
||||
*/
|
||||
|
||||
if ($config->cli->document->crawl->debug->level->notice)
|
||||
{
|
||||
@ -383,10 +385,16 @@ foreach($index->search('')
|
||||
|
||||
// Replace document data
|
||||
// https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916
|
||||
|
||||
// @TODO optimization for replacements required
|
||||
// https://manual.manticoresearch.com/Data_creation_and_modification/Updating_documents/REPLACE
|
||||
echo 'replace';
|
||||
/*
|
||||
$result = $index->replaceDocument(
|
||||
$data,
|
||||
$document->getId()
|
||||
);
|
||||
*/
|
||||
|
||||
// Debug result
|
||||
if ($config->cli->document->crawl->debug->level->notice)
|
||||
|
@ -79,10 +79,6 @@ $result = $index->create(
|
||||
'time' =>
|
||||
[
|
||||
'type' => 'integer'
|
||||
],
|
||||
'crc32url' =>
|
||||
[
|
||||
'type' => 'bigint'
|
||||
]
|
||||
],
|
||||
(array) $config->manticore->index->document->settings
|
||||
|
@ -128,7 +128,7 @@ for ($i = 0; $i <= $total; $i++)
|
||||
if (isset($argv[6]))
|
||||
{
|
||||
$local = $index->search('')
|
||||
->filter('crc32url', $crc32url)
|
||||
->filter('id', $crc32url)
|
||||
->limit(1)
|
||||
->get();
|
||||
|
||||
@ -149,7 +149,6 @@ for ($i = 0; $i <= $total; $i++)
|
||||
$index->addDocument(
|
||||
[
|
||||
'url' => $url,
|
||||
'crc32url' => (int) $crc32url,
|
||||
'time' => (int) $remote->timeUpdated,
|
||||
'code' => (int) $remote->httpCode,
|
||||
'size' => (int) $remote->size,
|
||||
@ -157,7 +156,8 @@ for ($i = 0; $i <= $total; $i++)
|
||||
'title' => (string) $remote->title,
|
||||
'description' => (string) $remote->description,
|
||||
'keywords' => (string) $remote->keywords
|
||||
]
|
||||
],
|
||||
(int) $crc32url
|
||||
);
|
||||
|
||||
// Result
|
||||
|
@ -60,7 +60,7 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE
|
||||
|
||||
// Check URL for exist
|
||||
$exist = $index->search('')
|
||||
->filter('crc32url', $crc32url)
|
||||
->filter('id', $crc32url)
|
||||
->limit(1)
|
||||
->get()
|
||||
->getTotal();
|
||||
@ -82,9 +82,9 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE
|
||||
|
||||
$index->addDocument(
|
||||
[
|
||||
'url' => $url,
|
||||
'crc32url' => $crc32url
|
||||
]
|
||||
'url' => $url
|
||||
],
|
||||
$crc32url
|
||||
);
|
||||
|
||||
$response = sprintf(
|
||||
@ -113,7 +113,7 @@ switch (true)
|
||||
|
||||
case filter_var($q, FILTER_VALIDATE_URL):
|
||||
|
||||
$query = $index->search('')->filter('crc32url', crc32($q));
|
||||
$query = $index->search('')->filter('id', crc32($q));
|
||||
|
||||
break;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user