Browse Source

set crc32url as document id

main
ghost 11 months ago
parent
commit
6f4abe4729
  1. 1
      README.md
  2. 8
      src/cli/document/add.php
  3. 51
      src/cli/document/clean.php
  4. 18
      src/cli/document/crawl.php
  5. 4
      src/cli/index/init.php
  6. 6
      src/cli/yggo/import.php
  7. 10
      src/webui/search.php

1
README.md

@ -88,7 +88,6 @@ php src/cli/document/crawl.php @@ -88,7 +88,6 @@ php src/cli/document/crawl.php
php src/cli/document/clean.php
```
* remove `url` duplicates
* make index optimization
##### Search

8
src/cli/document/add.php

@ -29,7 +29,7 @@ $crc32url = crc32($url); @@ -29,7 +29,7 @@ $crc32url = crc32($url);
// Check URL for exist
$result = $index->search('')
->filter('crc32url', $crc32url)
->filter('id', $crc32url)
->limit(1)
->get();
@ -47,9 +47,9 @@ if ($result->getTotal()) @@ -47,9 +47,9 @@ if ($result->getTotal())
// Add
$result = $index->addDocument(
[
'url' => $url,
'crc32url' => $crc32url
]
'url' => $url
],
$crc32url
);
echo sprintf(

51
src/cli/document/clean.php

@ -36,57 +36,6 @@ $index = $client->index( @@ -36,57 +36,6 @@ $index = $client->index(
$config->manticore->index->document->name
);
// Get totals
$total = $index->search('')
->option('cutoff', 0)
->limit(0)
->get()
->getTotal();
// Delete duplicates #5
$delete = [];
foreach($index->search('')->limit($total)->get() as $queue)
{
$duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get();
if ($duplicates->getTotal() > 1)
{
foreach ($duplicates as $duplicate)
{
$delete[$duplicate->crc32url][] = $duplicate->getId();
}
}
}
$i = 0;
foreach ($delete as $crc32url => $ids)
{
$j = 0;
foreach ($ids as $id)
{
$i++;
$j++;
// Skip first link
if ($j == 1) continue;
// Delete duplicate
$index->deleteDocument($id);
}
}
// Free mem
$delete = [];
// @TODO $config->cli->document->crawl->skip->stripos->url
// Dump operation result
echo sprintf(
_('duplicated URLs deleted: %s') . PHP_EOL,
number_format($i)
);
// Optimize indexes
echo _('indexes optimization begin') . PHP_EOL;

18
src/cli/document/crawl.php

@ -98,7 +98,6 @@ foreach($index->search('') @@ -98,7 +98,6 @@ foreach($index->search('')
$data =
[
'url' => $document->get('url'),
'crc32url' => $document->get('crc32url'),
'title' => $document->get('title'),
'description' => $document->get('description'),
'keywords' => $document->get('keywords'),
@ -355,17 +354,20 @@ foreach($index->search('') @@ -355,17 +354,20 @@ foreach($index->search('')
$crc32url = crc32($url);
if (!$index->search('')
->filter('crc32url', $crc32url)
->filter('id', $crc32url)
->limit(1)
->get()
->getTotal())
{
echo 'add';
/*
$index->addDocument(
[
'url' => $url,
'crc32url' => $crc32url
]
'url' => $url
],
$crc32url
);
*/
if ($config->cli->document->crawl->debug->level->notice)
{
@ -383,10 +385,16 @@ foreach($index->search('') @@ -383,10 +385,16 @@ foreach($index->search('')
// Replace document data
// https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916
// @TODO optimization for replacements required
// https://manual.manticoresearch.com/Data_creation_and_modification/Updating_documents/REPLACE
echo 'replace';
/*
$result = $index->replaceDocument(
$data,
$document->getId()
);
*/
// Debug result
if ($config->cli->document->crawl->debug->level->notice)

4
src/cli/index/init.php

@ -79,10 +79,6 @@ $result = $index->create( @@ -79,10 +79,6 @@ $result = $index->create(
'time' =>
[
'type' => 'integer'
],
'crc32url' =>
[
'type' => 'bigint'
]
],
(array) $config->manticore->index->document->settings

6
src/cli/yggo/import.php

@ -128,7 +128,7 @@ for ($i = 0; $i <= $total; $i++) @@ -128,7 +128,7 @@ for ($i = 0; $i <= $total; $i++)
if (isset($argv[6]))
{
$local = $index->search('')
->filter('crc32url', $crc32url)
->filter('id', $crc32url)
->limit(1)
->get();
@ -149,7 +149,6 @@ for ($i = 0; $i <= $total; $i++) @@ -149,7 +149,6 @@ for ($i = 0; $i <= $total; $i++)
$index->addDocument(
[
'url' => $url,
'crc32url' => (int) $crc32url,
'time' => (int) $remote->timeUpdated,
'code' => (int) $remote->httpCode,
'size' => (int) $remote->size,
@ -157,7 +156,8 @@ for ($i = 0; $i <= $total; $i++) @@ -157,7 +156,8 @@ for ($i = 0; $i <= $total; $i++)
'title' => (string) $remote->title,
'description' => (string) $remote->description,
'keywords' => (string) $remote->keywords
]
],
(int) $crc32url
);
// Result

10
src/webui/search.php

@ -60,7 +60,7 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE @@ -60,7 +60,7 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE
// Check URL for exist
$exist = $index->search('')
->filter('crc32url', $crc32url)
->filter('id', $crc32url)
->limit(1)
->get()
->getTotal();
@ -82,9 +82,9 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE @@ -82,9 +82,9 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE
$index->addDocument(
[
'url' => $url,
'crc32url' => $crc32url
]
'url' => $url
],
$crc32url
);
$response = sprintf(
@ -113,7 +113,7 @@ switch (true) @@ -113,7 +113,7 @@ switch (true)
case filter_var($q, FILTER_VALIDATE_URL):
$query = $index->search('')->filter('crc32url', crc32($q));
$query = $index->search('')->filter('id', crc32($q));
break;

Loading…
Cancel
Save