Browse Source

set crc32url as document id

main
ghost 10 months ago
parent
commit
6f4abe4729
  1. 1
      README.md
  2. 8
      src/cli/document/add.php
  3. 51
      src/cli/document/clean.php
  4. 18
      src/cli/document/crawl.php
  5. 4
      src/cli/index/init.php
  6. 6
      src/cli/yggo/import.php
  7. 10
      src/webui/search.php

1
README.md

@ -88,7 +88,6 @@ php src/cli/document/crawl.php
php src/cli/document/clean.php php src/cli/document/clean.php
``` ```
* remove `url` duplicates
* make index optimization * make index optimization
##### Search ##### Search

8
src/cli/document/add.php

@ -29,7 +29,7 @@ $crc32url = crc32($url);
// Check URL for exist // Check URL for exist
$result = $index->search('') $result = $index->search('')
->filter('crc32url', $crc32url) ->filter('id', $crc32url)
->limit(1) ->limit(1)
->get(); ->get();
@ -47,9 +47,9 @@ if ($result->getTotal())
// Add // Add
$result = $index->addDocument( $result = $index->addDocument(
[ [
'url' => $url, 'url' => $url
'crc32url' => $crc32url ],
] $crc32url
); );
echo sprintf( echo sprintf(

51
src/cli/document/clean.php

@ -36,57 +36,6 @@ $index = $client->index(
$config->manticore->index->document->name $config->manticore->index->document->name
); );
// Get totals
$total = $index->search('')
->option('cutoff', 0)
->limit(0)
->get()
->getTotal();
// Delete duplicates #5
$delete = [];
foreach($index->search('')->limit($total)->get() as $queue)
{
$duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get();
if ($duplicates->getTotal() > 1)
{
foreach ($duplicates as $duplicate)
{
$delete[$duplicate->crc32url][] = $duplicate->getId();
}
}
}
$i = 0;
foreach ($delete as $crc32url => $ids)
{
$j = 0;
foreach ($ids as $id)
{
$i++;
$j++;
// Skip first link
if ($j == 1) continue;
// Delete duplicate
$index->deleteDocument($id);
}
}
// Free mem
$delete = [];
// @TODO $config->cli->document->crawl->skip->stripos->url
// Dump operation result
echo sprintf(
_('duplicated URLs deleted: %s') . PHP_EOL,
number_format($i)
);
// Optimize indexes // Optimize indexes
echo _('indexes optimization begin') . PHP_EOL; echo _('indexes optimization begin') . PHP_EOL;

18
src/cli/document/crawl.php

@ -98,7 +98,6 @@ foreach($index->search('')
$data = $data =
[ [
'url' => $document->get('url'), 'url' => $document->get('url'),
'crc32url' => $document->get('crc32url'),
'title' => $document->get('title'), 'title' => $document->get('title'),
'description' => $document->get('description'), 'description' => $document->get('description'),
'keywords' => $document->get('keywords'), 'keywords' => $document->get('keywords'),
@ -355,17 +354,20 @@ foreach($index->search('')
$crc32url = crc32($url); $crc32url = crc32($url);
if (!$index->search('') if (!$index->search('')
->filter('crc32url', $crc32url) ->filter('id', $crc32url)
->limit(1) ->limit(1)
->get() ->get()
->getTotal()) ->getTotal())
{ {
echo 'add';
/*
$index->addDocument( $index->addDocument(
[ [
'url' => $url, 'url' => $url
'crc32url' => $crc32url ],
] $crc32url
); );
*/
if ($config->cli->document->crawl->debug->level->notice) if ($config->cli->document->crawl->debug->level->notice)
{ {
@ -383,10 +385,16 @@ foreach($index->search('')
// Replace document data // Replace document data
// https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916 // https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916
// @TODO optimization for replacements required
// https://manual.manticoresearch.com/Data_creation_and_modification/Updating_documents/REPLACE
echo 'replace';
/*
$result = $index->replaceDocument( $result = $index->replaceDocument(
$data, $data,
$document->getId() $document->getId()
); );
*/
// Debug result // Debug result
if ($config->cli->document->crawl->debug->level->notice) if ($config->cli->document->crawl->debug->level->notice)

4
src/cli/index/init.php

@ -79,10 +79,6 @@ $result = $index->create(
'time' => 'time' =>
[ [
'type' => 'integer' 'type' => 'integer'
],
'crc32url' =>
[
'type' => 'bigint'
] ]
], ],
(array) $config->manticore->index->document->settings (array) $config->manticore->index->document->settings

6
src/cli/yggo/import.php

@ -128,7 +128,7 @@ for ($i = 0; $i <= $total; $i++)
if (isset($argv[6])) if (isset($argv[6]))
{ {
$local = $index->search('') $local = $index->search('')
->filter('crc32url', $crc32url) ->filter('id', $crc32url)
->limit(1) ->limit(1)
->get(); ->get();
@ -149,7 +149,6 @@ for ($i = 0; $i <= $total; $i++)
$index->addDocument( $index->addDocument(
[ [
'url' => $url, 'url' => $url,
'crc32url' => (int) $crc32url,
'time' => (int) $remote->timeUpdated, 'time' => (int) $remote->timeUpdated,
'code' => (int) $remote->httpCode, 'code' => (int) $remote->httpCode,
'size' => (int) $remote->size, 'size' => (int) $remote->size,
@ -157,7 +156,8 @@ for ($i = 0; $i <= $total; $i++)
'title' => (string) $remote->title, 'title' => (string) $remote->title,
'description' => (string) $remote->description, 'description' => (string) $remote->description,
'keywords' => (string) $remote->keywords 'keywords' => (string) $remote->keywords
] ],
(int) $crc32url
); );
// Result // Result

10
src/webui/search.php

@ -60,7 +60,7 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE
// Check URL for exist // Check URL for exist
$exist = $index->search('') $exist = $index->search('')
->filter('crc32url', $crc32url) ->filter('id', $crc32url)
->limit(1) ->limit(1)
->get() ->get()
->getTotal(); ->getTotal();
@ -82,9 +82,9 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE
$index->addDocument( $index->addDocument(
[ [
'url' => $url, 'url' => $url
'crc32url' => $crc32url ],
] $crc32url
); );
$response = sprintf( $response = sprintf(
@ -113,7 +113,7 @@ switch (true)
case filter_var($q, FILTER_VALIDATE_URL): case filter_var($q, FILTER_VALIDATE_URL):
$query = $index->search('')->filter('crc32url', crc32($q)); $query = $index->search('')->filter('id', crc32($q));
break; break;

Loading…
Cancel
Save