Browse Source

crawl newest pages by rand in queue

main
ghost 12 months ago
parent
commit
33cc778999
  1. 36
      src/cli/document/crawl.php

36
src/cli/document/crawl.php

@ -50,30 +50,6 @@ try { @@ -50,30 +50,6 @@ try {
]
);
// Init search
$search = new \Manticoresearch\Search(
$client
);
$search->setIndex(
$config->manticore->index->document->name
);
$search->match(
'*',
'url'
);
$search->sort(
'time',
'asc'
);
$search->limit(
$config->cli->document->crawl->queue->limit
);
// Init index
$index = $client->index(
$config->manticore->index->document->name
);
@ -105,8 +81,16 @@ if ($config->cli->document->crawl->debug->level->notice) @@ -105,8 +81,16 @@ if ($config->cli->document->crawl->debug->level->notice)
);
}
// Begin queue
foreach($search->get() as $document)
// Begin crawl queue
// thanks to @manticoresearch for help with random feature implementation:
// https://github.com/manticoresoftware/manticoresearch-php/discussions/176
foreach($index->search('')
->expression('random', 'rand()')
->sort('time', 'asc')
->sort('random', 'asc')
->limit($config->cli->document->crawl->queue->limit)
->get() as $document)
{
// Define data
$time = time();

Loading…
Cancel
Save