Browse Source

add CURL options that prevent crawl queue stuck

main
ghost 1 year ago
parent
commit
02dd3649a7
  1. 14
      example/config.json
  2. 47
      src/cli/document/crawl.php

14
example/config.json

@ -51,6 +51,20 @@
{ {
"crawl": "crawl":
{ {
"curl":
{
"connection":
{
"timeout":3
},
"download":
{
"size":
{
"max":10000024
}
}
},
"queue": "queue":
{ {
"limit":1, "limit":1,

47
src/cli/document/crawl.php

@ -18,6 +18,12 @@ $config = json_decode(
) )
); );
// Set global options
define(
'CONFIG_CLI_DOCUMENT_CRAWL_CURL_DOWNLOAD_SIZE_MAX',
$config->cli->document->crawl->curl->download->size->max
);
// Init client // Init client
$client = new \Manticoresearch\Client( $client = new \Manticoresearch\Client(
[ [
@ -57,6 +63,13 @@ $index = $client->index(
// Begin queue // Begin queue
foreach($search->get() as $document) foreach($search->get() as $document)
{ {
// Debug target
echo sprintf(
'index "%s" in "%s"' . PHP_EOL,
$document->get('url'),
$config->manticore->index->document->name
);
// Update index time // Update index time
$index->updateDocument( $index->updateDocument(
[ [
@ -70,12 +83,46 @@ foreach($search->get() as $document)
$document->get('url') $document->get('url')
); );
// Drop URL with long response
curl_setopt(
$request,
CURLOPT_CONNECTTIMEOUT,
$config->cli->document->crawl->curl->connection->timeout
);
curl_setopt(
$request,
CURLOPT_TIMEOUT,
$config->cli->document->crawl->curl->connection->timeout
);
// Prevent huge content download e.g. media streams URL
curl_setopt( curl_setopt(
$request, $request,
CURLOPT_RETURNTRANSFER, CURLOPT_RETURNTRANSFER,
true true
); );
curl_setopt(
$request,
CURLOPT_NOPROGRESS,
false
);
curl_setopt(
$request,
CURLOPT_PROGRESSFUNCTION,
function(
$download,
$downloaded,
$upload,
$uploaded
) {
return $downloaded > CONFIG_CLI_DOCUMENT_CRAWL_CURL_DOWNLOAD_SIZE_MAX ? 1 : 0;
}
);
// Begin request
if ($response = curl_exec($request)) if ($response = curl_exec($request))
{ {
// Update HTTP code // Update HTTP code

Loading…
Cancel
Save