From 02dd3649a7d9eb40f4ab37c540ddf01f9c2dbb28 Mon Sep 17 00:00:00 2001 From: ghost Date: Mon, 27 Nov 2023 16:54:26 +0200 Subject: [PATCH] add CURL options that prevent crawl queue stuck --- example/config.json | 14 ++++++++++++ src/cli/document/crawl.php | 47 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/example/config.json b/example/config.json index 87b8dfb..31981bf 100644 --- a/example/config.json +++ b/example/config.json @@ -51,6 +51,20 @@ { "crawl": { + "curl": + { + "connection": + { + "timeout":3 + }, + "download": + { + "size": + { + "max":10000024 + } + } + }, "queue": { "limit":1, diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index 7948c09..a19e00d 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -18,6 +18,12 @@ $config = json_decode( ) ); +// Set global options +define( + 'CONFIG_CLI_DOCUMENT_CRAWL_CURL_DOWNLOAD_SIZE_MAX', + $config->cli->document->crawl->curl->download->size->max +); + // Init client $client = new \Manticoresearch\Client( [ @@ -57,6 +63,13 @@ $index = $client->index( // Begin queue foreach($search->get() as $document) { + // Debug target + echo sprintf( + 'index "%s" in "%s"' . PHP_EOL, + $document->get('url'), + $config->manticore->index->document->name + ); + // Update index time $index->updateDocument( [ @@ -70,12 +83,46 @@ foreach($search->get() as $document) $document->get('url') ); + // Drop URL with long response + curl_setopt( + $request, + CURLOPT_CONNECTTIMEOUT, + $config->cli->document->crawl->curl->connection->timeout + ); + + curl_setopt( + $request, + CURLOPT_TIMEOUT, + $config->cli->document->crawl->curl->connection->timeout + ); + + // Prevent huge content download e.g. media streams URL curl_setopt( $request, CURLOPT_RETURNTRANSFER, true ); + curl_setopt( + $request, + CURLOPT_NOPROGRESS, + false + ); + + curl_setopt( + $request, + CURLOPT_PROGRESSFUNCTION, + function( + $download, + $downloaded, + $upload, + $uploaded + ) { + return $downloaded > CONFIG_CLI_DOCUMENT_CRAWL_CURL_DOWNLOAD_SIZE_MAX ? 1 : 0; + } + ); + + // Begin request if ($response = curl_exec($request)) { // Update HTTP code