From c6e9ba9d09bd3f34f509d46b3a366fd80175f72e Mon Sep 17 00:00:00 2001 From: ghost Date: Fri, 24 Nov 2023 19:51:43 +0200 Subject: [PATCH] implement local storage feature with tar.gz compression --- src/cli/document/crawl.php | 128 ++++++++++++++++++++++++++++++++++++- src/config.json | 2 +- 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index 62b9ed0..c847ccd 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -252,6 +252,8 @@ foreach($search->get() as $document) { foreach (array_unique($documents) as $url) { + $url = trim($url); + if (!$index->search('@url "' . $url . '"') ->limit(1) ->get() @@ -272,9 +274,133 @@ foreach($search->get() as $document) } } } + + // Create snap + if ($config->cli->document->crawl->snap->enabled && $code === 200) + { + try + { + // Generate path + $time = time(); + + $md5url = md5( + $document->get('url') + ); + + /// absolute + if ('/' === substr($config->snap->storage->local->directory, 0, 1)) + { + $filepath = $config->snap->storage->local->directory; + } + + /// relative + else + { + $filepath = __DIR__ . '/../../../' . $config->snap->storage->local->directory; + } + + $filepath = sprintf( + '%s/%s', + $filepath, + implode( + '/', + str_split( + $md5url + ) + ) + ); + + $filename = sprintf( + '%s/%s.tar', + $filepath, + $time + ); + + @mkdir($filepath, 0755, true); + + // Compress response to archive + $snap = new PharData($filename); + + $snap->addFromString( + 'DATA', + $response + ); + + $snap->addFromString( + 'MIME', + $mime + ); + + $snap->addFromString( + 'URL', + $document->get('url') + ); + + $snap->compress( + Phar::GZ + ); + + unlink( + $filename + ); + + $filename = sprintf( + '%s.gz', + $filename + ); + + // Copy to mirror storage on enabled + if ($config->snap->storage->mirror->enabled) + { + // @TODO copy + // Snap match remote storage size/mime conditions + } + + // Remove snap on local storage disabled + if (!$config->snap->storage->local->enabled) + { + @unlink( + $filename + ); + } + + // Remove snap on out of local storage size limits + if ($size > $config->snap->storage->local->size->max) + { + @unlink( + $filename + ); + } + + // Remove snap on mime not allowed + $remove = true; + foreach ($config->snap->storage->local->mime as $whitelist) + { + if (false !== stripos($mime, $whitelist)) + { + $remove = false; + break; + } + } + + if ($remove) + { + @unlink( + $filename + ); + } + } + + catch (Exception $exception) + { + var_dump( + $exception + ); + } + } } - // Apply delay + // Crawl queue delay sleep( $config->cli->document->crawl->queue->limit ); diff --git a/src/config.json b/src/config.json index d047a56..dfdb430 100644 --- a/src/config.json +++ b/src/config.json @@ -68,7 +68,7 @@ { "local":{ "enabled":true, - "location":"/storage/snap", + "directory":"storage/snap", "size": { "max":100024