From c492a980948b6135c3f0c31300af90bc5bcb5c52 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 3 Apr 2024 18:18:12 +0300 Subject: [PATCH] use yo-tools-php library --- composer.json | 3 +- src/cli/document/crawl.php | 65 ++++++-------------------------------- 2 files changed, 11 insertions(+), 57 deletions(-) diff --git a/composer.json b/composer.json index 9e09263..5122b2e 100644 --- a/composer.json +++ b/composer.json @@ -20,6 +20,7 @@ "jdenticon/jdenticon": "^1.0", "yggverse/ftp": "^1.0", "gregwar/captcha": "^1.2", - "yggverse/net": "^1.2" + "yggverse/net": "^1.2", + "yggverse/yo-tools": "^0.1.0" } } diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index 972fe18..838a0ec 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -6,37 +6,6 @@ $microtime = microtime(true); // Load dependencies require_once __DIR__ . '/../../../vendor/autoload.php'; -// Define helpers -function getLastSnapTime(array $files): int -{ - $time = []; - - foreach ($files as $file) - { - if (!str_ends_with($file, '.tar.gz')) - { - continue; - } - - $time[] = preg_replace( - '/^([\d]+)\.tar\.gz$/', - '$1', - basename( - $file - ) - ); - } - - if ($time) - { - return max( - $time - ); - } - - return 0; -} - // Init config $config = json_decode( file_get_contents( @@ -519,10 +488,6 @@ foreach($index->search('') // Crawl documents $documents = []; - $scheme = parse_url($document->get('url'), PHP_URL_SCHEME); - $host = parse_url($document->get('url'), PHP_URL_HOST); - $port = parse_url($document->get('url'), PHP_URL_PORT); - foreach ($config->cli->document->crawl->selector as $selector => $settings) { foreach ($crawler->filter($selector)->each(function($node) { @@ -534,25 +499,13 @@ foreach($index->search('') if ($url = $value->attr($settings->attribute)) { //Make relative links absolute - if (!parse_url($url, PHP_URL_HOST)) - { - $url = $scheme . '://' . $host . ($port ? ':' . $port : null) . - '/' . - trim( - ltrim( - str_replace( - [ - './', - '../' - ], - '', - $url - ), - '/' - ), - '.' - ); - } + $url = \Yggverse\YoTools\Link::relative2absolute( + $document->get('url'), + $url, + $scheme, + $host, + $port, + ); // Regex rules if (!preg_match($settings->regex, $url)) @@ -834,7 +787,7 @@ foreach($index->search('') @mkdir($filepath, 0755, true); // Check latest snap older than defined in settings - if (time() - getLastSnapTime((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout) + if (time() - \Yggverse\YoTools\Snap::getTimeLast((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout) { $filename = sprintf( '%s/%s', @@ -1003,7 +956,7 @@ foreach($index->search('') ); // Check latest snap older than defined in settings - if (time() - getLastSnapTime((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout) + if (time() - \Yggverse\YoTools\Snap::getTimeLast((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout) { if ($remote->copy($tmp, $filename)) {