Browse Source

use yo-tools-php library

main
yggverse 8 months ago
parent
commit
c492a98094
  1. 3
      composer.json
  2. 65
      src/cli/document/crawl.php

3
composer.json

@ -20,6 +20,7 @@ @@ -20,6 +20,7 @@
"jdenticon/jdenticon": "^1.0",
"yggverse/ftp": "^1.0",
"gregwar/captcha": "^1.2",
"yggverse/net": "^1.2"
"yggverse/net": "^1.2",
"yggverse/yo-tools": "^0.1.0"
}
}

65
src/cli/document/crawl.php

@ -6,37 +6,6 @@ $microtime = microtime(true); @@ -6,37 +6,6 @@ $microtime = microtime(true);
// Load dependencies
require_once __DIR__ . '/../../../vendor/autoload.php';
// Define helpers
function getLastSnapTime(array $files): int
{
$time = [];
foreach ($files as $file)
{
if (!str_ends_with($file, '.tar.gz'))
{
continue;
}
$time[] = preg_replace(
'/^([\d]+)\.tar\.gz$/',
'$1',
basename(
$file
)
);
}
if ($time)
{
return max(
$time
);
}
return 0;
}
// Init config
$config = json_decode(
file_get_contents(
@ -519,10 +488,6 @@ foreach($index->search('') @@ -519,10 +488,6 @@ foreach($index->search('')
// Crawl documents
$documents = [];
$scheme = parse_url($document->get('url'), PHP_URL_SCHEME);
$host = parse_url($document->get('url'), PHP_URL_HOST);
$port = parse_url($document->get('url'), PHP_URL_PORT);
foreach ($config->cli->document->crawl->selector as $selector => $settings)
{
foreach ($crawler->filter($selector)->each(function($node) {
@ -534,25 +499,13 @@ foreach($index->search('') @@ -534,25 +499,13 @@ foreach($index->search('')
if ($url = $value->attr($settings->attribute))
{
//Make relative links absolute
if (!parse_url($url, PHP_URL_HOST))
{
$url = $scheme . '://' . $host . ($port ? ':' . $port : null) .
'/' .
trim(
ltrim(
str_replace(
[
'./',
'../'
],
'',
$url
),
'/'
),
'.'
);
}
$url = \Yggverse\YoTools\Link::relative2absolute(
$document->get('url'),
$url,
$scheme,
$host,
$port,
);
// Regex rules
if (!preg_match($settings->regex, $url))
@ -834,7 +787,7 @@ foreach($index->search('') @@ -834,7 +787,7 @@ foreach($index->search('')
@mkdir($filepath, 0755, true);
// Check latest snap older than defined in settings
if (time() - getLastSnapTime((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout)
if (time() - \Yggverse\YoTools\Snap::getTimeLast((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout)
{
$filename = sprintf(
'%s/%s',
@ -1003,7 +956,7 @@ foreach($index->search('') @@ -1003,7 +956,7 @@ foreach($index->search('')
);
// Check latest snap older than defined in settings
if (time() - getLastSnapTime((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout)
if (time() - \Yggverse\YoTools\Snap::getTimeLast((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout)
{
if ($remote->copy($tmp, $filename))
{

Loading…
Cancel
Save