|
|
|
@ -6,37 +6,6 @@ $microtime = microtime(true);
@@ -6,37 +6,6 @@ $microtime = microtime(true);
|
|
|
|
|
// Load dependencies |
|
|
|
|
require_once __DIR__ . '/../../../vendor/autoload.php'; |
|
|
|
|
|
|
|
|
|
// Define helpers |
|
|
|
|
function getLastSnapTime(array $files): int |
|
|
|
|
{ |
|
|
|
|
$time = []; |
|
|
|
|
|
|
|
|
|
foreach ($files as $file) |
|
|
|
|
{ |
|
|
|
|
if (!str_ends_with($file, '.tar.gz')) |
|
|
|
|
{ |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
$time[] = preg_replace( |
|
|
|
|
'/^([\d]+)\.tar\.gz$/', |
|
|
|
|
'$1', |
|
|
|
|
basename( |
|
|
|
|
$file |
|
|
|
|
) |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if ($time) |
|
|
|
|
{ |
|
|
|
|
return max( |
|
|
|
|
$time |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return 0; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Init config |
|
|
|
|
$config = json_decode( |
|
|
|
|
file_get_contents( |
|
|
|
@ -519,10 +488,6 @@ foreach($index->search('')
@@ -519,10 +488,6 @@ foreach($index->search('')
|
|
|
|
|
// Crawl documents |
|
|
|
|
$documents = []; |
|
|
|
|
|
|
|
|
|
$scheme = parse_url($document->get('url'), PHP_URL_SCHEME); |
|
|
|
|
$host = parse_url($document->get('url'), PHP_URL_HOST); |
|
|
|
|
$port = parse_url($document->get('url'), PHP_URL_PORT); |
|
|
|
|
|
|
|
|
|
foreach ($config->cli->document->crawl->selector as $selector => $settings) |
|
|
|
|
{ |
|
|
|
|
foreach ($crawler->filter($selector)->each(function($node) { |
|
|
|
@ -534,25 +499,13 @@ foreach($index->search('')
@@ -534,25 +499,13 @@ foreach($index->search('')
|
|
|
|
|
if ($url = $value->attr($settings->attribute)) |
|
|
|
|
{ |
|
|
|
|
//Make relative links absolute |
|
|
|
|
if (!parse_url($url, PHP_URL_HOST)) |
|
|
|
|
{ |
|
|
|
|
$url = $scheme . '://' . $host . ($port ? ':' . $port : null) . |
|
|
|
|
'/' . |
|
|
|
|
trim( |
|
|
|
|
ltrim( |
|
|
|
|
str_replace( |
|
|
|
|
[ |
|
|
|
|
'./', |
|
|
|
|
'../' |
|
|
|
|
], |
|
|
|
|
'', |
|
|
|
|
$url |
|
|
|
|
), |
|
|
|
|
'/' |
|
|
|
|
), |
|
|
|
|
'.' |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
$url = \Yggverse\YoTools\Link::relative2absolute( |
|
|
|
|
$document->get('url'), |
|
|
|
|
$url, |
|
|
|
|
$scheme, |
|
|
|
|
$host, |
|
|
|
|
$port, |
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
// Regex rules |
|
|
|
|
if (!preg_match($settings->regex, $url)) |
|
|
|
@ -834,7 +787,7 @@ foreach($index->search('')
@@ -834,7 +787,7 @@ foreach($index->search('')
|
|
|
|
|
@mkdir($filepath, 0755, true); |
|
|
|
|
|
|
|
|
|
// Check latest snap older than defined in settings |
|
|
|
|
if (time() - getLastSnapTime((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout) |
|
|
|
|
if (time() - \Yggverse\YoTools\Snap::getTimeLast((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout) |
|
|
|
|
{ |
|
|
|
|
$filename = sprintf( |
|
|
|
|
'%s/%s', |
|
|
|
@ -1003,7 +956,7 @@ foreach($index->search('')
@@ -1003,7 +956,7 @@ foreach($index->search('')
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
// Check latest snap older than defined in settings |
|
|
|
|
if (time() - getLastSnapTime((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout) |
|
|
|
|
if (time() - \Yggverse\YoTools\Snap::getTimeLast((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout) |
|
|
|
|
{ |
|
|
|
|
if ($remote->copy($tmp, $filename)) |
|
|
|
|
{ |
|
|
|
|