Browse Source

use yo-tools-php library

main
yggverse 8 months ago
parent
commit
c492a98094
  1. 3
      composer.json
  2. 63
      src/cli/document/crawl.php

3
composer.json

@ -20,6 +20,7 @@
"jdenticon/jdenticon": "^1.0", "jdenticon/jdenticon": "^1.0",
"yggverse/ftp": "^1.0", "yggverse/ftp": "^1.0",
"gregwar/captcha": "^1.2", "gregwar/captcha": "^1.2",
"yggverse/net": "^1.2" "yggverse/net": "^1.2",
"yggverse/yo-tools": "^0.1.0"
} }
} }

63
src/cli/document/crawl.php

@ -6,37 +6,6 @@ $microtime = microtime(true);
// Load dependencies // Load dependencies
require_once __DIR__ . '/../../../vendor/autoload.php'; require_once __DIR__ . '/../../../vendor/autoload.php';
// Define helpers
function getLastSnapTime(array $files): int
{
$time = [];
foreach ($files as $file)
{
if (!str_ends_with($file, '.tar.gz'))
{
continue;
}
$time[] = preg_replace(
'/^([\d]+)\.tar\.gz$/',
'$1',
basename(
$file
)
);
}
if ($time)
{
return max(
$time
);
}
return 0;
}
// Init config // Init config
$config = json_decode( $config = json_decode(
file_get_contents( file_get_contents(
@ -519,10 +488,6 @@ foreach($index->search('')
// Crawl documents // Crawl documents
$documents = []; $documents = [];
$scheme = parse_url($document->get('url'), PHP_URL_SCHEME);
$host = parse_url($document->get('url'), PHP_URL_HOST);
$port = parse_url($document->get('url'), PHP_URL_PORT);
foreach ($config->cli->document->crawl->selector as $selector => $settings) foreach ($config->cli->document->crawl->selector as $selector => $settings)
{ {
foreach ($crawler->filter($selector)->each(function($node) { foreach ($crawler->filter($selector)->each(function($node) {
@ -534,25 +499,13 @@ foreach($index->search('')
if ($url = $value->attr($settings->attribute)) if ($url = $value->attr($settings->attribute))
{ {
//Make relative links absolute //Make relative links absolute
if (!parse_url($url, PHP_URL_HOST)) $url = \Yggverse\YoTools\Link::relative2absolute(
{ $document->get('url'),
$url = $scheme . '://' . $host . ($port ? ':' . $port : null) . $url,
'/' . $scheme,
trim( $host,
ltrim( $port,
str_replace(
[
'./',
'../'
],
'',
$url
),
'/'
),
'.'
); );
}
// Regex rules // Regex rules
if (!preg_match($settings->regex, $url)) if (!preg_match($settings->regex, $url))
@ -834,7 +787,7 @@ foreach($index->search('')
@mkdir($filepath, 0755, true); @mkdir($filepath, 0755, true);
// Check latest snap older than defined in settings // Check latest snap older than defined in settings
if (time() - getLastSnapTime((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout) if (time() - \Yggverse\YoTools\Snap::getTimeLast((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout)
{ {
$filename = sprintf( $filename = sprintf(
'%s/%s', '%s/%s',
@ -1003,7 +956,7 @@ foreach($index->search('')
); );
// Check latest snap older than defined in settings // Check latest snap older than defined in settings
if (time() - getLastSnapTime((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout) if (time() - \Yggverse\YoTools\Snap::getTimeLast((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout)
{ {
if ($remote->copy($tmp, $filename)) if ($remote->copy($tmp, $filename))
{ {

Loading…
Cancel
Save