mirror of
https://github.com/YGGverse/Yo.git
synced 2025-03-12 21:31:21 +00:00
1083 lines
34 KiB
PHP
1083 lines
34 KiB
PHP
<?php
|
|
|
|
// Debug
|
|
$microtime = microtime(true);
|
|
|
|
// Load dependencies
|
|
require_once __DIR__ . '/../../../vendor/autoload.php';
|
|
|
|
// Define helpers
|
|
function getLastSnapTime(array $files): int
|
|
{
|
|
$time = [];
|
|
|
|
foreach ($files as $file)
|
|
{
|
|
if (in_array($file, ['.', '..']))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
$time[] = preg_replace(
|
|
'/\D/',
|
|
'',
|
|
basename(
|
|
$file
|
|
)
|
|
);
|
|
}
|
|
|
|
if ($time)
|
|
{
|
|
return max(
|
|
$time
|
|
);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
// Init config
|
|
$config = json_decode(
|
|
file_get_contents(
|
|
__DIR__ . '/../../../config.json'
|
|
)
|
|
);
|
|
|
|
// Prevent multi-thread execution
|
|
$semaphore = sem_get(
|
|
crc32(
|
|
__DIR__ . '.yo.cli.document.crawl'
|
|
),
|
|
1
|
|
);
|
|
|
|
if (false === sem_acquire($semaphore, true))
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->warning)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [warning] process execution locked by another thread!') . PHP_EOL,
|
|
date('c')
|
|
);
|
|
}
|
|
|
|
exit;
|
|
}
|
|
|
|
// Set global options
|
|
define(
|
|
'CONFIG_CLI_DOCUMENT_CRAWL_CURL_DOWNLOAD_SIZE_MAX',
|
|
$config->cli->document->crawl->curl->download->size->max
|
|
);
|
|
|
|
// Init client
|
|
try {
|
|
|
|
$client = new \Manticoresearch\Client(
|
|
[
|
|
'host' => $config->manticore->server->host,
|
|
'port' => $config->manticore->server->port,
|
|
]
|
|
);
|
|
|
|
$index = $client->index(
|
|
$config->manticore->index->document->name
|
|
);
|
|
}
|
|
|
|
catch (Exception $exception)
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->error)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [error] %s') . PHP_EOL,
|
|
date('c'),
|
|
print_r(
|
|
$exception,
|
|
true
|
|
)
|
|
);
|
|
}
|
|
|
|
exit;
|
|
}
|
|
|
|
// Debug totals
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] crawl queue begin...') . PHP_EOL,
|
|
date('c')
|
|
);
|
|
}
|
|
|
|
// Begin crawl queue
|
|
// thanks to @manticoresearch for help with random feature implementation:
|
|
// https://github.com/manticoresoftware/manticoresearch-php/discussions/176
|
|
|
|
foreach($index->search('')
|
|
->expression('random', 'rand()')
|
|
->sort('index', 'desc')
|
|
->sort('time', 'asc')
|
|
->sort('rank', 'asc')
|
|
->sort('random', 'asc')
|
|
->limit($config->cli->document->crawl->queue->limit)
|
|
->get() as $document)
|
|
{
|
|
// Define data
|
|
$time = time();
|
|
|
|
$data =
|
|
[
|
|
'url' => $document->get('url'),
|
|
'title' => $document->get('title'),
|
|
'description' => $document->get('description'),
|
|
'keywords' => $document->get('keywords'),
|
|
'code' => $document->get('code'),
|
|
'size' => $document->get('size'),
|
|
'mime' => $document->get('mime'),
|
|
'rank' => $document->get('rank'),
|
|
'time' => $time,
|
|
'index' => 0
|
|
];
|
|
|
|
// Debug target
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] index "%s" in "%s"') . PHP_EOL,
|
|
date('c'),
|
|
$document->get('url'),
|
|
$config->manticore->index->document->name
|
|
);
|
|
}
|
|
|
|
// Update index time anyway and set reset code to 404
|
|
$index->updateDocument(
|
|
[
|
|
'time' => time(),
|
|
'code' => 200,
|
|
'index' => 0
|
|
],
|
|
$document->getId()
|
|
);
|
|
|
|
// Request remote URL
|
|
$request = curl_init(
|
|
$document->get('url')
|
|
);
|
|
|
|
// Drop URL with long response
|
|
curl_setopt(
|
|
$request,
|
|
CURLOPT_CONNECTTIMEOUT,
|
|
$config->cli->document->crawl->curl->connection->timeout
|
|
);
|
|
|
|
curl_setopt(
|
|
$request,
|
|
CURLOPT_TIMEOUT,
|
|
$config->cli->document->crawl->curl->connection->timeout
|
|
);
|
|
|
|
// Prevent huge content download e.g. media streams URL
|
|
curl_setopt(
|
|
$request,
|
|
CURLOPT_RETURNTRANSFER,
|
|
true
|
|
);
|
|
|
|
curl_setopt(
|
|
$request,
|
|
CURLOPT_NOPROGRESS,
|
|
false
|
|
);
|
|
|
|
curl_setopt(
|
|
$request,
|
|
CURLOPT_PROGRESSFUNCTION,
|
|
function(
|
|
$download,
|
|
$downloaded,
|
|
$upload,
|
|
$uploaded
|
|
) {
|
|
global $request;
|
|
|
|
global $index;
|
|
global $document;
|
|
|
|
if (!$mime = curl_getinfo($request, CURLINFO_CONTENT_TYPE))
|
|
{
|
|
$mime = $document->get('mime');
|
|
}
|
|
|
|
$index->updateDocument(
|
|
[
|
|
'mime' => $mime,
|
|
'time' => time(),
|
|
'code' => 200,
|
|
'index' => 0
|
|
],
|
|
$document->getId()
|
|
);
|
|
|
|
return $downloaded > CONFIG_CLI_DOCUMENT_CRAWL_CURL_DOWNLOAD_SIZE_MAX ? 1 : 0;
|
|
}
|
|
);
|
|
|
|
// Begin request
|
|
if ($response = curl_exec($request))
|
|
{
|
|
// Update HTTP code or skip on empty
|
|
if ($code = curl_getinfo($request, CURLINFO_HTTP_CODE))
|
|
{
|
|
// Delete deprecated document from index as HTTP code still not 200
|
|
/*
|
|
if ($code != 200 && !empty($data['code']) && $data['code'] != 200)
|
|
{
|
|
$index->deleteDocument(
|
|
$document->getId()
|
|
);
|
|
|
|
continue;
|
|
}
|
|
*/
|
|
|
|
$data['code'] = $code;
|
|
|
|
} else continue;
|
|
|
|
// Update size or skip on empty
|
|
if ($size = curl_getinfo($request, CURLINFO_SIZE_DOWNLOAD))
|
|
{
|
|
$data['size'] = $size;
|
|
|
|
} else continue;
|
|
|
|
// Update MIME type or skip on empty
|
|
if ($type = curl_getinfo($request, CURLINFO_CONTENT_TYPE))
|
|
{
|
|
$data['mime'] = $type;
|
|
|
|
// On document charset specified
|
|
if (preg_match('/charset=([^\s;]+)/i', $type, $charset))
|
|
{
|
|
if (!empty($charset[1]))
|
|
{
|
|
// Get system encodings
|
|
foreach (mb_list_encodings() as $encoding)
|
|
{
|
|
if (strtolower($charset[1]) == strtolower($encoding))
|
|
{
|
|
// Convert response to UTF-8
|
|
$response = mb_convert_encoding(
|
|
$response,
|
|
'UTF-8',
|
|
$charset[1]
|
|
);
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} else continue;
|
|
|
|
// DOM crawler
|
|
if (
|
|
false !== stripos($type, 'text/html')
|
|
||
|
|
false !== stripos($type, 'text/xhtml')
|
|
||
|
|
false !== stripos($type, 'application/xhtml')
|
|
) {
|
|
$crawler = new Symfony\Component\DomCrawler\Crawler();
|
|
$crawler->addHtmlContent(
|
|
$response
|
|
);
|
|
|
|
// Get title
|
|
foreach ($crawler->filter('head > title')->each(function($node) {
|
|
|
|
return $node->text();
|
|
|
|
}) as $value)
|
|
{
|
|
if (!empty($value))
|
|
{
|
|
$data['title'] = trim(
|
|
strip_tags(
|
|
html_entity_decode(
|
|
$value
|
|
)
|
|
)
|
|
);
|
|
}
|
|
}
|
|
|
|
// Get description
|
|
foreach ($crawler->filter('head > meta[name="description"]')->each(function($node) {
|
|
|
|
return $node->attr('content');
|
|
|
|
}) as $value)
|
|
{
|
|
if (!empty($value))
|
|
{
|
|
$data['description'] = trim(
|
|
strip_tags(
|
|
html_entity_decode(
|
|
$value
|
|
)
|
|
)
|
|
);
|
|
}
|
|
}
|
|
|
|
// Get keywords
|
|
$keywords = [];
|
|
|
|
// Extract from meta tag
|
|
foreach ($crawler->filter('head > meta[name="keywords"]')->each(function($node) {
|
|
|
|
return $node->attr('content');
|
|
|
|
}) as $value)
|
|
{
|
|
if (!empty($value))
|
|
{
|
|
foreach ((array) explode(
|
|
',',
|
|
mb_strtolower(
|
|
strip_tags(
|
|
html_entity_decode(
|
|
$value
|
|
)
|
|
)
|
|
)
|
|
) as $keyword)
|
|
{
|
|
// Remove extra spaces
|
|
$keyword = trim(
|
|
$keyword
|
|
);
|
|
|
|
// Skip short words
|
|
if (mb_strlen($keyword) > 2)
|
|
{
|
|
$keywords[] = $keyword;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get keywords from headers
|
|
/* Disable keywords collection from headers as body index enabled
|
|
|
|
foreach ($crawler->filter('h1,h2,h3,h4,h5,h6')->each(function($node) {
|
|
|
|
return $node->text();
|
|
|
|
}) as $value)
|
|
{
|
|
if (!empty($value))
|
|
{
|
|
foreach ((array) explode(
|
|
',',
|
|
mb_strtolower(
|
|
strip_tags(
|
|
html_entity_decode(
|
|
$value
|
|
)
|
|
)
|
|
)
|
|
) as $keyword)
|
|
{
|
|
// Remove extra spaces
|
|
$keyword = trim(
|
|
$keyword
|
|
);
|
|
|
|
// Skip short words
|
|
if (mb_strlen($keyword) > 2)
|
|
{
|
|
$keywords[] = $keyword;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
*/
|
|
|
|
// Keep keywords unique
|
|
$keywords = array_unique(
|
|
$keywords
|
|
);
|
|
|
|
// Update previous keywords when new value exists
|
|
if ($keywords)
|
|
{
|
|
$data['keywords'] = implode(',', $keywords);
|
|
}
|
|
|
|
// Save document body text to index
|
|
foreach ($crawler->filter('html > body')->each(function($node) {
|
|
|
|
return $node->html();
|
|
|
|
}) as $value)
|
|
{
|
|
if (!empty($value))
|
|
{
|
|
$data['body'] = trim(
|
|
preg_replace(
|
|
'/[\s]{2,}/', // strip extra separators
|
|
' ',
|
|
strip_tags(
|
|
str_replace( // make text separators before strip any closing tag, new line, etc
|
|
[
|
|
'<',
|
|
'>',
|
|
PHP_EOL,
|
|
],
|
|
[
|
|
' <',
|
|
'> ',
|
|
PHP_EOL . ' ',
|
|
],
|
|
preg_replace(
|
|
[
|
|
'/<script([^>]*)>([\s\S]*?)<\/script>/i', // strip js content
|
|
'/<style([^>]*)>([\s\S]*?)<\/style>/i', // strip css content
|
|
'/<pre([^>]*)>([\s\S]*?)<\/pre>/i', // strip code content
|
|
'/<code([^>]*)>([\s\S]*?)<\/code>/i',
|
|
],
|
|
'',
|
|
html_entity_decode(
|
|
$value
|
|
)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
);
|
|
}
|
|
}
|
|
|
|
// Crawl documents
|
|
$documents = [];
|
|
|
|
$scheme = parse_url($document->get('url'), PHP_URL_SCHEME);
|
|
$host = parse_url($document->get('url'), PHP_URL_HOST);
|
|
$port = parse_url($document->get('url'), PHP_URL_PORT);
|
|
|
|
foreach ($config->cli->document->crawl->selector as $selector => $settings)
|
|
{
|
|
foreach ($crawler->filter($selector)->each(function($node) {
|
|
|
|
return $node;
|
|
|
|
}) as $value) {
|
|
|
|
if ($url = $value->attr($settings->attribute))
|
|
{
|
|
//Make relative links absolute
|
|
if (!parse_url($url, PHP_URL_HOST))
|
|
{
|
|
$url = $scheme . '://' . $host . ($port ? ':' . $port : null) .
|
|
'/' .
|
|
trim(
|
|
ltrim(
|
|
str_replace(
|
|
[
|
|
'./',
|
|
'../'
|
|
],
|
|
'',
|
|
$url
|
|
),
|
|
'/'
|
|
),
|
|
'.'
|
|
);
|
|
}
|
|
|
|
// Regex rules
|
|
if (!preg_match($settings->regex, $url))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// External host rules
|
|
if (!$settings->external && parse_url($url, PHP_URL_HOST) != $host)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
$documents[] = $url;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($documents)
|
|
{
|
|
foreach (array_unique($documents) as $url)
|
|
{
|
|
// Apply stripos condition
|
|
$skip = false;
|
|
|
|
foreach ($config->cli->document->crawl->skip->stripos->url as $condition)
|
|
{
|
|
if (false !== stripos($url, $condition)) {
|
|
|
|
$skip = true;
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($skip)
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] skip "%s" by stripos condition "%s"') . PHP_EOL,
|
|
date('c'),
|
|
$url,
|
|
print_r(
|
|
$config->cli->document->crawl->skip->stripos->url,
|
|
true
|
|
)
|
|
);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
// Save index
|
|
$url = trim($url);
|
|
$crc32url = crc32($url);
|
|
|
|
if (!$index->search('')
|
|
->filter('id', $crc32url)
|
|
->limit(1)
|
|
->get()
|
|
->getTotal())
|
|
{
|
|
|
|
$index->addDocument(
|
|
[
|
|
'url' => $url,
|
|
'rank' => (int) mb_strlen(
|
|
(string)
|
|
urldecode(
|
|
(string)
|
|
parse_url(
|
|
$url,
|
|
PHP_URL_PATH
|
|
)
|
|
)
|
|
)
|
|
],
|
|
$crc32url
|
|
);
|
|
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] add "%s" to "%s"') . PHP_EOL,
|
|
date('c'),
|
|
$url,
|
|
$config->manticore->index->document->name
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Replace document data
|
|
// https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916
|
|
$result = $index->replaceDocument(
|
|
$data,
|
|
$document->getId()
|
|
);
|
|
|
|
// Debug result
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
'[%s] [notice] index "%s" updated: %s %s' . PHP_EOL,
|
|
date('c'),
|
|
$config->manticore->index->document->name,
|
|
print_r(
|
|
$result,
|
|
true
|
|
),
|
|
print_r(
|
|
$data,
|
|
true
|
|
),
|
|
);
|
|
}
|
|
|
|
// Create snap
|
|
if ($config->cli->document->crawl->snap->enabled && $code === 200)
|
|
{
|
|
try
|
|
{
|
|
// Generate path
|
|
$time = time();
|
|
|
|
$md5url = md5(
|
|
$document->get('url')
|
|
);
|
|
|
|
/// absolute
|
|
if (str_starts_with($config->snap->storage->tmp->directory, '/'))
|
|
{
|
|
$filepath = $config->snap->storage->tmp->directory;
|
|
}
|
|
|
|
/// relative
|
|
else
|
|
{
|
|
$filepath = __DIR__ . '/../../../' . $config->snap->storage->tmp->directory;
|
|
}
|
|
|
|
@mkdir($filepath, 0755, true);
|
|
|
|
$tmp = sprintf(
|
|
'%s/%s.%s.tar',
|
|
$filepath,
|
|
$md5url,
|
|
$time
|
|
);
|
|
|
|
// Compress response to archive
|
|
$snap = new PharData($tmp);
|
|
|
|
$snap->addFromString(
|
|
'DATA',
|
|
$response
|
|
);
|
|
|
|
$snap->addFromString(
|
|
'MIME',
|
|
$type
|
|
);
|
|
|
|
$snap->addFromString(
|
|
'URL',
|
|
$document->get('url')
|
|
);
|
|
|
|
$snap->compress(
|
|
Phar::GZ
|
|
);
|
|
|
|
unlink( // remove tarball
|
|
$tmp
|
|
);
|
|
|
|
$tmp = sprintf(
|
|
'%s.gz',
|
|
$tmp
|
|
);
|
|
|
|
// Copy to local storage on enabled
|
|
if ($config->snap->storage->local->enabled)
|
|
{
|
|
// Check for mime allowed
|
|
$allowed = false;
|
|
|
|
foreach ($config->snap->storage->local->mime->stripos as $whitelist)
|
|
{
|
|
if (false !== stripos($type, $whitelist))
|
|
{
|
|
$allowed = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check for url allowed
|
|
if ($allowed)
|
|
{
|
|
$allowed = false;
|
|
|
|
foreach ($config->snap->storage->local->url->stripos as $whitelist)
|
|
{
|
|
if (false !== stripos($document->get('url'), $whitelist))
|
|
{
|
|
$allowed = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check size limits
|
|
if ($allowed)
|
|
{
|
|
$allowed = false;
|
|
|
|
if ($size <= $config->snap->storage->local->size->max)
|
|
{
|
|
$allowed = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Copy snap to the permanent storage
|
|
if ($allowed)
|
|
{
|
|
/// absolute
|
|
if (str_starts_with($config->snap->storage->local->directory, '/'))
|
|
{
|
|
$filepath = $config->snap->storage->local->directory;
|
|
}
|
|
|
|
/// relative
|
|
else
|
|
{
|
|
$filepath = __DIR__ . '/../../../' . $config->snap->storage->local->directory;
|
|
}
|
|
|
|
$filepath = sprintf(
|
|
'%s/%s',
|
|
$filepath,
|
|
implode(
|
|
'/',
|
|
str_split(
|
|
$md5url
|
|
)
|
|
)
|
|
);
|
|
|
|
@mkdir($filepath, 0755, true);
|
|
|
|
// Check latest snap older than defined in settings
|
|
if (time() - getLastSnapTime((array) scandir($filepath)) > $config->cli->document->crawl->snap->timeout)
|
|
{
|
|
$filename = sprintf(
|
|
'%s/%s',
|
|
$filepath,
|
|
sprintf(
|
|
'%s.tar.gz',
|
|
$time
|
|
)
|
|
);
|
|
|
|
if (copy($tmp, $filename))
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] save snap to "%s" on local storage') . PHP_EOL,
|
|
date('c'),
|
|
$filename
|
|
);
|
|
}
|
|
}
|
|
|
|
else
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->error)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [error] could not copy "%s" to "%s" on local storage') . PHP_EOL,
|
|
date('c'),
|
|
$tmp,
|
|
$filename
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
else
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] local snap is up to date by timeout settings') . PHP_EOL,
|
|
date('c')
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
else
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] local snap skipped by settings condition') . PHP_EOL,
|
|
date('c')
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Copy to FTP storage on enabled
|
|
foreach ($config->snap->storage->remote->ftp as $ftp)
|
|
{
|
|
// Resource enabled
|
|
if (!$ftp->enabled)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check for mime allowed
|
|
$allowed = false;
|
|
|
|
foreach ($ftp->mime->stripos as $whitelist)
|
|
{
|
|
if (false !== stripos($type, $whitelist))
|
|
{
|
|
$allowed = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!$allowed)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check for url allowed
|
|
$allowed = false;
|
|
|
|
foreach ($ftp->url->stripos as $whitelist)
|
|
{
|
|
if (false !== stripos($document->get('url'), $whitelist))
|
|
{
|
|
$allowed = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!$allowed)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check size limits
|
|
$allowed = false;
|
|
|
|
if ($size <= $ftp->size->max)
|
|
{
|
|
$allowed = true;
|
|
}
|
|
|
|
if (!$allowed)
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] remote snap skipped on "%s" by settings condition') . PHP_EOL,
|
|
date('c'),
|
|
$ftp->connection->host
|
|
);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
// Prepare location
|
|
$filepath = implode(
|
|
'/',
|
|
str_split(
|
|
$md5url
|
|
)
|
|
);
|
|
|
|
$filename = sprintf(
|
|
'%s/%s',
|
|
$filepath,
|
|
sprintf(
|
|
'%s.tar.gz',
|
|
$time
|
|
)
|
|
);
|
|
|
|
// Init connection
|
|
$attempt = 1;
|
|
|
|
do {
|
|
|
|
$remote = new \Yggverse\Ftp\Client();
|
|
|
|
$connection = $remote->connect(
|
|
$ftp->connection->host,
|
|
$ftp->connection->port,
|
|
$ftp->connection->username,
|
|
$ftp->connection->password,
|
|
$ftp->connection->directory,
|
|
$ftp->connection->timeout,
|
|
$ftp->connection->passive
|
|
);
|
|
|
|
// Remote host connected
|
|
if ($connection) {
|
|
|
|
$remote->mkdir(
|
|
$filepath,
|
|
true
|
|
);
|
|
|
|
// Check latest snap older than defined in settings
|
|
if (time() - getLastSnapTime((array) $remote->nlist($filepath)) > $config->cli->document->crawl->snap->timeout)
|
|
{
|
|
if ($remote->copy($tmp, $filename))
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] save snap to "%s" on remote host "%s"') . PHP_EOL,
|
|
date('c'),
|
|
$filename,
|
|
$ftp->connection->host
|
|
);
|
|
}
|
|
}
|
|
|
|
else
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->error)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [error] could not copy snap "%s" to "%s" on destination "%s"') . PHP_EOL,
|
|
date('c'),
|
|
$tmp,
|
|
$filename,
|
|
$ftp->connection->host
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
else
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] remote snap on destination "%s" is up to date by timeout settings') . PHP_EOL,
|
|
date('c'),
|
|
$ftp->connection->host
|
|
);
|
|
}
|
|
}
|
|
|
|
$remote->close();
|
|
|
|
// On remote connection lost, repeat attempt
|
|
} else {
|
|
|
|
// Stop connection attempts on limit provided
|
|
if ($ftp->connection->attempts->limit > 0 && $attempt > $ftp->connection->attempts->limit)
|
|
{
|
|
break;
|
|
}
|
|
|
|
// Log event
|
|
if ($config->cli->document->crawl->debug->level->warning)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [warning] attempt: %s, wait for remote storage "%s" reconnection...') . PHP_EOL,
|
|
date('c'),
|
|
$attempt++,
|
|
$ftp->connection->host,
|
|
);
|
|
}
|
|
|
|
// Delay next attempt
|
|
if ($ftp->connection->attempts->delay)
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->warning)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [warning] pending %s seconds to reconnect...') . PHP_EOL,
|
|
date('c'),
|
|
$ftp->connection->attempts->delay
|
|
);
|
|
}
|
|
|
|
sleep(
|
|
$ftp->connection->attempts->delay
|
|
);
|
|
}
|
|
}
|
|
|
|
} while ($connection === false);
|
|
}
|
|
|
|
// Remove tmp data
|
|
if (unlink($tmp))
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] remove tmp snap file "%s"') . PHP_EOL,
|
|
date('c'),
|
|
$tmp
|
|
);
|
|
}
|
|
}
|
|
|
|
else
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->error)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [error] could not remove tmp snap file "%s"') . PHP_EOL,
|
|
date('c'),
|
|
$tmp
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
catch (Exception $exception)
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->error)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [error] %s') . PHP_EOL,
|
|
date('c'),
|
|
print_r(
|
|
$exception,
|
|
true
|
|
)
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Crawl queue delay
|
|
if ($config->cli->document->crawl->queue->delay)
|
|
{
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] pending %s seconds...') . PHP_EOL,
|
|
date('c'),
|
|
$config->cli->document->crawl->queue->delay
|
|
);
|
|
}
|
|
|
|
sleep(
|
|
$config->cli->document->crawl->queue->delay
|
|
);
|
|
}
|
|
|
|
// Debug totals
|
|
if ($config->cli->document->crawl->debug->level->notice)
|
|
{
|
|
echo sprintf(
|
|
_('[%s] [notice] crawl queue completed in %s') . PHP_EOL,
|
|
date('c'),
|
|
microtime(true) - $microtime
|
|
);
|
|
}
|
|
} |