Browse Source

fix document fields update

main
ghost 1 year ago
parent
commit
38fbc32151
  1. 99
      src/cli/document/crawl.php

99
src/cli/document/crawl.php

@ -67,6 +67,21 @@ $index = $client->index(
// Begin queue // Begin queue
foreach($search->get() as $document) foreach($search->get() as $document)
{ {
// Define data
$time = time();
$data =
[
'url' => $document->get('url'),
'title' => $document->get('title'),
'description' => $document->get('description'),
'keywords' => $document->get('keywords'),
'code' => $document->get('code'),
'size' => $document->get('size'),
'mime' => $document->get('mime'),
'time' => $time
];
// Debug target // Debug target
echo sprintf( echo sprintf(
'index "%s" in "%s"' . PHP_EOL, 'index "%s" in "%s"' . PHP_EOL,
@ -74,7 +89,7 @@ foreach($search->get() as $document)
$config->manticore->index->document->name $config->manticore->index->document->name
); );
// Update index time // Update index time anyway and set reset code to 404
$index->updateDocument( $index->updateDocument(
[ [
'time' => time(), 'time' => time(),
@ -130,39 +145,24 @@ foreach($search->get() as $document)
// Begin request // Begin request
if ($response = curl_exec($request)) if ($response = curl_exec($request))
{ {
// Update HTTP code // Update HTTP code or skip on empty
if ($code = curl_getinfo($request, CURLINFO_HTTP_CODE)) if ($code = curl_getinfo($request, CURLINFO_HTTP_CODE))
{ {
$index->updateDocument( $data['code'] = $code;
[
'code' => $code
],
$document->getId()
);
} else continue; } else continue;
// Update size // Update size or skip on empty
if ($size = curl_getinfo($request, CURLINFO_SIZE_DOWNLOAD)) if ($size = curl_getinfo($request, CURLINFO_SIZE_DOWNLOAD))
{ {
$index->updateDocument( $data['size'] = $size;
[
'size' => $size
],
$document->getId()
);
} else continue; } else continue;
// Update MIME type // Update MIME type or skip on empty
if ($mime = curl_getinfo($request, CURLINFO_CONTENT_TYPE)) if ($mime = curl_getinfo($request, CURLINFO_CONTENT_TYPE))
{ {
$index->updateDocument( $data['mime'] = $mime;
[
'mime' => $mime
],
$document->getId()
);
} else continue; } else continue;
@ -175,30 +175,34 @@ foreach($search->get() as $document)
); );
// Get title // Get title
$title = '';
foreach ($crawler->filter('head > title')->each(function($node) { foreach ($crawler->filter('head > title')->each(function($node) {
return $node->text(); return $node->text();
}) as $value) { }) as $value)
{
$title = html_entity_decode( if (!empty($value))
{
$data['title'] = html_entity_decode(
$value $value
); );
} }
}
// Get description // Get description
$description = '';
foreach ($crawler->filter('head > meta[name="description"]')->each(function($node) { foreach ($crawler->filter('head > meta[name="description"]')->each(function($node) {
return $node->attr('content'); return $node->attr('content');
}) as $value) { }) as $value)
{
$description = html_entity_decode( if (!empty($value))
{
$data['description'] = html_entity_decode(
$value $value
); );
} }
}
// Get keywords // Get keywords
$keywords = ''; $keywords = '';
@ -206,31 +210,15 @@ foreach($search->get() as $document)
return $node->attr('content'); return $node->attr('content');
}) as $value) { }) as $value)
{
$keywords = html_entity_decode( if (!empty($value))
{
$data['keywords'] = html_entity_decode(
$value $value
); );
} }
}
// Replace document
// https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916
$data =
[
'url' => $document->get('url'),
'title' => $title,
'description' => $description,
'keywords' => $keywords,
'code' => $code,
'size' => $size,
'mime' => $mime,
'time' => time()
];
$result = $index->replaceDocument(
$data,
$document->getId()
);
echo sprintf( echo sprintf(
'index "%s" updated: %s %s' . PHP_EOL, 'index "%s" updated: %s %s' . PHP_EOL,
@ -330,6 +318,13 @@ foreach($search->get() as $document)
} }
} }
// Replace document data
// https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916
$result = $index->replaceDocument(
$data,
$document->getId()
);
// Create snap // Create snap
if ($config->cli->document->crawl->snap->enabled && $code === 200) if ($config->cli->document->crawl->snap->enabled && $code === 200)
{ {

Loading…
Cancel
Save