save document body text to index

This commit is contained in:
yggverse 2024-03-20 19:31:56 +02:00
parent 1f27a7e105
commit 3884f375d4
2 changed files with 43 additions and 0 deletions

View File

@ -335,6 +335,45 @@ foreach($index->search('')
$data['keywords'] = implode(',', $keywords);
}
// Save document body text to index
foreach ($crawler->filter('html > body')->each(function($node) {
return $node->html();
}) as $value)
{
if (!empty($value))
{
$data['body'] = trim(
preg_replace(
'/[\s]{2,}/', // strip extra separators
' ',
strip_tags(
str_replace( // make text separators before strip any closing tag, new line, etc
[
'<',
'>',
PHP_EOL,
],
[
' <',
'> ',
PHP_EOL . ' ',
],
preg_replace(
'/<script([^>]*)>([^<]*)<\/script>/is', // strip js content
'',
html_entity_decode(
$value
)
)
)
)
)
);
}
}
// Crawl documents
$documents = [];

View File

@ -64,6 +64,10 @@ $result = $index->create(
[
'type' => 'text'
],
'body' =>
[
'type' => 'text'
],
'mime' =>
[
'type' => 'text'