Browse Source

save document body text to index

main
yggverse 9 months ago
parent
commit
3884f375d4
  1. 39
      src/cli/document/crawl.php
  2. 4
      src/cli/index/init.php

39
src/cli/document/crawl.php

@ -335,6 +335,45 @@ foreach($index->search('') @@ -335,6 +335,45 @@ foreach($index->search('')
$data['keywords'] = implode(',', $keywords);
}
// Save document body text to index
foreach ($crawler->filter('html > body')->each(function($node) {
return $node->html();
}) as $value)
{
if (!empty($value))
{
$data['body'] = trim(
preg_replace(
'/[\s]{2,}/', // strip extra separators
' ',
strip_tags(
str_replace( // make text separators before strip any closing tag, new line, etc
[
'<',
'>',
PHP_EOL,
],
[
' <',
'> ',
PHP_EOL . ' ',
],
preg_replace(
'/<script([^>]*)>([^<]*)<\/script>/is', // strip js content
'',
html_entity_decode(
$value
)
)
)
)
)
);
}
}
// Crawl documents
$documents = [];

4
src/cli/index/init.php

@ -64,6 +64,10 @@ $result = $index->create( @@ -64,6 +64,10 @@ $result = $index->create(
[
'type' => 'text'
],
'body' =>
[
'type' => 'text'
],
'mime' =>
[
'type' => 'text'

Loading…
Cancel
Save