diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index 222fc5f..b20be42 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -335,6 +335,45 @@ foreach($index->search('') $data['keywords'] = implode(',', $keywords); } + // Save document body text to index + foreach ($crawler->filter('html > body')->each(function($node) { + + return $node->html(); + + }) as $value) + { + if (!empty($value)) + { + $data['body'] = trim( + preg_replace( + '/[\s]{2,}/', // strip extra separators + ' ', + strip_tags( + str_replace( // make text separators before strip any closing tag, new line, etc + [ + '<', + '>', + PHP_EOL, + ], + [ + ' <', + '> ', + PHP_EOL . ' ', + ], + preg_replace( + '/]*)>([^<]*)<\/script>/is', // strip js content + '', + html_entity_decode( + $value + ) + ) + ) + ) + ) + ); + } + } + // Crawl documents $documents = []; diff --git a/src/cli/index/init.php b/src/cli/index/init.php index e7d60f3..6cf1a8a 100644 --- a/src/cli/index/init.php +++ b/src/cli/index/init.php @@ -64,6 +64,10 @@ $result = $index->create( [ 'type' => 'text' ], + 'body' => + [ + 'type' => 'text' + ], 'mime' => [ 'type' => 'text'