collect keywords from document headers

This commit is contained in:
ghost 2024-01-23 02:49:52 +02:00
parent cfbc84cbaf
commit 1c2e8dafb2

View File

@ -224,8 +224,12 @@ foreach($index->search('')
{ {
if (!empty($value)) if (!empty($value))
{ {
$data['title'] = html_entity_decode( $data['title'] = trim(
$value strip_tags(
html_entity_decode(
$value
)
)
); );
} }
} }
@ -239,14 +243,20 @@ foreach($index->search('')
{ {
if (!empty($value)) if (!empty($value))
{ {
$data['description'] = html_entity_decode( $data['description'] = trim(
$value strip_tags(
html_entity_decode(
$value
)
)
); );
} }
} }
// Get keywords // Get keywords
$keywords = ''; $keywords = [];
// Extract from meta tag
foreach ($crawler->filter('head > meta[name="keywords"]')->each(function($node) { foreach ($crawler->filter('head > meta[name="keywords"]')->each(function($node) {
return $node->attr('content'); return $node->attr('content');
@ -255,12 +265,76 @@ foreach($index->search('')
{ {
if (!empty($value)) if (!empty($value))
{ {
$data['keywords'] = html_entity_decode( foreach ((array) explode(
$value ',',
); mb_strtolower(
strip_tags(
html_entity_decode(
$value
)
)
)
) as $keyword)
{
// Remove extra spaces
$keyword = trim(
$keyword
);
// Skip short words
if (mb_strlen($keyword) > 2)
{
$keywords[] = $keyword;
}
}
} }
} }
// Get keywords from headers
foreach ($crawler->filter('h1,h2,h3,h4,h5,h6')->each(function($node) {
return $node->text();
}) as $value)
{
if (!empty($value))
{
foreach ((array) explode(
',',
mb_strtolower(
strip_tags(
html_entity_decode(
$value
)
)
)
) as $keyword)
{
// Remove extra spaces
$keyword = trim(
$keyword
);
// Skip short words
if (mb_strlen($keyword) > 2)
{
$keywords[] = $keyword;
}
}
}
}
// Keep keywords unique
$keywords = array_unique(
$keywords
);
// Update previous keywords when new value exists
if ($keywords)
{
$data['keywords'] = implode(',', $keywords);
}
// Crawl documents // Crawl documents
$documents = []; $documents = [];