add skip url filter by stripos condition

This commit is contained in:
ghost 2023-11-30 02:24:02 +02:00
parent ee074b684a
commit 3306dc1961
3 changed files with 32 additions and 0 deletions

View File

@ -99,6 +99,20 @@
"regex":"/.*/ui"
}
},
"skip":
{
"stripos":
{
"url":
[
"#",
"javascript:",
"mailto:",
"magnet:",
"xmpp:"
]
}
},
"snap":
{
"enabled":true

View File

@ -79,6 +79,8 @@ foreach ($delete as $crc32url => $ids)
// Free mem
$delete = [];
// @TODO $config->cli->document->crawl->skip->stripos->url
// Dump operation result
echo sprintf(
_('duplicated URLs deleted: %s') . PHP_EOL,

View File

@ -298,6 +298,22 @@ foreach($search->get() as $document)
{
foreach (array_unique($documents) as $url)
{
// Apply stripos condition
$skip = false;
foreach ($config->cli->document->crawl->skip->stripos->url as $condition)
{
if (false !== stripos($url, $condition)) {
$skip = true;
break;
}
}
if ($skip) continue;
// Save index
$url = trim($url);
$crc32url = crc32($url);