add skip url filter by stripos condition

This commit is contained in:
ghost 2023-11-30 02:24:02 +02:00
parent ee074b684a
commit 3306dc1961
3 changed files with 32 additions and 0 deletions

View File

@ -99,6 +99,20 @@
"regex":"/.*/ui" "regex":"/.*/ui"
} }
}, },
"skip":
{
"stripos":
{
"url":
[
"#",
"javascript:",
"mailto:",
"magnet:",
"xmpp:"
]
}
},
"snap": "snap":
{ {
"enabled":true "enabled":true

View File

@ -79,6 +79,8 @@ foreach ($delete as $crc32url => $ids)
// Free mem // Free mem
$delete = []; $delete = [];
// @TODO $config->cli->document->crawl->skip->stripos->url
// Dump operation result // Dump operation result
echo sprintf( echo sprintf(
_('duplicated URLs deleted: %s') . PHP_EOL, _('duplicated URLs deleted: %s') . PHP_EOL,

View File

@ -298,6 +298,22 @@ foreach($search->get() as $document)
{ {
foreach (array_unique($documents) as $url) foreach (array_unique($documents) as $url)
{ {
// Apply stripos condition
$skip = false;
foreach ($config->cli->document->crawl->skip->stripos->url as $condition)
{
if (false !== stripos($url, $condition)) {
$skip = true;
break;
}
}
if ($skip) continue;
// Save index
$url = trim($url); $url = trim($url);
$crc32url = crc32($url); $crc32url = crc32($url);