Browse Source

add skip url filter by stripos condition

main
ghost 12 months ago
parent
commit
3306dc1961
  1. 14
      example/config.json
  2. 2
      src/cli/document/clean.php
  3. 16
      src/cli/document/crawl.php

14
example/config.json

@ -99,6 +99,20 @@
"regex":"/.*/ui" "regex":"/.*/ui"
} }
}, },
"skip":
{
"stripos":
{
"url":
[
"#",
"javascript:",
"mailto:",
"magnet:",
"xmpp:"
]
}
},
"snap": "snap":
{ {
"enabled":true "enabled":true

2
src/cli/document/clean.php

@ -79,6 +79,8 @@ foreach ($delete as $crc32url => $ids)
// Free mem // Free mem
$delete = []; $delete = [];
// @TODO $config->cli->document->crawl->skip->stripos->url
// Dump operation result // Dump operation result
echo sprintf( echo sprintf(
_('duplicated URLs deleted: %s') . PHP_EOL, _('duplicated URLs deleted: %s') . PHP_EOL,

16
src/cli/document/crawl.php

@ -298,6 +298,22 @@ foreach($search->get() as $document)
{ {
foreach (array_unique($documents) as $url) foreach (array_unique($documents) as $url)
{ {
// Apply stripos condition
$skip = false;
foreach ($config->cli->document->crawl->skip->stripos->url as $condition)
{
if (false !== stripos($url, $condition)) {
$skip = true;
break;
}
}
if ($skip) continue;
// Save index
$url = trim($url); $url = trim($url);
$crc32url = crc32($url); $crc32url = crc32($url);

Loading…
Cancel
Save