Browse Source

add skip url filter by stripos condition

main
ghost 12 months ago
parent
commit
3306dc1961
  1. 14
      example/config.json
  2. 2
      src/cli/document/clean.php
  3. 16
      src/cli/document/crawl.php

14
example/config.json

@ -99,6 +99,20 @@ @@ -99,6 +99,20 @@
"regex":"/.*/ui"
}
},
"skip":
{
"stripos":
{
"url":
[
"#",
"javascript:",
"mailto:",
"magnet:",
"xmpp:"
]
}
},
"snap":
{
"enabled":true

2
src/cli/document/clean.php

@ -79,6 +79,8 @@ foreach ($delete as $crc32url => $ids) @@ -79,6 +79,8 @@ foreach ($delete as $crc32url => $ids)
// Free mem
$delete = [];
// @TODO $config->cli->document->crawl->skip->stripos->url
// Dump operation result
echo sprintf(
_('duplicated URLs deleted: %s') . PHP_EOL,

16
src/cli/document/crawl.php

@ -298,6 +298,22 @@ foreach($search->get() as $document) @@ -298,6 +298,22 @@ foreach($search->get() as $document)
{
foreach (array_unique($documents) as $url)
{
// Apply stripos condition
$skip = false;
foreach ($config->cli->document->crawl->skip->stripos->url as $condition)
{
if (false !== stripos($url, $condition)) {
$skip = true;
break;
}
}
if ($skip) continue;
// Save index
$url = trim($url);
$crc32url = crc32($url);

Loading…
Cancel
Save