diff --git a/example/config.json b/example/config.json index dbcfe21..b92292e 100644 --- a/example/config.json +++ b/example/config.json @@ -99,6 +99,20 @@ "regex":"/.*/ui" } }, + "skip": + { + "stripos": + { + "url": + [ + "#", + "javascript:", + "mailto:", + "magnet:", + "xmpp:" + ] + } + }, "snap": { "enabled":true diff --git a/src/cli/document/clean.php b/src/cli/document/clean.php index ff05f6f..8e85d6a 100644 --- a/src/cli/document/clean.php +++ b/src/cli/document/clean.php @@ -79,6 +79,8 @@ foreach ($delete as $crc32url => $ids) // Free mem $delete = []; +// @TODO $config->cli->document->crawl->skip->stripos->url + // Dump operation result echo sprintf( _('duplicated URLs deleted: %s') . PHP_EOL, diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index 5650713..2c1c9f0 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -298,6 +298,22 @@ foreach($search->get() as $document) { foreach (array_unique($documents) as $url) { + // Apply stripos condition + $skip = false; + + foreach ($config->cli->document->crawl->skip->stripos->url as $condition) + { + if (false !== stripos($url, $condition)) { + + $skip = true; + + break; + } + } + + if ($skip) continue; + + // Save index $url = trim($url); $crc32url = crc32($url);