From 27564c4fbc2d128fd9146d5f078185623251c75f Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 27 Mar 2024 04:27:49 +0200 Subject: [PATCH] add collision events debug --- src/cli/document/crawl.php | 57 +++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index a5999c0..74a0232 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -603,19 +603,22 @@ foreach($index->search('') } // Save index - $url = trim($url); - $crc32url = crc32($url); - - if (!$index->search('') - ->filter('id', $crc32url) - ->limit(1) - ->get() - ->getTotal()) - { + $url = trim( + $url + ); + + $crc32url = crc32( + $url + ); + // Check url does not registered yet + $results = $index->search('')->filter('id', $crc32url)->get(); + + if (!$results->getTotal()) + { $index->addDocument( [ - 'url' => $url + 'url' => $url ], $crc32url ); @@ -630,6 +633,40 @@ foreach($index->search('') ); } } + + // URL already exists + else + { + // Print notice level notice + if ($config->cli->document->crawl->debug->level->notice) + { + echo sprintf( + _('[%s] [notice] URL "%s" already registered with CRC32 "%d"') . PHP_EOL, + date('c'), + $url, + $crc32url + ); + } + + // Check for event details + foreach ($results as $result) + { + // Is collision + if ($url != $result->get('url')) + { + if ($config->cli->document->crawl->debug->level->warning) + { + echo sprintf( + _('[%s] [warning] ID "%d" collision for target url "%s" stored as "%s"') . PHP_EOL, + date('c'), + $crc32url, + $url, + $result->get('url') + ); + } + } + } + } } } }