From 04dbbc3adf1a4fff53de8023986df2d9d2ec7bbe Mon Sep 17 00:00:00 2001 From: ghost Date: Sun, 2 Apr 2023 18:56:56 +0300 Subject: [PATCH] make url/src column ukeys digital by using crc32 --- crontab/crawler.php | 3 ++- library/sqlite.php | 18 ++++++++++-------- public/search.php | 2 +- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/crontab/crawler.php b/crontab/crawler.php index 9bab142..07a877a 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -100,6 +100,7 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) // Add page images $db->addImage($queue->pageId, Filter::url($src), + crc32($src), Filter::imageAlt($alt)); } } @@ -134,7 +135,7 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) // Save valid internal links to the index queue if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { - $db->initPage($href, time()); + $db->initPage($href, crc32($href), time()); } } } \ No newline at end of file diff --git a/library/sqlite.php b/library/sqlite.php index 84dd0bc..6ace4ab 100644 --- a/library/sqlite.php +++ b/library/sqlite.php @@ -14,6 +14,7 @@ class SQLite { $this->_db->query(' CREATE TABLE IF NOT EXISTS "page" ( "pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + "crc32url" INTEGER NOT NULL UNIQUE, "httpCode" INTEGER, "timeAdded" INTEGER NOT NULL, "timeUpdated" INTEGER, @@ -21,16 +22,17 @@ class SQLite { "data" TEXT, "description" TEXT, "keywords" TEXT, - "url" TEXT NOT NULL UNIQUE + "url" TEXT NOT NULL ) '); $this->_db->query(' CREATE TABLE IF NOT EXISTS "image" ( "imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + "crc32src" INTEGER NOT NULL UNIQUE, "pageId" INTEGER NOT NULL, "alt" TEXT NOT NULL, - "src" TEXT NOT NULL UNIQUE + "src" TEXT NOT NULL ) '); @@ -103,20 +105,20 @@ class SQLite { return $this->_db->lastInsertId(); } - public function initPage(string $url, int $timeAdded) { + public function initPage(string $url, int $crc32url, int $timeAdded) { - $query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `timeAdded`) VALUES (?, ?)'); + $query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `crc32url`, `timeAdded`) VALUES (?, ?, ?)'); - $query->execute([$url, $timeAdded]); + $query->execute([$url, $crc32url, $timeAdded]); return $this->_db->lastInsertId(); } - public function addImage(int $pageId, string $src, string $alt) { + public function addImage(int $pageId, string $src, int $crc32src, string $alt) { - $query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `alt`) VALUES (?, ?, ?)'); + $query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `crc32src`, `alt`) VALUES (?, ?, ?, ?)'); - $query->execute([$pageId, $src, $alt]); + $query->execute([$pageId, $src, $crc32src, $alt]); return $this->_db->lastInsertId(); } diff --git a/public/search.php b/public/search.php index 8e76f4a..06fde5a 100644 --- a/public/search.php +++ b/public/search.php @@ -22,7 +22,7 @@ $q = !empty($_GET['q']) ? Filter::url($_GET['q']) : ''; // Crawl request if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { - $db->initPage($q, time()); + $db->initPage($q, crc32($q), time()); } // Search request