Browse Source

make url/src column ukeys digital by using crc32

sqliteway
ghost 2 years ago
parent
commit
04dbbc3adf
  1. 3
      crontab/crawler.php
  2. 18
      library/sqlite.php
  3. 2
      public/search.php

3
crontab/crawler.php

@ -100,6 +100,7 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
// Add page images // Add page images
$db->addImage($queue->pageId, $db->addImage($queue->pageId,
Filter::url($src), Filter::url($src),
crc32($src),
Filter::imageAlt($alt)); Filter::imageAlt($alt));
} }
} }
@ -134,7 +135,7 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
// Save valid internal links to the index queue // Save valid internal links to the index queue
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
$db->initPage($href, time()); $db->initPage($href, crc32($href), time());
} }
} }
} }

18
library/sqlite.php

@ -14,6 +14,7 @@ class SQLite {
$this->_db->query(' $this->_db->query('
CREATE TABLE IF NOT EXISTS "page" ( CREATE TABLE IF NOT EXISTS "page" (
"pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, "pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32url" INTEGER NOT NULL UNIQUE,
"httpCode" INTEGER, "httpCode" INTEGER,
"timeAdded" INTEGER NOT NULL, "timeAdded" INTEGER NOT NULL,
"timeUpdated" INTEGER, "timeUpdated" INTEGER,
@ -21,16 +22,17 @@ class SQLite {
"data" TEXT, "data" TEXT,
"description" TEXT, "description" TEXT,
"keywords" TEXT, "keywords" TEXT,
"url" TEXT NOT NULL UNIQUE "url" TEXT NOT NULL
) )
'); ');
$this->_db->query(' $this->_db->query('
CREATE TABLE IF NOT EXISTS "image" ( CREATE TABLE IF NOT EXISTS "image" (
"imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, "imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32src" INTEGER NOT NULL UNIQUE,
"pageId" INTEGER NOT NULL, "pageId" INTEGER NOT NULL,
"alt" TEXT NOT NULL, "alt" TEXT NOT NULL,
"src" TEXT NOT NULL UNIQUE "src" TEXT NOT NULL
) )
'); ');
@ -103,20 +105,20 @@ class SQLite {
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function initPage(string $url, int $timeAdded) { public function initPage(string $url, int $crc32url, int $timeAdded) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `timeAdded`) VALUES (?, ?)'); $query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `crc32url`, `timeAdded`) VALUES (?, ?, ?)');
$query->execute([$url, $timeAdded]); $query->execute([$url, $crc32url, $timeAdded]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function addImage(int $pageId, string $src, string $alt) { public function addImage(int $pageId, string $src, int $crc32src, string $alt) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `alt`) VALUES (?, ?, ?)'); $query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `crc32src`, `alt`) VALUES (?, ?, ?, ?)');
$query->execute([$pageId, $src, $alt]); $query->execute([$pageId, $src, $crc32src, $alt]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }

2
public/search.php

@ -22,7 +22,7 @@ $q = !empty($_GET['q']) ? Filter::url($_GET['q']) : '';
// Crawl request // Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$db->initPage($q, time()); $db->initPage($q, crc32($q), time());
} }
// Search request // Search request

Loading…
Cancel
Save