Browse Source

make url/src column ukeys digital by using crc32

sqliteway
ghost 2 years ago
parent
commit
04dbbc3adf
  1. 3
      crontab/crawler.php
  2. 18
      library/sqlite.php
  3. 2
      public/search.php

3
crontab/crawler.php

@ -100,6 +100,7 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) @@ -100,6 +100,7 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
// Add page images
$db->addImage($queue->pageId,
Filter::url($src),
crc32($src),
Filter::imageAlt($alt));
}
}
@ -134,7 +135,7 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) @@ -134,7 +135,7 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
// Save valid internal links to the index queue
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
$db->initPage($href, time());
$db->initPage($href, crc32($href), time());
}
}
}

18
library/sqlite.php

@ -14,6 +14,7 @@ class SQLite { @@ -14,6 +14,7 @@ class SQLite {
$this->_db->query('
CREATE TABLE IF NOT EXISTS "page" (
"pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32url" INTEGER NOT NULL UNIQUE,
"httpCode" INTEGER,
"timeAdded" INTEGER NOT NULL,
"timeUpdated" INTEGER,
@ -21,16 +22,17 @@ class SQLite { @@ -21,16 +22,17 @@ class SQLite {
"data" TEXT,
"description" TEXT,
"keywords" TEXT,
"url" TEXT NOT NULL UNIQUE
"url" TEXT NOT NULL
)
');
$this->_db->query('
CREATE TABLE IF NOT EXISTS "image" (
"imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32src" INTEGER NOT NULL UNIQUE,
"pageId" INTEGER NOT NULL,
"alt" TEXT NOT NULL,
"src" TEXT NOT NULL UNIQUE
"src" TEXT NOT NULL
)
');
@ -103,20 +105,20 @@ class SQLite { @@ -103,20 +105,20 @@ class SQLite {
return $this->_db->lastInsertId();
}
public function initPage(string $url, int $timeAdded) {
public function initPage(string $url, int $crc32url, int $timeAdded) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `timeAdded`) VALUES (?, ?)');
$query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `crc32url`, `timeAdded`) VALUES (?, ?, ?)');
$query->execute([$url, $timeAdded]);
$query->execute([$url, $crc32url, $timeAdded]);
return $this->_db->lastInsertId();
}
public function addImage(int $pageId, string $src, string $alt) {
public function addImage(int $pageId, string $src, int $crc32src, string $alt) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `alt`) VALUES (?, ?, ?)');
$query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `crc32src`, `alt`) VALUES (?, ?, ?, ?)');
$query->execute([$pageId, $src, $alt]);
$query->execute([$pageId, $src, $crc32src, $alt]);
return $this->_db->lastInsertId();
}

2
public/search.php

@ -22,7 +22,7 @@ $q = !empty($_GET['q']) ? Filter::url($_GET['q']) : ''; @@ -22,7 +22,7 @@ $q = !empty($_GET['q']) ? Filter::url($_GET['q']) : '';
// Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$db->initPage($q, time());
$db->initPage($q, crc32($q), time());
}
// Search request

Loading…
Cancel
Save