Browse Source

initial commit

sqliteway
ghost 2 years ago
parent
commit
72985eaf9e
  1. 22
      config/app.php.txt
  2. 135
      crontab/crawler.php
  3. 44
      library/curl.php
  4. 73
      library/filter.php
  5. 160
      library/sqlite.php
  6. 123
      public/index.php
  7. 197
      public/search.php

22
config/app.php.txt

@ -0,0 +1,22 @@ @@ -0,0 +1,22 @@
<?php
// Debug
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);
// Website
define('WEBSITE_DOMAIN', '');
// Database
define('DB_NAME', 'database.sqlite');
define('DB_USERNAME', '');
define('DB_PASSWORD', '');
// Crawl settings
define('CRAWL_IMAGE', false); // @TODO
define('CRAWL_PAGE_LIMIT', 10);
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
define('CRAWL_URL_REGEXP', '/^.*$/ui');

135
crontab/crawler.php

@ -0,0 +1,135 @@ @@ -0,0 +1,135 @@
<?php
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
// Process crawl queue
foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queue) {
$url = new Curl($queue->url);
$db->updatePageQueue($queue->pageId, time(), $url->getCode());
// Skip processing non 200 code
if (200 != $url->getCode()) {
continue;
}
// Skip processing pages without returned data
if (!$content = $url->getContent()) {
continue;
}
$dom = new DomDocument();
@$dom->loadHTML($content);
// Skip index page links without titles
$title = @$dom->getElementsByTagName('title');
if ($title->length == 0) {
continue;
}
// Get optional page meta data
$description = '';
$keywords = '';
foreach (@$dom->getElementsByTagName('meta') as $meta) {
if (@$meta->getAttribute('name') == 'description') {
$description = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'keywords') {
$keywords = @$meta->getAttribute('content');
}
}
// Index page data
$db->updatePage($queue->pageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($description),
Filter::pageKeywords($keywords),
Filter::pageData($url->getContent()),
time());
// Update images
$db->deleteImages($queue->pageId);
if (CRAWL_IMAGE) {
foreach (@$dom->getElementsByTagName('img') as $image) {
// Skip images without required attributes
if (!$src = @$image->getAttribute('src')) {
continue;
}
if (!$alt = @$image->getAttribute('alt')) {
continue;
}
// Add domain to the relative links
if (!parse_url($src, PHP_URL_HOST)) {
$src = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
parse_url($queue->url, PHP_URL_HOST) .
parse_url($queue->url, PHP_URL_PORT) .
$src; // @TODO sometimes wrong URL prefix available
}
// Add page images
$db->addImage($queue->pageId,
Filter::url($src),
Filter::imageAlt($alt));
}
}
// Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) {
// Skip links without required attribute
if (!$href = @$a->getAttribute('href')) {
continue;
}
// Skip anchor links
if (false !== strpos($href, '#')) {
continue;
}
// Add absolute prefixes to the relative links
if (!parse_url($href, PHP_URL_HOST)) {
$href = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
parse_url($queue->url, PHP_URL_HOST) .
parse_url($queue->url, PHP_URL_PORT) .
$href;
}
// Filter href URL
$href = Filter::url($href);
// Save valid internal links to the index queue
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
if (!$db->getPage($href)) {
$db->initPage($href, time());
}
}
}
}

44
library/curl.php

@ -0,0 +1,44 @@ @@ -0,0 +1,44 @@
<?php
class Curl {
private $_connection;
public function __construct(string $url) {
$this->_connection = curl_init($url);
curl_setopt($this->_connection, CURLOPT_RETURNTRANSFER, true);
curl_setopt($this->_connection, CURLOPT_TIMEOUT, 5);
curl_exec($this->_connection);
}
public function __destruct() {
curl_close($this->_connection);
}
public function getError() {
if (curl_errno($this->_connection)) {
return curl_errno($this->_connection);
} else {
return false;
}
}
public function getCode() {
return curl_getinfo($this->_connection, CURLINFO_HTTP_CODE);
}
public function getContent() {
return curl_exec($this->_connection);
}
}

73
library/filter.php

@ -0,0 +1,73 @@ @@ -0,0 +1,73 @@
<?php
class Filter {
static public function url(string $url) {
return trim($url);
}
static public function imageAlt(string $alt) {
$alt = preg_replace('/[\s]+/', ' ', $alt);
$alt = trim($alt);
return $alt;
}
static public function pageTitle(string $title) {
$title = preg_replace('/[\s]+/', ' ', $title);
$title = trim($title);
return $title;
}
static public function pageDescription(string $description) {
$description = preg_replace('/[\s]+/', ' ', $description);
$description = trim($description);
return $description;
}
static public function pageKeywords(string $keywords) {
$keywords = preg_replace('/[\s]+/', ' ', $keywords);
$keywords = trim($keywords);
return $keywords;
}
static public function pageData(string $data) {
$filterDataPre = [
'/<script.*?\/script>/s',
'/<style.*?\/style>/s'
];
$filterDataPost = [
'/[\s]{2,}/',
];
$data = preg_replace($filterDataPre, ' ', $data);
$data = html_entity_decode($data);
$data = strip_tags($data);
$data = preg_replace($filterDataPost, ' ', $data);
return $data;
}
static public function plural(int $number, array $texts) {
$cases = array (2, 0, 1, 1, 1, 2);
return $texts[(($number % 100) > 4 && ($number % 100) < 20) ? 2 : $cases[min($number % 10, 5)]];
}
}

160
library/sqlite.php

@ -0,0 +1,160 @@ @@ -0,0 +1,160 @@
<?php
class SQLite {
private PDO $_db;
public function __construct(string $database, string $username, string $password) {
$this->_db = new PDO('sqlite:' . $database, $username, $password);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
$this->_db->query('
CREATE TABLE IF NOT EXISTS "page" (
"pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"httpCode" INTEGER,
"timeAdded" INTEGER NOT NULL,
"timeUpdated" INTEGER,
"title" TEXT,
"data" TEXT,
"description" TEXT,
"keywords" TEXT,
"url" TEXT NOT NULL
)
');
$this->_db->query('
CREATE TABLE IF NOT EXISTS "image" (
"imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"pageId" INTEGER NOT NULL,
"alt" TEXT NOT NULL,
"src" TEXT NOT NULL
)
');
// FTS5
$this->_db->query('
CREATE VIRTUAL TABLE IF NOT EXISTS `ftsPage` USING fts5(`url`, `title`, `description`, `keywords`, `data`, tokenize=`unicode61`, content=`page`, content_rowid=`pageId`)
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageInsert` AFTER INSERT ON `page` BEGIN
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
END
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageDelete` AFTER DELETE ON `page` BEGIN
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
END
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageUpdate` UPDATE ON `page` BEGIN
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
END
');
}
public function getTotalPages() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page`');
$query->execute();
return $query->fetch()->total;
}
public function getPage(string $url) {
$query = $this->_db->prepare('SELECT * FROM `page` WHERE `url` = ?');
$query->execute([$url]);
return $query->fetch();
}
public function updatePage(int $pageId, string $title, string $description, string $keywords, string $data, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `page` SET `title` = ?, `description` = ?, `data` = ?, `timeUpdated` = ? WHERE `pageId` = ?');
$query->execute([$title, $description, $data, $timeUpdated, $pageId]);
return $query->rowCount();
}
public function addPage(string $title, string $description, string $keywords, string $data, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `page` (`title`, `description`, `data`, `timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$title, $description, $data, $timeAdded]);
return $this->_db->lastInsertId();
}
public function initPage(string $url, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `page` (`url`, `timeAdded`) VALUES (?, ?)');
$query->execute([$url, $timeAdded]);
return $this->_db->lastInsertId();
}
public function addImage(int $pageId, string $src, string $alt) {
$query = $this->_db->prepare('INSERT INTO `image` (`pageId`, `src`, `alt`) VALUES (?, ?, ?)');
$query->execute([$pageId, $src, $alt]);
return $this->_db->lastInsertId();
}
public function deleteImages(int $pageId) {
$query = $this->_db->prepare('DELETE FROM `image` WHERE `pageId` = ?');
$query->execute([$pageId]);
return $query->rowCount();
}
public function getPageQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `page` WHERE `timeUpdated` IS NULL OR `timeUpdated` < ? ORDER BY `pageId` LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updatePageQueue(string $pageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `page` SET `timeUpdated` = ?, `httpCode` = ? WHERE `pageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $pageId]);
return $query->rowCount();
}
public function searchPages(string $q) {
$query = $this->_db->prepare('SELECT `title`, `description`, `url` FROM `ftsPage` WHERE `data` MATCH ? ORDER BY `rank`');
$query->execute([$q]);
return $query->fetchAll();
}
public function searchPagesTotal(string $q) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `ftsPage` WHERE `data` MATCH ?');
$query->execute([$q]);
return $query->fetch()->total;
}
}

123
public/index.php

@ -0,0 +1,123 @@ @@ -0,0 +1,123 @@
<?php
// Load system dependencies
require_once('../config/app.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
$totalPages = $db->getTotalPages();
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]);
?>
<!DOCTYPE html>
<html lang="<?php echo _('en-US') ?>">
<head>
<title><?php echo _('YGGo! Web Search Engine') ?></title>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="Description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
<meta name="Keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, sqlite, fts5, yggdrasil, js-less, open source') ?>" />
<style>
* {
border: 0;
margin: 0;
padding: 0;
font-family: Sans-serif;
}
body {
background-color: #2e3436
}
h1 {
color: #fff;
font-weight: normal;
font-size: 48px;
margin: 16px 0
}
form {
display: block;
max-width: 640px;
margin: 280px auto;
text-align: center;
}
input {
width: 100%;
margin: 16px 0;
padding: 18px 0;
border-radius: 32px;
background-color: #000;
color: #fff;
font-size: 16px;
text-align: center;
}
input:hover {
background-color: #111
}
input:focus {
outline: none;
background-color: #111
}
input:focus::placeholder {
color: #090808;
}
button {
margin: 22px 0;
padding: 12px 16px;
border-radius: 4px;
cursor: pointer;
background-color: #3394fb;
color: #fff;
font-size: 14px;
}
button:hover {
background-color: #4b9df4;
}
footer {
position: fixed;
bottom: 0;
left:0;
right: 0;
text-align: center;
padding: 24px;
}
a, a:visited, a:active {
color: #ccc;
}
a:hover {
color: #fff;
}
</style>
</head>
<body>
<header>
<form name="search" method="GET" action="<?php echo WEBSITE_DOMAIN; ?>/search.php">
<h1><?php echo _('YGGo!') ?></h1>
<input type="text" name="q" placeholder="<?php echo $placeholder ?>" value="" />
<button type="submit"><?php echo _('Search') ?></button>
</form>
</header>
<footer>
<a href="https://github.com/d47081/YGGo"><?php echo _('meow') ?></a>
</footer>
</body>
</html>

197
public/search.php

@ -0,0 +1,197 @@ @@ -0,0 +1,197 @@
<?php
// Load system dependencies
require_once('../config/app.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
// Define page basics
$totalPages = $db->getTotalPages();
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]);
// Filter request data
$q = !empty($_GET['q']) ? Filter::url($_GET['q']) : '';
// Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
if (!$db->getPage($q)) {
$db->initPage($q, time());
}
}
// Search request
if (!empty($q)) {
$results = $db->searchPages('"' . $q . '"');
$resultsTotal = $db->searchPagesTotal('"' . $q . '"');
} else {
$results = [];
$resultsTotal = 0;
}
?>
<!DOCTYPE html>
<html lang="<?php echo _('en-US'); ?>">
<head>
<title><?php echo sprintf(_('%s - YGGo!'), htmlentities($q)) ?></title>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="Description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
<meta name="Keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, sqlite, fts5, yggdrasil, js-less, open source') ?>" />
<style>
* {
border: 0;
margin: 0;
padding: 0;
font-family: Sans-serif;
}
body {
background-color: #2e3436;
}
header {
background-color: #34393b;
position: fixed;
top: 0;
left: 0;
right: 0;
}
main {
margin-top: 92px;
margin-bottom: 76px;
}
h1 {
color: #fff;
font-weight: normal;
font-size: 26px;
margin: 16px 0;
position: fixed;
top: 8px;
left: 24px;
}
h2 {
display: block;
font-size: 16px;
font-weight: normal;
margin: 4px 0;
color: #fff;
}
form {
display: block;
max-width: 640px;
margin: 0 auto;
text-align: center;
}
input {
width: 100%;
margin: 16px 0;
padding: 14px 0;
border-radius: 32px;
background-color: #000;
color: #fff;
font-size: 16px;
text-align: center;
}
input:hover {
background-color: #111
}
input:focus {
outline: none;
background-color: #111
}
input:focus::placeholder {
color: #090808
}
button {
padding: 12px 16px;
border-radius: 4px;
cursor: pointer;
background-color: #3394fb;
color: #fff;
font-size: 14px;
position: fixed;
top: 18px;
right: 24px;
}
button:hover {
background-color: #4b9df4;
}
a, a:visited, a:active {
color: #3394fb;
display: block;
font-size: 14px;
}
a:hover {
color: #54a3f7;
}
div {
max-width: 640px;
margin: 0 auto;
padding: 16px 0;
border-top: 1px #000 dashed;
font-size: 14px
}
span {
display: block;
margin: 8px 0;
}
</style>
</head>
<body>
<header>
<form name="search" method="GET" action="<?php echo WEBSITE_DOMAIN; ?>/search.php">
<a href="<?php echo WEBSITE_DOMAIN; ?>"><h1><?php echo _('YGGo!') ?></h1></a>
<input type="text" name="q" placeholder="<?php echo $placeholder ?>" value="<?php echo htmlentities($q) ?>" />
<button type="submit"><?php echo _('Search'); ?></button>
</form>
</header>
<main>
<?php if ($results) { ?>
<div>
<?php echo sprintf(_('Total found: %s'), $resultsTotal) ?>
</div>
<?php foreach ($results as $result) { ?>
<div>
<h2><?php echo $result->title ?></h2>
<?php if (!empty($result->description)) { ?>
<span><?php echo $result->description ?></span>
<?php } ?>
<a href="<?php echo $result->url ?>"><?php echo $result->url ?></a>
</div>
<?php } ?>
<?php } else { ?>
<div style="text-align:center">
<?php echo sprintf(_('Total found: %s'), $resultsTotal) ?>
</div>
<?php } ?>
</main>
</body>
</html>
Loading…
Cancel
Save