implement MySQL/Sphinx data model #3, add basical robots.txt support #2

This commit is contained in:
ghost 2023-04-07 04:04:24 +03:00
parent a14d18fedb
commit 2495a2bbc7
14 changed files with 561 additions and 278 deletions

3
.gitignore vendored
View File

@ -2,5 +2,8 @@
.ftpignore
config/app.php
config/sphinx.conf
database/yggo.mwb.bak
storage

View File

@ -7,7 +7,7 @@ The project goal - simple interface, clear architecture and lightweight server r
#### Online examples
* [http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yggo](http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yggo)
* [http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yggo](http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yggo)
* [http://94.140.114.241/yggo/](http://94.140.114.241/yggo)
#### Screenshots
@ -28,7 +28,8 @@ php-dom
php-pdo
php-curl
php-gd
sqlite / fts5
php-mysql
sphinx search server
```
#### Installation
@ -39,12 +40,16 @@ sqlite / fts5
* Set up the `/crontab/crawler.php` script for execution every the minute, but it mostly related of the configs and targetal network volume, there is no debug implemented yet, so let's silentize it by `/dev/null`
* Script has no MVC model, because of super simple. It's is just 2 files, and everything else stored incapsulated in `/library` classes.
#### Configuration
todo
#### Roadmap / ideas
* [x] Web pages full text ranking search
* [x] Make search results pagination
* [ ] Blacklist domains (useful for some mirrors)
* [ ] Add robots.txt support (Issue #2)
* [x] Add robots.txt support (Issue #2)
* [ ] Improve yggdrasil links detection, add .ygg domain zone support
* [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights
* [ ] Images search (basically implemented but requires testing and some performance optimization)
@ -57,7 +62,7 @@ sqlite / fts5
* [x] An idea to make unique gravatars for sites without favicons, because simpler to ident, comparing to ipv6
* [ ] An idea to make some visitors counters, like in good old times?
#### Contributions
#### Contributions
Please make a new master branch for each patch in your fork before create PR
@ -66,6 +71,8 @@ git checkout master
git checkout -b my-pr-branch-name
```
See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
#### Donate to contributors
* @d47081: [BTC](https://www.blockchain.com/explorer/addresses/btc/bc1qngdf2kwty6djjqpk0ynkpq9wmlrmtm7e0c534y) | [DOGE](https://dogechain.info/address/D5Sez493ibLqTpyB3xwQUspZvJ1cxEdRNQ)

View File

@ -11,16 +11,24 @@ define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database
define('DB_NAME', 'database.sqlite');
define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
define('DB_PASSWORD', '');
// Crawl settings
define('CRAWL_IMAGE', false); // @TODO
// Sphinx
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
// Crawl settings
define('CRAWL_PAGE_LIMIT', 10);
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
define('CRAWL_META_ONLY', false);
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
define('CRAWL_HOST_DEFAULT_STATUS', true);
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
define('CRAWL_ROBOTS_DEFAULT_RULES', "");

22
config/sphinx.conf.txt Normal file
View File

@ -0,0 +1,22 @@
source hostPage
{
type = mysql
sql_host = localhost
sql_user =
sql_pass =
sql_db =
sql_port = 3306 # optional, default is 3306
sql_query = \
SELECT hostPageId, metaTitle, metaDescription, metaKeywords, data, uri \
FROM hostPage
sql_attr_uint = hostPageId
}
index hostPage
{
source = hostPage
path =
}

View File

@ -11,31 +11,46 @@ if (false === sem_acquire($semaphore, true)) {
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
require_once('../library/parser.php');
require_once('../library/mysql.php');
// Debug
$timeStart = microtime(true);
$hostPagesProcessed = 0;
$hostPagesIndexed = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Process crawl queue
foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queue) {
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
$url = new Curl($queue->url);
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
$db->updatePageQueue($queue->pageId, time(), $url->getCode());
$curl = new Curl($queueHostPageURL);
// Skip processing non 200 code
if (200 != $url->getCode()) {
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
// Skip next page processing non 200 code
if (200 != $curl->getCode()) {
continue;
}
// Skip processing pages without returned data
if (!$content = $url->getContent()) {
// Skip next page processing pages without returned data
if (!$content = $curl->getContent()) {
continue;
}
// Grab page content
$dom = new DomDocument();
@$dom->loadHTML($content);
@ -62,48 +77,12 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
}
}
// Index page data
$db->updatePage($queue->pageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($description),
Filter::pageKeywords($keywords),
CRAWL_META_ONLY ? '' : Filter::pageData($content),
time());
// Update images
$db->deleteImages($queue->pageId);
if (CRAWL_IMAGE) {
foreach (@$dom->getElementsByTagName('img') as $image) {
// Skip images without required attributes
if (!$src = @$image->getAttribute('src')) {
continue;
}
if (!$alt = @$image->getAttribute('alt')) {
continue;
}
// Add domain to the relative links
if (!parse_url($src, PHP_URL_HOST)) {
$src = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
parse_url($queue->url, PHP_URL_HOST) .
parse_url($queue->url, PHP_URL_PORT) .
$src; // @TODO sometimes wrong URL prefix available
}
// Add page images
$db->addImage($queue->pageId,
Filter::url($src),
crc32($src),
Filter::imageAlt($alt));
}
}
// Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($description),
Filter::pageKeywords($keywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) {
@ -120,22 +99,101 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
continue;
}
// Add absolute prefixes to the relative links
// Add absolute URL prefixes to the relative links found
if (!parse_url($href, PHP_URL_HOST)) {
$href = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
parse_url($queue->url, PHP_URL_HOST) .
parse_url($queue->url, PHP_URL_PORT) .
$href;
$href = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . ltrim($href, '/');
}
// Filter href URL
$href = Filter::url($href);
// Save valid internal links to the index queue
// Validate formatted link
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
$db->initPage($href, crc32($href), time());
$db->beginTransaction();
try {
// Parse formatted link
$hostURL = Parser::hostURL($href);
$hostPageURI = Parser::uri($href);
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots);
if ($hostId) {
echo 'hostmane ' . $hostURL->string . PHP_EOL;
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
$hostPagesAdded++;
}
}
$db->commit();
} catch(Exception $e){
var_dump($e);
$db->rollBack();
}
}
}
}
}
// Debug
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;

BIN
database/yggo.mwb Normal file

Binary file not shown.

View File

@ -4,16 +4,7 @@ class Filter {
static public function url(string $url) {
return trim($url);
}
static public function imageAlt(string $alt) {
$alt = preg_replace('/[\s]+/', ' ', $alt);
$alt = trim($alt);
return $alt;
return trim(urldecode($url));
}
static public function pageTitle(string $title) {

196
library/mysql.php Normal file
View File

@ -0,0 +1,196 @@
<?php
class MySQL {
private PDO $_db;
public function __construct(string $host, int $port, string $database, string $username, string $password) {
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
}
// System
public function beginTransaction() {
$this->_db->beginTransaction();
}
public function commit() {
$this->_db->commit();
}
public function rollBack() {
$this->_db->rollBack();
}
// Host
public function getHost(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1');
$query->execute([$crc32url]);
return $query->fetch();
}
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots]);
return $this->_db->lastInsertId();
}
// Pages
public function getTotalHostPages(int $hostId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetch()->total;
}
public function getTotalPagesByHttpCode(mixed $httpCode) {
if (is_null($httpCode)) {
$query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` IS NULL');
} else {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` = ?');
$query->execute([$httpCode]);
}
return $query->fetch()->total;
}
public function getTotalPages() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`');
$query->execute();
return $query->fetch()->total;
}
public function getHostPage(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->fetch();
}
public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
`hostPage`.`metaDescription`,
`hostPage`.`data`,
`hostPage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE `hostPage`.`hostPageId` = ?
LIMIT 1');
$query->execute([$hostPageId]);
return $query->fetch();
}
public function addHostPage(int $hostId,
int $crc32uri,
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $httpCode = null,
mixed $rank = null,
mixed $metaTitle = null,
mixed $metaDescription = null,
mixed $metaKeywords = null,
mixed $data = null) {
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
`crc32uri`,
`uri`,
`timeAdded`,
`timeUpdated`,
`httpCode`,
`rank`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
return $this->_db->lastInsertId();
}
public function updateHostPage( int $hostPageId,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
mixed $data) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
`metaDescription` = ?,
`metaKeywords` = ?,
`data` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]);
return $query->rowCount();
}
// Crawl tools
public function getCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`,
`hostPage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlPageMetaOnly`,
`host`.`robots`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
ORDER BY `hostPage`.`hostPageId`
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $hostPageId]);
return $query->rowCount();
}
}

73
library/parser.php Normal file
View File

@ -0,0 +1,73 @@
<?php
class Parser {
static public function hostURL(string $string) {
$result = [
'string' => null,
'scheme' => null,
'name' => null,
'port' => null,
];
if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) {
$result['string'] = $hostScheme . '://';
$result['scheme'] = $hostScheme;
} else {
return false;
}
if ($hostName = parse_url($string, PHP_URL_HOST)) {
$result['string'] .= $hostName;
$result['name'] = $hostName;
} else {
return false;
}
if ($hostPort = parse_url($string, PHP_URL_PORT)) {
$result['string'] .= ':' . $hostPort;
$result['port'] = $hostPort;
}
return (object) $result;
}
static public function uri(string $string) {
$result = [
'string' => '/',
'path' => '/',
'query' => null,
];
if ($path = parse_url($string, PHP_URL_PATH)) {
$result['string'] = $path;
$result['path'] = $path;
}
if ($query = parse_url($string, PHP_URL_QUERY)) {
$result['string'] .= '?' . $query;
$result['query'] = '?' . $query;
}
return (object) $result;
}
}

View File

@ -1,7 +1,5 @@
<?php
// @TODO #2
class Robots {
private $_rule = [];
@ -42,16 +40,10 @@ class Robots {
}
}
public function indexURL(string $url) {
public function uriAllowed(string $uri) {
// Unify case match
$url = strtolower(trim($url));
// Convert to URI
$url = str_replace(parse_url($url, PHP_URL_SCHEME) . '://' .
parse_url($url, PHP_URL_HOST) .
parse_url($url, PHP_URL_PORT),
'', $url);
$uri = strtolower(trim($uri));
// Index by default
$result = true;
@ -59,7 +51,7 @@ class Robots {
// Begin index rules by ASC priority
foreach ($this->_rule as $rule => $value) {
if (preg_match('!^' . $rule . '!', $url)) {
if (preg_match('!^' . $rule . '!', $uri)) {
$result = $value;
}

31
library/sphinxql.php Normal file
View File

@ -0,0 +1,31 @@
<?php
class SphinxQL {
private $_sphinx;
public function __construct(string $host, int $port) {
$this->_sphinx = new PDO('mysql:host=' . $host . ';port=' . $port . ';charset=utf8', false, false, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_sphinx->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
}
public function searchHostPages(string $keyword, int $start, int $limit) {
$query = $this->_sphinx->prepare('SELECT * FROM `hostPage` WHERE MATCH(?) LIMIT ' . (int) $start . ',' . (int) $limit);
$query->execute([$keyword]);
return $query->fetchAll();
}
public function searchHostPagesTotal(string $keyword) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
$query->execute([$keyword]);
return $query->fetch()->total;
}
}

View File

@ -1,170 +0,0 @@
<?php
class SQLite {
private PDO $_db;
public function __construct(string $database, string $username, string $password) {
$this->_db = new PDO('sqlite:' . $database, $username, $password);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
$this->_db->query('
CREATE TABLE IF NOT EXISTS "page" (
"pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32url" INTEGER NOT NULL UNIQUE,
"httpCode" INTEGER,
"timeAdded" INTEGER NOT NULL,
"timeUpdated" INTEGER,
"title" TEXT,
"data" TEXT,
"description" TEXT,
"keywords" TEXT,
"url" TEXT NOT NULL
)
');
$this->_db->query('
CREATE TABLE IF NOT EXISTS "image" (
"imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32src" INTEGER NOT NULL UNIQUE,
"pageId" INTEGER NOT NULL,
"alt" TEXT NOT NULL,
"src" TEXT NOT NULL
)
');
// FTS5
$this->_db->query('
CREATE VIRTUAL TABLE IF NOT EXISTS `ftsPage` USING fts5(`url`, `title`, `description`, `keywords`, `data`, tokenize=`unicode61`, content=`page`, content_rowid=`pageId`)
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageInsert` AFTER INSERT ON `page` BEGIN
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
END
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageDelete` AFTER DELETE ON `page` BEGIN
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
END
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageUpdate` UPDATE ON `page` BEGIN
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
END
');
}
public function getTotalPagesByHttpCode(mixed $httpCode) {
if (is_null($httpCode)) {
$query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` IS NULL');
} else {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` = ?');
$query->execute([$httpCode]);
}
return $query->fetch()->total;
}
public function getTotalPages() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page`');
$query->execute();
return $query->fetch()->total;
}
public function updatePage(int $pageId, string $title, string $description, string $keywords, string $data, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `page` SET `title` = ?, `description` = ?, `data` = ?, `timeUpdated` = ? WHERE `pageId` = ?');
$query->execute([$title, $description, $data, $timeUpdated, $pageId]);
return $query->rowCount();
}
public function addPage(string $title, string $description, string $keywords, string $data, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `page` (`title`, `description`, `data`, `timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$title, $description, $data, $timeAdded]);
return $this->_db->lastInsertId();
}
public function initPage(string $url, int $crc32url, int $timeAdded) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `crc32url`, `timeAdded`) VALUES (?, ?, ?)');
$query->execute([$url, $crc32url, $timeAdded]);
return $this->_db->lastInsertId();
}
public function addImage(int $pageId, string $src, int $crc32src, string $alt) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `crc32src`, `alt`) VALUES (?, ?, ?, ?)');
$query->execute([$pageId, $src, $crc32src, $alt]);
return $this->_db->lastInsertId();
}
public function deleteImages(int $pageId) {
$query = $this->_db->prepare('DELETE FROM `image` WHERE `pageId` = ?');
$query->execute([$pageId]);
return $query->rowCount();
}
public function getPageQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `page` WHERE `timeUpdated` IS NULL OR `timeUpdated` < ? ORDER BY `pageId` LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updatePageQueue(string $pageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `page` SET `timeUpdated` = ?, `httpCode` = ? WHERE `pageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $pageId]);
return $query->rowCount();
}
public function searchPages(string $q, int $start = 0, int $limit = 100) {
$query = $this->_db->prepare('SELECT `title`, `description`, `url` FROM `ftsPage` WHERE `data` MATCH ? ORDER BY `rank` LIMIT ' . (int) $start . ',' . (int) $limit);
$query->execute([$q]);
return $query->fetchAll();
}
public function searchPagesTotal(string $q) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `ftsPage` WHERE `data` MATCH ?');
$query->execute([$q]);
return $query->fetch()->total;
}
}

View File

@ -3,10 +3,10 @@
// Load system dependencies
require_once('../config/app.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
require_once('../library/mysql.php');
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
$totalPages = $db->getTotalPages();

View File

@ -2,11 +2,18 @@
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
require_once('../library/parser.php');
require_once('../library/mysql.php');
require_once('../library/sphinxql.php');
// Connect Sphinx search server
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Define page basics
$totalPages = $db->getTotalPages();
@ -23,14 +30,76 @@ $p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
// Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$db->initPage($q, crc32($q), time());
$db->beginTransaction();
try {
// Parse host info
if ($hostURL = Parser::hostURL($q)) {
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots);
}
// Parse page URI
$hostPageURI = Parser::uri($q);
// Init robots parser
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
}
}
$db->commit();
} catch(Exception $e){
$db->rollBack();
}
}
// Search request
if (!empty($q)) {
$results = $db->searchPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT);
$resultsTotal = $db->searchPagesTotal('"' . $q . '"');
$results = $sphinx->searchHostPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT);
$resultsTotal = $sphinx->searchHostPagesTotal('"' . $q . '"');
} else {
@ -196,16 +265,19 @@ if (!empty($q)) {
<?php } ?>
</div>
<?php foreach ($results as $result) { ?>
<div>
<h2><?php echo $result->title ?></h2>
<?php if (!empty($result->description)) { ?>
<span><?php echo $result->description ?></span>
<?php } ?>
<a href="<?php echo $result->url ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode(parse_url($result->url, PHP_URL_HOST)) ?>" alt="favicon" width="16" height="16" />
<?php echo $result->url ?>
</a>
</div>
<?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<div>
<h2><?php echo $hostPage->metaTitle ?></h2>
<?php if (!empty($hostPage->metaDescription)) { ?>
<span><?php echo $hostPage->metaDescription ?></span>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" />
<?php echo $hostPageURL ?>
</a>
</div>
<?php } ?>
<?php } ?>
<?php if ($p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT <= $resultsTotal) { ?>
<div>