Browse Source

implement MySQL/Sphinx data model #3, add basical robots.txt support #2

main
ghost 2 years ago
parent
commit
2495a2bbc7
  1. 3
      .gitignore
  2. 11
      README.md
  3. 16
      config/app.php.txt
  4. 22
      config/sphinx.conf.txt
  5. 180
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 11
      library/filter.php
  8. 196
      library/mysql.php
  9. 73
      library/parser.php
  10. 14
      library/robots.php
  11. 31
      library/sphinxql.php
  12. 170
      library/sqlite.php
  13. 4
      public/index.php
  14. 102
      public/search.php

3
.gitignore vendored

@ -2,5 +2,8 @@
.ftpignore .ftpignore
config/app.php config/app.php
config/sphinx.conf
database/yggo.mwb.bak
storage storage

11
README.md

@ -28,7 +28,8 @@ php-dom
php-pdo php-pdo
php-curl php-curl
php-gd php-gd
sqlite / fts5 php-mysql
sphinx search server
``` ```
#### Installation #### Installation
@ -39,12 +40,16 @@ sqlite / fts5
* Set up the `/crontab/crawler.php` script for execution every the minute, but it mostly related of the configs and targetal network volume, there is no debug implemented yet, so let's silentize it by `/dev/null` * Set up the `/crontab/crawler.php` script for execution every the minute, but it mostly related of the configs and targetal network volume, there is no debug implemented yet, so let's silentize it by `/dev/null`
* Script has no MVC model, because of super simple. It's is just 2 files, and everything else stored incapsulated in `/library` classes. * Script has no MVC model, because of super simple. It's is just 2 files, and everything else stored incapsulated in `/library` classes.
#### Configuration
todo
#### Roadmap / ideas #### Roadmap / ideas
* [x] Web pages full text ranking search * [x] Web pages full text ranking search
* [x] Make search results pagination * [x] Make search results pagination
* [ ] Blacklist domains (useful for some mirrors) * [ ] Blacklist domains (useful for some mirrors)
* [ ] Add robots.txt support (Issue #2) * [x] Add robots.txt support (Issue #2)
* [ ] Improve yggdrasil links detection, add .ygg domain zone support * [ ] Improve yggdrasil links detection, add .ygg domain zone support
* [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights * [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights
* [ ] Images search (basically implemented but requires testing and some performance optimization) * [ ] Images search (basically implemented but requires testing and some performance optimization)
@ -66,6 +71,8 @@ git checkout master
git checkout -b my-pr-branch-name git checkout -b my-pr-branch-name
``` ```
See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
#### Donate to contributors #### Donate to contributors
* @d47081: [BTC](https://www.blockchain.com/explorer/addresses/btc/bc1qngdf2kwty6djjqpk0ynkpq9wmlrmtm7e0c534y) | [DOGE](https://dogechain.info/address/D5Sez493ibLqTpyB3xwQUspZvJ1cxEdRNQ) * @d47081: [BTC](https://www.blockchain.com/explorer/addresses/btc/bc1qngdf2kwty6djjqpk0ynkpq9wmlrmtm7e0c534y) | [DOGE](https://dogechain.info/address/D5Sez493ibLqTpyB3xwQUspZvJ1cxEdRNQ)

16
config/app.php.txt

@ -11,16 +11,24 @@ define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true); define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database // Database
define('DB_NAME', 'database.sqlite'); define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', ''); define('DB_USERNAME', '');
define('DB_PASSWORD', ''); define('DB_PASSWORD', '');
// Crawl settings // Sphinx
define('CRAWL_IMAGE', false); // @TODO define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
// Crawl settings
define('CRAWL_PAGE_LIMIT', 10); define('CRAWL_PAGE_LIMIT', 10);
define('CRAWL_PAGE_SECONDS_OFFSET', 3600); define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui' define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
define('CRAWL_META_ONLY', false); define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
define('CRAWL_HOST_DEFAULT_STATUS', true);
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
define('CRAWL_ROBOTS_DEFAULT_RULES', "");

22
config/sphinx.conf.txt

@ -0,0 +1,22 @@
source hostPage
{
type = mysql
sql_host = localhost
sql_user =
sql_pass =
sql_db =
sql_port = 3306 # optional, default is 3306
sql_query = \
SELECT hostPageId, metaTitle, metaDescription, metaKeywords, data, uri \
FROM hostPage
sql_attr_uint = hostPageId
}
index hostPage
{
source = hostPage
path =
}

180
crontab/crawler.php

@ -11,31 +11,46 @@ if (false === sem_acquire($semaphore, true)) {
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
require_once('../library/curl.php'); require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/filter.php'); require_once('../library/filter.php');
require_once('../library/sqlite.php'); require_once('../library/parser.php');
require_once('../library/mysql.php');
// Debug
$timeStart = microtime(true);
$hostPagesProcessed = 0;
$hostPagesIndexed = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
// Connect database // Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Process crawl queue // Process crawl queue
foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queue) { foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
$url = new Curl($queue->url); $curl = new Curl($queueHostPageURL);
$db->updatePageQueue($queue->pageId, time(), $url->getCode()); // Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
// Skip processing non 200 code // Skip next page processing non 200 code
if (200 != $url->getCode()) { if (200 != $curl->getCode()) {
continue; continue;
} }
// Skip processing pages without returned data // Skip next page processing pages without returned data
if (!$content = $url->getContent()) { if (!$content = $curl->getContent()) {
continue; continue;
} }
// Grab page content
$dom = new DomDocument(); $dom = new DomDocument();
@$dom->loadHTML($content); @$dom->loadHTML($content);
@ -62,48 +77,12 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
} }
} }
// Index page data // Update queued page data
$db->updatePage($queue->pageId, $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue), Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($description), Filter::pageDescription($description),
Filter::pageKeywords($keywords), Filter::pageKeywords($keywords),
CRAWL_META_ONLY ? '' : Filter::pageData($content), CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
time());
// Update images
$db->deleteImages($queue->pageId);
if (CRAWL_IMAGE) {
foreach (@$dom->getElementsByTagName('img') as $image) {
// Skip images without required attributes
if (!$src = @$image->getAttribute('src')) {
continue;
}
if (!$alt = @$image->getAttribute('alt')) {
continue;
}
// Add domain to the relative links
if (!parse_url($src, PHP_URL_HOST)) {
$src = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
parse_url($queue->url, PHP_URL_HOST) .
parse_url($queue->url, PHP_URL_PORT) .
$src; // @TODO sometimes wrong URL prefix available
}
// Add page images
$db->addImage($queue->pageId,
Filter::url($src),
crc32($src),
Filter::imageAlt($alt));
}
}
// Collect internal links from page content // Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) { foreach(@$dom->getElementsByTagName('a') as $a) {
@ -120,22 +99,101 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
continue; continue;
} }
// Add absolute prefixes to the relative links // Add absolute URL prefixes to the relative links found
if (!parse_url($href, PHP_URL_HOST)) { if (!parse_url($href, PHP_URL_HOST)) {
$href = parse_url($queue->url, PHP_URL_SCHEME) . '://' . $href = $queueHostPage->scheme . '://' .
parse_url($queue->url, PHP_URL_HOST) . $queueHostPage->name .
parse_url($queue->url, PHP_URL_PORT) . ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
$href; '/' . ltrim($href, '/');
} }
// Filter href URL // Validate formatted link
$href = Filter::url($href);
// Save valid internal links to the index queue
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
$db->initPage($href, crc32($href), time()); $db->beginTransaction();
try {
// Parse formatted link
$hostURL = Parser::hostURL($href);
$hostPageURI = Parser::uri($href);
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots);
if ($hostId) {
echo 'hostmane ' . $hostURL->string . PHP_EOL;
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
$hostPagesAdded++;
}
}
$db->commit();
} catch(Exception $e){
var_dump($e);
$db->rollBack();
}
} }
} }
} }
// Debug
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

11
library/filter.php

@ -4,16 +4,7 @@ class Filter {
static public function url(string $url) { static public function url(string $url) {
return trim($url); return trim(urldecode($url));
}
static public function imageAlt(string $alt) {
$alt = preg_replace('/[\s]+/', ' ', $alt);
$alt = trim($alt);
return $alt;
} }
static public function pageTitle(string $title) { static public function pageTitle(string $title) {

196
library/mysql.php

@ -0,0 +1,196 @@
<?php
class MySQL {
private PDO $_db;
public function __construct(string $host, int $port, string $database, string $username, string $password) {
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
}
// System
public function beginTransaction() {
$this->_db->beginTransaction();
}
public function commit() {
$this->_db->commit();
}
public function rollBack() {
$this->_db->rollBack();
}
// Host
public function getHost(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1');
$query->execute([$crc32url]);
return $query->fetch();
}
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots]);
return $this->_db->lastInsertId();
}
// Pages
public function getTotalHostPages(int $hostId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetch()->total;
}
public function getTotalPagesByHttpCode(mixed $httpCode) {
if (is_null($httpCode)) {
$query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` IS NULL');
} else {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` = ?');
$query->execute([$httpCode]);
}
return $query->fetch()->total;
}
public function getTotalPages() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`');
$query->execute();
return $query->fetch()->total;
}
public function getHostPage(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->fetch();
}
public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
`hostPage`.`metaDescription`,
`hostPage`.`data`,
`hostPage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE `hostPage`.`hostPageId` = ?
LIMIT 1');
$query->execute([$hostPageId]);
return $query->fetch();
}
public function addHostPage(int $hostId,
int $crc32uri,
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $httpCode = null,
mixed $rank = null,
mixed $metaTitle = null,
mixed $metaDescription = null,
mixed $metaKeywords = null,
mixed $data = null) {
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
`crc32uri`,
`uri`,
`timeAdded`,
`timeUpdated`,
`httpCode`,
`rank`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
return $this->_db->lastInsertId();
}
public function updateHostPage( int $hostPageId,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
mixed $data) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
`metaDescription` = ?,
`metaKeywords` = ?,
`data` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]);
return $query->rowCount();
}
// Crawl tools
public function getCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`,
`hostPage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlPageMetaOnly`,
`host`.`robots`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
ORDER BY `hostPage`.`hostPageId`
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $hostPageId]);
return $query->rowCount();
}
}

73
library/parser.php

@ -0,0 +1,73 @@
<?php
class Parser {
static public function hostURL(string $string) {
$result = [
'string' => null,
'scheme' => null,
'name' => null,
'port' => null,
];
if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) {
$result['string'] = $hostScheme . '://';
$result['scheme'] = $hostScheme;
} else {
return false;
}
if ($hostName = parse_url($string, PHP_URL_HOST)) {
$result['string'] .= $hostName;
$result['name'] = $hostName;
} else {
return false;
}
if ($hostPort = parse_url($string, PHP_URL_PORT)) {
$result['string'] .= ':' . $hostPort;
$result['port'] = $hostPort;
}
return (object) $result;
}
static public function uri(string $string) {
$result = [
'string' => '/',
'path' => '/',
'query' => null,
];
if ($path = parse_url($string, PHP_URL_PATH)) {
$result['string'] = $path;
$result['path'] = $path;
}
if ($query = parse_url($string, PHP_URL_QUERY)) {
$result['string'] .= '?' . $query;
$result['query'] = '?' . $query;
}
return (object) $result;
}
}

14
library/robots.php

@ -1,7 +1,5 @@
<?php <?php
// @TODO #2
class Robots { class Robots {
private $_rule = []; private $_rule = [];
@ -42,16 +40,10 @@ class Robots {
} }
} }
public function indexURL(string $url) { public function uriAllowed(string $uri) {
// Unify case match // Unify case match
$url = strtolower(trim($url)); $uri = strtolower(trim($uri));
// Convert to URI
$url = str_replace(parse_url($url, PHP_URL_SCHEME) . '://' .
parse_url($url, PHP_URL_HOST) .
parse_url($url, PHP_URL_PORT),
'', $url);
// Index by default // Index by default
$result = true; $result = true;
@ -59,7 +51,7 @@ class Robots {
// Begin index rules by ASC priority // Begin index rules by ASC priority
foreach ($this->_rule as $rule => $value) { foreach ($this->_rule as $rule => $value) {
if (preg_match('!^' . $rule . '!', $url)) { if (preg_match('!^' . $rule . '!', $uri)) {
$result = $value; $result = $value;
} }

31
library/sphinxql.php

@ -0,0 +1,31 @@
<?php
class SphinxQL {
private $_sphinx;
public function __construct(string $host, int $port) {
$this->_sphinx = new PDO('mysql:host=' . $host . ';port=' . $port . ';charset=utf8', false, false, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_sphinx->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
}
public function searchHostPages(string $keyword, int $start, int $limit) {
$query = $this->_sphinx->prepare('SELECT * FROM `hostPage` WHERE MATCH(?) LIMIT ' . (int) $start . ',' . (int) $limit);
$query->execute([$keyword]);
return $query->fetchAll();
}
public function searchHostPagesTotal(string $keyword) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
$query->execute([$keyword]);
return $query->fetch()->total;
}
}

170
library/sqlite.php

@ -1,170 +0,0 @@
<?php
class SQLite {
private PDO $_db;
public function __construct(string $database, string $username, string $password) {
$this->_db = new PDO('sqlite:' . $database, $username, $password);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
$this->_db->query('
CREATE TABLE IF NOT EXISTS "page" (
"pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32url" INTEGER NOT NULL UNIQUE,
"httpCode" INTEGER,
"timeAdded" INTEGER NOT NULL,
"timeUpdated" INTEGER,
"title" TEXT,
"data" TEXT,
"description" TEXT,
"keywords" TEXT,
"url" TEXT NOT NULL
)
');
$this->_db->query('
CREATE TABLE IF NOT EXISTS "image" (
"imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32src" INTEGER NOT NULL UNIQUE,
"pageId" INTEGER NOT NULL,
"alt" TEXT NOT NULL,
"src" TEXT NOT NULL
)
');
// FTS5
$this->_db->query('
CREATE VIRTUAL TABLE IF NOT EXISTS `ftsPage` USING fts5(`url`, `title`, `description`, `keywords`, `data`, tokenize=`unicode61`, content=`page`, content_rowid=`pageId`)
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageInsert` AFTER INSERT ON `page` BEGIN
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
END
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageDelete` AFTER DELETE ON `page` BEGIN
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
END
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageUpdate` UPDATE ON `page` BEGIN
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
END
');
}
public function getTotalPagesByHttpCode(mixed $httpCode) {
if (is_null($httpCode)) {
$query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` IS NULL');
} else {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` = ?');
$query->execute([$httpCode]);
}
return $query->fetch()->total;
}
public function getTotalPages() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page`');
$query->execute();
return $query->fetch()->total;
}
public function updatePage(int $pageId, string $title, string $description, string $keywords, string $data, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `page` SET `title` = ?, `description` = ?, `data` = ?, `timeUpdated` = ? WHERE `pageId` = ?');
$query->execute([$title, $description, $data, $timeUpdated, $pageId]);
return $query->rowCount();
}
public function addPage(string $title, string $description, string $keywords, string $data, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `page` (`title`, `description`, `data`, `timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$title, $description, $data, $timeAdded]);
return $this->_db->lastInsertId();
}
public function initPage(string $url, int $crc32url, int $timeAdded) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `crc32url`, `timeAdded`) VALUES (?, ?, ?)');
$query->execute([$url, $crc32url, $timeAdded]);
return $this->_db->lastInsertId();
}
public function addImage(int $pageId, string $src, int $crc32src, string $alt) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `crc32src`, `alt`) VALUES (?, ?, ?, ?)');
$query->execute([$pageId, $src, $crc32src, $alt]);
return $this->_db->lastInsertId();
}
public function deleteImages(int $pageId) {
$query = $this->_db->prepare('DELETE FROM `image` WHERE `pageId` = ?');
$query->execute([$pageId]);
return $query->rowCount();
}
public function getPageQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `page` WHERE `timeUpdated` IS NULL OR `timeUpdated` < ? ORDER BY `pageId` LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updatePageQueue(string $pageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `page` SET `timeUpdated` = ?, `httpCode` = ? WHERE `pageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $pageId]);
return $query->rowCount();
}
public function searchPages(string $q, int $start = 0, int $limit = 100) {
$query = $this->_db->prepare('SELECT `title`, `description`, `url` FROM `ftsPage` WHERE `data` MATCH ? ORDER BY `rank` LIMIT ' . (int) $start . ',' . (int) $limit);
$query->execute([$q]);
return $query->fetchAll();
}
public function searchPagesTotal(string $q) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `ftsPage` WHERE `data` MATCH ?');
$query->execute([$q]);
return $query->fetch()->total;
}
}

4
public/index.php

@ -3,10 +3,10 @@
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
require_once('../library/filter.php'); require_once('../library/filter.php');
require_once('../library/sqlite.php'); require_once('../library/mysql.php');
// Connect database // Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
$totalPages = $db->getTotalPages(); $totalPages = $db->getTotalPages();

102
public/search.php

@ -2,11 +2,18 @@
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/filter.php'); require_once('../library/filter.php');
require_once('../library/sqlite.php'); require_once('../library/parser.php');
require_once('../library/mysql.php');
require_once('../library/sphinxql.php');
// Connect Sphinx search server
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
// Connect database // Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Define page basics // Define page basics
$totalPages = $db->getTotalPages(); $totalPages = $db->getTotalPages();
@ -23,14 +30,76 @@ $p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
// Crawl request // Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$db->initPage($q, crc32($q), time()); $db->beginTransaction();
try {
// Parse host info
if ($hostURL = Parser::hostURL($q)) {
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots);
}
// Parse page URI
$hostPageURI = Parser::uri($q);
// Init robots parser
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
}
}
$db->commit();
} catch(Exception $e){
$db->rollBack();
}
} }
// Search request // Search request
if (!empty($q)) { if (!empty($q)) {
$results = $db->searchPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT); $results = $sphinx->searchHostPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT);
$resultsTotal = $db->searchPagesTotal('"' . $q . '"'); $resultsTotal = $sphinx->searchHostPagesTotal('"' . $q . '"');
} else { } else {
@ -196,16 +265,19 @@ if (!empty($q)) {
<?php } ?> <?php } ?>
</div> </div>
<?php foreach ($results as $result) { ?> <?php foreach ($results as $result) { ?>
<div> <?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<h2><?php echo $result->title ?></h2> <?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<?php if (!empty($result->description)) { ?> <div>
<span><?php echo $result->description ?></span> <h2><?php echo $hostPage->metaTitle ?></h2>
<?php } ?> <?php if (!empty($hostPage->metaDescription)) { ?>
<a href="<?php echo $result->url ?>"> <span><?php echo $hostPage->metaDescription ?></span>
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode(parse_url($result->url, PHP_URL_HOST)) ?>" alt="favicon" width="16" height="16" /> <?php } ?>
<?php echo $result->url ?> <a href="<?php echo $hostPageURL ?>">
</a> <img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" />
</div> <?php echo $hostPageURL ?>
</a>
</div>
<?php } ?>
<?php } ?> <?php } ?>
<?php if ($p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT <= $resultsTotal) { ?> <?php if ($p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT <= $resultsTotal) { ?>
<div> <div>

Loading…
Cancel
Save