Browse Source

implement MySQL/Sphinx data model #3, add basical robots.txt support #2

main
ghost 2 years ago
parent
commit
2495a2bbc7
  1. 3
      .gitignore
  2. 11
      README.md
  3. 16
      config/app.php.txt
  4. 22
      config/sphinx.conf.txt
  5. 180
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 11
      library/filter.php
  8. 196
      library/mysql.php
  9. 73
      library/parser.php
  10. 14
      library/robots.php
  11. 31
      library/sphinxql.php
  12. 170
      library/sqlite.php
  13. 4
      public/index.php
  14. 102
      public/search.php

3
.gitignore vendored

@ -2,5 +2,8 @@ @@ -2,5 +2,8 @@
.ftpignore
config/app.php
config/sphinx.conf
database/yggo.mwb.bak
storage

11
README.md

@ -28,7 +28,8 @@ php-dom @@ -28,7 +28,8 @@ php-dom
php-pdo
php-curl
php-gd
sqlite / fts5
php-mysql
sphinx search server
```
#### Installation
@ -39,12 +40,16 @@ sqlite / fts5 @@ -39,12 +40,16 @@ sqlite / fts5
* Set up the `/crontab/crawler.php` script for execution every the minute, but it mostly related of the configs and targetal network volume, there is no debug implemented yet, so let's silentize it by `/dev/null`
* Script has no MVC model, because of super simple. It's is just 2 files, and everything else stored incapsulated in `/library` classes.
#### Configuration
todo
#### Roadmap / ideas
* [x] Web pages full text ranking search
* [x] Make search results pagination
* [ ] Blacklist domains (useful for some mirrors)
* [ ] Add robots.txt support (Issue #2)
* [x] Add robots.txt support (Issue #2)
* [ ] Improve yggdrasil links detection, add .ygg domain zone support
* [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights
* [ ] Images search (basically implemented but requires testing and some performance optimization)
@ -66,6 +71,8 @@ git checkout master @@ -66,6 +71,8 @@ git checkout master
git checkout -b my-pr-branch-name
```
See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
#### Donate to contributors
* @d47081: [BTC](https://www.blockchain.com/explorer/addresses/btc/bc1qngdf2kwty6djjqpk0ynkpq9wmlrmtm7e0c534y) | [DOGE](https://dogechain.info/address/D5Sez493ibLqTpyB3xwQUspZvJ1cxEdRNQ)

16
config/app.php.txt

@ -11,16 +11,24 @@ define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100); @@ -11,16 +11,24 @@ define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database
define('DB_NAME', 'database.sqlite');
define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
define('DB_PASSWORD', '');
// Crawl settings
define('CRAWL_IMAGE', false); // @TODO
// Sphinx
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
// Crawl settings
define('CRAWL_PAGE_LIMIT', 10);
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
define('CRAWL_META_ONLY', false);
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
define('CRAWL_HOST_DEFAULT_STATUS', true);
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
define('CRAWL_ROBOTS_DEFAULT_RULES', "");

22
config/sphinx.conf.txt

@ -0,0 +1,22 @@ @@ -0,0 +1,22 @@
source hostPage
{
type = mysql
sql_host = localhost
sql_user =
sql_pass =
sql_db =
sql_port = 3306 # optional, default is 3306
sql_query = \
SELECT hostPageId, metaTitle, metaDescription, metaKeywords, data, uri \
FROM hostPage
sql_attr_uint = hostPageId
}
index hostPage
{
source = hostPage
path =
}

180
crontab/crawler.php

@ -11,31 +11,46 @@ if (false === sem_acquire($semaphore, true)) { @@ -11,31 +11,46 @@ if (false === sem_acquire($semaphore, true)) {
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
require_once('../library/parser.php');
require_once('../library/mysql.php');
// Debug
$timeStart = microtime(true);
$hostPagesProcessed = 0;
$hostPagesIndexed = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Process crawl queue
foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queue) {
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
$url = new Curl($queue->url);
$curl = new Curl($queueHostPageURL);
$db->updatePageQueue($queue->pageId, time(), $url->getCode());
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
// Skip processing non 200 code
if (200 != $url->getCode()) {
// Skip next page processing non 200 code
if (200 != $curl->getCode()) {
continue;
}
// Skip processing pages without returned data
if (!$content = $url->getContent()) {
// Skip next page processing pages without returned data
if (!$content = $curl->getContent()) {
continue;
}
// Grab page content
$dom = new DomDocument();
@$dom->loadHTML($content);
@ -62,48 +77,12 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) @@ -62,48 +77,12 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
}
}
// Index page data
$db->updatePage($queue->pageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($description),
Filter::pageKeywords($keywords),
CRAWL_META_ONLY ? '' : Filter::pageData($content),
time());
// Update images
$db->deleteImages($queue->pageId);
if (CRAWL_IMAGE) {
foreach (@$dom->getElementsByTagName('img') as $image) {
// Skip images without required attributes
if (!$src = @$image->getAttribute('src')) {
continue;
}
if (!$alt = @$image->getAttribute('alt')) {
continue;
}
// Add domain to the relative links
if (!parse_url($src, PHP_URL_HOST)) {
$src = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
parse_url($queue->url, PHP_URL_HOST) .
parse_url($queue->url, PHP_URL_PORT) .
$src; // @TODO sometimes wrong URL prefix available
}
// Add page images
$db->addImage($queue->pageId,
Filter::url($src),
crc32($src),
Filter::imageAlt($alt));
}
}
// Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($description),
Filter::pageKeywords($keywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) {
@ -120,22 +99,101 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) @@ -120,22 +99,101 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
continue;
}
// Add absolute prefixes to the relative links
// Add absolute URL prefixes to the relative links found
if (!parse_url($href, PHP_URL_HOST)) {
$href = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
parse_url($queue->url, PHP_URL_HOST) .
parse_url($queue->url, PHP_URL_PORT) .
$href;
$href = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . ltrim($href, '/');
}
// Filter href URL
$href = Filter::url($href);
// Save valid internal links to the index queue
// Validate formatted link
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
$db->initPage($href, crc32($href), time());
$db->beginTransaction();
try {
// Parse formatted link
$hostURL = Parser::hostURL($href);
$hostPageURI = Parser::uri($href);
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots);
if ($hostId) {
echo 'hostmane ' . $hostURL->string . PHP_EOL;
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
$hostPagesAdded++;
}
}
$db->commit();
} catch(Exception $e){
var_dump($e);
$db->rollBack();
}
}
}
}
// Debug
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

11
library/filter.php

@ -4,16 +4,7 @@ class Filter { @@ -4,16 +4,7 @@ class Filter {
static public function url(string $url) {
return trim($url);
}
static public function imageAlt(string $alt) {
$alt = preg_replace('/[\s]+/', ' ', $alt);
$alt = trim($alt);
return $alt;
return trim(urldecode($url));
}
static public function pageTitle(string $title) {

196
library/mysql.php

@ -0,0 +1,196 @@ @@ -0,0 +1,196 @@
<?php
class MySQL {
private PDO $_db;
public function __construct(string $host, int $port, string $database, string $username, string $password) {
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
}
// System
public function beginTransaction() {
$this->_db->beginTransaction();
}
public function commit() {
$this->_db->commit();
}
public function rollBack() {
$this->_db->rollBack();
}
// Host
public function getHost(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1');
$query->execute([$crc32url]);
return $query->fetch();
}
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots]);
return $this->_db->lastInsertId();
}
// Pages
public function getTotalHostPages(int $hostId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetch()->total;
}
public function getTotalPagesByHttpCode(mixed $httpCode) {
if (is_null($httpCode)) {
$query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` IS NULL');
} else {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` = ?');
$query->execute([$httpCode]);
}
return $query->fetch()->total;
}
public function getTotalPages() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`');
$query->execute();
return $query->fetch()->total;
}
public function getHostPage(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->fetch();
}
public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
`hostPage`.`metaDescription`,
`hostPage`.`data`,
`hostPage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE `hostPage`.`hostPageId` = ?
LIMIT 1');
$query->execute([$hostPageId]);
return $query->fetch();
}
public function addHostPage(int $hostId,
int $crc32uri,
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $httpCode = null,
mixed $rank = null,
mixed $metaTitle = null,
mixed $metaDescription = null,
mixed $metaKeywords = null,
mixed $data = null) {
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
`crc32uri`,
`uri`,
`timeAdded`,
`timeUpdated`,
`httpCode`,
`rank`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
return $this->_db->lastInsertId();
}
public function updateHostPage( int $hostPageId,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
mixed $data) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
`metaDescription` = ?,
`metaKeywords` = ?,
`data` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]);
return $query->rowCount();
}
// Crawl tools
public function getCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`,
`hostPage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlPageMetaOnly`,
`host`.`robots`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
ORDER BY `hostPage`.`hostPageId`
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $hostPageId]);
return $query->rowCount();
}
}

73
library/parser.php

@ -0,0 +1,73 @@ @@ -0,0 +1,73 @@
<?php
class Parser {
static public function hostURL(string $string) {
$result = [
'string' => null,
'scheme' => null,
'name' => null,
'port' => null,
];
if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) {
$result['string'] = $hostScheme . '://';
$result['scheme'] = $hostScheme;
} else {
return false;
}
if ($hostName = parse_url($string, PHP_URL_HOST)) {
$result['string'] .= $hostName;
$result['name'] = $hostName;
} else {
return false;
}
if ($hostPort = parse_url($string, PHP_URL_PORT)) {
$result['string'] .= ':' . $hostPort;
$result['port'] = $hostPort;
}
return (object) $result;
}
static public function uri(string $string) {
$result = [
'string' => '/',
'path' => '/',
'query' => null,
];
if ($path = parse_url($string, PHP_URL_PATH)) {
$result['string'] = $path;
$result['path'] = $path;
}
if ($query = parse_url($string, PHP_URL_QUERY)) {
$result['string'] .= '?' . $query;
$result['query'] = '?' . $query;
}
return (object) $result;
}
}

14
library/robots.php

@ -1,7 +1,5 @@ @@ -1,7 +1,5 @@
<?php
// @TODO #2
class Robots {
private $_rule = [];
@ -42,16 +40,10 @@ class Robots { @@ -42,16 +40,10 @@ class Robots {
}
}
public function indexURL(string $url) {
public function uriAllowed(string $uri) {
// Unify case match
$url = strtolower(trim($url));
// Convert to URI
$url = str_replace(parse_url($url, PHP_URL_SCHEME) . '://' .
parse_url($url, PHP_URL_HOST) .
parse_url($url, PHP_URL_PORT),
'', $url);
$uri = strtolower(trim($uri));
// Index by default
$result = true;
@ -59,7 +51,7 @@ class Robots { @@ -59,7 +51,7 @@ class Robots {
// Begin index rules by ASC priority
foreach ($this->_rule as $rule => $value) {
if (preg_match('!^' . $rule . '!', $url)) {
if (preg_match('!^' . $rule . '!', $uri)) {
$result = $value;
}

31
library/sphinxql.php

@ -0,0 +1,31 @@ @@ -0,0 +1,31 @@
<?php
class SphinxQL {
private $_sphinx;
public function __construct(string $host, int $port) {
$this->_sphinx = new PDO('mysql:host=' . $host . ';port=' . $port . ';charset=utf8', false, false, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_sphinx->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
}
public function searchHostPages(string $keyword, int $start, int $limit) {
$query = $this->_sphinx->prepare('SELECT * FROM `hostPage` WHERE MATCH(?) LIMIT ' . (int) $start . ',' . (int) $limit);
$query->execute([$keyword]);
return $query->fetchAll();
}
public function searchHostPagesTotal(string $keyword) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
$query->execute([$keyword]);
return $query->fetch()->total;
}
}

170
library/sqlite.php

@ -1,170 +0,0 @@ @@ -1,170 +0,0 @@
<?php
class SQLite {
private PDO $_db;
public function __construct(string $database, string $username, string $password) {
$this->_db = new PDO('sqlite:' . $database, $username, $password);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
$this->_db->query('
CREATE TABLE IF NOT EXISTS "page" (
"pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32url" INTEGER NOT NULL UNIQUE,
"httpCode" INTEGER,
"timeAdded" INTEGER NOT NULL,
"timeUpdated" INTEGER,
"title" TEXT,
"data" TEXT,
"description" TEXT,
"keywords" TEXT,
"url" TEXT NOT NULL
)
');
$this->_db->query('
CREATE TABLE IF NOT EXISTS "image" (
"imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"crc32src" INTEGER NOT NULL UNIQUE,
"pageId" INTEGER NOT NULL,
"alt" TEXT NOT NULL,
"src" TEXT NOT NULL
)
');
// FTS5
$this->_db->query('
CREATE VIRTUAL TABLE IF NOT EXISTS `ftsPage` USING fts5(`url`, `title`, `description`, `keywords`, `data`, tokenize=`unicode61`, content=`page`, content_rowid=`pageId`)
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageInsert` AFTER INSERT ON `page` BEGIN
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
END
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageDelete` AFTER DELETE ON `page` BEGIN
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
END
');
$this->_db->query('
CREATE TRIGGER IF NOT EXISTS `pageUpdate` UPDATE ON `page` BEGIN
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
END
');
}
public function getTotalPagesByHttpCode(mixed $httpCode) {
if (is_null($httpCode)) {
$query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` IS NULL');
} else {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` = ?');
$query->execute([$httpCode]);
}
return $query->fetch()->total;
}
public function getTotalPages() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page`');
$query->execute();
return $query->fetch()->total;
}
public function updatePage(int $pageId, string $title, string $description, string $keywords, string $data, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `page` SET `title` = ?, `description` = ?, `data` = ?, `timeUpdated` = ? WHERE `pageId` = ?');
$query->execute([$title, $description, $data, $timeUpdated, $pageId]);
return $query->rowCount();
}
public function addPage(string $title, string $description, string $keywords, string $data, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `page` (`title`, `description`, `data`, `timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$title, $description, $data, $timeAdded]);
return $this->_db->lastInsertId();
}
public function initPage(string $url, int $crc32url, int $timeAdded) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `crc32url`, `timeAdded`) VALUES (?, ?, ?)');
$query->execute([$url, $crc32url, $timeAdded]);
return $this->_db->lastInsertId();
}
public function addImage(int $pageId, string $src, int $crc32src, string $alt) {
$query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `crc32src`, `alt`) VALUES (?, ?, ?, ?)');
$query->execute([$pageId, $src, $crc32src, $alt]);
return $this->_db->lastInsertId();
}
public function deleteImages(int $pageId) {
$query = $this->_db->prepare('DELETE FROM `image` WHERE `pageId` = ?');
$query->execute([$pageId]);
return $query->rowCount();
}
public function getPageQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `page` WHERE `timeUpdated` IS NULL OR `timeUpdated` < ? ORDER BY `pageId` LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updatePageQueue(string $pageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `page` SET `timeUpdated` = ?, `httpCode` = ? WHERE `pageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $pageId]);
return $query->rowCount();
}
public function searchPages(string $q, int $start = 0, int $limit = 100) {
$query = $this->_db->prepare('SELECT `title`, `description`, `url` FROM `ftsPage` WHERE `data` MATCH ? ORDER BY `rank` LIMIT ' . (int) $start . ',' . (int) $limit);
$query->execute([$q]);
return $query->fetchAll();
}
public function searchPagesTotal(string $q) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `ftsPage` WHERE `data` MATCH ?');
$query->execute([$q]);
return $query->fetch()->total;
}
}

4
public/index.php

@ -3,10 +3,10 @@ @@ -3,10 +3,10 @@
// Load system dependencies
require_once('../config/app.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
require_once('../library/mysql.php');
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
$totalPages = $db->getTotalPages();

102
public/search.php

@ -2,11 +2,18 @@ @@ -2,11 +2,18 @@
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/filter.php');
require_once('../library/sqlite.php');
require_once('../library/parser.php');
require_once('../library/mysql.php');
require_once('../library/sphinxql.php');
// Connect Sphinx search server
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
// Connect database
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Define page basics
$totalPages = $db->getTotalPages();
@ -23,14 +30,76 @@ $p = !empty($_GET['p']) ? (int) $_GET['p'] : 1; @@ -23,14 +30,76 @@ $p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
// Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$db->initPage($q, crc32($q), time());
$db->beginTransaction();
try {
// Parse host info
if ($hostURL = Parser::hostURL($q)) {
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots);
}
// Parse page URI
$hostPageURI = Parser::uri($q);
// Init robots parser
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
}
}
$db->commit();
} catch(Exception $e){
$db->rollBack();
}
}
// Search request
if (!empty($q)) {
$results = $db->searchPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT);
$resultsTotal = $db->searchPagesTotal('"' . $q . '"');
$results = $sphinx->searchHostPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT);
$resultsTotal = $sphinx->searchHostPagesTotal('"' . $q . '"');
} else {
@ -196,16 +265,19 @@ if (!empty($q)) { @@ -196,16 +265,19 @@ if (!empty($q)) {
<?php } ?>
</div>
<?php foreach ($results as $result) { ?>
<div>
<h2><?php echo $result->title ?></h2>
<?php if (!empty($result->description)) { ?>
<span><?php echo $result->description ?></span>
<?php } ?>
<a href="<?php echo $result->url ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode(parse_url($result->url, PHP_URL_HOST)) ?>" alt="favicon" width="16" height="16" />
<?php echo $result->url ?>
</a>
</div>
<?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<div>
<h2><?php echo $hostPage->metaTitle ?></h2>
<?php if (!empty($hostPage->metaDescription)) { ?>
<span><?php echo $hostPage->metaDescription ?></span>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" />
<?php echo $hostPageURL ?>
</a>
</div>
<?php } ?>
<?php } ?>
<?php if ($p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT <= $resultsTotal) { ?>
<div>

Loading…
Cancel
Save