mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-08 22:07:56 +00:00
This commit is contained in:
parent
a14d18fedb
commit
2495a2bbc7
3
.gitignore
vendored
3
.gitignore
vendored
@ -2,5 +2,8 @@
|
|||||||
.ftpignore
|
.ftpignore
|
||||||
|
|
||||||
config/app.php
|
config/app.php
|
||||||
|
config/sphinx.conf
|
||||||
|
|
||||||
|
database/yggo.mwb.bak
|
||||||
|
|
||||||
storage
|
storage
|
||||||
|
11
README.md
11
README.md
@ -28,7 +28,8 @@ php-dom
|
|||||||
php-pdo
|
php-pdo
|
||||||
php-curl
|
php-curl
|
||||||
php-gd
|
php-gd
|
||||||
sqlite / fts5
|
php-mysql
|
||||||
|
sphinx search server
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Installation
|
#### Installation
|
||||||
@ -39,12 +40,16 @@ sqlite / fts5
|
|||||||
* Set up the `/crontab/crawler.php` script for execution every the minute, but it mostly related of the configs and targetal network volume, there is no debug implemented yet, so let's silentize it by `/dev/null`
|
* Set up the `/crontab/crawler.php` script for execution every the minute, but it mostly related of the configs and targetal network volume, there is no debug implemented yet, so let's silentize it by `/dev/null`
|
||||||
* Script has no MVC model, because of super simple. It's is just 2 files, and everything else stored incapsulated in `/library` classes.
|
* Script has no MVC model, because of super simple. It's is just 2 files, and everything else stored incapsulated in `/library` classes.
|
||||||
|
|
||||||
|
#### Configuration
|
||||||
|
|
||||||
|
todo
|
||||||
|
|
||||||
#### Roadmap / ideas
|
#### Roadmap / ideas
|
||||||
|
|
||||||
* [x] Web pages full text ranking search
|
* [x] Web pages full text ranking search
|
||||||
* [x] Make search results pagination
|
* [x] Make search results pagination
|
||||||
* [ ] Blacklist domains (useful for some mirrors)
|
* [ ] Blacklist domains (useful for some mirrors)
|
||||||
* [ ] Add robots.txt support (Issue #2)
|
* [x] Add robots.txt support (Issue #2)
|
||||||
* [ ] Improve yggdrasil links detection, add .ygg domain zone support
|
* [ ] Improve yggdrasil links detection, add .ygg domain zone support
|
||||||
* [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights
|
* [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights
|
||||||
* [ ] Images search (basically implemented but requires testing and some performance optimization)
|
* [ ] Images search (basically implemented but requires testing and some performance optimization)
|
||||||
@ -66,6 +71,8 @@ git checkout master
|
|||||||
git checkout -b my-pr-branch-name
|
git checkout -b my-pr-branch-name
|
||||||
```
|
```
|
||||||
|
|
||||||
|
See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
|
||||||
|
|
||||||
#### Donate to contributors
|
#### Donate to contributors
|
||||||
|
|
||||||
* @d47081: [BTC](https://www.blockchain.com/explorer/addresses/btc/bc1qngdf2kwty6djjqpk0ynkpq9wmlrmtm7e0c534y) | [DOGE](https://dogechain.info/address/D5Sez493ibLqTpyB3xwQUspZvJ1cxEdRNQ)
|
* @d47081: [BTC](https://www.blockchain.com/explorer/addresses/btc/bc1qngdf2kwty6djjqpk0ynkpq9wmlrmtm7e0c534y) | [DOGE](https://dogechain.info/address/D5Sez493ibLqTpyB3xwQUspZvJ1cxEdRNQ)
|
||||||
|
@ -11,16 +11,24 @@ define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100);
|
|||||||
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
|
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
|
||||||
|
|
||||||
// Database
|
// Database
|
||||||
define('DB_NAME', 'database.sqlite');
|
define('DB_HOST', 'localhost');
|
||||||
|
define('DB_PORT', 3306);
|
||||||
|
define('DB_NAME', '');
|
||||||
define('DB_USERNAME', '');
|
define('DB_USERNAME', '');
|
||||||
define('DB_PASSWORD', '');
|
define('DB_PASSWORD', '');
|
||||||
|
|
||||||
// Crawl settings
|
// Sphinx
|
||||||
define('CRAWL_IMAGE', false); // @TODO
|
define('SPHINX_HOST', '127.0.0.1');
|
||||||
|
define('SPHINX_PORT', 9306);
|
||||||
|
|
||||||
|
// Crawl settings
|
||||||
define('CRAWL_PAGE_LIMIT', 10);
|
define('CRAWL_PAGE_LIMIT', 10);
|
||||||
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
|
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
|
||||||
|
|
||||||
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
|
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
|
||||||
|
|
||||||
define('CRAWL_META_ONLY', false);
|
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
|
||||||
|
define('CRAWL_HOST_DEFAULT_STATUS', true);
|
||||||
|
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
|
||||||
|
|
||||||
|
define('CRAWL_ROBOTS_DEFAULT_RULES', "");
|
||||||
|
22
config/sphinx.conf.txt
Normal file
22
config/sphinx.conf.txt
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
source hostPage
|
||||||
|
{
|
||||||
|
type = mysql
|
||||||
|
|
||||||
|
sql_host = localhost
|
||||||
|
sql_user =
|
||||||
|
sql_pass =
|
||||||
|
sql_db =
|
||||||
|
sql_port = 3306 # optional, default is 3306
|
||||||
|
|
||||||
|
sql_query = \
|
||||||
|
SELECT hostPageId, metaTitle, metaDescription, metaKeywords, data, uri \
|
||||||
|
FROM hostPage
|
||||||
|
|
||||||
|
sql_attr_uint = hostPageId
|
||||||
|
}
|
||||||
|
|
||||||
|
index hostPage
|
||||||
|
{
|
||||||
|
source = hostPage
|
||||||
|
path =
|
||||||
|
}
|
@ -11,31 +11,46 @@ if (false === sem_acquire($semaphore, true)) {
|
|||||||
// Load system dependencies
|
// Load system dependencies
|
||||||
require_once('../config/app.php');
|
require_once('../config/app.php');
|
||||||
require_once('../library/curl.php');
|
require_once('../library/curl.php');
|
||||||
|
require_once('../library/robots.php');
|
||||||
require_once('../library/filter.php');
|
require_once('../library/filter.php');
|
||||||
require_once('../library/sqlite.php');
|
require_once('../library/parser.php');
|
||||||
|
require_once('../library/mysql.php');
|
||||||
|
|
||||||
|
// Debug
|
||||||
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
|
$hostPagesProcessed = 0;
|
||||||
|
$hostPagesIndexed = 0;
|
||||||
|
$hostPagesAdded = 0;
|
||||||
|
$hostsAdded = 0;
|
||||||
|
|
||||||
// Connect database
|
// Connect database
|
||||||
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||||
|
|
||||||
// Process crawl queue
|
// Process crawl queue
|
||||||
foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queue) {
|
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||||
|
|
||||||
$url = new Curl($queue->url);
|
// Build URL from the DB
|
||||||
|
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||||
|
|
||||||
$db->updatePageQueue($queue->pageId, time(), $url->getCode());
|
$curl = new Curl($queueHostPageURL);
|
||||||
|
|
||||||
// Skip processing non 200 code
|
// Update page index anyway, with the current time and http code
|
||||||
if (200 != $url->getCode()) {
|
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||||
|
|
||||||
|
// Skip next page processing non 200 code
|
||||||
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing pages without returned data
|
// Skip next page processing pages without returned data
|
||||||
if (!$content = $url->getContent()) {
|
if (!$content = $curl->getContent()) {
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Grab page content
|
||||||
$dom = new DomDocument();
|
$dom = new DomDocument();
|
||||||
|
|
||||||
@$dom->loadHTML($content);
|
@$dom->loadHTML($content);
|
||||||
@ -62,48 +77,12 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Index page data
|
// Update queued page data
|
||||||
$db->updatePage($queue->pageId,
|
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
||||||
Filter::pageTitle($title->item(0)->nodeValue),
|
Filter::pageTitle($title->item(0)->nodeValue),
|
||||||
Filter::pageDescription($description),
|
Filter::pageDescription($description),
|
||||||
Filter::pageKeywords($keywords),
|
Filter::pageKeywords($keywords),
|
||||||
CRAWL_META_ONLY ? '' : Filter::pageData($content),
|
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
||||||
time());
|
|
||||||
|
|
||||||
// Update images
|
|
||||||
$db->deleteImages($queue->pageId);
|
|
||||||
|
|
||||||
if (CRAWL_IMAGE) {
|
|
||||||
|
|
||||||
foreach (@$dom->getElementsByTagName('img') as $image) {
|
|
||||||
|
|
||||||
// Skip images without required attributes
|
|
||||||
if (!$src = @$image->getAttribute('src')) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!$alt = @$image->getAttribute('alt')) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add domain to the relative links
|
|
||||||
if (!parse_url($src, PHP_URL_HOST)) {
|
|
||||||
|
|
||||||
$src = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
|
|
||||||
parse_url($queue->url, PHP_URL_HOST) .
|
|
||||||
parse_url($queue->url, PHP_URL_PORT) .
|
|
||||||
$src; // @TODO sometimes wrong URL prefix available
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add page images
|
|
||||||
$db->addImage($queue->pageId,
|
|
||||||
Filter::url($src),
|
|
||||||
crc32($src),
|
|
||||||
Filter::imageAlt($alt));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect internal links from page content
|
// Collect internal links from page content
|
||||||
foreach(@$dom->getElementsByTagName('a') as $a) {
|
foreach(@$dom->getElementsByTagName('a') as $a) {
|
||||||
@ -120,22 +99,101 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add absolute prefixes to the relative links
|
// Add absolute URL prefixes to the relative links found
|
||||||
if (!parse_url($href, PHP_URL_HOST)) {
|
if (!parse_url($href, PHP_URL_HOST)) {
|
||||||
|
|
||||||
$href = parse_url($queue->url, PHP_URL_SCHEME) . '://' .
|
$href = $queueHostPage->scheme . '://' .
|
||||||
parse_url($queue->url, PHP_URL_HOST) .
|
$queueHostPage->name .
|
||||||
parse_url($queue->url, PHP_URL_PORT) .
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||||
$href;
|
'/' . ltrim($href, '/');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Filter href URL
|
// Validate formatted link
|
||||||
$href = Filter::url($href);
|
|
||||||
|
|
||||||
// Save valid internal links to the index queue
|
|
||||||
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
|
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
|
||||||
|
|
||||||
$db->initPage($href, crc32($href), time());
|
$db->beginTransaction();
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Parse formatted link
|
||||||
|
$hostURL = Parser::hostURL($href);
|
||||||
|
$hostPageURI = Parser::uri($href);
|
||||||
|
|
||||||
|
// Host exists
|
||||||
|
if ($host = $db->getHost(crc32($hostURL->string))) {
|
||||||
|
|
||||||
|
$hostStatus = $host->status;
|
||||||
|
$hostPageLimit = $host->crawlPageLimit;
|
||||||
|
$hostId = $host->hostId;
|
||||||
|
$hostRobots = $host->robots;
|
||||||
|
|
||||||
|
// Register new host
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// Get robots.txt if exists
|
||||||
|
$curl = new Curl($hostURL->string . '/robots.txt');
|
||||||
|
|
||||||
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
} else {
|
||||||
|
$hostRobots = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||||
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||||
|
$hostId = $db->addHost($hostURL->scheme,
|
||||||
|
$hostURL->name,
|
||||||
|
$hostURL->port,
|
||||||
|
crc32($hostURL->string),
|
||||||
|
time(),
|
||||||
|
null,
|
||||||
|
$hostPageLimit,
|
||||||
|
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||||
|
(string) $hostStatus,
|
||||||
|
$hostRobots);
|
||||||
|
|
||||||
|
if ($hostId) {
|
||||||
|
|
||||||
|
echo 'hostmane ' . $hostURL->string . PHP_EOL;
|
||||||
|
|
||||||
|
$hostsAdded++;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init robots parser
|
||||||
|
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
|
||||||
|
|
||||||
|
// Save page info
|
||||||
|
if ($hostStatus && // host enabled
|
||||||
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||||
|
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
||||||
|
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
|
||||||
|
|
||||||
|
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
|
||||||
|
|
||||||
|
$hostPagesAdded++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$db->commit();
|
||||||
|
|
||||||
|
} catch(Exception $e){
|
||||||
|
|
||||||
|
var_dump($e);
|
||||||
|
|
||||||
|
$db->rollBack();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Debug
|
||||||
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||||
|
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
||||||
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||||
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||||
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;
|
||||||
|
BIN
database/yggo.mwb
Normal file
BIN
database/yggo.mwb
Normal file
Binary file not shown.
@ -4,16 +4,7 @@ class Filter {
|
|||||||
|
|
||||||
static public function url(string $url) {
|
static public function url(string $url) {
|
||||||
|
|
||||||
return trim($url);
|
return trim(urldecode($url));
|
||||||
}
|
|
||||||
|
|
||||||
static public function imageAlt(string $alt) {
|
|
||||||
|
|
||||||
$alt = preg_replace('/[\s]+/', ' ', $alt);
|
|
||||||
|
|
||||||
$alt = trim($alt);
|
|
||||||
|
|
||||||
return $alt;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static public function pageTitle(string $title) {
|
static public function pageTitle(string $title) {
|
||||||
|
196
library/mysql.php
Normal file
196
library/mysql.php
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class MySQL {
|
||||||
|
|
||||||
|
private PDO $_db;
|
||||||
|
|
||||||
|
public function __construct(string $host, int $port, string $database, string $username, string $password) {
|
||||||
|
|
||||||
|
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
|
||||||
|
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
|
||||||
|
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
|
||||||
|
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
|
||||||
|
}
|
||||||
|
|
||||||
|
// System
|
||||||
|
public function beginTransaction() {
|
||||||
|
|
||||||
|
$this->_db->beginTransaction();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function commit() {
|
||||||
|
|
||||||
|
$this->_db->commit();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function rollBack() {
|
||||||
|
|
||||||
|
$this->_db->rollBack();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Host
|
||||||
|
public function getHost(int $crc32url) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$crc32url]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
|
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots]);
|
||||||
|
|
||||||
|
return $this->_db->lastInsertId();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pages
|
||||||
|
public function getTotalHostPages(int $hostId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
|
return $query->fetch()->total;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getTotalPagesByHttpCode(mixed $httpCode) {
|
||||||
|
|
||||||
|
if (is_null($httpCode)) {
|
||||||
|
|
||||||
|
$query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` IS NULL');
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` = ?');
|
||||||
|
|
||||||
|
$query->execute([$httpCode]);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return $query->fetch()->total;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getTotalPages() {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`');
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->fetch()->total;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHostPage(int $hostId, int $crc32uri) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostId, $crc32uri]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getFoundHostPage(int $hostPageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
|
||||||
|
`hostPage`.`metaDescription`,
|
||||||
|
`hostPage`.`data`,
|
||||||
|
`hostPage`.`uri`,
|
||||||
|
`host`.`scheme`,
|
||||||
|
`host`.`name`,
|
||||||
|
`host`.`port`
|
||||||
|
|
||||||
|
FROM `hostPage`
|
||||||
|
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||||
|
|
||||||
|
WHERE `hostPage`.`hostPageId` = ?
|
||||||
|
|
||||||
|
LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostPageId]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function addHostPage(int $hostId,
|
||||||
|
int $crc32uri,
|
||||||
|
string $uri,
|
||||||
|
int $timeAdded,
|
||||||
|
mixed $timeUpdated = null,
|
||||||
|
mixed $httpCode = null,
|
||||||
|
mixed $rank = null,
|
||||||
|
mixed $metaTitle = null,
|
||||||
|
mixed $metaDescription = null,
|
||||||
|
mixed $metaKeywords = null,
|
||||||
|
mixed $data = null) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
|
||||||
|
`crc32uri`,
|
||||||
|
`uri`,
|
||||||
|
`timeAdded`,
|
||||||
|
`timeUpdated`,
|
||||||
|
`httpCode`,
|
||||||
|
`rank`,
|
||||||
|
`metaTitle`,
|
||||||
|
`metaDescription`,
|
||||||
|
`metaKeywords`,
|
||||||
|
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
|
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
|
||||||
|
|
||||||
|
return $this->_db->lastInsertId();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function updateHostPage( int $hostPageId,
|
||||||
|
mixed $metaTitle,
|
||||||
|
mixed $metaDescription,
|
||||||
|
mixed $metaKeywords,
|
||||||
|
mixed $data) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
|
||||||
|
`metaDescription` = ?,
|
||||||
|
`metaKeywords` = ?,
|
||||||
|
`data` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Crawl tools
|
||||||
|
public function getCrawlQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`,
|
||||||
|
`hostPage`.`uri`,
|
||||||
|
`host`.`scheme`,
|
||||||
|
`host`.`name`,
|
||||||
|
`host`.`port`,
|
||||||
|
`host`.`crawlPageLimit`,
|
||||||
|
`host`.`crawlPageMetaOnly`,
|
||||||
|
`host`.`robots`
|
||||||
|
|
||||||
|
FROM `hostPage`
|
||||||
|
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||||
|
|
||||||
|
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||||
|
|
||||||
|
ORDER BY `hostPage`.`hostPageId`
|
||||||
|
|
||||||
|
LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$timeFrom]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$timeUpdated, $httpCode, $hostPageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
}
|
73
library/parser.php
Normal file
73
library/parser.php
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class Parser {
|
||||||
|
|
||||||
|
static public function hostURL(string $string) {
|
||||||
|
|
||||||
|
$result = [
|
||||||
|
'string' => null,
|
||||||
|
'scheme' => null,
|
||||||
|
'name' => null,
|
||||||
|
'port' => null,
|
||||||
|
];
|
||||||
|
|
||||||
|
if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) {
|
||||||
|
|
||||||
|
$result['string'] = $hostScheme . '://';
|
||||||
|
|
||||||
|
$result['scheme'] = $hostScheme;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($hostName = parse_url($string, PHP_URL_HOST)) {
|
||||||
|
|
||||||
|
$result['string'] .= $hostName;
|
||||||
|
|
||||||
|
$result['name'] = $hostName;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($hostPort = parse_url($string, PHP_URL_PORT)) {
|
||||||
|
|
||||||
|
$result['string'] .= ':' . $hostPort;
|
||||||
|
|
||||||
|
$result['port'] = $hostPort;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return (object) $result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static public function uri(string $string) {
|
||||||
|
|
||||||
|
$result = [
|
||||||
|
'string' => '/',
|
||||||
|
'path' => '/',
|
||||||
|
'query' => null,
|
||||||
|
];
|
||||||
|
|
||||||
|
if ($path = parse_url($string, PHP_URL_PATH)) {
|
||||||
|
|
||||||
|
$result['string'] = $path;
|
||||||
|
|
||||||
|
$result['path'] = $path;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($query = parse_url($string, PHP_URL_QUERY)) {
|
||||||
|
|
||||||
|
$result['string'] .= '?' . $query;
|
||||||
|
|
||||||
|
$result['query'] = '?' . $query;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return (object) $result;
|
||||||
|
}
|
||||||
|
}
|
@ -1,7 +1,5 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
// @TODO #2
|
|
||||||
|
|
||||||
class Robots {
|
class Robots {
|
||||||
|
|
||||||
private $_rule = [];
|
private $_rule = [];
|
||||||
@ -42,16 +40,10 @@ class Robots {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public function indexURL(string $url) {
|
public function uriAllowed(string $uri) {
|
||||||
|
|
||||||
// Unify case match
|
// Unify case match
|
||||||
$url = strtolower(trim($url));
|
$uri = strtolower(trim($uri));
|
||||||
|
|
||||||
// Convert to URI
|
|
||||||
$url = str_replace(parse_url($url, PHP_URL_SCHEME) . '://' .
|
|
||||||
parse_url($url, PHP_URL_HOST) .
|
|
||||||
parse_url($url, PHP_URL_PORT),
|
|
||||||
'', $url);
|
|
||||||
|
|
||||||
// Index by default
|
// Index by default
|
||||||
$result = true;
|
$result = true;
|
||||||
@ -59,7 +51,7 @@ class Robots {
|
|||||||
// Begin index rules by ASC priority
|
// Begin index rules by ASC priority
|
||||||
foreach ($this->_rule as $rule => $value) {
|
foreach ($this->_rule as $rule => $value) {
|
||||||
|
|
||||||
if (preg_match('!^' . $rule . '!', $url)) {
|
if (preg_match('!^' . $rule . '!', $uri)) {
|
||||||
|
|
||||||
$result = $value;
|
$result = $value;
|
||||||
}
|
}
|
||||||
|
31
library/sphinxql.php
Normal file
31
library/sphinxql.php
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class SphinxQL {
|
||||||
|
|
||||||
|
private $_sphinx;
|
||||||
|
|
||||||
|
public function __construct(string $host, int $port) {
|
||||||
|
|
||||||
|
$this->_sphinx = new PDO('mysql:host=' . $host . ';port=' . $port . ';charset=utf8', false, false, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
|
||||||
|
$this->_sphinx->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
|
||||||
|
$this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function searchHostPages(string $keyword, int $start, int $limit) {
|
||||||
|
|
||||||
|
$query = $this->_sphinx->prepare('SELECT * FROM `hostPage` WHERE MATCH(?) LIMIT ' . (int) $start . ',' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$keyword]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function searchHostPagesTotal(string $keyword) {
|
||||||
|
|
||||||
|
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
|
||||||
|
|
||||||
|
$query->execute([$keyword]);
|
||||||
|
|
||||||
|
return $query->fetch()->total;
|
||||||
|
}
|
||||||
|
}
|
@ -1,170 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
class SQLite {
|
|
||||||
|
|
||||||
private PDO $_db;
|
|
||||||
|
|
||||||
public function __construct(string $database, string $username, string $password) {
|
|
||||||
|
|
||||||
$this->_db = new PDO('sqlite:' . $database, $username, $password);
|
|
||||||
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
|
|
||||||
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
|
|
||||||
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
|
|
||||||
|
|
||||||
$this->_db->query('
|
|
||||||
CREATE TABLE IF NOT EXISTS "page" (
|
|
||||||
"pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
|
||||||
"crc32url" INTEGER NOT NULL UNIQUE,
|
|
||||||
"httpCode" INTEGER,
|
|
||||||
"timeAdded" INTEGER NOT NULL,
|
|
||||||
"timeUpdated" INTEGER,
|
|
||||||
"title" TEXT,
|
|
||||||
"data" TEXT,
|
|
||||||
"description" TEXT,
|
|
||||||
"keywords" TEXT,
|
|
||||||
"url" TEXT NOT NULL
|
|
||||||
)
|
|
||||||
');
|
|
||||||
|
|
||||||
$this->_db->query('
|
|
||||||
CREATE TABLE IF NOT EXISTS "image" (
|
|
||||||
"imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
|
||||||
"crc32src" INTEGER NOT NULL UNIQUE,
|
|
||||||
"pageId" INTEGER NOT NULL,
|
|
||||||
"alt" TEXT NOT NULL,
|
|
||||||
"src" TEXT NOT NULL
|
|
||||||
)
|
|
||||||
');
|
|
||||||
|
|
||||||
// FTS5
|
|
||||||
$this->_db->query('
|
|
||||||
CREATE VIRTUAL TABLE IF NOT EXISTS `ftsPage` USING fts5(`url`, `title`, `description`, `keywords`, `data`, tokenize=`unicode61`, content=`page`, content_rowid=`pageId`)
|
|
||||||
');
|
|
||||||
|
|
||||||
$this->_db->query('
|
|
||||||
CREATE TRIGGER IF NOT EXISTS `pageInsert` AFTER INSERT ON `page` BEGIN
|
|
||||||
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
|
|
||||||
END
|
|
||||||
');
|
|
||||||
|
|
||||||
$this->_db->query('
|
|
||||||
CREATE TRIGGER IF NOT EXISTS `pageDelete` AFTER DELETE ON `page` BEGIN
|
|
||||||
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
|
|
||||||
END
|
|
||||||
');
|
|
||||||
|
|
||||||
$this->_db->query('
|
|
||||||
CREATE TRIGGER IF NOT EXISTS `pageUpdate` UPDATE ON `page` BEGIN
|
|
||||||
INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`);
|
|
||||||
INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`);
|
|
||||||
END
|
|
||||||
');
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getTotalPagesByHttpCode(mixed $httpCode) {
|
|
||||||
|
|
||||||
if (is_null($httpCode)) {
|
|
||||||
|
|
||||||
$query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` IS NULL');
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` = ?');
|
|
||||||
|
|
||||||
$query->execute([$httpCode]);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return $query->fetch()->total;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getTotalPages() {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page`');
|
|
||||||
|
|
||||||
$query->execute();
|
|
||||||
|
|
||||||
return $query->fetch()->total;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function updatePage(int $pageId, string $title, string $description, string $keywords, string $data, int $timeUpdated) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `page` SET `title` = ?, `description` = ?, `data` = ?, `timeUpdated` = ? WHERE `pageId` = ?');
|
|
||||||
|
|
||||||
$query->execute([$title, $description, $data, $timeUpdated, $pageId]);
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function addPage(string $title, string $description, string $keywords, string $data, int $timeAdded) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `page` (`title`, `description`, `data`, `timeAdded`) VALUES (?, ?, ?, ?)');
|
|
||||||
|
|
||||||
$query->execute([$title, $description, $data, $timeAdded]);
|
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function initPage(string $url, int $crc32url, int $timeAdded) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `crc32url`, `timeAdded`) VALUES (?, ?, ?)');
|
|
||||||
|
|
||||||
$query->execute([$url, $crc32url, $timeAdded]);
|
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function addImage(int $pageId, string $src, int $crc32src, string $alt) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `crc32src`, `alt`) VALUES (?, ?, ?, ?)');
|
|
||||||
|
|
||||||
$query->execute([$pageId, $src, $crc32src, $alt]);
|
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function deleteImages(int $pageId) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('DELETE FROM `image` WHERE `pageId` = ?');
|
|
||||||
|
|
||||||
$query->execute([$pageId]);
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getPageQueue(int $limit, int $timeFrom) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `page` WHERE `timeUpdated` IS NULL OR `timeUpdated` < ? ORDER BY `pageId` LIMIT ' . (int) $limit);
|
|
||||||
|
|
||||||
$query->execute([$timeFrom]);
|
|
||||||
|
|
||||||
return $query->fetchAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function updatePageQueue(string $pageId, int $timeUpdated, int $httpCode) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `page` SET `timeUpdated` = ?, `httpCode` = ? WHERE `pageId` = ? LIMIT 1');
|
|
||||||
|
|
||||||
$query->execute([$timeUpdated, $httpCode, $pageId]);
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function searchPages(string $q, int $start = 0, int $limit = 100) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT `title`, `description`, `url` FROM `ftsPage` WHERE `data` MATCH ? ORDER BY `rank` LIMIT ' . (int) $start . ',' . (int) $limit);
|
|
||||||
|
|
||||||
$query->execute([$q]);
|
|
||||||
|
|
||||||
return $query->fetchAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function searchPagesTotal(string $q) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `ftsPage` WHERE `data` MATCH ?');
|
|
||||||
|
|
||||||
$query->execute([$q]);
|
|
||||||
|
|
||||||
return $query->fetch()->total;
|
|
||||||
}
|
|
||||||
}
|
|
@ -3,10 +3,10 @@
|
|||||||
// Load system dependencies
|
// Load system dependencies
|
||||||
require_once('../config/app.php');
|
require_once('../config/app.php');
|
||||||
require_once('../library/filter.php');
|
require_once('../library/filter.php');
|
||||||
require_once('../library/sqlite.php');
|
require_once('../library/mysql.php');
|
||||||
|
|
||||||
// Connect database
|
// Connect database
|
||||||
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||||
|
|
||||||
$totalPages = $db->getTotalPages();
|
$totalPages = $db->getTotalPages();
|
||||||
|
|
||||||
|
@ -2,11 +2,18 @@
|
|||||||
|
|
||||||
// Load system dependencies
|
// Load system dependencies
|
||||||
require_once('../config/app.php');
|
require_once('../config/app.php');
|
||||||
|
require_once('../library/curl.php');
|
||||||
|
require_once('../library/robots.php');
|
||||||
require_once('../library/filter.php');
|
require_once('../library/filter.php');
|
||||||
require_once('../library/sqlite.php');
|
require_once('../library/parser.php');
|
||||||
|
require_once('../library/mysql.php');
|
||||||
|
require_once('../library/sphinxql.php');
|
||||||
|
|
||||||
|
// Connect Sphinx search server
|
||||||
|
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||||
|
|
||||||
// Connect database
|
// Connect database
|
||||||
$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD);
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||||
|
|
||||||
// Define page basics
|
// Define page basics
|
||||||
$totalPages = $db->getTotalPages();
|
$totalPages = $db->getTotalPages();
|
||||||
@ -23,14 +30,76 @@ $p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
|
|||||||
// Crawl request
|
// Crawl request
|
||||||
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||||
|
|
||||||
$db->initPage($q, crc32($q), time());
|
$db->beginTransaction();
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Parse host info
|
||||||
|
if ($hostURL = Parser::hostURL($q)) {
|
||||||
|
|
||||||
|
// Host exists
|
||||||
|
if ($host = $db->getHost(crc32($hostURL->string))) {
|
||||||
|
|
||||||
|
$hostStatus = $host->status;
|
||||||
|
$hostPageLimit = $host->crawlPageLimit;
|
||||||
|
$hostId = $host->hostId;
|
||||||
|
$hostRobots = $host->robots;
|
||||||
|
|
||||||
|
// Register new host
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// Get robots.txt if exists
|
||||||
|
$curl = new Curl($hostURL->string . '/robots.txt');
|
||||||
|
|
||||||
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
} else {
|
||||||
|
$hostRobots = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||||
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||||
|
$hostId = $db->addHost($hostURL->scheme,
|
||||||
|
$hostURL->name,
|
||||||
|
$hostURL->port,
|
||||||
|
crc32($hostURL->string),
|
||||||
|
time(),
|
||||||
|
null,
|
||||||
|
$hostPageLimit,
|
||||||
|
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||||
|
(string) $hostStatus,
|
||||||
|
$hostRobots);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse page URI
|
||||||
|
$hostPageURI = Parser::uri($q);
|
||||||
|
|
||||||
|
// Init robots parser
|
||||||
|
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
|
||||||
|
|
||||||
|
// Save page info
|
||||||
|
if ($hostStatus && // host enabled
|
||||||
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||||
|
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
||||||
|
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
|
||||||
|
|
||||||
|
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$db->commit();
|
||||||
|
|
||||||
|
} catch(Exception $e){
|
||||||
|
|
||||||
|
$db->rollBack();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search request
|
// Search request
|
||||||
if (!empty($q)) {
|
if (!empty($q)) {
|
||||||
|
|
||||||
$results = $db->searchPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT);
|
$results = $sphinx->searchHostPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT);
|
||||||
$resultsTotal = $db->searchPagesTotal('"' . $q . '"');
|
$resultsTotal = $sphinx->searchHostPagesTotal('"' . $q . '"');
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
@ -196,17 +265,20 @@ if (!empty($q)) {
|
|||||||
<?php } ?>
|
<?php } ?>
|
||||||
</div>
|
</div>
|
||||||
<?php foreach ($results as $result) { ?>
|
<?php foreach ($results as $result) { ?>
|
||||||
|
<?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
|
||||||
|
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
|
||||||
<div>
|
<div>
|
||||||
<h2><?php echo $result->title ?></h2>
|
<h2><?php echo $hostPage->metaTitle ?></h2>
|
||||||
<?php if (!empty($result->description)) { ?>
|
<?php if (!empty($hostPage->metaDescription)) { ?>
|
||||||
<span><?php echo $result->description ?></span>
|
<span><?php echo $hostPage->metaDescription ?></span>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
<a href="<?php echo $result->url ?>">
|
<a href="<?php echo $hostPageURL ?>">
|
||||||
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode(parse_url($result->url, PHP_URL_HOST)) ?>" alt="favicon" width="16" height="16" />
|
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" />
|
||||||
<?php echo $result->url ?>
|
<?php echo $hostPageURL ?>
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
|
<?php } ?>
|
||||||
<?php if ($p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT <= $resultsTotal) { ?>
|
<?php if ($p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT <= $resultsTotal) { ?>
|
||||||
<div>
|
<div>
|
||||||
<a href="<?php echo WEBSITE_DOMAIN; ?>/search.php?q=<?php echo urlencode(htmlentities($q)) ?>&p=<?php echo $p + 1 ?>"><?php echo _('Next page') ?></a>
|
<a href="<?php echo WEBSITE_DOMAIN; ?>/search.php?q=<?php echo urlencode(htmlentities($q)) ?>&p=<?php echo $p + 1 ?>"><?php echo _('Next page') ?></a>
|
||||||
|
Loading…
Reference in New Issue
Block a user