/s'
- ];
-
- $filterDataPost = [
- '/[\s]{2,}/',
- ];
-
- $data = preg_replace($filterDataPre, ' ', $data);
-
- $data = html_entity_decode($data);
- $data = strip_tags($data);
-
- $data = preg_replace($filterDataPost, ' ', $data);
-
- return $data;
- }
-
static public function searchQuery(string $query, string $mode = 'default') {
// Create query CRC32
diff --git a/library/helper.php b/library/helper.php
new file mode 100644
index 0000000..e65f0b3
--- /dev/null
+++ b/library/helper.php
@@ -0,0 +1,168 @@
+get(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key))) {
+
+ return $value;
+ }
+
+ if (!$value = $db->findHostSettingValue($hostId, $key)) {
+
+ $value = $defaultValue;
+ }
+
+ $memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
+
+ return $value;
+ }
+
+ public static function setHostSetting(MySQL $db,
+ Memcached $memcached,
+ int $hostId,
+ string $key,
+ mixed $value) : int {
+
+ if ($hostSetting = $db->findHostSetting($hostId, $key)) {
+
+ $rowsAffected = $db->updateHostSetting($hostSetting->hostSettingId, $value, time());
+
+ } else {
+
+ $rowsAffected = $db->addHostSetting($hostId, $key, $value, time());
+ }
+
+ $memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
+
+ return $rowsAffected;
+ }
+
+ public static function addLinkToDB(MySQL $db, Memcached $memcached, string $link) : mixed {
+
+ // Define variables
+ $result = (object)
+ [
+ 'new' => (object)
+ [
+ 'hostId' => [],
+ 'hostPageId' => [],
+ ],
+ 'old' => (object)
+ [
+ 'hostId' => [],
+ 'hostPageId' => [],
+ ],
+ ];
+
+ // Validate DB connection
+ if (!$db) {
+
+ return false;
+ }
+
+ // Validate link URL
+ if (!$link = URL::parse($link)) {
+
+ return false;
+ }
+
+ // Init host
+ if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) {
+
+ // Make sure host URL compatible with this host rules before continue
+ if (!preg_match(self::getHostSetting($db, $memcached, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) {
+
+ return false;
+ }
+
+ $hostId = $host->hostId;
+
+ $result->old->hostId[] = $host->hostId;
+
+ } else {
+
+ // Make sure link compatible with default host rules before create new host
+ if (!preg_match(DEFAULT_HOST_URL_REGEXP, $link->host->url)) {
+
+ return false;
+ }
+
+ // Register new host
+ if ($hostId = $db->addHost($link->host->scheme, $link->host->name, $link->host->port, crc32($link->host->url), time())) {
+
+ $result->new->hostId[] = $hostId;
+
+ // Init required for app web root page
+ if ($link->page->uri != '/') {
+
+ if ($hostPageId = $db->addHostPage($hostId, crc32('/'), '/', time())) {
+
+ // Note: commented because of referrer link registration implemented out of this method
+ // $result->new->hostPageId[] = $hostPageId;
+ }
+ }
+
+ } else {
+
+ return false;
+ }
+ }
+
+ // Add host page if not exists
+ if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($link->page->uri))) {
+
+ $result->old->hostPageId[] = $hostPage->hostPageId;
+
+ } else {
+
+ // Make sure host page URL compatible with this host rules before continue
+ if (!preg_match(self::getHostSetting($db, $memcached, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) {
+
+ return false;
+ }
+
+ // Validate page limits for this host
+ if ($db->getTotalHostPages($hostId) > self::getHostSetting($db, $memcached, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) {
+
+ return false;
+ }
+
+ // Validate ROBOTS.TXT
+ $robots = new Robots(
+ self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
+ self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
+ );
+
+ if (!$robots->uriAllowed($link->page->uri)) {
+
+ return false;
+ }
+
+ // Validate host page MIME
+ // Note: passed to the crawl queue to prevent extra-curl requests
+
+ // Add host page
+ if ($hostPageId = $db->addHostPage($hostId, crc32($link->page->uri), $link->page->uri, time())) {
+
+ $result->new->hostPageId[] = $hostPageId;
+
+ } else {
+
+ return false;
+ }
+ }
+
+ return $result;
+ }
+
+ // Cache host setting requests
+}
\ No newline at end of file
diff --git a/library/mysql.php b/library/mysql.php
index aad78d9..1b1fde6 100644
--- a/library/mysql.php
+++ b/library/mysql.php
@@ -60,7 +60,7 @@ class MySQL {
return $query->fetch();
}
- public function getHostByCRC32URL(int $crc32url) {
+ public function findHostByCRC32URL(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1');
@@ -78,87 +78,74 @@ class MySQL {
return $query->fetch()->total;
}
- public function addHost(string $scheme,
- string $name,
- mixed $port,
- int $crc32url,
- int $timeAdded,
- mixed $timeUpdated,
- int $crawlPageLimit,
- string $crawlMetaOnly,
- string $status,
- string $nsfw,
- mixed $robots,
- mixed $robotsPostfix) {
+ public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`,
`name`,
`port`,
`crc32url`,
- `timeAdded`,
- `timeUpdated`,
- `crawlPageLimit`,
- `crawlMetaOnly`,
- `status`,
- `nsfw`,
- `robots`,
- `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
-
- $query->execute([ $scheme,
- $name,
- $port,
- $crc32url,
- $timeAdded,
- $timeUpdated,
- $crawlPageLimit,
- $crawlMetaOnly,
- $status,
- $nsfw,
- $robots,
- $robotsPostfix]);
+ `timeAdded`) VALUES (?, ?, ?, ?, ?)');
+
+ $query->execute([$scheme, $name, $port, $crc32url, $timeAdded]);
return $this->_db->lastInsertId();
}
- public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) {
+ // Host settings
+ public function findHostSettingValue(int $hostId, string $key) {
- $query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
+ $query = $this->_db->prepare('SELECT `value` FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
- $query->execute([$robots, $timeUpdated, $hostId]);
+ $query->execute([$hostId, $key]);
- return $query->rowCount();
+ return $query->rowCount() ? json_decode($query->fetch()->value) : false;
}
- // Host settings
- public function getHostSetting(int $hostId, mixed $key) {
+ public function findHostSetting(int $hostId, string $key) {
- $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
+ $query = $this->_db->prepare('SELECT * FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
$query->execute([$hostId, $key]);
- return $query->rowCount() ? $query->fetch()->value : false;
+ return $query->fetch();
}
- public function getHostSettings(int $hostId) {
+ public function addHostSetting(int $hostId, string $key, mixed $value, int $timeAdded) {
- $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
+ $query = $this->_db->prepare('INSERT INTO `hostSetting` (`hostId`, `key`, `value`, `timeAdded`) VALUES (?, ?, ?, ?)');
- $query->execute([$hostId]);
+ $value = json_encode($value);
- return $query->fetchAll();
+ $query->execute(
+ [
+ $hostId,
+ $key,
+ $value,
+ $timeAdded
+ ]
+ );
+
+ return $query->rowCount();
}
- public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) {
+ public function updateHostSetting(int $hostSettingId, mixed $value, int $timeUpdated) {
+
+ $query = $this->_db->query('UPDATE `hostSetting` SET `value` = ?,
+ `timeUpdated` = ?
- $query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ?
- `key` = ?,
- `value` = ?,
- `timeAdded = ?
+ WHERE `hostSettingId` = ?
- ON DUPLICATE KEY UPDATE `value` = ?,
- `timeUpdated` = ?');
+ LIMIT 1');
- $query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]);
+ $value = json_encode($value);
+
+ $query->execute(
+ [
+ $value,
+ $timeUpdated,
+ $hostSettingId
+ ]
+ );
return $query->rowCount();
}
@@ -212,20 +199,16 @@ class MySQL {
public function getTopHostPages(int $limit = 100) {
// Get ID (to prevent memory over usage)
- $query = $this->_db->query("SELECT `hostPage`.`hostPageId`
-
- FROM `hostPage`
- JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
+ $query = $this->_db->query("SELECT `hostPageId` FROM `hostPage`
- WHERE `host`.`status` = '1'
- AND `hostPage`.`httpCode` = 200
- AND `hostPage`.`rank` > 0
- AND `hostPage`.`timeBanned` IS NULL
- AND `hostPage`.`mime` IS NOT NULL
+ WHERE `httpCode` = 200
+ AND `rank` > 0
+ AND `timeBanned` IS NULL
+ AND `mime` IS NOT NULL
- ORDER BY `rank` DESC
+ ORDER BY `rank` DESC
- LIMIT " . (int) $limit);
+ LIMIT " . (int) $limit);
// Get required page details
foreach ($query->fetchAll() as $top) {
@@ -387,12 +370,11 @@ class MySQL {
return $query->rowCount();
}
- public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
+ public function setHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
$query = $this->_db->prepare('INSERT IGNORE `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`) VALUES (?, ?)');
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
-
}
public function deleteHostPageToHostPage(int $hostPageId) {
@@ -422,6 +404,15 @@ class MySQL {
return $query->fetchAll();
}
+ public function getHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
+
+ $query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? AND `hostPageIdTarget` = ? LIMIT 1');
+
+ $query->execute([$hostPageIdSource, $hostPageIdTarget]);
+
+ return $query->fetch();
+ }
+
public function addHostPageSnap(int $hostPageId, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, `timeAdded`) VALUES (?, ?)');
@@ -560,62 +551,46 @@ class MySQL {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
- $query->execute();
+ $query->execute([$timeOffset]);
return $query->rowCount();
}
- // Crawler tools
- public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
+ public function resetBannedHosts(int $timeOffset) {
- $query = $this->_db->prepare("SELECT COUNT(*) AS `total`
+ $query = $this->_db->prepare('UPDATE `host` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
- FROM `hostPage`
- JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
+ $query->execute([$timeOffset]);
- WHERE (
- `hostPage`.`timeUpdated` IS NULL OR
- `hostPage`.`timeUpdated` < ? OR (
- `hostPage`.`uri` = '/' AND
- `hostPage`.`timeUpdated` < ?
- )
- )
+ return $query->rowCount();
+ }
+
+ // Crawler tools
+ public function getHostPageCrawlQueueTotal(int $timeFrom) {
- AND `host`.`status` <> ?
- AND `hostPage`.`timeBanned` IS NULL");
+ $query = $this->_db->prepare("SELECT COUNT(*) AS `total` FROM `hostPage`
- $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
+ WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `hostPage`.`timeBanned` IS NULL");
+
+ $query->execute([$timeFrom]);
return $query->fetch()->total;
}
- public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
+ public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
$result = [];
// Get ID (to prevent memory over usage)
- $query = $this->_db->prepare("SELECT `hostPage`.`hostPageId`
-
- FROM `hostPage`
- JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
-
- WHERE (
- `hostPage`.`timeUpdated` IS NULL OR
- `hostPage`.`timeUpdated` < ?
- OR (
- `hostPage`.`uri` = '/' AND
- `hostPage`.`timeUpdated` < ?
- )
- )
+ $query = $this->_db->prepare("SELECT `hostPageId` FROM `hostPage`
- AND `host`.`status` <> ?
- AND `hostPage`.`timeBanned` IS NULL
+ WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL
- ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
+ ORDER BY LENGTH(`uri`) ASC, RAND()
- LIMIT " . (int) $limit);
+ LIMIT " . (int) $limit);
- $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
+ $query->execute([$timeFrom]);
// Get required page details
foreach ($query->fetchAll() as $queue) {
@@ -627,10 +602,6 @@ class MySQL {
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
- `host`.`crawlPageLimit`,
- `host`.`crawlMetaOnly`,
- `host`.`robots`,
- `host`.`robotsPostfix`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
@@ -676,13 +647,13 @@ class MySQL {
FROM `host`
- WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
+ WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL
ORDER BY RAND()
LIMIT " . (int) $limit);
- $query->execute([$timeFrom, 0]);
+ $query->execute([$timeFrom]);
// Get required page details
foreach ($query->fetchAll() as $host) {
diff --git a/library/parser.php b/library/parser.php
deleted file mode 100644
index 5149427..0000000
--- a/library/parser.php
+++ /dev/null
@@ -1,73 +0,0 @@
- null,
- 'scheme' => null,
- 'name' => null,
- 'port' => null,
- ];
-
- if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) {
-
- $result['string'] = $hostScheme . '://';
-
- $result['scheme'] = $hostScheme;
-
- } else {
-
- return false;
- }
-
- if ($hostName = parse_url($string, PHP_URL_HOST)) {
-
- $result['string'] .= $hostName;
-
- $result['name'] = $hostName;
-
- } else {
-
- return false;
- }
-
- if ($hostPort = parse_url($string, PHP_URL_PORT)) {
-
- $result['string'] .= ':' . $hostPort;
-
- $result['port'] = $hostPort;
-
- }
-
- return (object) $result;
- }
-
- static public function uri(string $string) {
-
- $result = [
- 'string' => '/',
- 'path' => '/',
- 'query' => null,
- ];
-
- if ($path = parse_url($string, PHP_URL_PATH)) {
-
- $result['string'] = $path;
-
- $result['path'] = $path;
-
- }
-
- if ($query = parse_url($string, PHP_URL_QUERY)) {
-
- $result['string'] .= '?' . $query;
-
- $result['query'] = '?' . $query;
-
- }
-
- return (object) $result;
- }
-}
\ No newline at end of file
diff --git a/library/url.php b/library/url.php
new file mode 100644
index 0000000..bada461
--- /dev/null
+++ b/library/url.php
@@ -0,0 +1,82 @@
+ (object)
+ [
+ 'url' => null,
+ 'scheme' => null,
+ 'name' => null,
+ 'port' => null,
+ ],
+ 'page' => (object)
+ [
+ 'url' => null,
+ 'uri' => null,
+ 'path' => null,
+ 'query' => null,
+ ]
+ ];
+
+ // Validate URL
+ if (!self::is($url)) {
+
+ return false;
+ }
+
+ // Parse host
+ if ($scheme = parse_url($url, PHP_URL_SCHEME)) {
+
+ $result->host->url = $scheme . '://';
+ $result->host->scheme = $scheme;
+
+ } else {
+
+ return false;
+ }
+
+ if ($host = parse_url($url, PHP_URL_HOST)) {
+
+ $result->host->url .= $host;
+ $result->host->name = $host;
+
+ } else {
+
+ return false;
+ }
+
+ if ($port = parse_url($url, PHP_URL_PORT)) {
+
+ $result->host->url .= ':' . $port;
+ $result->host->port = $port;
+
+ // port is optional
+ }
+
+ // Parse page
+ if ($path = parse_url($url, PHP_URL_PATH)) {
+
+ $result->page->uri = $path;
+ $result->page->path = $path;
+ }
+
+ if ($query = parse_url($url, PHP_URL_QUERY)) {
+
+ $result->page->uri .= '?' . $query;
+ $result->page->query = '?' . $query;
+ }
+
+ $result->page->url = $result->host->url . $result->page->uri;
+
+ return $result;
+ }
+}
\ No newline at end of file
diff --git a/media/db-prototype.png b/media/db-prototype.png
index e8c3b75..92bf9d1 100644
Binary files a/media/db-prototype.png and b/media/db-prototype.png differ
diff --git a/public/api.php b/public/api.php
index 636c3b3..8014212 100644
--- a/public/api.php
+++ b/public/api.php
@@ -1,14 +1,11 @@
true,
'result' => [
'config' => [
- 'websiteDomain' => WEBSITE_DOMAIN,
- 'crawlUrlRegexp' => CRAWL_URL_REGEXP,
- 'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
- 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
- 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
- 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
- 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
- 'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
- 'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
- 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
- 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
+ 'WEBSITE_DOMAIN' => WEBSITE_DOMAIN,
+ 'DEFAULT_HOST_URL_REGEXP' => DEFAULT_HOST_URL_REGEXP,
+ // @TODO
],
'api' => [
'version' => (string) API_VERSION,
diff --git a/public/explore.php b/public/explore.php
index 8e34fe3..10fa63d 100644
--- a/public/explore.php
+++ b/public/explore.php
@@ -7,10 +7,28 @@ require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server
-$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
+try {
+
+ $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
+
+} catch(Exception $e) {
+
+ var_dump($e);
+
+ exit;
+}
// Connect database
-$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
+try {
+
+ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
+
+} catch(Exception $e) {
+
+ var_dump($e);
+
+ exit;
+}
// Filter request data
$hp = !empty($_GET['hp']) ? Filter::url($_GET['hp']) : 0;
@@ -283,7 +301,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
+ getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
diff --git a/public/index.php b/public/index.php
index 4ab8e03..b8f3486 100644
--- a/public/index.php
+++ b/public/index.php
@@ -6,7 +6,16 @@ require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server
-$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
+try {
+
+ $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
+
+} catch(Exception $e) {
+
+ var_dump($e);
+
+ exit;
+}
$totalPages = $sphinx->getHostPagesTotal();
diff --git a/public/search.php b/public/search.php
index 71537ae..84878cf 100644
--- a/public/search.php
+++ b/public/search.php
@@ -2,18 +2,48 @@
// Load system dependencies
require_once(__DIR__ . '/../config/app.php');
-require_once(__DIR__ . '/../library/curl.php');
-require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/filter.php');
-require_once(__DIR__ . '/../library/parser.php');
+require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/mysql.php');
+require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server
-$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
+try {
+
+ $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
+
+} catch(Exception $e) {
+
+ var_dump($e);
+
+ exit;
+}
// Connect database
-$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
+try {
+
+ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
+
+} catch(Exception $e) {
+
+ var_dump($e);
+
+ exit;
+}
+
+// Connect memcached
+try {
+
+ $memcached = new Memcached();
+ $memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
+
+} catch(Exception $e) {
+
+ var_dump($e);
+
+ exit;
+}
// Filter request data
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'text';
@@ -36,82 +66,34 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]);
+// Define alert message
+$alertMessages = [];
-// Crawl request
-if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
-
- $db->beginTransaction();
+// Register new host/page on search request contains the link
+if (URL::is($q)) {
try {
- // Parse host info
- if ($hostURL = Parser::hostURL($q)) {
+ $db->beginTransaction();
+
+ if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $q)) {
- // Host exists
- if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
+ if (count($linkToDBresult->new->hostPageId)) {
- $hostStatus = $host->status;
- $hostNsfw = $host->nsfw;
- $hostPageLimit = $host->crawlPageLimit;
- $hostMetaOnly = $host->crawlMetaOnly;
- $hostId = $host->hostId;
- $hostRobots = $host->robots;
- $hostRobotsPostfix = $host->robotsPostfix;
+ $alertMessages[] = _('Link successfully registered in the crawl queue!');
- // Register new host
} else {
- // Disk quota not reached
- if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
-
- // Get robots.txt if exists
- $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
-
- if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
- $hostRobots = $curl->getContent();
- } else {
- $hostRobots = null;
- }
-
- $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
-
- $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
- $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
- $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
- $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
-
- $hostId = $db->addHost( $hostURL->scheme,
- $hostURL->name,
- $hostURL->port,
- crc32($hostURL->string),
- time(),
- null,
- $hostPageLimit,
- (string) $hostMetaOnly,
- (string) $hostStatus,
- (string) $hostNsfw,
- $hostRobots,
- $hostRobotsPostfix);
-
- // Add web root host page to make host visible in the crawl queue
- $db->addHostPage($hostId, crc32('/'), '/', time());
- }
- }
+ if ($resultsTotal == 0) {
- // Parse page URI
- $hostPageURI = Parser::uri($q);
+ $alertMessages[] = _('This link already registered in the crawl queue.');
+ }
- // Init robots parser
- $robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix);
+ }
- // Save page info
- if ($hostStatus && // host enabled
- $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
- $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
- !$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
+ } else {
- $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
- }
+ $alertMessages[] = _('Link address not supported on this host!');
}
$db->commit();
@@ -124,6 +106,12 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
}
}
+// Count pages in the crawl queue
+if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) {
+
+ $alertMessages[] = sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal);
+}
+
?>
@@ -313,8 +301,8 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
-
+
+
@@ -352,7 +340,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
+ getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) { ?>
diff --git a/public/top.php b/public/top.php
index 4ccabd4..2f01c1b 100644
--- a/public/top.php
+++ b/public/top.php
@@ -7,10 +7,28 @@ require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server
-$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
+try {
+
+ $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
+
+} catch(Exception $e) {
+
+ var_dump($e);
+
+ exit;
+}
// Connect database
-$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
+try {
+
+ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
+
+} catch(Exception $e) {
+
+ var_dump($e);
+
+ exit;
+}
// Define page basics
$totalPages = $sphinx->getHostPagesTotal();
@@ -271,7 +289,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
+ getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>