Browse Source

implement unlimited settings customization for each host

main
ghost 1 year ago
parent
commit
d024ffd770
  1. 2
      README.md
  2. 7
      cli/yggo.php
  3. 219
      config/app.php.example
  4. 6
      crontab/cleaner.php
  5. 444
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 28
      library/filter.php
  8. 168
      library/helper.php
  9. 181
      library/mysql.php
  10. 73
      library/parser.php
  11. 82
      library/url.php
  12. BIN
      media/db-prototype.png
  13. 19
      public/api.php
  14. 20
      public/explore.php
  15. 9
      public/index.php
  16. 122
      public/search.php
  17. 20
      public/top.php

2
README.md

@ -86,7 +86,7 @@ GET action=hosts - required @@ -86,7 +86,7 @@ GET action=hosts - required
##### Application manifest
Returns node information for other nodes that have same `CRAWL_MANIFEST_API_VERSION` and `CRAWL_URL_REGEXP` conditions.
Returns node information for other nodes that have same `CRAWL_MANIFEST_API_VERSION` and `DEFAULT_HOST_URL_REGEXP` conditions.
Could be enabled or disabled by `API_MANIFEST_ENABLED` option

7
cli/yggo.php

@ -390,7 +390,7 @@ if (!empty($argv[1])) { @@ -390,7 +390,7 @@ if (!empty($argv[1])) {
$selectors = [];
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) as $selector) {
if (!empty($selector)) {
@ -428,8 +428,7 @@ if (!empty($argv[1])) { @@ -428,8 +428,7 @@ if (!empty($argv[1])) {
$db->addHostPageDom($hostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
trim((bool) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_STRIP_TAGS', DEFAULT_HOST_PAGES_DOM_STRIP_TAGS) ? strip_tags(preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
@ -447,7 +446,7 @@ if (!empty($argv[1])) { @@ -447,7 +446,7 @@ if (!empty($argv[1])) {
exit;
}
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
CLI::danger(_('DEFAULT_HOST_PAGES_DOM_SELECTORS not provided in the configuration file'));
CLI::break();
exit;

219
config/app.php.example

@ -64,7 +64,7 @@ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100); @@ -64,7 +64,7 @@ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database
define('DB_HOST', '127.0.0.1');
define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
@ -75,7 +75,7 @@ define('SPHINX_HOST', '127.0.0.1'); @@ -75,7 +75,7 @@ define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
// Memcached
define('MEMCACHED_HOST', '127.0.0.1');
define('MEMCACHED_HOST', 'localhost');
define('MEMCACHED_PORT', 11211);
// Snaps
@ -92,19 +92,19 @@ define('MEMCACHED_PORT', 11211); @@ -92,19 +92,19 @@ define('MEMCACHED_PORT', 11211);
*/
define('SNAP_STORAGE', json_encode((object)
[
'localhost' => [ // @TODO see https://github.com/YGGverse/YGGo#roadmap
'localhost' => [
'storage-1' => [
'directory' => __DIR__ . '/../storage/snap/hps/',
'quota' => [
'mime' => false,
'size' => 10000000024, // @TODO
'request' => [ // @TODO
'size' => 10000000024,
'request' => [
'download' => [
'size' => 10000024,
'seconds' => 60*60
]
]
]
],
],
// ...
],
@ -118,9 +118,9 @@ define('SNAP_STORAGE', json_encode((object) @@ -118,9 +118,9 @@ define('SNAP_STORAGE', json_encode((object)
'timeout' => 30,
'passive' => true,
'quota' => [
'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico',
'size' => 10000000024, // @TODO
'request' => [ // @TODO
'mime' => 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico',
'size' => 10000000024,
'request' => [
'download' => [
'size' => 10000024,
'seconds' => 60*60
@ -133,6 +133,7 @@ define('SNAP_STORAGE', json_encode((object) @@ -133,6 +133,7 @@ define('SNAP_STORAGE', json_encode((object)
]
));
// Proxy settings
/*
@ -143,141 +144,124 @@ define('SNAP_STORAGE', json_encode((object) @@ -143,141 +144,124 @@ define('SNAP_STORAGE', json_encode((object)
*/
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
// Crawl settings
// Host defaults
/*
* Crawler / Bot User Agent name
*
* Shared to other hosts through CURL requests by crawler
* Only URL addresses match this rule will be crawled
*
*/
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
define('DEFAULT_HOST_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); // ipv6 links only
/*
* Skip curl download on response data size reached
* Default robots.txt rules (will be overwriten on remote rules available)
*
* See also: CURLOPT_TIMEOUT (library/curl.php)
* string|null
*
*/
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760);
define('DEFAULT_HOST_ROBOTS_TXT', null);
/*
* Stop crawler on disk quota reached (Mb)
* These rules forcely appending to the remote robots.txt file
*
* string|null
*
*/
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
define('DEFAULT_HOST_ROBOTS_TXT_POSTFIX', null);
/*
* Pages (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_PAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
* Pages limit per new host by default
*
* Set 0 to disable
* Crawler stops indexing on this limit reach to prevent disk overuse
*
*/
define('CRAWL_PAGE_LIMIT', 20);
define('DEFAULT_HOST_PAGES_LIMIT', 100000);
/*
* Renew page index by timing offset provided
*
* This option works with CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
* Index pages match MIME types
*
* or the crawler can stuck in queue
* comma separated
*
*/
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
define('DEFAULT_HOST_PAGES_MIME', 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac,font/ttf');
/*
* Renew home page index by timing offset provided
*
* Used for new pages scanning in highter priority
*
* This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
* Index only meta tags
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
*
* or the crawler can stuck in queue
* Warning!
* this option requires huge disk storage,
* it's experimental feature, oriented for index operations
*
*/
define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30);
define('DEFAULT_HOST_PAGES_DATA', false);
/*
* Index pages match MIME types
* Generates hostPageDom index based on hostPage.data field
*
* comma separated
* Could be useful for building semantical index query (config/sphinx.conf.txt)
*
* At this moment feature available in the CLI only (cli/yggo.php)
*
*/
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
define('DEFAULT_HOST_PAGES_DOM_SELECTORS', false); // ";" separated
/*
* Only URL addresses match this rule will be auto-crawled
* Strip HTML in the DEFAULT_HOST_PAGES_DOM_SELECTORS content
*
*/
define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui');
define('DEFAULT_HOST_PAGE_DOM_STRIP_TAGS', false);
// Crawl queue
/*
* Pages limit per new host by default
*
* Crawler stops indexing on this limit reach to prevent disk overuse
* Crawler / Bot User Agent name
*
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
* Shared to other hosts through CURL requests by crawler
*
*/
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
/*
* Set default auto-crawl status for new host added
*
* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT
* false - requires manual validation by the moderator in the DB `host`.`status` field
* Skip curl download on response data size reached
*
* This option also disable host in the search results
* See also: CURLOPT_TIMEOUT (library/curl.php)
*
*/
define('CRAWL_HOST_DEFAULT_STATUS', true);
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 50485760);
/*
* Index only meta tags
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
*
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
*
* Warning!
* this option disabled requires huge disk storage,
* it's experimental feature, oriented for index operations
* Stop crawler on disk quota reached (Mb)
*
*/
define('CRAWL_HOST_DEFAULT_META_ONLY', true);
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 128);
/*
* Not suitable/safe for work status for new host by default
* Pages (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Could be filtered in search results
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
* Set 0 to disable
*
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
define('CRAWL_HOST_PAGE_QUEUE_LIMIT', 10);
/*
* Collect sitemap index when available
* Renew page index by timing offset provided
*
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
* This option works with CRAWL_HOST_PAGE_QUEUE_LIMIT step queue
*
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
* Pay attention, that CRAWL_HOST_PAGE_QUEUE_LIMIT + CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
*
* true|false
* or the crawler can stuck in queue
*
*/
define('CRAWL_SITEMAPS', true);
define('CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Re-calculate page rank on page update
@ -287,7 +271,7 @@ define('CRAWL_SITEMAPS', true); @@ -287,7 +271,7 @@ define('CRAWL_SITEMAPS', true);
* true|false
*
*/
define('CRAWL_PAGE_RANK_UPDATE', true);
define('CRAWL_HOST_PAGE_RANK_UPDATE', false);
/*
* Renew hosts index by timing offset provided
@ -304,53 +288,28 @@ define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7); @@ -304,53 +288,28 @@ define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7);
define('CRAWL_HOST_LIMIT', 1);
/*
* Crawl robots.txt
*/
define('CRAWL_ROBOTS', true); // true|false
/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robots.md
*
*/
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
/*
* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
* The crawler does not overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
* Collect sitemap index when available
*
*/
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
/*
* Generates hostPageDom index based on hostPage.data field
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
*
* Could be useful for building semantical index query (config/sphinx.conf.txt)
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
*
* At this moment feature available in the CLI only (cli/yggo.php)
* true|false
*
*/
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6');
define('CRAWL_SITEMAPS', true);
/*
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content
*
* Crawl robots.txt
*/
define('CRAWL_HOST_PAGE_DOM_STRIP_TAGS', true);
define('CRAWL_ROBOTS', true); // true|false
/*
* Look for third-party manifests to collect distributed index
*
* API address provided in yggo meta tag
* will be stored in the `manifest` DB table
*
* Collecting URL that match CRAWL_URL_REGEXP condition
* Collecting URL that match DEFAULT_HOST_URL_REGEXP condition
*
*/
define('CRAWL_MANIFEST', true);
@ -359,10 +318,17 @@ define('CRAWL_MANIFEST', true); @@ -359,10 +318,17 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility
*
*/
define('CRAWL_MANIFEST_API_VERSION', 0.12);
define('CRAWL_MANIFEST_API_VERSION', 0.13);
// Cleaner settings
/*
* Remove host ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_HOST_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove page ban after following time
*
@ -370,7 +336,7 @@ define('CRAWL_MANIFEST_API_VERSION', 0.12); @@ -370,7 +336,7 @@ define('CRAWL_MANIFEST_API_VERSION', 0.12);
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
define('CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Database tables optimization
@ -382,7 +348,7 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); @@ -382,7 +348,7 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
* When enabled - requires enough of RAM
*
*/
define('CLEAN_DB_TABLES_OPTIMIZATION', false);
define('CLEAN_DB_TABLES_OPTIMIZATION', true);
// API settings
@ -420,17 +386,12 @@ define('API_HOSTS_ENABLED', true); @@ -420,17 +386,12 @@ define('API_HOSTS_ENABLED', true);
* Database host fields comma separated or * to share all the fields
*
*/
define('API_HOSTS_FIELDS',
'`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`robots`,
`host`.`robotsPostfix`,
`host`.`nsfw`,
`host`.`timeAdded`,
`host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');
define('API_HOSTS_FIELDS', "IF (`port` IS NOT NULL,
CONCAT(`scheme`, '://', `name`, ':', `port`),
CONCAT(`scheme`, '://', `name`)
) AS `url`,
`timeAdded`,
`timeUpdated`");
/*
* Manifest API

6
crontab/cleaner.php

@ -28,8 +28,11 @@ require_once(__DIR__ . '/../library/mysql.php'); @@ -28,8 +28,11 @@ require_once(__DIR__ . '/../library/mysql.php');
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Reset banned hosts
$hostsBansRemoved = $db->resetBannedHostPages(time() - CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET);
// Reset banned pages
$hostPagesBansRemoved = $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
$hostPagesBansRemoved = $db->resetBannedHosts(time() - CLEAN_HOST_BAN_SECONDS_OFFSET);
// Optimize tables
if (CLEAN_DB_TABLES_OPTIMIZATION) {
@ -45,6 +48,7 @@ if (CLEAN_DB_TABLES_OPTIMIZATION) { @@ -45,6 +48,7 @@ if (CLEAN_DB_TABLES_OPTIMIZATION) {
}
// Debug
echo 'Host bans removed: ' . $hostsBansRemoved . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;

444
crontab/crawler.php

@ -24,9 +24,10 @@ require_once(__DIR__ . '/../library/ftp.php'); @@ -24,9 +24,10 @@ require_once(__DIR__ . '/../library/ftp.php');
require_once(__DIR__ . '/../library/curl.php');
require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/sitemap.php');
require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/parser.php');
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
// Check disk quota
@ -62,27 +63,38 @@ try { @@ -62,27 +63,38 @@ try {
} catch(Exception $e) {
// Debug std
var_dump($e);
exit;
}
// Process hosts crawl queue
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) {
// Connect memcached
try {
$db->beginTransaction();
$memcached = new Memcached();
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Process hosts crawl queue
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) {
try {
$db->beginTransaction();
// Update host crawl queue
$hostsProcessed += $db->updateHostCrawlQueue($host->hostId);
$hostsProcessed += $db->updateHostCrawlQueue($queueHost->hostId, time());
// Crawl robots.txt
// Update host robots.txt settings from remote host
if (CRAWL_ROBOTS) {
// Update robots
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
$curl = new Curl($queueHost->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
@ -90,61 +102,63 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -90,61 +102,63 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Sitemap provided in robots.txt
if (200 == $curl->getCode()) {
$hostRobots = $curl->getContent();
// Update robots.txt rules
if (200 == $curl->getCode() && false !== stripos(trim(mb_strtolower((string) $curl->getContentType())), 'text/plain')) {
} else {
$hostRobots = $host->robots;
Helper::setHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent());
}
// Update host index
$db->updateHostRobots($host->hostId, $hostRobots, time());
}
// Process sitemaps when enabled
if (CRAWL_SITEMAPS) {
// Look for custom sitemap URL served in robots.txt
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
$robots = new Robots(
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
);
if ($hostSitemapPath = $robots->getSitemap()) {
if ($sitemapLink = $robots->getSitemap()) {
// Replace relative paths
$hostSitemapPath = trim($hostSitemapPath, '/');
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
$sitemapURL = sprintf('%s/%s', $queueHost->url, trim(str_ireplace($hostCrawlQueue->url, '', $sitemapLink), '/'));
// Set default path when not exists
// Set default path
} else {
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
$sitemapURL = sprintf('%s/sitemap.xml', $queueHost->url);
}
// Init sitemap data
$sitemap = new Sitemap($hostSitemapPath);
// Init sitemap
$sitemap = new Sitemap($sitemapURL);
if ($sitemapLinks = $sitemap->getLinks()) {
$sitemapsProcessed++;
// Process collected sitemap links
foreach ($sitemapLinks as $link => $attributes) {
foreach ($sitemapLinks as $loc => $attributes) {
// Replace relative paths
$loc = sprintf('%s/%s', $queueHost->url, trim(str_ireplace($queueHost->url, '', $loc), '/'));
// Validate link
if (!$link = URL::parse($loc)) {
continue;
}
// Collect this host links only
if ($link->host->url != $queueHost->url) {
// Parse formatted link
$linkURI = Parser::uri($link);
$linkHostURL = Parser::hostURL($link);
continue;
}
// Add host page
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
$linkHostURL->string == $host->url && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
// Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $loc)) {
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
}
}
}
@ -152,8 +166,11 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -152,8 +166,11 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Update manifests
if (CRAWL_MANIFEST) {
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
// Host have manifest provided
if ($manifestURL = Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'MANIFEST_URL', NULL)) {
// Get remote manifest
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
@ -165,42 +182,32 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -165,42 +182,32 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Skip processing non 200 code
if (200 != $curl->getCode()) {
$db->commit();
continue;
}
// Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) {
$db->commit();
continue;
}
// Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) {
$db->commit();
continue;
}
// Skip processing on required fields missed
if (empty($remoteManifest->status) ||
empty($remoteManifest->result->config->crawlUrlRegexp) ||
empty($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP) ||
empty($remoteManifest->result->api->version) ||
empty($remoteManifest->result->api->hosts)) {
$db->commit();
continue;
}
// Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
$db->commit();
if ($remoteManifest->result->api->version !== API_VERSION) {
continue;
}
@ -208,28 +215,24 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -208,28 +215,24 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Skip processing on host API not available
if (!$remoteManifest->result->api->hosts) {
$db->commit();
continue;
}
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
$db->commit();
// Skip processing on remote host URL does not match local condition
if ($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP !=
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) {
continue;
}
// Skip processing on host link does not match condition
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
$db->commit();
// Skip processing on remote host link does not match local condition
if (false === preg_match(Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP),
$remoteManifest->result->api->hosts)) {
continue;
}
// Begin hosts collection
// Grab host URLs
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
@ -241,32 +244,23 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -241,32 +244,23 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Skip processing non 200 code
if (200 != $curl->getCode()) {
$db->commit();
continue;
}
// Skip processing without returned data
if (!$remoteManifestHosts = $curl->getContent()) {
$db->commit();
if (!$remoteManifest = $curl->getContent()) {
continue;
}
// Skip processing on json encoding error
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
$db->commit();
if (!$remoteManifestHosts = @json_decode($remoteManifest)) {
continue;
}
// Skip processing on required fields missed
if (empty($remoteManifestHosts->status) ||
empty($remoteManifestHosts->result)) {
$db->commit();
if (empty($remoteManifestHosts->result)) {
continue;
}
@ -275,64 +269,16 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -275,64 +269,16 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
foreach ($remoteManifestHosts->result as $remoteManifestHost) {
// Skip processing on required fields missed
if (empty($remoteManifestHost->scheme) ||
empty($remoteManifestHost->name)) {
if (empty($remoteManifestHost->url)) {
continue;
}
$hostURL = $remoteManifestHost->scheme . '://' .
$remoteManifestHost->name .
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
// Host not exists
if (!$db->getHostByCRC32URL(crc32($hostURL))) {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
// Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $remoteManifestHost->url)) {
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost( $remoteManifestHosts->result->scheme,
$remoteManifestHosts->result->name,
$remoteManifestHosts->result->port,
crc32($hostURL),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
}
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
}
}
}
@ -354,7 +300,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF @@ -354,7 +300,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET) as $queueHostPage) {
$db->beginTransaction();
@ -370,9 +316,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -370,9 +316,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update page rank
if (CRAWL_PAGE_RANK_UPDATE) {
// @TODO add common method
if (CRAWL_HOST_PAGE_RANK_UPDATE) {
$hostPageRank = 0;
@ -432,113 +376,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -432,113 +376,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$url = trim($match[1]);
//Make relative links absolute
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
if (!parse_url($url, PHP_URL_HOST)) {
$url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
}
// Validate formatted link
if (filter_var($url, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $url)) {
// Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $url)) {
// Parse formatted link
$hostURL = Parser::hostURL($url);
$hostPageURI = Parser::uri($url);
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
// Host exists
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
// Register referrer
if ($linkToDBresult->old->hostPageId) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
foreach ($linkToDBresult->old->hostPageId as $hostPageIdTarget) {
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost( $hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
// When page is root, skip next operations
if ($hostPageURI->string == '/') {
$db->commit();
continue;
$db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
}
}
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
if ($linkToDBresult->new->hostPageId) {
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
foreach ($linkToDBresult->new->hostPageId as $hostPageIdTarget) {
$hostPageId = $hostPage->hostPageId;
} else {
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
// Apply referer meta description to the target page before indexing it
if ($lastHostPageDescription = $db->getLastPageDescription($queueHostPage->hostPageId)) {
$db->addHostPageDescription($hostPageId,
$lastHostPageDescription->title,
$lastHostPageDescription->description,
$lastHostPageDescription->keywords,
$hostMetaOnly ? null : ($lastHostPageDescription->data ? base64_encode($lastHostPageDescription->data) : null),
time());
$db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
}
$hostPagesAdded++;
}
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
}
}
}
@ -567,7 +430,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -567,7 +430,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Check for MIME
$hostPageInMime = false;
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
foreach ((array) explode(',', Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) {
// Ban page on MIME type not allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
@ -804,16 +668,16 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -804,16 +668,16 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$metaTitle,
$metaDescription ? Filter::pageDescription($metaDescription) : null,
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
$content ? (Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null,
time());
// Collect page DOM elements data on enabled
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
if ($hostPageDomSelectors = Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
// Begin selectors extraction
$html = str_get_html($content);
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
foreach ((array) explode(';', $hostPageDomSelectors) as $selector) {
foreach($html->find($selector) as $element) {
@ -822,8 +686,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -822,8 +686,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$db->addHostPageDom($queueHostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
trim(Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
@ -851,7 +714,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -851,7 +714,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if (CRAWL_MANIFEST &&
!empty($metaYggoManifestURL) &&
filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) &&
preg_match(CRAWL_URL_REGEXP, $metaYggoManifestURL)) {
preg_match(DEFAULT_HOST_URL_REGEXP, $metaYggoManifestURL)) {
$manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL);
}
@ -891,7 +754,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -891,7 +754,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
'data' => null,
'mime' => null,
'ref' => $src,
'href' => $src,
];
}
@ -923,7 +786,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -923,7 +786,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null,
'data' => null,
'mime' => Filter::mime($type),
'ref' => $src,
'href' => $src,
];
}
@ -953,7 +816,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -953,7 +816,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null,
'data' => null,
'mime' => Filter::mime($type),
'ref' => $src,
'href' => $src,
];
}
@ -983,7 +846,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -983,7 +846,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null,
'data' => null,
'mime' => Filter::mime($type),
'ref' => $src,
'href' => $src,
];
}
@ -1002,7 +865,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -1002,7 +865,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null,
'data' => null,
'mime' => null,
'ref' => $src,
'href' => $src,
];
}
@ -1021,7 +884,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -1021,7 +884,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null,
'data' => null,
'mime' => null,
'ref' => $href,
'href' => $href,
];
}
@ -1084,7 +947,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -1084,7 +947,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => Filter::pageKeywords($title),
'data' => null,
'mime' => null,
'ref' => $href,
'href' => $href,
];
}
@ -1092,107 +955,40 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -1092,107 +955,40 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
foreach ($links as $link) {
// Make relative links absolute
if (!parse_url($link['ref'], PHP_URL_HOST)) {
if (!parse_url($link['href'], PHP_URL_HOST)) {
$link['ref'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
$link['href'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['href']), '/'), '.');
}
// Validate formatted link
if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) {
// Register new link
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $link['href'])) {
// Parse formatted link
$hostURL = Parser::hostURL($link['ref']);
$hostPageURI = Parser::uri($link['ref']);
// Host exists
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Increase new hosts counters
if ($linkToDBresult->new->hostId) {
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
$hostsAdded += count($linkToDBresult->new->hostId);
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
if ($linkToDBresult->new->hostPageId) {
$hostId = $db->addHost( $hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
}
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
// Register referrer
if ($linkToDBresult->old->hostPageId) {
// When page is root, skip next operations
if ($hostPageURI->string == '/') {
foreach ($linkToDBresult->old->hostPageId as $hostPageIdTarget) {
continue;
$db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
}
}
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
if ($linkToDBresult->new->hostPageId) {
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
foreach ($linkToDBresult->new->hostPageId as $hostPageIdTarget) {
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId;
} else {
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
$db->addHostPageDescription($hostPageId,
$link['title'],
$link['description'],
$link['keywords'],
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
time());
$hostPagesAdded++;
$db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
}
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
}
}
}
@ -1236,7 +1032,7 @@ $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; @@ -1236,7 +1032,7 @@ $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
// Debug output
echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

28
library/filter.php

@ -2,11 +2,6 @@ @@ -2,11 +2,6 @@
class Filter {
static public function string(mixed $data) {
return (string) $data;
}
static public function url(mixed $url) {
$url = (string) $url;
@ -54,29 +49,6 @@ class Filter { @@ -54,29 +49,6 @@ class Filter {
return $keywords;
}
static public function pageData(mixed $data) {
$data = (string) $data;
$filterDataPre = [
'/<script.*?\/script>/s',
'/<style.*?\/style>/s'
];
$filterDataPost = [
'/[\s]{2,}/',
];
$data = preg_replace($filterDataPre, ' ', $data);
$data = html_entity_decode($data);
$data = strip_tags($data);
$data = preg_replace($filterDataPost, ' ', $data);
return $data;
}
static public function searchQuery(string $query, string $mode = 'default') {
// Create query CRC32

168
library/helper.php

@ -0,0 +1,168 @@ @@ -0,0 +1,168 @@
<?php
require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/robots.php');
class Helper {
public static function getHostSetting(MySQL $db,
Memcached $memcached,
int $hostId,
string $key,
mixed $defaultValue) : mixed {
if ($value = $memcached->get(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key))) {
return $value;
}
if (!$value = $db->findHostSettingValue($hostId, $key)) {
$value = $defaultValue;
}
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
return $value;
}
public static function setHostSetting(MySQL $db,
Memcached $memcached,
int $hostId,
string $key,
mixed $value) : int {
if ($hostSetting = $db->findHostSetting($hostId, $key)) {
$rowsAffected = $db->updateHostSetting($hostSetting->hostSettingId, $value, time());
} else {
$rowsAffected = $db->addHostSetting($hostId, $key, $value, time());
}
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
return $rowsAffected;
}
public static function addLinkToDB(MySQL $db, Memcached $memcached, string $link) : mixed {
// Define variables
$result = (object)
[
'new' => (object)
[
'hostId' => [],
'hostPageId' => [],
],
'old' => (object)
[
'hostId' => [],
'hostPageId' => [],
],
];
// Validate DB connection
if (!$db) {
return false;
}
// Validate link URL
if (!$link = URL::parse($link)) {
return false;
}
// Init host
if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) {
// Make sure host URL compatible with this host rules before continue
if (!preg_match(self::getHostSetting($db, $memcached, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) {
return false;
}
$hostId = $host->hostId;
$result->old->hostId[] = $host->hostId;
} else {
// Make sure link compatible with default host rules before create new host
if (!preg_match(DEFAULT_HOST_URL_REGEXP, $link->host->url)) {
return false;
}
// Register new host
if ($hostId = $db->addHost($link->host->scheme, $link->host->name, $link->host->port, crc32($link->host->url), time())) {
$result->new->hostId[] = $hostId;
// Init required for app web root page
if ($link->page->uri != '/') {
if ($hostPageId = $db->addHostPage($hostId, crc32('/'), '/', time())) {
// Note: commented because of referrer link registration implemented out of this method
// $result->new->hostPageId[] = $hostPageId;
}
}
} else {
return false;
}
}
// Add host page if not exists
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($link->page->uri))) {
$result->old->hostPageId[] = $hostPage->hostPageId;
} else {
// Make sure host page URL compatible with this host rules before continue
if (!preg_match(self::getHostSetting($db, $memcached, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) {
return false;
}
// Validate page limits for this host
if ($db->getTotalHostPages($hostId) > self::getHostSetting($db, $memcached, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) {
return false;
}
// Validate ROBOTS.TXT
$robots = new Robots(
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
);
if (!$robots->uriAllowed($link->page->uri)) {
return false;
}
// Validate host page MIME
// Note: passed to the crawl queue to prevent extra-curl requests
// Add host page
if ($hostPageId = $db->addHostPage($hostId, crc32($link->page->uri), $link->page->uri, time())) {
$result->new->hostPageId[] = $hostPageId;
} else {
return false;
}
}
return $result;
}
// Cache host setting requests
}

181
library/mysql.php

@ -60,7 +60,7 @@ class MySQL { @@ -60,7 +60,7 @@ class MySQL {
return $query->fetch();
}
public function getHostByCRC32URL(int $crc32url) {
public function findHostByCRC32URL(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1');
@ -78,87 +78,74 @@ class MySQL { @@ -78,87 +78,74 @@ class MySQL {
return $query->fetch()->total;
}
public function addHost(string $scheme,
string $name,
mixed $port,
int $crc32url,
int $timeAdded,
mixed $timeUpdated,
int $crawlPageLimit,
string $crawlMetaOnly,
string $status,
string $nsfw,
mixed $robots,
mixed $robotsPostfix) {
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`,
`name`,
`port`,
`crc32url`,
`timeAdded`,
`timeUpdated`,
`crawlPageLimit`,
`crawlMetaOnly`,
`status`,
`nsfw`,
`robots`,
`robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $scheme,
$name,
$port,
$crc32url,
$timeAdded,
$timeUpdated,
$crawlPageLimit,
$crawlMetaOnly,
$status,
$nsfw,
$robots,
$robotsPostfix]);
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded]);
return $this->_db->lastInsertId();
}
public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) {
// Host settings
public function findHostSettingValue(int $hostId, string $key) {
$query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
$query = $this->_db->prepare('SELECT `value` FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
$query->execute([$robots, $timeUpdated, $hostId]);
$query->execute([$hostId, $key]);
return $query->rowCount();
return $query->rowCount() ? json_decode($query->fetch()->value) : false;
}
// Host settings
public function getHostSetting(int $hostId, mixed $key) {
public function findHostSetting(int $hostId, string $key) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
$query = $this->_db->prepare('SELECT * FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
$query->execute([$hostId, $key]);
return $query->rowCount() ? $query->fetch()->value : false;
return $query->fetch();
}
public function getHostSettings(int $hostId) {
public function addHostSetting(int $hostId, string $key, mixed $value, int $timeAdded) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
$query = $this->_db->prepare('INSERT INTO `hostSetting` (`hostId`, `key`, `value`, `timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$hostId]);
$value = json_encode($value);
return $query->fetchAll();
$query->execute(
[
$hostId,
$key,
$value,
$timeAdded
]
);
return $query->rowCount();
}
public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) {
public function updateHostSetting(int $hostSettingId, mixed $value, int $timeUpdated) {
$query = $this->_db->query('UPDATE `hostSetting` SET `value` = ?,
`timeUpdated` = ?
$query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ?
`key` = ?,
`value` = ?,
`timeAdded = ?
WHERE `hostSettingId` = ?
ON DUPLICATE KEY UPDATE `value` = ?,
`timeUpdated` = ?');
LIMIT 1');
$query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]);
$value = json_encode($value);
$query->execute(
[
$value,
$timeUpdated,
$hostSettingId
]
);
return $query->rowCount();
}
@ -212,16 +199,12 @@ class MySQL { @@ -212,16 +199,12 @@ class MySQL {
public function getTopHostPages(int $limit = 100) {
// Get ID (to prevent memory over usage)
$query = $this->_db->query("SELECT `hostPage`.`hostPageId`
FROM `hostPage`
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
$query = $this->_db->query("SELECT `hostPageId` FROM `hostPage`
WHERE `host`.`status` = '1'
AND `hostPage`.`httpCode` = 200
AND `hostPage`.`rank` > 0
AND `hostPage`.`timeBanned` IS NULL
AND `hostPage`.`mime` IS NOT NULL
WHERE `httpCode` = 200
AND `rank` > 0
AND `timeBanned` IS NULL
AND `mime` IS NOT NULL
ORDER BY `rank` DESC
@ -387,12 +370,11 @@ class MySQL { @@ -387,12 +370,11 @@ class MySQL {
return $query->rowCount();
}
public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
public function setHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
$query = $this->_db->prepare('INSERT IGNORE `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`) VALUES (?, ?)');
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
}
public function deleteHostPageToHostPage(int $hostPageId) {
@ -422,6 +404,15 @@ class MySQL { @@ -422,6 +404,15 @@ class MySQL {
return $query->fetchAll();
}
public function getHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? AND `hostPageIdTarget` = ? LIMIT 1');
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
return $query->fetch();
}
public function addHostPageSnap(int $hostPageId, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, `timeAdded`) VALUES (?, ?)');
@ -560,62 +551,46 @@ class MySQL { @@ -560,62 +551,46 @@ class MySQL {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
$query->execute();
$query->execute([$timeOffset]);
return $query->rowCount();
}
// Crawler tools
public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
public function resetBannedHosts(int $timeOffset) {
$query = $this->_db->prepare("SELECT COUNT(*) AS `total`
$query = $this->_db->prepare('UPDATE `host` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
$query->execute([$timeOffset]);
return $query->rowCount();
}
// Crawler tools
public function getHostPageCrawlQueueTotal(int $timeFrom) {
WHERE (
`hostPage`.`timeUpdated` IS NULL OR
`hostPage`.`timeUpdated` < ? OR (
`hostPage`.`uri` = '/' AND
`hostPage`.`timeUpdated` < ?
)
)
$query = $this->_db->prepare("SELECT COUNT(*) AS `total` FROM `hostPage`
AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL");
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `hostPage`.`timeBanned` IS NULL");
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
$query->execute([$timeFrom]);
return $query->fetch()->total;
}
public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
$result = [];
// Get ID (to prevent memory over usage)
$query = $this->_db->prepare("SELECT `hostPage`.`hostPageId`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (
`hostPage`.`timeUpdated` IS NULL OR
`hostPage`.`timeUpdated` < ?
OR (
`hostPage`.`uri` = '/' AND
`hostPage`.`timeUpdated` < ?
)
)
$query = $this->_db->prepare("SELECT `hostPageId` FROM `hostPage`
AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
ORDER BY LENGTH(`uri`) ASC, RAND()
LIMIT " . (int) $limit);
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
$query->execute([$timeFrom]);
// Get required page details
foreach ($query->fetchAll() as $queue) {
@ -627,10 +602,6 @@ class MySQL { @@ -627,10 +602,6 @@ class MySQL {
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlMetaOnly`,
`host`.`robots`,
`host`.`robotsPostfix`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
@ -676,13 +647,13 @@ class MySQL { @@ -676,13 +647,13 @@ class MySQL {
FROM `host`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL
ORDER BY RAND()
LIMIT " . (int) $limit);
$query->execute([$timeFrom, 0]);
$query->execute([$timeFrom]);
// Get required page details
foreach ($query->fetchAll() as $host) {

73
library/parser.php

@ -1,73 +0,0 @@ @@ -1,73 +0,0 @@
<?php
class Parser {
static public function hostURL(string $string) {
$result = [
'string' => null,
'scheme' => null,
'name' => null,
'port' => null,
];
if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) {
$result['string'] = $hostScheme . '://';
$result['scheme'] = $hostScheme;
} else {
return false;
}
if ($hostName = parse_url($string, PHP_URL_HOST)) {
$result['string'] .= $hostName;
$result['name'] = $hostName;
} else {
return false;
}
if ($hostPort = parse_url($string, PHP_URL_PORT)) {
$result['string'] .= ':' . $hostPort;
$result['port'] = $hostPort;
}
return (object) $result;
}
static public function uri(string $string) {
$result = [
'string' => '/',
'path' => '/',
'query' => null,
];
if ($path = parse_url($string, PHP_URL_PATH)) {
$result['string'] = $path;
$result['path'] = $path;
}
if ($query = parse_url($string, PHP_URL_QUERY)) {
$result['string'] .= '?' . $query;
$result['query'] = '?' . $query;
}
return (object) $result;
}
}

82
library/url.php

@ -0,0 +1,82 @@ @@ -0,0 +1,82 @@
<?php
class URL {
public static function is(string $url) : bool {
return filter_var($url, FILTER_VALIDATE_URL);
}
public static function parse(string $url) : mixed {
$result = (object)
[
'host' => (object)
[
'url' => null,
'scheme' => null,
'name' => null,
'port' => null,
],
'page' => (object)
[
'url' => null,
'uri' => null,
'path' => null,
'query' => null,
]
];
// Validate URL
if (!self::is($url)) {
return false;
}
// Parse host
if ($scheme = parse_url($url, PHP_URL_SCHEME)) {
$result->host->url = $scheme . '://';
$result->host->scheme = $scheme;
} else {
return false;
}
if ($host = parse_url($url, PHP_URL_HOST)) {
$result->host->url .= $host;
$result->host->name = $host;
} else {
return false;
}
if ($port = parse_url($url, PHP_URL_PORT)) {
$result->host->url .= ':' . $port;
$result->host->port = $port;
// port is optional
}
// Parse page
if ($path = parse_url($url, PHP_URL_PATH)) {
$result->page->uri = $path;
$result->page->path = $path;
}
if ($query = parse_url($url, PHP_URL_QUERY)) {
$result->page->uri .= '?' . $query;
$result->page->query = '?' . $query;
}
$result->page->url = $result->host->url . $result->page->uri;
return $result;
}
}

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 144 KiB

After

Width:  |  Height:  |  Size: 138 KiB

19
public/api.php

@ -1,14 +1,11 @@ @@ -1,14 +1,11 @@
<?php
// Current version
define('API_VERSION', 0.12);
define('API_VERSION', 0.13);
// Load system dependencies
require_once(__DIR__ . '/../config/app.php');
require_once(__DIR__ . '/../library/curl.php');
require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/parser.php');
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/sphinxql.php');
@ -107,17 +104,9 @@ if (API_ENABLED) { @@ -107,17 +104,9 @@ if (API_ENABLED) {
'status' => true,
'result' => [
'config' => [
'websiteDomain' => WEBSITE_DOMAIN,
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
'WEBSITE_DOMAIN' => WEBSITE_DOMAIN,
'DEFAULT_HOST_URL_REGEXP' => DEFAULT_HOST_URL_REGEXP,
// @TODO
],
'api' => [
'version' => (string) API_VERSION,

20
public/explore.php

@ -7,11 +7,29 @@ require_once(__DIR__ . '/../library/mysql.php'); @@ -7,11 +7,29 @@ require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server
try {
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect database
try {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Filter request data
$hp = !empty($_GET['hp']) ? Filter::url($_GET['hp']) : 0;
@ -283,7 +301,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -283,7 +301,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php } else { ?>
<div style="text-align:center">
<span><?php echo _('Not found') ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?>
</div>

9
public/index.php

@ -6,8 +6,17 @@ require_once(__DIR__ . '/../library/filter.php'); @@ -6,8 +6,17 @@ require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server
try {
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
$totalPages = $sphinx->getHostPagesTotal();
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),

122
public/search.php

@ -2,19 +2,49 @@ @@ -2,19 +2,49 @@
// Load system dependencies
require_once(__DIR__ . '/../config/app.php');
require_once(__DIR__ . '/../library/curl.php');
require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/parser.php');
require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server
try {
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect database
try {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect memcached
try {
$memcached = new Memcached();
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Filter request data
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'text';
$m = !empty($_GET['m']) ? Filter::url($_GET['m']) : 'default';
@ -36,82 +66,34 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -36,82 +66,34 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]);
// Define alert message
$alertMessages = [];
// Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$db->beginTransaction();
// Register new host/page on search request contains the link
if (URL::is($q)) {
try {
// Parse host info
if ($hostURL = Parser::hostURL($q)) {
// Host exists
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
$db->beginTransaction();
// Register new host
} else {
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $q)) {
// Disk quota not reached
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
if (count($linkToDBresult->new->hostPageId)) {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
$alertMessages[] = _('Link successfully registered in the crawl queue!');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
if ($resultsTotal == 0) {
$hostId = $db->addHost( $hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
}
$alertMessages[] = _('This link already registered in the crawl queue.');
}
// Parse page URI
$hostPageURI = Parser::uri($q);
// Init robots parser
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix);
}
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
} else {
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
}
$alertMessages[] = _('Link address not supported on this host!');
}
$db->commit();
@ -124,6 +106,12 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -124,6 +106,12 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
}
}
// Count pages in the crawl queue
if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) {
$alertMessages[] = sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal);
}
?>
<!DOCTYPE html>
@ -313,8 +301,8 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -313,8 +301,8 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php if ($results) { ?>
<div>
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php foreach ($alertMessages as $alertMessage) { ?>
<span><?php echo $alertMessage ?></span>
<?php } ?>
</div>
<?php foreach ($results as $result) { ?>
@ -352,7 +340,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -352,7 +340,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php } else { ?>
<div style="text-align:center">
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?>
</div>

20
public/top.php

@ -7,11 +7,29 @@ require_once(__DIR__ . '/../library/mysql.php'); @@ -7,11 +7,29 @@ require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server
try {
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect database
try {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Define page basics
$totalPages = $sphinx->getHostPagesTotal();
@ -271,7 +289,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -271,7 +289,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php } else { ?>
<div style="text-align:center">
<span><?php echo _('Not found') ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?>
</div>

Loading…
Cancel
Save