Browse Source

implement unlimited settings customization for each host

main
ghost 1 year ago
parent
commit
d024ffd770
  1. 2
      README.md
  2. 7
      cli/yggo.php
  3. 219
      config/app.php.example
  4. 6
      crontab/cleaner.php
  5. 446
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 28
      library/filter.php
  8. 168
      library/helper.php
  9. 181
      library/mysql.php
  10. 73
      library/parser.php
  11. 82
      library/url.php
  12. BIN
      media/db-prototype.png
  13. 19
      public/api.php
  14. 24
      public/explore.php
  15. 11
      public/index.php
  16. 126
      public/search.php
  17. 24
      public/top.php

2
README.md

@ -86,7 +86,7 @@ GET action=hosts - required
##### Application manifest ##### Application manifest
Returns node information for other nodes that have same `CRAWL_MANIFEST_API_VERSION` and `CRAWL_URL_REGEXP` conditions. Returns node information for other nodes that have same `CRAWL_MANIFEST_API_VERSION` and `DEFAULT_HOST_URL_REGEXP` conditions.
Could be enabled or disabled by `API_MANIFEST_ENABLED` option Could be enabled or disabled by `API_MANIFEST_ENABLED` option

7
cli/yggo.php

@ -390,7 +390,7 @@ if (!empty($argv[1])) {
$selectors = []; $selectors = [];
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) as $selector) {
if (!empty($selector)) { if (!empty($selector)) {
@ -428,8 +428,7 @@ if (!empty($argv[1])) {
$db->addHostPageDom($hostPage->hostPageId, $db->addHostPageDom($hostPage->hostPageId,
time(), time(),
$selector, $selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( trim((bool) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_STRIP_TAGS', DEFAULT_HOST_PAGES_DOM_STRIP_TAGS) ? strip_tags(preg_replace('/[\s]+/',
preg_replace('/[\s]+/',
' ', ' ',
str_replace(['<br />', '<br/>', '<br>', '</'], str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'], [' ', ' ', ' ', ' </'],
@ -447,7 +446,7 @@ if (!empty($argv[1])) {
exit; exit;
} }
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file')); CLI::danger(_('DEFAULT_HOST_PAGES_DOM_SELECTORS not provided in the configuration file'));
CLI::break(); CLI::break();
exit; exit;

219
config/app.php.example

@ -64,7 +64,7 @@ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true); define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database // Database
define('DB_HOST', '127.0.0.1'); define('DB_HOST', 'localhost');
define('DB_PORT', 3306); define('DB_PORT', 3306);
define('DB_NAME', ''); define('DB_NAME', '');
define('DB_USERNAME', ''); define('DB_USERNAME', '');
@ -75,7 +75,7 @@ define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306); define('SPHINX_PORT', 9306);
// Memcached // Memcached
define('MEMCACHED_HOST', '127.0.0.1'); define('MEMCACHED_HOST', 'localhost');
define('MEMCACHED_PORT', 11211); define('MEMCACHED_PORT', 11211);
// Snaps // Snaps
@ -92,19 +92,19 @@ define('MEMCACHED_PORT', 11211);
*/ */
define('SNAP_STORAGE', json_encode((object) define('SNAP_STORAGE', json_encode((object)
[ [
'localhost' => [ // @TODO see https://github.com/YGGverse/YGGo#roadmap 'localhost' => [
'storage-1' => [ 'storage-1' => [
'directory' => __DIR__ . '/../storage/snap/hps/', 'directory' => __DIR__ . '/../storage/snap/hps/',
'quota' => [ 'quota' => [
'mime' => false, 'mime' => false,
'size' => 10000000024, // @TODO 'size' => 10000000024,
'request' => [ // @TODO 'request' => [
'download' => [ 'download' => [
'size' => 10000024, 'size' => 10000024,
'seconds' => 60*60 'seconds' => 60*60
] ]
] ]
] ],
], ],
// ... // ...
], ],
@ -118,9 +118,9 @@ define('SNAP_STORAGE', json_encode((object)
'timeout' => 30, 'timeout' => 30,
'passive' => true, 'passive' => true,
'quota' => [ 'quota' => [
'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico', 'mime' => 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico',
'size' => 10000000024, // @TODO 'size' => 10000000024,
'request' => [ // @TODO 'request' => [
'download' => [ 'download' => [
'size' => 10000024, 'size' => 10000024,
'seconds' => 60*60 'seconds' => 60*60
@ -133,6 +133,7 @@ define('SNAP_STORAGE', json_encode((object)
] ]
)); ));
// Proxy settings // Proxy settings
/* /*
@ -143,141 +144,124 @@ define('SNAP_STORAGE', json_encode((object)
*/ */
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )'); define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
// Crawl settings // Host defaults
/* /*
* Crawler / Bot User Agent name * Only URL addresses match this rule will be crawled
*
* Shared to other hosts through CURL requests by crawler
* *
*/ */
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); define('DEFAULT_HOST_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); // ipv6 links only
/* /*
* Skip curl download on response data size reached * Default robots.txt rules (will be overwriten on remote rules available)
* *
* See also: CURLOPT_TIMEOUT (library/curl.php) * string|null
* *
*/ */
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760); define('DEFAULT_HOST_ROBOTS_TXT', null);
/* /*
* Stop crawler on disk quota reached (Mb) * These rules forcely appending to the remote robots.txt file
*
* string|null
* *
*/ */
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); define('DEFAULT_HOST_ROBOTS_TXT_POSTFIX', null);
/* /*
* Pages (URI) processing limit in the crawler.php queue * Pages limit per new host by default
*
* This option related to CRAWL_PAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
* *
* Set 0 to disable * Crawler stops indexing on this limit reach to prevent disk overuse
* *
*/ */
define('CRAWL_PAGE_LIMIT', 20); define('DEFAULT_HOST_PAGES_LIMIT', 100000);
/* /*
* Renew page index by timing offset provided * Index pages match MIME types
*
* This option works with CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
* *
* or the crawler can stuck in queue * comma separated
* *
*/ */
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); define('DEFAULT_HOST_PAGES_MIME', 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac,font/ttf');
/* /*
* Renew home page index by timing offset provided * Index only meta tags
* * or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
* Used for new pages scanning in highter priority
*
* This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
* *
* or the crawler can stuck in queue * Warning!
* this option requires huge disk storage,
* it's experimental feature, oriented for index operations
* *
*/ */
define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30); define('DEFAULT_HOST_PAGES_DATA', false);
/* /*
* Index pages match MIME types * Generates hostPageDom index based on hostPage.data field
* *
* comma separated * Could be useful for building semantical index query (config/sphinx.conf.txt)
*
* At this moment feature available in the CLI only (cli/yggo.php)
* *
*/ */
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac'); define('DEFAULT_HOST_PAGES_DOM_SELECTORS', false); // ";" separated
/* /*
* Only URL addresses match this rule will be auto-crawled * Strip HTML in the DEFAULT_HOST_PAGES_DOM_SELECTORS content
* *
*/ */
define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); define('DEFAULT_HOST_PAGE_DOM_STRIP_TAGS', false);
// Crawl queue
/* /*
* Pages limit per new host by default * Crawler / Bot User Agent name
*
* Crawler stops indexing on this limit reach to prevent disk overuse
* *
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field * Shared to other hosts through CURL requests by crawler
* *
*/ */
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000); define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
/* /*
* Set default auto-crawl status for new host added * Skip curl download on response data size reached
*
* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT
* false - requires manual validation by the moderator in the DB `host`.`status` field
* *
* This option also disable host in the search results * See also: CURLOPT_TIMEOUT (library/curl.php)
* *
*/ */
define('CRAWL_HOST_DEFAULT_STATUS', true); define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 50485760);
/* /*
* Index only meta tags * Stop crawler on disk quota reached (Mb)
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
*
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
*
* Warning!
* this option disabled requires huge disk storage,
* it's experimental feature, oriented for index operations
* *
*/ */
define('CRAWL_HOST_DEFAULT_META_ONLY', true); define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 128);
/* /*
* Not suitable/safe for work status for new host by default * Pages (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
* *
* Could be filtered in search results * Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
* *
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field * Set 0 to disable
* *
*/ */
define('CRAWL_HOST_DEFAULT_NSFW', false); define('CRAWL_HOST_PAGE_QUEUE_LIMIT', 10);
/* /*
* Collect sitemap index when available * Renew page index by timing offset provided
* *
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only * This option works with CRAWL_HOST_PAGE_QUEUE_LIMIT step queue
* *
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml * Pay attention, that CRAWL_HOST_PAGE_QUEUE_LIMIT + CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
* *
* true|false * or the crawler can stuck in queue
* *
*/ */
define('CRAWL_SITEMAPS', true); define('CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET', 60*60*24*30*12);
/* /*
* Re-calculate page rank on page update * Re-calculate page rank on page update
@ -287,7 +271,7 @@ define('CRAWL_SITEMAPS', true);
* true|false * true|false
* *
*/ */
define('CRAWL_PAGE_RANK_UPDATE', true); define('CRAWL_HOST_PAGE_RANK_UPDATE', false);
/* /*
* Renew hosts index by timing offset provided * Renew hosts index by timing offset provided
@ -304,53 +288,28 @@ define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7);
define('CRAWL_HOST_LIMIT', 1); define('CRAWL_HOST_LIMIT', 1);
/* /*
* Crawl robots.txt * Collect sitemap index when available
*/
define('CRAWL_ROBOTS', true); // true|false
/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robots.md
*
*/
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
/*
* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
* The crawler does not overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
* *
*/ * At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
/*
* Generates hostPageDom index based on hostPage.data field
* *
* Could be useful for building semantical index query (config/sphinx.conf.txt) * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
* *
* At this moment feature available in the CLI only (cli/yggo.php) * true|false
* *
*/ */
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6'); define('CRAWL_SITEMAPS', true);
/* /*
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content * Crawl robots.txt
*
*/ */
define('CRAWL_HOST_PAGE_DOM_STRIP_TAGS', true); define('CRAWL_ROBOTS', true); // true|false
/* /*
* Look for third-party manifests to collect distributed index * Look for third-party manifests to collect distributed index
* *
* API address provided in yggo meta tag * API address provided in yggo meta tag
* will be stored in the `manifest` DB table
* *
* Collecting URL that match CRAWL_URL_REGEXP condition * Collecting URL that match DEFAULT_HOST_URL_REGEXP condition
* *
*/ */
define('CRAWL_MANIFEST', true); define('CRAWL_MANIFEST', true);
@ -359,10 +318,17 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility * Manifest API version compatibility
* *
*/ */
define('CRAWL_MANIFEST_API_VERSION', 0.12); define('CRAWL_MANIFEST_API_VERSION', 0.13);
// Cleaner settings /*
* Remove host ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_HOST_BAN_SECONDS_OFFSET', 60*60*24*30);
/* /*
* Remove page ban after following time * Remove page ban after following time
* *
@ -370,7 +336,7 @@ define('CRAWL_MANIFEST_API_VERSION', 0.12);
* to prevent extra http requests to unavailable or not condition resources * to prevent extra http requests to unavailable or not condition resources
* *
*/ */
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); define('CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/* /*
* Database tables optimization * Database tables optimization
@ -382,7 +348,7 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
* When enabled - requires enough of RAM * When enabled - requires enough of RAM
* *
*/ */
define('CLEAN_DB_TABLES_OPTIMIZATION', false); define('CLEAN_DB_TABLES_OPTIMIZATION', true);
// API settings // API settings
@ -420,17 +386,12 @@ define('API_HOSTS_ENABLED', true);
* Database host fields comma separated or * to share all the fields * Database host fields comma separated or * to share all the fields
* *
*/ */
define('API_HOSTS_FIELDS', define('API_HOSTS_FIELDS', "IF (`port` IS NOT NULL,
'`host`.`scheme`, CONCAT(`scheme`, '://', `name`, ':', `port`),
`host`.`name`, CONCAT(`scheme`, '://', `name`)
`host`.`port`, ) AS `url`,
`host`.`crawlPageLimit`, `timeAdded`,
`host`.`robots`, `timeUpdated`");
`host`.`robotsPostfix`,
`host`.`nsfw`,
`host`.`timeAdded`,
`host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');
/* /*
* Manifest API * Manifest API

6
crontab/cleaner.php

@ -28,8 +28,11 @@ require_once(__DIR__ . '/../library/mysql.php');
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Reset banned hosts
$hostsBansRemoved = $db->resetBannedHostPages(time() - CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET);
// Reset banned pages // Reset banned pages
$hostPagesBansRemoved = $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET); $hostPagesBansRemoved = $db->resetBannedHosts(time() - CLEAN_HOST_BAN_SECONDS_OFFSET);
// Optimize tables // Optimize tables
if (CLEAN_DB_TABLES_OPTIMIZATION) { if (CLEAN_DB_TABLES_OPTIMIZATION) {
@ -45,6 +48,7 @@ if (CLEAN_DB_TABLES_OPTIMIZATION) {
} }
// Debug // Debug
echo 'Host bans removed: ' . $hostsBansRemoved . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL; echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;

446
crontab/crawler.php

@ -24,9 +24,10 @@ require_once(__DIR__ . '/../library/ftp.php');
require_once(__DIR__ . '/../library/curl.php'); require_once(__DIR__ . '/../library/curl.php');
require_once(__DIR__ . '/../library/robots.php'); require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/sitemap.php'); require_once(__DIR__ . '/../library/sitemap.php');
require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/parser.php');
require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php'); require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
// Check disk quota // Check disk quota
@ -62,27 +63,38 @@ try {
} catch(Exception $e) { } catch(Exception $e) {
// Debug std
var_dump($e); var_dump($e);
exit; exit;
} }
// Process hosts crawl queue // Connect memcached
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) { try {
$db->beginTransaction(); $memcached = new Memcached();
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Process hosts crawl queue
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) {
try { try {
$db->beginTransaction();
// Update host crawl queue // Update host crawl queue
$hostsProcessed += $db->updateHostCrawlQueue($host->hostId); $hostsProcessed += $db->updateHostCrawlQueue($queueHost->hostId, time());
// Crawl robots.txt // Update host robots.txt settings from remote host
if (CRAWL_ROBOTS) { if (CRAWL_ROBOTS) {
// Update robots $curl = new Curl($queueHost->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
@ -90,61 +102,63 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
// Sitemap provided in robots.txt // Update robots.txt rules
if (200 == $curl->getCode()) { if (200 == $curl->getCode() && false !== stripos(trim(mb_strtolower((string) $curl->getContentType())), 'text/plain')) {
$hostRobots = $curl->getContent();
} else { Helper::setHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent());
$hostRobots = $host->robots;
} }
// Update host index
$db->updateHostRobots($host->hostId, $hostRobots, time());
} }
// Process sitemaps when enabled // Process sitemaps when enabled
if (CRAWL_SITEMAPS) { if (CRAWL_SITEMAPS) {
// Look for custom sitemap URL served in robots.txt // Look for custom sitemap URL served in robots.txt
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); $robots = new Robots(
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
);
if ($hostSitemapPath = $robots->getSitemap()) { if ($sitemapLink = $robots->getSitemap()) {
// Replace relative paths // Replace relative paths
$hostSitemapPath = trim($hostSitemapPath, '/'); $sitemapURL = sprintf('%s/%s', $queueHost->url, trim(str_ireplace($hostCrawlQueue->url, '', $sitemapLink), '/'));
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
// Set default path when not exists // Set default path
} else { } else {
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url); $sitemapURL = sprintf('%s/sitemap.xml', $queueHost->url);
} }
// Init sitemap data // Init sitemap
$sitemap = new Sitemap($hostSitemapPath); $sitemap = new Sitemap($sitemapURL);
if ($sitemapLinks = $sitemap->getLinks()) { if ($sitemapLinks = $sitemap->getLinks()) {
$sitemapsProcessed++; $sitemapsProcessed++;
// Process collected sitemap links // Process collected sitemap links
foreach ($sitemapLinks as $link => $attributes) { foreach ($sitemapLinks as $loc => $attributes) {
// Replace relative paths
$loc = sprintf('%s/%s', $queueHost->url, trim(str_ireplace($queueHost->url, '', $loc), '/'));
// Validate link
if (!$link = URL::parse($loc)) {
continue;
}
// Collect this host links only
if ($link->host->url != $queueHost->url) {
// Parse formatted link continue;
$linkURI = Parser::uri($link); }
$linkHostURL = Parser::hostURL($link);
// Add host page // Register new link
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $loc)) {
$linkHostURL->string == $host->url && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); $hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
} }
} }
} }
@ -152,8 +166,11 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Update manifests // Update manifests
if (CRAWL_MANIFEST) { if (CRAWL_MANIFEST) {
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
// Host have manifest provided
if ($manifestURL = Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'MANIFEST_URL', NULL)) {
// Get remote manifest
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); $curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
@ -165,42 +182,32 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Skip processing non 200 code // Skip processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
$db->commit();
continue; continue;
} }
// Skip processing without returned data // Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) { if (!$remoteManifest = $curl->getContent()) {
$db->commit();
continue; continue;
} }
// Skip processing on json encoding error // Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) { if (!$remoteManifest = @json_decode($remoteManifest)) {
$db->commit();
continue; continue;
} }
// Skip processing on required fields missed // Skip processing on required fields missed
if (empty($remoteManifest->status) || if (empty($remoteManifest->status) ||
empty($remoteManifest->result->config->crawlUrlRegexp) || empty($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP) ||
empty($remoteManifest->result->api->version) || empty($remoteManifest->result->api->version) ||
empty($remoteManifest->result->api->hosts)) { empty($remoteManifest->result->api->hosts)) {
$db->commit();
continue; continue;
} }
// Skip processing on API version not compatible // Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { if ($remoteManifest->result->api->version !== API_VERSION) {
$db->commit();
continue; continue;
} }
@ -208,28 +215,24 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Skip processing on host API not available // Skip processing on host API not available
if (!$remoteManifest->result->api->hosts) { if (!$remoteManifest->result->api->hosts) {
$db->commit();
continue; continue;
} }
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition // Skip processing on remote host URL does not match local condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { if ($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP !=
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) {
$db->commit();
continue; continue;
} }
// Skip processing on host link does not match condition // Skip processing on remote host link does not match local condition
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { if (false === preg_match(Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP),
$remoteManifest->result->api->hosts)) {
$db->commit();
continue; continue;
} }
// Begin hosts collection // Grab host URLs
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT); $curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
@ -241,32 +244,23 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
// Skip processing non 200 code // Skip processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
$db->commit();
continue; continue;
} }
// Skip processing without returned data // Skip processing without returned data
if (!$remoteManifestHosts = $curl->getContent()) { if (!$remoteManifest = $curl->getContent()) {
$db->commit();
continue; continue;
} }
// Skip processing on json encoding error // Skip processing on json encoding error
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { if (!$remoteManifestHosts = @json_decode($remoteManifest)) {
$db->commit();
continue; continue;
} }
// Skip processing on required fields missed // Skip processing on required fields missed
if (empty($remoteManifestHosts->status) || if (empty($remoteManifestHosts->result)) {
empty($remoteManifestHosts->result)) {
$db->commit();
continue; continue;
} }
@ -275,64 +269,16 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
foreach ($remoteManifestHosts->result as $remoteManifestHost) { foreach ($remoteManifestHosts->result as $remoteManifestHost) {
// Skip processing on required fields missed // Skip processing on required fields missed
if (empty($remoteManifestHost->scheme) || if (empty($remoteManifestHost->url)) {
empty($remoteManifestHost->name)) {
continue; continue;
} }
$hostURL = $remoteManifestHost->scheme . '://' . // Register new link
$remoteManifestHost->name . if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $remoteManifestHost->url)) {
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
// Host not exists
if (!$db->getHostByCRC32URL(crc32($hostURL))) {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats $hostsAdded += count($linkToDBresult->new->hostId);
$httpRequestsTotal++; $hostPagesAdded += count($linkToDBresult->new->hostPageId);
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost( $remoteManifestHosts->result->scheme,
$remoteManifestHosts->result->name,
$remoteManifestHosts->result->port,
crc32($hostURL),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
}
} }
} }
} }
@ -354,7 +300,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
} }
// Process pages crawl queue // Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) { foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET) as $queueHostPage) {
$db->beginTransaction(); $db->beginTransaction();
@ -370,9 +316,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
// Update page rank // Update page rank
if (CRAWL_PAGE_RANK_UPDATE) { if (CRAWL_HOST_PAGE_RANK_UPDATE) {
// @TODO add common method
$hostPageRank = 0; $hostPageRank = 0;
@ -432,113 +376,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$url = trim($match[1]); $url = trim($match[1]);
//Make relative links absolute //Make relative links absolute
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use if (!parse_url($url, PHP_URL_HOST)) {
$url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.'); $url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
} }
// Validate formatted link // Register new link
if (filter_var($url, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $url)) { if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $url)) {
// Parse formatted link
$hostURL = Parser::hostURL($url);
$hostPageURI = Parser::uri($url);
// Host exists $hostsAdded += count($linkToDBresult->new->hostId);
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) { $hostPagesAdded += count($linkToDBresult->new->hostPageId);
$hostStatus = $host->status; // Register referrer
$hostNsfw = $host->nsfw; if ($linkToDBresult->old->hostPageId) {
$hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host foreach ($linkToDBresult->old->hostPageId as $hostPageIdTarget) {
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost( $hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
// When page is root, skip next operations
if ($hostPageURI->string == '/') {
$db->commit();
continue;
} }
} }
// Init robots parser if ($linkToDBresult->new->hostPageId) {
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save page info foreach ($linkToDBresult->new->hostPageId as $hostPageIdTarget) {
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId;
} else {
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); $db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
// Apply referer meta description to the target page before indexing it
if ($lastHostPageDescription = $db->getLastPageDescription($queueHostPage->hostPageId)) {
$db->addHostPageDescription($hostPageId,
$lastHostPageDescription->title,
$lastHostPageDescription->description,
$lastHostPageDescription->keywords,
$hostMetaOnly ? null : ($lastHostPageDescription->data ? base64_encode($lastHostPageDescription->data) : null),
time());
}
$hostPagesAdded++;
} }
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
} }
} }
} }
@ -567,7 +430,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Check for MIME // Check for MIME
$hostPageInMime = false; $hostPageInMime = false;
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
foreach ((array) explode(',', Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) {
// Ban page on MIME type not allowed in settings // Ban page on MIME type not allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
@ -804,16 +668,16 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$metaTitle, $metaTitle,
$metaDescription ? Filter::pageDescription($metaDescription) : null, $metaDescription ? Filter::pageDescription($metaDescription) : null,
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null, $metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, $content ? (Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null,
time()); time());
// Collect page DOM elements data on enabled // Collect page DOM elements data on enabled
if (CRAWL_HOST_PAGE_DOM_SELECTORS) { if ($hostPageDomSelectors = Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
// Begin selectors extraction // Begin selectors extraction
$html = str_get_html($content); $html = str_get_html($content);
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { foreach ((array) explode(';', $hostPageDomSelectors) as $selector) {
foreach($html->find($selector) as $element) { foreach($html->find($selector) as $element) {
@ -822,8 +686,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$db->addHostPageDom($queueHostPage->hostPageId, $db->addHostPageDom($queueHostPage->hostPageId,
time(), time(),
$selector, $selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( trim(Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/',
preg_replace('/[\s]+/',
' ', ' ',
str_replace(['<br />', '<br/>', '<br>', '</'], str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'], [' ', ' ', ' ', ' </'],
@ -851,7 +714,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if (CRAWL_MANIFEST && if (CRAWL_MANIFEST &&
!empty($metaYggoManifestURL) && !empty($metaYggoManifestURL) &&
filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) && filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) &&
preg_match(CRAWL_URL_REGEXP, $metaYggoManifestURL)) { preg_match(DEFAULT_HOST_URL_REGEXP, $metaYggoManifestURL)) {
$manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL); $manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL);
} }
@ -891,7 +754,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')), 'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
'data' => null, 'data' => null,
'mime' => null, 'mime' => null,
'ref' => $src, 'href' => $src,
]; ];
} }
@ -923,7 +786,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null, 'keywords' => null,
'data' => null, 'data' => null,
'mime' => Filter::mime($type), 'mime' => Filter::mime($type),
'ref' => $src, 'href' => $src,
]; ];
} }
@ -953,7 +816,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null, 'keywords' => null,
'data' => null, 'data' => null,
'mime' => Filter::mime($type), 'mime' => Filter::mime($type),
'ref' => $src, 'href' => $src,
]; ];
} }
@ -983,7 +846,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null, 'keywords' => null,
'data' => null, 'data' => null,
'mime' => Filter::mime($type), 'mime' => Filter::mime($type),
'ref' => $src, 'href' => $src,
]; ];
} }
@ -1002,7 +865,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null, 'keywords' => null,
'data' => null, 'data' => null,
'mime' => null, 'mime' => null,
'ref' => $src, 'href' => $src,
]; ];
} }
@ -1021,7 +884,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => null, 'keywords' => null,
'data' => null, 'data' => null,
'mime' => null, 'mime' => null,
'ref' => $href, 'href' => $href,
]; ];
} }
@ -1084,115 +947,48 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
'keywords' => Filter::pageKeywords($title), 'keywords' => Filter::pageKeywords($title),
'data' => null, 'data' => null,
'mime' => null, 'mime' => null,
'ref' => $href, 'href' => $href,
]; ];
} }
// Process links collected // Process links collected
foreach ($links as $link) { foreach ($links as $link) {
//Make relative links absolute // Make relative links absolute
if (!parse_url($link['ref'], PHP_URL_HOST)) { if (!parse_url($link['href'], PHP_URL_HOST)) {
$link['ref'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.'); $link['href'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['href']), '/'), '.');
} }
// Validate formatted link // Register new link
if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) { if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $link['href'])) {
// Parse formatted link // Increase new hosts counters
$hostURL = Parser::hostURL($link['ref']); if ($linkToDBresult->new->hostId) {
$hostPageURI = Parser::uri($link['ref']);
// Host exists
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $hostsAdded += count($linkToDBresult->new->hostId);
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
} }
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; if ($linkToDBresult->new->hostPageId) {
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost( $hostURL->scheme, $hostPagesAdded += count($linkToDBresult->new->hostPageId);
$hostURL->name, }
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters // Register referrer
$hostPagesAdded++; if ($linkToDBresult->old->hostPageId) {
$hostsAdded++;
// When page is root, skip next operations foreach ($linkToDBresult->old->hostPageId as $hostPageIdTarget) {
if ($hostPageURI->string == '/') {
continue; $db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
} }
} }
// Init robots parser if ($linkToDBresult->new->hostPageId) {
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save page info foreach ($linkToDBresult->new->hostPageId as $hostPageIdTarget) {
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { $db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
$hostPageId = $hostPage->hostPageId;
} else {
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
$db->addHostPageDescription($hostPageId,
$link['title'],
$link['description'],
$link['keywords'],
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
time());
$hostPagesAdded++;
} }
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
} }
} }
} }
@ -1236,7 +1032,7 @@ $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
// Debug output // Debug output
echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL; echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL; echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

28
library/filter.php

@ -2,11 +2,6 @@
class Filter { class Filter {
static public function string(mixed $data) {
return (string) $data;
}
static public function url(mixed $url) { static public function url(mixed $url) {
$url = (string) $url; $url = (string) $url;
@ -54,29 +49,6 @@ class Filter {
return $keywords; return $keywords;
} }
static public function pageData(mixed $data) {
$data = (string) $data;
$filterDataPre = [
'/<script.*?\/script>/s',
'/<style.*?\/style>/s'
];
$filterDataPost = [
'/[\s]{2,}/',
];
$data = preg_replace($filterDataPre, ' ', $data);
$data = html_entity_decode($data);
$data = strip_tags($data);
$data = preg_replace($filterDataPost, ' ', $data);
return $data;
}
static public function searchQuery(string $query, string $mode = 'default') { static public function searchQuery(string $query, string $mode = 'default') {
// Create query CRC32 // Create query CRC32

168
library/helper.php

@ -0,0 +1,168 @@
<?php
require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/robots.php');
class Helper {
public static function getHostSetting(MySQL $db,
Memcached $memcached,
int $hostId,
string $key,
mixed $defaultValue) : mixed {
if ($value = $memcached->get(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key))) {
return $value;
}
if (!$value = $db->findHostSettingValue($hostId, $key)) {
$value = $defaultValue;
}
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
return $value;
}
public static function setHostSetting(MySQL $db,
Memcached $memcached,
int $hostId,
string $key,
mixed $value) : int {
if ($hostSetting = $db->findHostSetting($hostId, $key)) {
$rowsAffected = $db->updateHostSetting($hostSetting->hostSettingId, $value, time());
} else {
$rowsAffected = $db->addHostSetting($hostId, $key, $value, time());
}
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
return $rowsAffected;
}
public static function addLinkToDB(MySQL $db, Memcached $memcached, string $link) : mixed {
// Define variables
$result = (object)
[
'new' => (object)
[
'hostId' => [],
'hostPageId' => [],
],
'old' => (object)
[
'hostId' => [],
'hostPageId' => [],
],
];
// Validate DB connection
if (!$db) {
return false;
}
// Validate link URL
if (!$link = URL::parse($link)) {
return false;
}
// Init host
if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) {
// Make sure host URL compatible with this host rules before continue
if (!preg_match(self::getHostSetting($db, $memcached, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) {
return false;
}
$hostId = $host->hostId;
$result->old->hostId[] = $host->hostId;
} else {
// Make sure link compatible with default host rules before create new host
if (!preg_match(DEFAULT_HOST_URL_REGEXP, $link->host->url)) {
return false;
}
// Register new host
if ($hostId = $db->addHost($link->host->scheme, $link->host->name, $link->host->port, crc32($link->host->url), time())) {
$result->new->hostId[] = $hostId;
// Init required for app web root page
if ($link->page->uri != '/') {
if ($hostPageId = $db->addHostPage($hostId, crc32('/'), '/', time())) {
// Note: commented because of referrer link registration implemented out of this method
// $result->new->hostPageId[] = $hostPageId;
}
}
} else {
return false;
}
}
// Add host page if not exists
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($link->page->uri))) {
$result->old->hostPageId[] = $hostPage->hostPageId;
} else {
// Make sure host page URL compatible with this host rules before continue
if (!preg_match(self::getHostSetting($db, $memcached, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) {
return false;
}
// Validate page limits for this host
if ($db->getTotalHostPages($hostId) > self::getHostSetting($db, $memcached, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) {
return false;
}
// Validate ROBOTS.TXT
$robots = new Robots(
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
);
if (!$robots->uriAllowed($link->page->uri)) {
return false;
}
// Validate host page MIME
// Note: passed to the crawl queue to prevent extra-curl requests
// Add host page
if ($hostPageId = $db->addHostPage($hostId, crc32($link->page->uri), $link->page->uri, time())) {
$result->new->hostPageId[] = $hostPageId;
} else {
return false;
}
}
return $result;
}
// Cache host setting requests
}

181
library/mysql.php

@ -60,7 +60,7 @@ class MySQL {
return $query->fetch(); return $query->fetch();
} }
public function getHostByCRC32URL(int $crc32url) { public function findHostByCRC32URL(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1');
@ -78,87 +78,74 @@ class MySQL {
return $query->fetch()->total; return $query->fetch()->total;
} }
public function addHost(string $scheme, public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded) {
string $name,
mixed $port,
int $crc32url,
int $timeAdded,
mixed $timeUpdated,
int $crawlPageLimit,
string $crawlMetaOnly,
string $status,
string $nsfw,
mixed $robots,
mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, $query = $this->_db->prepare('INSERT INTO `host` (`scheme`,
`name`, `name`,
`port`, `port`,
`crc32url`, `crc32url`,
`timeAdded`, `timeAdded`) VALUES (?, ?, ?, ?, ?)');
`timeUpdated`,
`crawlPageLimit`, $query->execute([$scheme, $name, $port, $crc32url, $timeAdded]);
`crawlMetaOnly`,
`status`,
`nsfw`,
`robots`,
`robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $scheme,
$name,
$port,
$crc32url,
$timeAdded,
$timeUpdated,
$crawlPageLimit,
$crawlMetaOnly,
$status,
$nsfw,
$robots,
$robotsPostfix]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) { // Host settings
public function findHostSettingValue(int $hostId, string $key) {
$query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT `value` FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
$query->execute([$robots, $timeUpdated, $hostId]); $query->execute([$hostId, $key]);
return $query->rowCount(); return $query->rowCount() ? json_decode($query->fetch()->value) : false;
} }
// Host settings public function findHostSetting(int $hostId, string $key) {
public function getHostSetting(int $hostId, mixed $key) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
$query->execute([$hostId, $key]); $query->execute([$hostId, $key]);
return $query->rowCount() ? $query->fetch()->value : false; return $query->fetch();
} }
public function getHostSettings(int $hostId) { public function addHostSetting(int $hostId, string $key, mixed $value, int $timeAdded) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?'); $query = $this->_db->prepare('INSERT INTO `hostSetting` (`hostId`, `key`, `value`, `timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$hostId]); $value = json_encode($value);
return $query->fetchAll(); $query->execute(
[
$hostId,
$key,
$value,
$timeAdded
]
);
return $query->rowCount();
} }
public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) { public function updateHostSetting(int $hostSettingId, mixed $value, int $timeUpdated) {
$query = $this->_db->query('UPDATE `hostSetting` SET `value` = ?,
`timeUpdated` = ?
$query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ? WHERE `hostSettingId` = ?
`key` = ?,
`value` = ?,
`timeAdded = ?
ON DUPLICATE KEY UPDATE `value` = ?, LIMIT 1');
`timeUpdated` = ?');
$query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]); $value = json_encode($value);
$query->execute(
[
$value,
$timeUpdated,
$hostSettingId
]
);
return $query->rowCount(); return $query->rowCount();
} }
@ -212,16 +199,12 @@ class MySQL {
public function getTopHostPages(int $limit = 100) { public function getTopHostPages(int $limit = 100) {
// Get ID (to prevent memory over usage) // Get ID (to prevent memory over usage)
$query = $this->_db->query("SELECT `hostPage`.`hostPageId` $query = $this->_db->query("SELECT `hostPageId` FROM `hostPage`
FROM `hostPage`
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
WHERE `host`.`status` = '1' WHERE `httpCode` = 200
AND `hostPage`.`httpCode` = 200 AND `rank` > 0
AND `hostPage`.`rank` > 0 AND `timeBanned` IS NULL
AND `hostPage`.`timeBanned` IS NULL AND `mime` IS NOT NULL
AND `hostPage`.`mime` IS NOT NULL
ORDER BY `rank` DESC ORDER BY `rank` DESC
@ -387,12 +370,11 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) { public function setHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
$query = $this->_db->prepare('INSERT IGNORE `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`) VALUES (?, ?)'); $query = $this->_db->prepare('INSERT IGNORE `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`) VALUES (?, ?)');
$query->execute([$hostPageIdSource, $hostPageIdTarget]); $query->execute([$hostPageIdSource, $hostPageIdTarget]);
} }
public function deleteHostPageToHostPage(int $hostPageId) { public function deleteHostPageToHostPage(int $hostPageId) {
@ -422,6 +404,15 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function getHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? AND `hostPageIdTarget` = ? LIMIT 1');
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
return $query->fetch();
}
public function addHostPageSnap(int $hostPageId, int $timeAdded) { public function addHostPageSnap(int $hostPageId, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, `timeAdded`) VALUES (?, ?)'); $query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, `timeAdded`) VALUES (?, ?)');
@ -560,62 +551,46 @@ class MySQL {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset); $query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
$query->execute(); $query->execute([$timeOffset]);
return $query->rowCount(); return $query->rowCount();
} }
// Crawler tools public function resetBannedHosts(int $timeOffset) {
public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
$query = $this->_db->prepare("SELECT COUNT(*) AS `total` $query = $this->_db->prepare('UPDATE `host` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
FROM `hostPage` $query->execute([$timeOffset]);
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
return $query->rowCount();
}
// Crawler tools
public function getHostPageCrawlQueueTotal(int $timeFrom) {
WHERE ( $query = $this->_db->prepare("SELECT COUNT(*) AS `total` FROM `hostPage`
`hostPage`.`timeUpdated` IS NULL OR
`hostPage`.`timeUpdated` < ? OR (
`hostPage`.`uri` = '/' AND
`hostPage`.`timeUpdated` < ?
)
)
AND `host`.`status` <> ? WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `hostPage`.`timeBanned` IS NULL");
AND `hostPage`.`timeBanned` IS NULL");
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); $query->execute([$timeFrom]);
return $query->fetch()->total; return $query->fetch()->total;
} }
public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
$result = []; $result = [];
// Get ID (to prevent memory over usage) // Get ID (to prevent memory over usage)
$query = $this->_db->prepare("SELECT `hostPage`.`hostPageId` $query = $this->_db->prepare("SELECT `hostPageId` FROM `hostPage`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (
`hostPage`.`timeUpdated` IS NULL OR
`hostPage`.`timeUpdated` < ?
OR (
`hostPage`.`uri` = '/' AND
`hostPage`.`timeUpdated` < ?
)
)
AND `host`.`status` <> ? WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL
AND `hostPage`.`timeBanned` IS NULL
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND() ORDER BY LENGTH(`uri`) ASC, RAND()
LIMIT " . (int) $limit); LIMIT " . (int) $limit);
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); $query->execute([$timeFrom]);
// Get required page details // Get required page details
foreach ($query->fetchAll() as $queue) { foreach ($query->fetchAll() as $queue) {
@ -627,10 +602,6 @@ class MySQL {
`host`.`scheme`, `host`.`scheme`,
`host`.`name`, `host`.`name`,
`host`.`port`, `host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlMetaOnly`,
`host`.`robots`,
`host`.`robotsPostfix`,
IF (`host`.`port` IS NOT NULL, IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
@ -676,13 +647,13 @@ class MySQL {
FROM `host` FROM `host`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL
ORDER BY RAND() ORDER BY RAND()
LIMIT " . (int) $limit); LIMIT " . (int) $limit);
$query->execute([$timeFrom, 0]); $query->execute([$timeFrom]);
// Get required page details // Get required page details
foreach ($query->fetchAll() as $host) { foreach ($query->fetchAll() as $host) {

73
library/parser.php

@ -1,73 +0,0 @@
<?php
class Parser {
static public function hostURL(string $string) {
$result = [
'string' => null,
'scheme' => null,
'name' => null,
'port' => null,
];
if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) {
$result['string'] = $hostScheme . '://';
$result['scheme'] = $hostScheme;
} else {
return false;
}
if ($hostName = parse_url($string, PHP_URL_HOST)) {
$result['string'] .= $hostName;
$result['name'] = $hostName;
} else {
return false;
}
if ($hostPort = parse_url($string, PHP_URL_PORT)) {
$result['string'] .= ':' . $hostPort;
$result['port'] = $hostPort;
}
return (object) $result;
}
static public function uri(string $string) {
$result = [
'string' => '/',
'path' => '/',
'query' => null,
];
if ($path = parse_url($string, PHP_URL_PATH)) {
$result['string'] = $path;
$result['path'] = $path;
}
if ($query = parse_url($string, PHP_URL_QUERY)) {
$result['string'] .= '?' . $query;
$result['query'] = '?' . $query;
}
return (object) $result;
}
}

82
library/url.php

@ -0,0 +1,82 @@
<?php
class URL {
public static function is(string $url) : bool {
return filter_var($url, FILTER_VALIDATE_URL);
}
public static function parse(string $url) : mixed {
$result = (object)
[
'host' => (object)
[
'url' => null,
'scheme' => null,
'name' => null,
'port' => null,
],
'page' => (object)
[
'url' => null,
'uri' => null,
'path' => null,
'query' => null,
]
];
// Validate URL
if (!self::is($url)) {
return false;
}
// Parse host
if ($scheme = parse_url($url, PHP_URL_SCHEME)) {
$result->host->url = $scheme . '://';
$result->host->scheme = $scheme;
} else {
return false;
}
if ($host = parse_url($url, PHP_URL_HOST)) {
$result->host->url .= $host;
$result->host->name = $host;
} else {
return false;
}
if ($port = parse_url($url, PHP_URL_PORT)) {
$result->host->url .= ':' . $port;
$result->host->port = $port;
// port is optional
}
// Parse page
if ($path = parse_url($url, PHP_URL_PATH)) {
$result->page->uri = $path;
$result->page->path = $path;
}
if ($query = parse_url($url, PHP_URL_QUERY)) {
$result->page->uri .= '?' . $query;
$result->page->query = '?' . $query;
}
$result->page->url = $result->host->url . $result->page->uri;
return $result;
}
}

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 144 KiB

After

Width:  |  Height:  |  Size: 138 KiB

19
public/api.php

@ -1,14 +1,11 @@
<?php <?php
// Current version // Current version
define('API_VERSION', 0.12); define('API_VERSION', 0.13);
// Load system dependencies // Load system dependencies
require_once(__DIR__ . '/../config/app.php'); require_once(__DIR__ . '/../config/app.php');
require_once(__DIR__ . '/../library/curl.php');
require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/parser.php');
require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/sphinxql.php'); require_once(__DIR__ . '/../library/sphinxql.php');
@ -107,17 +104,9 @@ if (API_ENABLED) {
'status' => true, 'status' => true,
'result' => [ 'result' => [
'config' => [ 'config' => [
'websiteDomain' => WEBSITE_DOMAIN, 'WEBSITE_DOMAIN' => WEBSITE_DOMAIN,
'crawlUrlRegexp' => CRAWL_URL_REGEXP, 'DEFAULT_HOST_URL_REGEXP' => DEFAULT_HOST_URL_REGEXP,
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW, // @TODO
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
], ],
'api' => [ 'api' => [
'version' => (string) API_VERSION, 'version' => (string) API_VERSION,

24
public/explore.php

@ -7,10 +7,28 @@ require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/sphinxql.php'); require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server // Connect Sphinx search server
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); try {
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); try {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Filter request data // Filter request data
$hp = !empty($_GET['hp']) ? Filter::url($_GET['hp']) : 0; $hp = !empty($_GET['hp']) ? Filter::url($_GET['hp']) : 0;
@ -283,7 +301,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php } else { ?> <?php } else { ?>
<div style="text-align:center"> <div style="text-align:center">
<span><?php echo _('Not found') ?></span> <span><?php echo _('Not found') ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?> <?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span> <span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?> <?php } ?>
</div> </div>

11
public/index.php

@ -6,7 +6,16 @@ require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/sphinxql.php'); require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server // Connect Sphinx search server
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); try {
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
$totalPages = $sphinx->getHostPagesTotal(); $totalPages = $sphinx->getHostPagesTotal();

126
public/search.php

@ -2,18 +2,48 @@
// Load system dependencies // Load system dependencies
require_once(__DIR__ . '/../config/app.php'); require_once(__DIR__ . '/../config/app.php');
require_once(__DIR__ . '/../library/curl.php');
require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/parser.php'); require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/sphinxql.php'); require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server // Connect Sphinx search server
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); try {
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); try {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect memcached
try {
$memcached = new Memcached();
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Filter request data // Filter request data
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'text'; $t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'text';
@ -36,82 +66,34 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
sprintf(_('Over %s pages or enter the new one...'), $totalPages), sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages), sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]); ]);
// Define alert message
$alertMessages = [];
// Crawl request // Register new host/page on search request contains the link
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { if (URL::is($q)) {
$db->beginTransaction();
try { try {
// Parse host info $db->beginTransaction();
if ($hostURL = Parser::hostURL($q)) {
// Host exists if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $q)) {
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
$hostStatus = $host->status; if (count($linkToDBresult->new->hostPageId)) {
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host $alertMessages[] = _('Link successfully registered in the crawl queue!');
} else {
// Disk quota not reached } else {
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
// Get robots.txt if exists if ($resultsTotal == 0) {
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $alertMessages[] = _('This link already registered in the crawl queue.');
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
} }
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost( $hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
}
} }
// Parse page URI } else {
$hostPageURI = Parser::uri($q);
// Init robots parser
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); $alertMessages[] = _('Link address not supported on this host!');
}
} }
$db->commit(); $db->commit();
@ -124,6 +106,12 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
} }
} }
// Count pages in the crawl queue
if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) {
$alertMessages[] = sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal);
}
?> ?>
<!DOCTYPE html> <!DOCTYPE html>
@ -313,8 +301,8 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php if ($results) { ?> <?php if ($results) { ?>
<div> <div>
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span> <span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?> <?php foreach ($alertMessages as $alertMessage) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span> <span><?php echo $alertMessage ?></span>
<?php } ?> <?php } ?>
</div> </div>
<?php foreach ($results as $result) { ?> <?php foreach ($results as $result) { ?>
@ -352,7 +340,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php } else { ?> <?php } else { ?>
<div style="text-align:center"> <div style="text-align:center">
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span> <span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?> <?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span> <span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?> <?php } ?>
</div> </div>

24
public/top.php

@ -7,10 +7,28 @@ require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/sphinxql.php'); require_once(__DIR__ . '/../library/sphinxql.php');
// Connect Sphinx search server // Connect Sphinx search server
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); try {
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); try {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
} catch(Exception $e) {
var_dump($e);
exit;
}
// Define page basics // Define page basics
$totalPages = $sphinx->getHostPagesTotal(); $totalPages = $sphinx->getHostPagesTotal();
@ -271,7 +289,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php } else { ?> <?php } else { ?>
<div style="text-align:center"> <div style="text-align:center">
<span><?php echo _('Not found') ?></span> <span><?php echo _('Not found') ?></span>
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?> <?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span> <span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
<?php } ?> <?php } ?>
</div> </div>

Loading…
Cancel
Save