mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
implement unlimited settings customization for each host
This commit is contained in:
parent
ab6c0379c8
commit
d024ffd770
@ -86,7 +86,7 @@ GET action=hosts - required
|
||||
|
||||
##### Application manifest
|
||||
|
||||
Returns node information for other nodes that have same `CRAWL_MANIFEST_API_VERSION` and `CRAWL_URL_REGEXP` conditions.
|
||||
Returns node information for other nodes that have same `CRAWL_MANIFEST_API_VERSION` and `DEFAULT_HOST_URL_REGEXP` conditions.
|
||||
|
||||
Could be enabled or disabled by `API_MANIFEST_ENABLED` option
|
||||
|
||||
|
15
cli/yggo.php
15
cli/yggo.php
@ -390,7 +390,7 @@ if (!empty($argv[1])) {
|
||||
|
||||
$selectors = [];
|
||||
|
||||
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
|
||||
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) as $selector) {
|
||||
|
||||
if (!empty($selector)) {
|
||||
|
||||
@ -428,12 +428,11 @@ if (!empty($argv[1])) {
|
||||
$db->addHostPageDom($hostPage->hostPageId,
|
||||
time(),
|
||||
$selector,
|
||||
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
|
||||
preg_replace('/[\s]+/',
|
||||
' ',
|
||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||
[' ', ' ', ' ', ' </'],
|
||||
$element->innertext))) : $element->innertext));
|
||||
trim((bool) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_STRIP_TAGS', DEFAULT_HOST_PAGES_DOM_STRIP_TAGS) ? strip_tags(preg_replace('/[\s]+/',
|
||||
' ',
|
||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||
[' ', ' ', ' ', ' </'],
|
||||
$element->innertext))) : $element->innertext));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -447,7 +446,7 @@ if (!empty($argv[1])) {
|
||||
exit;
|
||||
}
|
||||
|
||||
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
|
||||
CLI::danger(_('DEFAULT_HOST_PAGES_DOM_SELECTORS not provided in the configuration file'));
|
||||
CLI::break();
|
||||
exit;
|
||||
|
||||
|
@ -64,7 +64,7 @@ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
|
||||
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
|
||||
|
||||
// Database
|
||||
define('DB_HOST', '127.0.0.1');
|
||||
define('DB_HOST', 'localhost');
|
||||
define('DB_PORT', 3306);
|
||||
define('DB_NAME', '');
|
||||
define('DB_USERNAME', '');
|
||||
@ -75,7 +75,7 @@ define('SPHINX_HOST', '127.0.0.1');
|
||||
define('SPHINX_PORT', 9306);
|
||||
|
||||
// Memcached
|
||||
define('MEMCACHED_HOST', '127.0.0.1');
|
||||
define('MEMCACHED_HOST', 'localhost');
|
||||
define('MEMCACHED_PORT', 11211);
|
||||
|
||||
// Snaps
|
||||
@ -92,19 +92,19 @@ define('MEMCACHED_PORT', 11211);
|
||||
*/
|
||||
define('SNAP_STORAGE', json_encode((object)
|
||||
[
|
||||
'localhost' => [ // @TODO see https://github.com/YGGverse/YGGo#roadmap
|
||||
'localhost' => [
|
||||
'storage-1' => [
|
||||
'directory' => __DIR__ . '/../storage/snap/hps/',
|
||||
'quota' => [
|
||||
'mime' => false,
|
||||
'size' => 10000000024, // @TODO
|
||||
'request' => [ // @TODO
|
||||
'size' => 10000000024,
|
||||
'request' => [
|
||||
'download' => [
|
||||
'size' => 10000024,
|
||||
'seconds' => 60*60
|
||||
]
|
||||
]
|
||||
]
|
||||
],
|
||||
],
|
||||
// ...
|
||||
],
|
||||
@ -118,9 +118,9 @@ define('SNAP_STORAGE', json_encode((object)
|
||||
'timeout' => 30,
|
||||
'passive' => true,
|
||||
'quota' => [
|
||||
'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico',
|
||||
'size' => 10000000024, // @TODO
|
||||
'request' => [ // @TODO
|
||||
'mime' => 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico',
|
||||
'size' => 10000000024,
|
||||
'request' => [
|
||||
'download' => [
|
||||
'size' => 10000024,
|
||||
'seconds' => 60*60
|
||||
@ -133,6 +133,7 @@ define('SNAP_STORAGE', json_encode((object)
|
||||
]
|
||||
));
|
||||
|
||||
|
||||
// Proxy settings
|
||||
|
||||
/*
|
||||
@ -143,7 +144,75 @@ define('SNAP_STORAGE', json_encode((object)
|
||||
*/
|
||||
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
|
||||
|
||||
// Crawl settings
|
||||
// Host defaults
|
||||
|
||||
/*
|
||||
* Only URL addresses match this rule will be crawled
|
||||
*
|
||||
*/
|
||||
define('DEFAULT_HOST_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); // ipv6 links only
|
||||
|
||||
/*
|
||||
* Default robots.txt rules (will be overwriten on remote rules available)
|
||||
*
|
||||
* string|null
|
||||
*
|
||||
*/
|
||||
define('DEFAULT_HOST_ROBOTS_TXT', null);
|
||||
|
||||
/*
|
||||
* These rules forcely appending to the remote robots.txt file
|
||||
*
|
||||
* string|null
|
||||
*
|
||||
*/
|
||||
define('DEFAULT_HOST_ROBOTS_TXT_POSTFIX', null);
|
||||
|
||||
/*
|
||||
* Pages limit per new host by default
|
||||
*
|
||||
* Crawler stops indexing on this limit reach to prevent disk overuse
|
||||
*
|
||||
*/
|
||||
define('DEFAULT_HOST_PAGES_LIMIT', 100000);
|
||||
|
||||
/*
|
||||
* Index pages match MIME types
|
||||
*
|
||||
* comma separated
|
||||
*
|
||||
*/
|
||||
define('DEFAULT_HOST_PAGES_MIME', 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac,font/ttf');
|
||||
|
||||
/*
|
||||
* Index only meta tags
|
||||
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
|
||||
*
|
||||
* Warning!
|
||||
* this option requires huge disk storage,
|
||||
* it's experimental feature, oriented for index operations
|
||||
*
|
||||
*/
|
||||
define('DEFAULT_HOST_PAGES_DATA', false);
|
||||
|
||||
/*
|
||||
* Generates hostPageDom index based on hostPage.data field
|
||||
*
|
||||
* Could be useful for building semantical index query (config/sphinx.conf.txt)
|
||||
*
|
||||
* At this moment feature available in the CLI only (cli/yggo.php)
|
||||
*
|
||||
*/
|
||||
define('DEFAULT_HOST_PAGES_DOM_SELECTORS', false); // ";" separated
|
||||
|
||||
/*
|
||||
* Strip HTML in the DEFAULT_HOST_PAGES_DOM_SELECTORS content
|
||||
*
|
||||
*/
|
||||
define('DEFAULT_HOST_PAGE_DOM_STRIP_TAGS', false);
|
||||
|
||||
|
||||
// Crawl queue
|
||||
|
||||
/*
|
||||
* Crawler / Bot User Agent name
|
||||
@ -159,18 +228,18 @@ define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.co
|
||||
* See also: CURLOPT_TIMEOUT (library/curl.php)
|
||||
*
|
||||
*/
|
||||
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760);
|
||||
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 50485760);
|
||||
|
||||
/*
|
||||
* Stop crawler on disk quota reached (Mb)
|
||||
*
|
||||
*/
|
||||
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
||||
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 128);
|
||||
|
||||
/*
|
||||
* Pages (URI) processing limit in the crawler.php queue
|
||||
*
|
||||
* This option related to CRAWL_PAGE_SECONDS_OFFSET value
|
||||
* This option related to CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET value
|
||||
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
|
||||
*
|
||||
* Usually up to 20 pages per minute,
|
||||
@ -179,105 +248,20 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
||||
* Set 0 to disable
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_LIMIT', 20);
|
||||
define('CRAWL_HOST_PAGE_QUEUE_LIMIT', 10);
|
||||
|
||||
/*
|
||||
* Renew page index by timing offset provided
|
||||
*
|
||||
* This option works with CRAWL_PAGE_LIMIT step queue
|
||||
* This option works with CRAWL_HOST_PAGE_QUEUE_LIMIT step queue
|
||||
*
|
||||
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
|
||||
* Pay attention, that CRAWL_HOST_PAGE_QUEUE_LIMIT + CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET pair
|
||||
* must have enough value to crawl all pages collected in the DB index
|
||||
*
|
||||
* or the crawler can stuck in queue
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
|
||||
/*
|
||||
* Renew home page index by timing offset provided
|
||||
*
|
||||
* Used for new pages scanning in highter priority
|
||||
*
|
||||
* This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue
|
||||
*
|
||||
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
|
||||
* must have enough value to crawl all pages collected in the DB index
|
||||
*
|
||||
* or the crawler can stuck in queue
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30);
|
||||
|
||||
/*
|
||||
* Index pages match MIME types
|
||||
*
|
||||
* comma separated
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
|
||||
|
||||
/*
|
||||
* Only URL addresses match this rule will be auto-crawled
|
||||
*
|
||||
*/
|
||||
define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui');
|
||||
|
||||
/*
|
||||
* Pages limit per new host by default
|
||||
*
|
||||
* Crawler stops indexing on this limit reach to prevent disk overuse
|
||||
*
|
||||
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
|
||||
|
||||
/*
|
||||
* Set default auto-crawl status for new host added
|
||||
*
|
||||
* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT
|
||||
* false - requires manual validation by the moderator in the DB `host`.`status` field
|
||||
*
|
||||
* This option also disable host in the search results
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_STATUS', true);
|
||||
|
||||
/*
|
||||
* Index only meta tags
|
||||
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
|
||||
*
|
||||
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
|
||||
*
|
||||
* Warning!
|
||||
* this option disabled requires huge disk storage,
|
||||
* it's experimental feature, oriented for index operations
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_META_ONLY', true);
|
||||
|
||||
/*
|
||||
* Not suitable/safe for work status for new host by default
|
||||
*
|
||||
* Could be filtered in search results
|
||||
*
|
||||
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_NSFW', false);
|
||||
|
||||
/*
|
||||
* Collect sitemap index when available
|
||||
*
|
||||
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
|
||||
*
|
||||
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
|
||||
*
|
||||
* true|false
|
||||
*
|
||||
*/
|
||||
define('CRAWL_SITEMAPS', true);
|
||||
define('CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
|
||||
/*
|
||||
* Re-calculate page rank on page update
|
||||
@ -287,7 +271,7 @@ define('CRAWL_SITEMAPS', true);
|
||||
* true|false
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_RANK_UPDATE', true);
|
||||
define('CRAWL_HOST_PAGE_RANK_UPDATE', false);
|
||||
|
||||
/*
|
||||
* Renew hosts index by timing offset provided
|
||||
@ -303,54 +287,29 @@ define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7);
|
||||
*/
|
||||
define('CRAWL_HOST_LIMIT', 1);
|
||||
|
||||
/*
|
||||
* Collect sitemap index when available
|
||||
*
|
||||
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
|
||||
*
|
||||
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
|
||||
*
|
||||
* true|false
|
||||
*
|
||||
*/
|
||||
define('CRAWL_SITEMAPS', true);
|
||||
|
||||
/*
|
||||
* Crawl robots.txt
|
||||
*/
|
||||
define('CRAWL_ROBOTS', true); // true|false
|
||||
|
||||
/*
|
||||
* Default robots.txt rules on remote file not exists
|
||||
* The crawler able to overwrite these rules
|
||||
*
|
||||
* Presets
|
||||
* yggdrasil: /database/yggdrasil/host.robots.md
|
||||
*
|
||||
*/
|
||||
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
|
||||
|
||||
/*
|
||||
* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
|
||||
* The crawler does not overwrite these rules
|
||||
*
|
||||
* Presets
|
||||
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
|
||||
*
|
||||
*/
|
||||
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
||||
|
||||
/*
|
||||
* Generates hostPageDom index based on hostPage.data field
|
||||
*
|
||||
* Could be useful for building semantical index query (config/sphinx.conf.txt)
|
||||
*
|
||||
* At this moment feature available in the CLI only (cli/yggo.php)
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6');
|
||||
|
||||
/*
|
||||
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_PAGE_DOM_STRIP_TAGS', true);
|
||||
|
||||
/*
|
||||
* Look for third-party manifests to collect distributed index
|
||||
*
|
||||
* API address provided in yggo meta tag
|
||||
* will be stored in the `manifest` DB table
|
||||
*
|
||||
* Collecting URL that match CRAWL_URL_REGEXP condition
|
||||
* Collecting URL that match DEFAULT_HOST_URL_REGEXP condition
|
||||
*
|
||||
*/
|
||||
define('CRAWL_MANIFEST', true);
|
||||
@ -359,10 +318,17 @@ define('CRAWL_MANIFEST', true);
|
||||
* Manifest API version compatibility
|
||||
*
|
||||
*/
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.12);
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.13);
|
||||
|
||||
// Cleaner settings
|
||||
/*
|
||||
* Remove host ban after following time
|
||||
*
|
||||
* This option used in crawler and search page
|
||||
* to prevent extra http requests to unavailable or not condition resources
|
||||
*
|
||||
*/
|
||||
|
||||
define('CLEAN_HOST_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||
/*
|
||||
* Remove page ban after following time
|
||||
*
|
||||
@ -370,7 +336,7 @@ define('CRAWL_MANIFEST_API_VERSION', 0.12);
|
||||
* to prevent extra http requests to unavailable or not condition resources
|
||||
*
|
||||
*/
|
||||
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||
define('CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||
|
||||
/*
|
||||
* Database tables optimization
|
||||
@ -382,7 +348,7 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||
* When enabled - requires enough of RAM
|
||||
*
|
||||
*/
|
||||
define('CLEAN_DB_TABLES_OPTIMIZATION', false);
|
||||
define('CLEAN_DB_TABLES_OPTIMIZATION', true);
|
||||
|
||||
// API settings
|
||||
|
||||
@ -420,17 +386,12 @@ define('API_HOSTS_ENABLED', true);
|
||||
* Database host fields comma separated or * to share all the fields
|
||||
*
|
||||
*/
|
||||
define('API_HOSTS_FIELDS',
|
||||
'`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`,
|
||||
`host`.`crawlPageLimit`,
|
||||
`host`.`robots`,
|
||||
`host`.`robotsPostfix`,
|
||||
`host`.`nsfw`,
|
||||
`host`.`timeAdded`,
|
||||
`host`.`timeUpdated`,
|
||||
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');
|
||||
define('API_HOSTS_FIELDS', "IF (`port` IS NOT NULL,
|
||||
CONCAT(`scheme`, '://', `name`, ':', `port`),
|
||||
CONCAT(`scheme`, '://', `name`)
|
||||
) AS `url`,
|
||||
`timeAdded`,
|
||||
`timeUpdated`");
|
||||
|
||||
/*
|
||||
* Manifest API
|
||||
|
@ -28,8 +28,11 @@ require_once(__DIR__ . '/../library/mysql.php');
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
// Reset banned hosts
|
||||
$hostsBansRemoved = $db->resetBannedHostPages(time() - CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET);
|
||||
|
||||
// Reset banned pages
|
||||
$hostPagesBansRemoved = $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
|
||||
$hostPagesBansRemoved = $db->resetBannedHosts(time() - CLEAN_HOST_BAN_SECONDS_OFFSET);
|
||||
|
||||
// Optimize tables
|
||||
if (CLEAN_DB_TABLES_OPTIMIZATION) {
|
||||
@ -45,6 +48,7 @@ if (CLEAN_DB_TABLES_OPTIMIZATION) {
|
||||
}
|
||||
|
||||
// Debug
|
||||
echo 'Host bans removed: ' . $hostsBansRemoved . PHP_EOL;
|
||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||
|
||||
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
|
@ -24,9 +24,10 @@ require_once(__DIR__ . '/../library/ftp.php');
|
||||
require_once(__DIR__ . '/../library/curl.php');
|
||||
require_once(__DIR__ . '/../library/robots.php');
|
||||
require_once(__DIR__ . '/../library/sitemap.php');
|
||||
require_once(__DIR__ . '/../library/url.php');
|
||||
require_once(__DIR__ . '/../library/filter.php');
|
||||
require_once(__DIR__ . '/../library/parser.php');
|
||||
require_once(__DIR__ . '/../library/mysql.php');
|
||||
require_once(__DIR__ . '/../library/helper.php');
|
||||
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
|
||||
|
||||
// Check disk quota
|
||||
@ -62,27 +63,38 @@ try {
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Connect memcached
|
||||
try {
|
||||
|
||||
$memcached = new Memcached();
|
||||
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Process hosts crawl queue
|
||||
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) {
|
||||
|
||||
$db->beginTransaction();
|
||||
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) {
|
||||
|
||||
try {
|
||||
|
||||
// Update host crawl queue
|
||||
$hostsProcessed += $db->updateHostCrawlQueue($host->hostId);
|
||||
$db->beginTransaction();
|
||||
|
||||
// Crawl robots.txt
|
||||
// Update host crawl queue
|
||||
$hostsProcessed += $db->updateHostCrawlQueue($queueHost->hostId, time());
|
||||
|
||||
// Update host robots.txt settings from remote host
|
||||
if (CRAWL_ROBOTS) {
|
||||
|
||||
// Update robots
|
||||
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
$curl = new Curl($queueHost->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
@ -90,61 +102,63 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
// Sitemap provided in robots.txt
|
||||
if (200 == $curl->getCode()) {
|
||||
// Update robots.txt rules
|
||||
if (200 == $curl->getCode() && false !== stripos(trim(mb_strtolower((string) $curl->getContentType())), 'text/plain')) {
|
||||
|
||||
$hostRobots = $curl->getContent();
|
||||
|
||||
} else {
|
||||
|
||||
$hostRobots = $host->robots;
|
||||
Helper::setHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent());
|
||||
}
|
||||
|
||||
// Update host index
|
||||
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||
}
|
||||
|
||||
// Process sitemaps when enabled
|
||||
if (CRAWL_SITEMAPS) {
|
||||
|
||||
// Look for custom sitemap URL served in robots.txt
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
$robots = new Robots(
|
||||
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
|
||||
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
|
||||
);
|
||||
|
||||
if ($hostSitemapPath = $robots->getSitemap()) {
|
||||
if ($sitemapLink = $robots->getSitemap()) {
|
||||
|
||||
// Replace relative paths
|
||||
$hostSitemapPath = trim($hostSitemapPath, '/');
|
||||
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
|
||||
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
|
||||
// Replace relative paths
|
||||
$sitemapURL = sprintf('%s/%s', $queueHost->url, trim(str_ireplace($hostCrawlQueue->url, '', $sitemapLink), '/'));
|
||||
|
||||
// Set default path when not exists
|
||||
// Set default path
|
||||
} else {
|
||||
|
||||
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
|
||||
$sitemapURL = sprintf('%s/sitemap.xml', $queueHost->url);
|
||||
}
|
||||
|
||||
// Init sitemap data
|
||||
$sitemap = new Sitemap($hostSitemapPath);
|
||||
// Init sitemap
|
||||
$sitemap = new Sitemap($sitemapURL);
|
||||
|
||||
if ($sitemapLinks = $sitemap->getLinks()) {
|
||||
|
||||
$sitemapsProcessed++;
|
||||
|
||||
// Process collected sitemap links
|
||||
foreach ($sitemapLinks as $link => $attributes) {
|
||||
foreach ($sitemapLinks as $loc => $attributes) {
|
||||
|
||||
// Parse formatted link
|
||||
$linkURI = Parser::uri($link);
|
||||
$linkHostURL = Parser::hostURL($link);
|
||||
// Replace relative paths
|
||||
$loc = sprintf('%s/%s', $queueHost->url, trim(str_ireplace($queueHost->url, '', $loc), '/'));
|
||||
|
||||
// Add host page
|
||||
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||
$linkHostURL->string == $host->url && // this host links only
|
||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||
// Validate link
|
||||
if (!$link = URL::parse($loc)) {
|
||||
|
||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||
continue;
|
||||
}
|
||||
|
||||
// Collect this host links only
|
||||
if ($link->host->url != $queueHost->url) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Register new link
|
||||
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $loc)) {
|
||||
|
||||
$hostsAdded += count($linkToDBresult->new->hostId);
|
||||
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -152,8 +166,11 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
|
||||
|
||||
// Update manifests
|
||||
if (CRAWL_MANIFEST) {
|
||||
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
|
||||
|
||||
// Host have manifest provided
|
||||
if ($manifestURL = Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'MANIFEST_URL', NULL)) {
|
||||
|
||||
// Get remote manifest
|
||||
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
@ -165,42 +182,32 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
|
||||
// Skip processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing without returned data
|
||||
if (!$remoteManifest = $curl->getContent()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on json encoding error
|
||||
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on required fields missed
|
||||
if (empty($remoteManifest->status) ||
|
||||
empty($remoteManifest->result->config->crawlUrlRegexp) ||
|
||||
empty($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP) ||
|
||||
empty($remoteManifest->result->api->version) ||
|
||||
empty($remoteManifest->result->api->hosts)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on API version not compatible
|
||||
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
||||
|
||||
$db->commit();
|
||||
if ($remoteManifest->result->api->version !== API_VERSION) {
|
||||
|
||||
continue;
|
||||
}
|
||||
@ -208,28 +215,24 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
|
||||
// Skip processing on host API not available
|
||||
if (!$remoteManifest->result->api->hosts) {
|
||||
|
||||
$db->commit();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on remote host URL does not match local condition
|
||||
if ($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP !=
|
||||
Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
||||
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
||||
|
||||
$db->commit();
|
||||
// Skip processing on remote host link does not match local condition
|
||||
if (false === preg_match(Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP),
|
||||
$remoteManifest->result->api->hosts)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on host link does not match condition
|
||||
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Begin hosts collection
|
||||
// Grab host URLs
|
||||
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
@ -241,32 +244,23 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
|
||||
// Skip processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing without returned data
|
||||
if (!$remoteManifestHosts = $curl->getContent()) {
|
||||
|
||||
$db->commit();
|
||||
if (!$remoteManifest = $curl->getContent()) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on json encoding error
|
||||
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
|
||||
|
||||
$db->commit();
|
||||
if (!$remoteManifestHosts = @json_decode($remoteManifest)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on required fields missed
|
||||
if (empty($remoteManifestHosts->status) ||
|
||||
empty($remoteManifestHosts->result)) {
|
||||
|
||||
$db->commit();
|
||||
if (empty($remoteManifestHosts->result)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
@ -275,64 +269,16 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
|
||||
foreach ($remoteManifestHosts->result as $remoteManifestHost) {
|
||||
|
||||
// Skip processing on required fields missed
|
||||
if (empty($remoteManifestHost->scheme) ||
|
||||
empty($remoteManifestHost->name)) {
|
||||
if (empty($remoteManifestHost->url)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$hostURL = $remoteManifestHost->scheme . '://' .
|
||||
$remoteManifestHost->name .
|
||||
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
||||
// Register new link
|
||||
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $remoteManifestHost->url)) {
|
||||
|
||||
// Validate formatted link
|
||||
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
|
||||
|
||||
// Host not exists
|
||||
if (!$db->getHostByCRC32URL(crc32($hostURL))) {
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
} else {
|
||||
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
||||
}
|
||||
|
||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
|
||||
$hostId = $db->addHost( $remoteManifestHosts->result->scheme,
|
||||
$remoteManifestHosts->result->name,
|
||||
$remoteManifestHosts->result->port,
|
||||
crc32($hostURL),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
(string) $hostMetaOnly,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
// Add web root host page to make host visible in the crawl queue
|
||||
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||
|
||||
// Increase counters
|
||||
$hostPagesAdded++;
|
||||
$hostsAdded++;
|
||||
}
|
||||
$hostsAdded += count($linkToDBresult->new->hostId);
|
||||
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -354,7 +300,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
@ -370,9 +316,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
// Update page rank
|
||||
if (CRAWL_PAGE_RANK_UPDATE) {
|
||||
|
||||
// @TODO add common method
|
||||
if (CRAWL_HOST_PAGE_RANK_UPDATE) {
|
||||
|
||||
$hostPageRank = 0;
|
||||
|
||||
@ -432,113 +376,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
$url = trim($match[1]);
|
||||
|
||||
//Make relative links absolute
|
||||
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
|
||||
if (!parse_url($url, PHP_URL_HOST)) {
|
||||
|
||||
$url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
|
||||
}
|
||||
|
||||
// Validate formatted link
|
||||
if (filter_var($url, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $url)) {
|
||||
// Register new link
|
||||
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $url)) {
|
||||
|
||||
// Parse formatted link
|
||||
$hostURL = Parser::hostURL($url);
|
||||
$hostPageURI = Parser::uri($url);
|
||||
$hostsAdded += count($linkToDBresult->new->hostId);
|
||||
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
|
||||
|
||||
// Host exists
|
||||
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
|
||||
// Register referrer
|
||||
if ($linkToDBresult->old->hostPageId) {
|
||||
|
||||
$hostStatus = $host->status;
|
||||
$hostNsfw = $host->nsfw;
|
||||
$hostPageLimit = $host->crawlPageLimit;
|
||||
$hostMetaOnly = $host->crawlMetaOnly;
|
||||
$hostId = $host->hostId;
|
||||
$hostRobots = $host->robots;
|
||||
$hostRobotsPostfix = $host->robotsPostfix;
|
||||
foreach ($linkToDBresult->old->hostPageId as $hostPageIdTarget) {
|
||||
|
||||
// Register new host
|
||||
} else {
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
} else {
|
||||
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
||||
}
|
||||
|
||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
|
||||
$hostId = $db->addHost( $hostURL->scheme,
|
||||
$hostURL->name,
|
||||
$hostURL->port,
|
||||
crc32($hostURL->string),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
(string) $hostMetaOnly,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
// Add web root host page to make host visible in the crawl queue
|
||||
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||
|
||||
// Increase counters
|
||||
$hostPagesAdded++;
|
||||
$hostsAdded++;
|
||||
|
||||
// When page is root, skip next operations
|
||||
if ($hostPageURI->string == '/') {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
$db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
|
||||
}
|
||||
}
|
||||
|
||||
// Init robots parser
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
if ($linkToDBresult->new->hostPageId) {
|
||||
|
||||
// Save page info
|
||||
if ($hostStatus && // host enabled
|
||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
||||
foreach ($linkToDBresult->new->hostPageId as $hostPageIdTarget) {
|
||||
|
||||
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
|
||||
|
||||
$hostPageId = $hostPage->hostPageId;
|
||||
|
||||
} else {
|
||||
|
||||
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
||||
|
||||
// Apply referer meta description to the target page before indexing it
|
||||
if ($lastHostPageDescription = $db->getLastPageDescription($queueHostPage->hostPageId)) {
|
||||
|
||||
$db->addHostPageDescription($hostPageId,
|
||||
$lastHostPageDescription->title,
|
||||
$lastHostPageDescription->description,
|
||||
$lastHostPageDescription->keywords,
|
||||
$hostMetaOnly ? null : ($lastHostPageDescription->data ? base64_encode($lastHostPageDescription->data) : null),
|
||||
time());
|
||||
}
|
||||
|
||||
$hostPagesAdded++;
|
||||
}
|
||||
|
||||
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
|
||||
$db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -567,7 +430,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
|
||||
// Check for MIME
|
||||
$hostPageInMime = false;
|
||||
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
|
||||
|
||||
foreach ((array) explode(',', Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) {
|
||||
|
||||
// Ban page on MIME type not allowed in settings
|
||||
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
|
||||
@ -622,8 +486,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
// Insert compressed snap data into the tmp storage
|
||||
if (true === $zip->addFromString('DATA', $content) &&
|
||||
true === $zip->addFromString('META', sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
|
||||
sprintf('SOURCE: %s', Filter::url($queueHostPage->hostPageURL)) . PHP_EOL .
|
||||
sprintf('TIMESTAMP: %s', time()))) {
|
||||
sprintf('SOURCE: %s', Filter::url($queueHostPage->hostPageURL)) . PHP_EOL .
|
||||
sprintf('TIMESTAMP: %s', time()))) {
|
||||
}
|
||||
}
|
||||
|
||||
@ -802,18 +666,18 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
// Add queued page description if not exists
|
||||
$db->addHostPageDescription($queueHostPage->hostPageId,
|
||||
$metaTitle,
|
||||
$metaDescription ? Filter::pageDescription($metaDescription) : null,
|
||||
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
|
||||
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
|
||||
$metaDescription ? Filter::pageDescription($metaDescription) : null,
|
||||
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
|
||||
$content ? (Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null,
|
||||
time());
|
||||
|
||||
// Collect page DOM elements data on enabled
|
||||
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
|
||||
if ($hostPageDomSelectors = Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
|
||||
|
||||
// Begin selectors extraction
|
||||
$html = str_get_html($content);
|
||||
|
||||
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
|
||||
foreach ((array) explode(';', $hostPageDomSelectors) as $selector) {
|
||||
|
||||
foreach($html->find($selector) as $element) {
|
||||
|
||||
@ -822,12 +686,11 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
$db->addHostPageDom($queueHostPage->hostPageId,
|
||||
time(),
|
||||
$selector,
|
||||
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
|
||||
preg_replace('/[\s]+/',
|
||||
' ',
|
||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||
[' ', ' ', ' ', ' </'],
|
||||
$element->innertext))) : $element->innertext));
|
||||
trim(Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/',
|
||||
' ',
|
||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||
[' ', ' ', ' ', ' </'],
|
||||
$element->innertext))) : $element->innertext));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -851,7 +714,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
if (CRAWL_MANIFEST &&
|
||||
!empty($metaYggoManifestURL) &&
|
||||
filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) &&
|
||||
preg_match(CRAWL_URL_REGEXP, $metaYggoManifestURL)) {
|
||||
preg_match(DEFAULT_HOST_URL_REGEXP, $metaYggoManifestURL)) {
|
||||
|
||||
$manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL);
|
||||
}
|
||||
@ -891,7 +754,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
|
||||
'data' => null,
|
||||
'mime' => null,
|
||||
'ref' => $src,
|
||||
'href' => $src,
|
||||
];
|
||||
}
|
||||
|
||||
@ -923,7 +786,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
'keywords' => null,
|
||||
'data' => null,
|
||||
'mime' => Filter::mime($type),
|
||||
'ref' => $src,
|
||||
'href' => $src,
|
||||
];
|
||||
}
|
||||
|
||||
@ -953,7 +816,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
'keywords' => null,
|
||||
'data' => null,
|
||||
'mime' => Filter::mime($type),
|
||||
'ref' => $src,
|
||||
'href' => $src,
|
||||
];
|
||||
}
|
||||
|
||||
@ -983,7 +846,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
'keywords' => null,
|
||||
'data' => null,
|
||||
'mime' => Filter::mime($type),
|
||||
'ref' => $src,
|
||||
'href' => $src,
|
||||
];
|
||||
}
|
||||
|
||||
@ -1002,7 +865,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
'keywords' => null,
|
||||
'data' => null,
|
||||
'mime' => null,
|
||||
'ref' => $src,
|
||||
'href' => $src,
|
||||
];
|
||||
}
|
||||
|
||||
@ -1021,7 +884,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
'keywords' => null,
|
||||
'data' => null,
|
||||
'mime' => null,
|
||||
'ref' => $href,
|
||||
'href' => $href,
|
||||
];
|
||||
}
|
||||
|
||||
@ -1084,115 +947,48 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
'keywords' => Filter::pageKeywords($title),
|
||||
'data' => null,
|
||||
'mime' => null,
|
||||
'ref' => $href,
|
||||
'href' => $href,
|
||||
];
|
||||
}
|
||||
|
||||
// Process links collected
|
||||
foreach ($links as $link) {
|
||||
|
||||
//Make relative links absolute
|
||||
if (!parse_url($link['ref'], PHP_URL_HOST)) {
|
||||
// Make relative links absolute
|
||||
if (!parse_url($link['href'], PHP_URL_HOST)) {
|
||||
|
||||
$link['ref'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
|
||||
$link['href'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['href']), '/'), '.');
|
||||
}
|
||||
|
||||
// Validate formatted link
|
||||
if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) {
|
||||
// Register new link
|
||||
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $link['href'])) {
|
||||
|
||||
// Parse formatted link
|
||||
$hostURL = Parser::hostURL($link['ref']);
|
||||
$hostPageURI = Parser::uri($link['ref']);
|
||||
// Increase new hosts counters
|
||||
if ($linkToDBresult->new->hostId) {
|
||||
|
||||
// Host exists
|
||||
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
|
||||
$hostsAdded += count($linkToDBresult->new->hostId);
|
||||
}
|
||||
|
||||
$hostStatus = $host->status;
|
||||
$hostNsfw = $host->nsfw;
|
||||
$hostPageLimit = $host->crawlPageLimit;
|
||||
$hostMetaOnly = $host->crawlMetaOnly;
|
||||
$hostId = $host->hostId;
|
||||
$hostRobots = $host->robots;
|
||||
$hostRobotsPostfix = $host->robotsPostfix;
|
||||
if ($linkToDBresult->new->hostPageId) {
|
||||
|
||||
// Register new host
|
||||
} else {
|
||||
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
|
||||
}
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
// Register referrer
|
||||
if ($linkToDBresult->old->hostPageId) {
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
foreach ($linkToDBresult->old->hostPageId as $hostPageIdTarget) {
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
} else {
|
||||
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
||||
}
|
||||
|
||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
|
||||
$hostId = $db->addHost( $hostURL->scheme,
|
||||
$hostURL->name,
|
||||
$hostURL->port,
|
||||
crc32($hostURL->string),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
(string) $hostMetaOnly,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
// Add web root host page to make host visible in the crawl queue
|
||||
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||
|
||||
// Increase counters
|
||||
$hostPagesAdded++;
|
||||
$hostsAdded++;
|
||||
|
||||
// When page is root, skip next operations
|
||||
if ($hostPageURI->string == '/') {
|
||||
|
||||
continue;
|
||||
$db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
|
||||
}
|
||||
}
|
||||
|
||||
// Init robots parser
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
if ($linkToDBresult->new->hostPageId) {
|
||||
|
||||
// Save page info
|
||||
if ($hostStatus && // host enabled
|
||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
||||
foreach ($linkToDBresult->new->hostPageId as $hostPageIdTarget) {
|
||||
|
||||
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) {
|
||||
|
||||
$hostPageId = $hostPage->hostPageId;
|
||||
|
||||
} else {
|
||||
|
||||
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
||||
|
||||
$db->addHostPageDescription($hostPageId,
|
||||
$link['title'],
|
||||
$link['description'],
|
||||
$link['keywords'],
|
||||
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
|
||||
time());
|
||||
|
||||
$hostPagesAdded++;
|
||||
}
|
||||
|
||||
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
|
||||
$db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1236,7 +1032,7 @@ $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
||||
|
||||
// Debug output
|
||||
echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL;
|
||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL;
|
||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||
|
||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||
|
Binary file not shown.
@ -2,11 +2,6 @@
|
||||
|
||||
class Filter {
|
||||
|
||||
static public function string(mixed $data) {
|
||||
|
||||
return (string) $data;
|
||||
}
|
||||
|
||||
static public function url(mixed $url) {
|
||||
|
||||
$url = (string) $url;
|
||||
@ -54,29 +49,6 @@ class Filter {
|
||||
return $keywords;
|
||||
}
|
||||
|
||||
static public function pageData(mixed $data) {
|
||||
|
||||
$data = (string) $data;
|
||||
|
||||
$filterDataPre = [
|
||||
'/<script.*?\/script>/s',
|
||||
'/<style.*?\/style>/s'
|
||||
];
|
||||
|
||||
$filterDataPost = [
|
||||
'/[\s]{2,}/',
|
||||
];
|
||||
|
||||
$data = preg_replace($filterDataPre, ' ', $data);
|
||||
|
||||
$data = html_entity_decode($data);
|
||||
$data = strip_tags($data);
|
||||
|
||||
$data = preg_replace($filterDataPost, ' ', $data);
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
static public function searchQuery(string $query, string $mode = 'default') {
|
||||
|
||||
// Create query CRC32
|
||||
|
168
library/helper.php
Normal file
168
library/helper.php
Normal file
@ -0,0 +1,168 @@
|
||||
<?php
|
||||
|
||||
require_once(__DIR__ . '/../library/url.php');
|
||||
require_once(__DIR__ . '/../library/robots.php');
|
||||
|
||||
class Helper {
|
||||
|
||||
public static function getHostSetting(MySQL $db,
|
||||
Memcached $memcached,
|
||||
int $hostId,
|
||||
string $key,
|
||||
mixed $defaultValue) : mixed {
|
||||
|
||||
if ($value = $memcached->get(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key))) {
|
||||
|
||||
return $value;
|
||||
}
|
||||
|
||||
if (!$value = $db->findHostSettingValue($hostId, $key)) {
|
||||
|
||||
$value = $defaultValue;
|
||||
}
|
||||
|
||||
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
|
||||
|
||||
return $value;
|
||||
}
|
||||
|
||||
public static function setHostSetting(MySQL $db,
|
||||
Memcached $memcached,
|
||||
int $hostId,
|
||||
string $key,
|
||||
mixed $value) : int {
|
||||
|
||||
if ($hostSetting = $db->findHostSetting($hostId, $key)) {
|
||||
|
||||
$rowsAffected = $db->updateHostSetting($hostSetting->hostSettingId, $value, time());
|
||||
|
||||
} else {
|
||||
|
||||
$rowsAffected = $db->addHostSetting($hostId, $key, $value, time());
|
||||
}
|
||||
|
||||
$memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600);
|
||||
|
||||
return $rowsAffected;
|
||||
}
|
||||
|
||||
public static function addLinkToDB(MySQL $db, Memcached $memcached, string $link) : mixed {
|
||||
|
||||
// Define variables
|
||||
$result = (object)
|
||||
[
|
||||
'new' => (object)
|
||||
[
|
||||
'hostId' => [],
|
||||
'hostPageId' => [],
|
||||
],
|
||||
'old' => (object)
|
||||
[
|
||||
'hostId' => [],
|
||||
'hostPageId' => [],
|
||||
],
|
||||
];
|
||||
|
||||
// Validate DB connection
|
||||
if (!$db) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate link URL
|
||||
if (!$link = URL::parse($link)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Init host
|
||||
if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) {
|
||||
|
||||
// Make sure host URL compatible with this host rules before continue
|
||||
if (!preg_match(self::getHostSetting($db, $memcached, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
$hostId = $host->hostId;
|
||||
|
||||
$result->old->hostId[] = $host->hostId;
|
||||
|
||||
} else {
|
||||
|
||||
// Make sure link compatible with default host rules before create new host
|
||||
if (!preg_match(DEFAULT_HOST_URL_REGEXP, $link->host->url)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Register new host
|
||||
if ($hostId = $db->addHost($link->host->scheme, $link->host->name, $link->host->port, crc32($link->host->url), time())) {
|
||||
|
||||
$result->new->hostId[] = $hostId;
|
||||
|
||||
// Init required for app web root page
|
||||
if ($link->page->uri != '/') {
|
||||
|
||||
if ($hostPageId = $db->addHostPage($hostId, crc32('/'), '/', time())) {
|
||||
|
||||
// Note: commented because of referrer link registration implemented out of this method
|
||||
// $result->new->hostPageId[] = $hostPageId;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Add host page if not exists
|
||||
if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($link->page->uri))) {
|
||||
|
||||
$result->old->hostPageId[] = $hostPage->hostPageId;
|
||||
|
||||
} else {
|
||||
|
||||
// Make sure host page URL compatible with this host rules before continue
|
||||
if (!preg_match(self::getHostSetting($db, $memcached, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate page limits for this host
|
||||
if ($db->getTotalHostPages($hostId) > self::getHostSetting($db, $memcached, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate ROBOTS.TXT
|
||||
$robots = new Robots(
|
||||
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL .
|
||||
self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX)
|
||||
);
|
||||
|
||||
if (!$robots->uriAllowed($link->page->uri)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate host page MIME
|
||||
// Note: passed to the crawl queue to prevent extra-curl requests
|
||||
|
||||
// Add host page
|
||||
if ($hostPageId = $db->addHostPage($hostId, crc32($link->page->uri), $link->page->uri, time())) {
|
||||
|
||||
$result->new->hostPageId[] = $hostPageId;
|
||||
|
||||
} else {
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
// Cache host setting requests
|
||||
}
|
@ -60,7 +60,7 @@ class MySQL {
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getHostByCRC32URL(int $crc32url) {
|
||||
public function findHostByCRC32URL(int $crc32url) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1');
|
||||
|
||||
@ -78,87 +78,74 @@ class MySQL {
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function addHost(string $scheme,
|
||||
string $name,
|
||||
mixed $port,
|
||||
int $crc32url,
|
||||
int $timeAdded,
|
||||
mixed $timeUpdated,
|
||||
int $crawlPageLimit,
|
||||
string $crawlMetaOnly,
|
||||
string $status,
|
||||
string $nsfw,
|
||||
mixed $robots,
|
||||
mixed $robotsPostfix) {
|
||||
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`,
|
||||
`name`,
|
||||
`port`,
|
||||
`crc32url`,
|
||||
`timeAdded`,
|
||||
`timeUpdated`,
|
||||
`crawlPageLimit`,
|
||||
`crawlMetaOnly`,
|
||||
`status`,
|
||||
`nsfw`,
|
||||
`robots`,
|
||||
`robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([ $scheme,
|
||||
$name,
|
||||
$port,
|
||||
$crc32url,
|
||||
$timeAdded,
|
||||
$timeUpdated,
|
||||
$crawlPageLimit,
|
||||
$crawlMetaOnly,
|
||||
$status,
|
||||
$nsfw,
|
||||
$robots,
|
||||
$robotsPostfix]);
|
||||
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
|
||||
public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) {
|
||||
// Host settings
|
||||
public function findHostSettingValue(int $hostId, string $key) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
|
||||
$query = $this->_db->prepare('SELECT `value` FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$robots, $timeUpdated, $hostId]);
|
||||
$query->execute([$hostId, $key]);
|
||||
|
||||
return $query->rowCount() ? json_decode($query->fetch()->value) : false;
|
||||
}
|
||||
|
||||
public function findHostSetting(int $hostId, string $key) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostId, $key]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function addHostSetting(int $hostId, string $key, mixed $value, int $timeAdded) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostSetting` (`hostId`, `key`, `value`, `timeAdded`) VALUES (?, ?, ?, ?)');
|
||||
|
||||
$value = json_encode($value);
|
||||
|
||||
$query->execute(
|
||||
[
|
||||
$hostId,
|
||||
$key,
|
||||
$value,
|
||||
$timeAdded
|
||||
]
|
||||
);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Host settings
|
||||
public function getHostSetting(int $hostId, mixed $key) {
|
||||
public function updateHostSetting(int $hostSettingId, mixed $value, int $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
|
||||
$query = $this->_db->query('UPDATE `hostSetting` SET `value` = ?,
|
||||
`timeUpdated` = ?
|
||||
|
||||
$query->execute([$hostId, $key]);
|
||||
WHERE `hostSettingId` = ?
|
||||
|
||||
return $query->rowCount() ? $query->fetch()->value : false;
|
||||
}
|
||||
LIMIT 1');
|
||||
|
||||
public function getHostSettings(int $hostId) {
|
||||
$value = json_encode($value);
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) {
|
||||
|
||||
$query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ?
|
||||
`key` = ?,
|
||||
`value` = ?,
|
||||
`timeAdded = ?
|
||||
|
||||
ON DUPLICATE KEY UPDATE `value` = ?,
|
||||
`timeUpdated` = ?');
|
||||
|
||||
$query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]);
|
||||
$query->execute(
|
||||
[
|
||||
$value,
|
||||
$timeUpdated,
|
||||
$hostSettingId
|
||||
]
|
||||
);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
@ -212,20 +199,16 @@ class MySQL {
|
||||
public function getTopHostPages(int $limit = 100) {
|
||||
|
||||
// Get ID (to prevent memory over usage)
|
||||
$query = $this->_db->query("SELECT `hostPage`.`hostPageId`
|
||||
$query = $this->_db->query("SELECT `hostPageId` FROM `hostPage`
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
|
||||
WHERE `httpCode` = 200
|
||||
AND `rank` > 0
|
||||
AND `timeBanned` IS NULL
|
||||
AND `mime` IS NOT NULL
|
||||
|
||||
WHERE `host`.`status` = '1'
|
||||
AND `hostPage`.`httpCode` = 200
|
||||
AND `hostPage`.`rank` > 0
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
AND `hostPage`.`mime` IS NOT NULL
|
||||
ORDER BY `rank` DESC
|
||||
|
||||
ORDER BY `rank` DESC
|
||||
|
||||
LIMIT " . (int) $limit);
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
// Get required page details
|
||||
foreach ($query->fetchAll() as $top) {
|
||||
@ -387,12 +370,11 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
|
||||
public function setHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT IGNORE `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`) VALUES (?, ?)');
|
||||
|
||||
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
|
||||
|
||||
}
|
||||
|
||||
public function deleteHostPageToHostPage(int $hostPageId) {
|
||||
@ -422,6 +404,15 @@ class MySQL {
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? AND `hostPageIdTarget` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function addHostPageSnap(int $hostPageId, int $timeAdded) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, `timeAdded`) VALUES (?, ?)');
|
||||
@ -560,62 +551,46 @@ class MySQL {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
||||
|
||||
$query->execute();
|
||||
$query->execute([$timeOffset]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function resetBannedHosts(int $timeOffset) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `host` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
||||
|
||||
$query->execute([$timeOffset]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Crawler tools
|
||||
public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
|
||||
public function getHostPageCrawlQueueTotal(int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare("SELECT COUNT(*) AS `total`
|
||||
$query = $this->_db->prepare("SELECT COUNT(*) AS `total` FROM `hostPage`
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `hostPage`.`timeBanned` IS NULL");
|
||||
|
||||
WHERE (
|
||||
`hostPage`.`timeUpdated` IS NULL OR
|
||||
`hostPage`.`timeUpdated` < ? OR (
|
||||
`hostPage`.`uri` = '/' AND
|
||||
`hostPage`.`timeUpdated` < ?
|
||||
)
|
||||
)
|
||||
|
||||
AND `host`.`status` <> ?
|
||||
AND `hostPage`.`timeBanned` IS NULL");
|
||||
|
||||
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
|
||||
$query->execute([$timeFrom]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) {
|
||||
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$result = [];
|
||||
|
||||
// Get ID (to prevent memory over usage)
|
||||
$query = $this->_db->prepare("SELECT `hostPage`.`hostPageId`
|
||||
$query = $this->_db->prepare("SELECT `hostPageId` FROM `hostPage`
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL
|
||||
|
||||
WHERE (
|
||||
`hostPage`.`timeUpdated` IS NULL OR
|
||||
`hostPage`.`timeUpdated` < ?
|
||||
OR (
|
||||
`hostPage`.`uri` = '/' AND
|
||||
`hostPage`.`timeUpdated` < ?
|
||||
)
|
||||
)
|
||||
ORDER BY LENGTH(`uri`) ASC, RAND()
|
||||
|
||||
AND `host`.`status` <> ?
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
|
||||
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
|
||||
$query->execute([$timeFrom]);
|
||||
|
||||
// Get required page details
|
||||
foreach ($query->fetchAll() as $queue) {
|
||||
@ -627,10 +602,6 @@ class MySQL {
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`,
|
||||
`host`.`crawlPageLimit`,
|
||||
`host`.`crawlMetaOnly`,
|
||||
`host`.`robots`,
|
||||
`host`.`robotsPostfix`,
|
||||
|
||||
IF (`host`.`port` IS NOT NULL,
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
|
||||
@ -676,13 +647,13 @@ class MySQL {
|
||||
|
||||
FROM `host`
|
||||
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL
|
||||
|
||||
ORDER BY RAND()
|
||||
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom, 0]);
|
||||
$query->execute([$timeFrom]);
|
||||
|
||||
// Get required page details
|
||||
foreach ($query->fetchAll() as $host) {
|
||||
|
@ -1,73 +0,0 @@
|
||||
<?php
|
||||
|
||||
class Parser {
|
||||
|
||||
static public function hostURL(string $string) {
|
||||
|
||||
$result = [
|
||||
'string' => null,
|
||||
'scheme' => null,
|
||||
'name' => null,
|
||||
'port' => null,
|
||||
];
|
||||
|
||||
if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) {
|
||||
|
||||
$result['string'] = $hostScheme . '://';
|
||||
|
||||
$result['scheme'] = $hostScheme;
|
||||
|
||||
} else {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($hostName = parse_url($string, PHP_URL_HOST)) {
|
||||
|
||||
$result['string'] .= $hostName;
|
||||
|
||||
$result['name'] = $hostName;
|
||||
|
||||
} else {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($hostPort = parse_url($string, PHP_URL_PORT)) {
|
||||
|
||||
$result['string'] .= ':' . $hostPort;
|
||||
|
||||
$result['port'] = $hostPort;
|
||||
|
||||
}
|
||||
|
||||
return (object) $result;
|
||||
}
|
||||
|
||||
static public function uri(string $string) {
|
||||
|
||||
$result = [
|
||||
'string' => '/',
|
||||
'path' => '/',
|
||||
'query' => null,
|
||||
];
|
||||
|
||||
if ($path = parse_url($string, PHP_URL_PATH)) {
|
||||
|
||||
$result['string'] = $path;
|
||||
|
||||
$result['path'] = $path;
|
||||
|
||||
}
|
||||
|
||||
if ($query = parse_url($string, PHP_URL_QUERY)) {
|
||||
|
||||
$result['string'] .= '?' . $query;
|
||||
|
||||
$result['query'] = '?' . $query;
|
||||
|
||||
}
|
||||
|
||||
return (object) $result;
|
||||
}
|
||||
}
|
82
library/url.php
Normal file
82
library/url.php
Normal file
@ -0,0 +1,82 @@
|
||||
<?php
|
||||
|
||||
class URL {
|
||||
|
||||
public static function is(string $url) : bool {
|
||||
|
||||
return filter_var($url, FILTER_VALIDATE_URL);
|
||||
}
|
||||
|
||||
public static function parse(string $url) : mixed {
|
||||
|
||||
$result = (object)
|
||||
[
|
||||
'host' => (object)
|
||||
[
|
||||
'url' => null,
|
||||
'scheme' => null,
|
||||
'name' => null,
|
||||
'port' => null,
|
||||
],
|
||||
'page' => (object)
|
||||
[
|
||||
'url' => null,
|
||||
'uri' => null,
|
||||
'path' => null,
|
||||
'query' => null,
|
||||
]
|
||||
];
|
||||
|
||||
// Validate URL
|
||||
if (!self::is($url)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Parse host
|
||||
if ($scheme = parse_url($url, PHP_URL_SCHEME)) {
|
||||
|
||||
$result->host->url = $scheme . '://';
|
||||
$result->host->scheme = $scheme;
|
||||
|
||||
} else {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($host = parse_url($url, PHP_URL_HOST)) {
|
||||
|
||||
$result->host->url .= $host;
|
||||
$result->host->name = $host;
|
||||
|
||||
} else {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($port = parse_url($url, PHP_URL_PORT)) {
|
||||
|
||||
$result->host->url .= ':' . $port;
|
||||
$result->host->port = $port;
|
||||
|
||||
// port is optional
|
||||
}
|
||||
|
||||
// Parse page
|
||||
if ($path = parse_url($url, PHP_URL_PATH)) {
|
||||
|
||||
$result->page->uri = $path;
|
||||
$result->page->path = $path;
|
||||
}
|
||||
|
||||
if ($query = parse_url($url, PHP_URL_QUERY)) {
|
||||
|
||||
$result->page->uri .= '?' . $query;
|
||||
$result->page->query = '?' . $query;
|
||||
}
|
||||
|
||||
$result->page->url = $result->host->url . $result->page->uri;
|
||||
|
||||
return $result;
|
||||
}
|
||||
}
|
Binary file not shown.
Before Width: | Height: | Size: 144 KiB After Width: | Height: | Size: 138 KiB |
@ -1,14 +1,11 @@
|
||||
<?php
|
||||
|
||||
// Current version
|
||||
define('API_VERSION', 0.12);
|
||||
define('API_VERSION', 0.13);
|
||||
|
||||
// Load system dependencies
|
||||
require_once(__DIR__ . '/../config/app.php');
|
||||
require_once(__DIR__ . '/../library/curl.php');
|
||||
require_once(__DIR__ . '/../library/robots.php');
|
||||
require_once(__DIR__ . '/../library/filter.php');
|
||||
require_once(__DIR__ . '/../library/parser.php');
|
||||
require_once(__DIR__ . '/../library/mysql.php');
|
||||
require_once(__DIR__ . '/../library/sphinxql.php');
|
||||
|
||||
@ -107,17 +104,9 @@ if (API_ENABLED) {
|
||||
'status' => true,
|
||||
'result' => [
|
||||
'config' => [
|
||||
'websiteDomain' => WEBSITE_DOMAIN,
|
||||
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
|
||||
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
|
||||
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
|
||||
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
||||
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
||||
'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
|
||||
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
|
||||
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
||||
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
||||
'WEBSITE_DOMAIN' => WEBSITE_DOMAIN,
|
||||
'DEFAULT_HOST_URL_REGEXP' => DEFAULT_HOST_URL_REGEXP,
|
||||
// @TODO
|
||||
],
|
||||
'api' => [
|
||||
'version' => (string) API_VERSION,
|
||||
|
@ -7,10 +7,28 @@ require_once(__DIR__ . '/../library/mysql.php');
|
||||
require_once(__DIR__ . '/../library/sphinxql.php');
|
||||
|
||||
// Connect Sphinx search server
|
||||
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
try {
|
||||
|
||||
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
try {
|
||||
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Filter request data
|
||||
$hp = !empty($_GET['hp']) ? Filter::url($_GET['hp']) : 0;
|
||||
@ -283,7 +301,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<?php } else { ?>
|
||||
<div style="text-align:center">
|
||||
<span><?php echo _('Not found') ?></span>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
|
||||
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
|
||||
<?php } ?>
|
||||
</div>
|
||||
|
@ -6,7 +6,16 @@ require_once(__DIR__ . '/../library/filter.php');
|
||||
require_once(__DIR__ . '/../library/sphinxql.php');
|
||||
|
||||
// Connect Sphinx search server
|
||||
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
try {
|
||||
|
||||
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
$totalPages = $sphinx->getHostPagesTotal();
|
||||
|
||||
|
@ -2,18 +2,48 @@
|
||||
|
||||
// Load system dependencies
|
||||
require_once(__DIR__ . '/../config/app.php');
|
||||
require_once(__DIR__ . '/../library/curl.php');
|
||||
require_once(__DIR__ . '/../library/robots.php');
|
||||
require_once(__DIR__ . '/../library/filter.php');
|
||||
require_once(__DIR__ . '/../library/parser.php');
|
||||
require_once(__DIR__ . '/../library/url.php');
|
||||
require_once(__DIR__ . '/../library/mysql.php');
|
||||
require_once(__DIR__ . '/../library/helper.php');
|
||||
require_once(__DIR__ . '/../library/sphinxql.php');
|
||||
|
||||
// Connect Sphinx search server
|
||||
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
try {
|
||||
|
||||
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
try {
|
||||
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Connect memcached
|
||||
try {
|
||||
|
||||
$memcached = new Memcached();
|
||||
$memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Filter request data
|
||||
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'text';
|
||||
@ -36,82 +66,34 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
|
||||
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
|
||||
]);
|
||||
// Define alert message
|
||||
$alertMessages = [];
|
||||
|
||||
// Crawl request
|
||||
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
|
||||
$db->beginTransaction();
|
||||
// Register new host/page on search request contains the link
|
||||
if (URL::is($q)) {
|
||||
|
||||
try {
|
||||
|
||||
// Parse host info
|
||||
if ($hostURL = Parser::hostURL($q)) {
|
||||
$db->beginTransaction();
|
||||
|
||||
// Host exists
|
||||
if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) {
|
||||
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $q)) {
|
||||
|
||||
$hostStatus = $host->status;
|
||||
$hostNsfw = $host->nsfw;
|
||||
$hostPageLimit = $host->crawlPageLimit;
|
||||
$hostMetaOnly = $host->crawlMetaOnly;
|
||||
$hostId = $host->hostId;
|
||||
$hostRobots = $host->robots;
|
||||
$hostRobotsPostfix = $host->robotsPostfix;
|
||||
if (count($linkToDBresult->new->hostPageId)) {
|
||||
|
||||
$alertMessages[] = _('Link successfully registered in the crawl queue!');
|
||||
|
||||
// Register new host
|
||||
} else {
|
||||
|
||||
// Disk quota not reached
|
||||
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
|
||||
if ($resultsTotal == 0) {
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
} else {
|
||||
$hostRobots = null;
|
||||
}
|
||||
|
||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
|
||||
$hostId = $db->addHost( $hostURL->scheme,
|
||||
$hostURL->name,
|
||||
$hostURL->port,
|
||||
crc32($hostURL->string),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
(string) $hostMetaOnly,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
// Add web root host page to make host visible in the crawl queue
|
||||
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||
$alertMessages[] = _('This link already registered in the crawl queue.');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Parse page URI
|
||||
$hostPageURI = Parser::uri($q);
|
||||
} else {
|
||||
|
||||
// Init robots parser
|
||||
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix);
|
||||
|
||||
// Save page info
|
||||
if ($hostStatus && // host enabled
|
||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
||||
!$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists
|
||||
|
||||
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
||||
}
|
||||
$alertMessages[] = _('Link address not supported on this host!');
|
||||
}
|
||||
|
||||
$db->commit();
|
||||
@ -124,6 +106,12 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
}
|
||||
}
|
||||
|
||||
// Count pages in the crawl queue
|
||||
if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) {
|
||||
|
||||
$alertMessages[] = sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal);
|
||||
}
|
||||
|
||||
?>
|
||||
|
||||
<!DOCTYPE html>
|
||||
@ -313,8 +301,8 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
<?php if ($results) { ?>
|
||||
<div>
|
||||
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
|
||||
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
|
||||
<?php foreach ($alertMessages as $alertMessage) { ?>
|
||||
<span><?php echo $alertMessage ?></span>
|
||||
<?php } ?>
|
||||
</div>
|
||||
<?php foreach ($results as $result) { ?>
|
||||
@ -352,7 +340,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
<?php } else { ?>
|
||||
<div style="text-align:center">
|
||||
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
|
||||
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
|
||||
<?php if ($q && $queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) { ?>
|
||||
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
|
||||
<?php } ?>
|
||||
</div>
|
||||
|
@ -7,10 +7,28 @@ require_once(__DIR__ . '/../library/mysql.php');
|
||||
require_once(__DIR__ . '/../library/sphinxql.php');
|
||||
|
||||
// Connect Sphinx search server
|
||||
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
try {
|
||||
|
||||
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
try {
|
||||
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Define page basics
|
||||
$totalPages = $sphinx->getHostPagesTotal();
|
||||
@ -271,7 +289,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<?php } else { ?>
|
||||
<div style="text-align:center">
|
||||
<span><?php echo _('Not found') ?></span>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?>
|
||||
<?php if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
|
||||
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
|
||||
<?php } ?>
|
||||
</div>
|
||||
|
Loading…
x
Reference in New Issue
Block a user