diff --git a/README.md b/README.md index 453eb2d..fb72b82 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ GET action=hosts - required ##### Application manifest -Returns node information for other nodes that have same `CRAWL_MANIFEST_API_VERSION` and `CRAWL_URL_REGEXP` conditions. +Returns node information for other nodes that have same `CRAWL_MANIFEST_API_VERSION` and `DEFAULT_HOST_URL_REGEXP` conditions. Could be enabled or disabled by `API_MANIFEST_ENABLED` option diff --git a/cli/yggo.php b/cli/yggo.php index bb3b38c..1003ec3 100644 --- a/cli/yggo.php +++ b/cli/yggo.php @@ -390,7 +390,7 @@ if (!empty($argv[1])) { $selectors = []; - foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { + foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) as $selector) { if (!empty($selector)) { @@ -428,12 +428,11 @@ if (!empty($argv[1])) { $db->addHostPageDom($hostPage->hostPageId, time(), $selector, - trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( - preg_replace('/[\s]+/', - ' ', - str_replace(['
', '
', '
', 'innertext))) : $element->innertext)); + trim((bool) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_STRIP_TAGS', DEFAULT_HOST_PAGES_DOM_STRIP_TAGS) ? strip_tags(preg_replace('/[\s]+/', + ' ', + str_replace(['
', '
', '
', 'innertext))) : $element->innertext)); } } } @@ -447,7 +446,7 @@ if (!empty($argv[1])) { exit; } - CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file')); + CLI::danger(_('DEFAULT_HOST_PAGES_DOM_SELECTORS not provided in the configuration file')); CLI::break(); exit; diff --git a/config/app.php.example b/config/app.php.example index aad771b..37c1ad3 100644 --- a/config/app.php.example +++ b/config/app.php.example @@ -64,7 +64,7 @@ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100); define('WEBSITE_IDENTICON_IMAGE_CACHE', true); // Database -define('DB_HOST', '127.0.0.1'); +define('DB_HOST', 'localhost'); define('DB_PORT', 3306); define('DB_NAME', ''); define('DB_USERNAME', ''); @@ -75,7 +75,7 @@ define('SPHINX_HOST', '127.0.0.1'); define('SPHINX_PORT', 9306); // Memcached -define('MEMCACHED_HOST', '127.0.0.1'); +define('MEMCACHED_HOST', 'localhost'); define('MEMCACHED_PORT', 11211); // Snaps @@ -92,19 +92,19 @@ define('MEMCACHED_PORT', 11211); */ define('SNAP_STORAGE', json_encode((object) [ - 'localhost' => [ // @TODO see https://github.com/YGGverse/YGGo#roadmap + 'localhost' => [ 'storage-1' => [ 'directory' => __DIR__ . '/../storage/snap/hps/', 'quota' => [ 'mime' => false, - 'size' => 10000000024, // @TODO - 'request' => [ // @TODO + 'size' => 10000000024, + 'request' => [ 'download' => [ 'size' => 10000024, 'seconds' => 60*60 ] ] - ] + ], ], // ... ], @@ -118,9 +118,9 @@ define('SNAP_STORAGE', json_encode((object) 'timeout' => 30, 'passive' => true, 'quota' => [ - 'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico', - 'size' => 10000000024, // @TODO - 'request' => [ // @TODO + 'mime' => 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico', + 'size' => 10000000024, + 'request' => [ 'download' => [ 'size' => 10000024, 'seconds' => 60*60 @@ -133,6 +133,7 @@ define('SNAP_STORAGE', json_encode((object) ] )); + // Proxy settings /* @@ -143,141 +144,124 @@ define('SNAP_STORAGE', json_encode((object) */ define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )'); -// Crawl settings +// Host defaults /* - * Crawler / Bot User Agent name - * - * Shared to other hosts through CURL requests by crawler + * Only URL addresses match this rule will be crawled * */ -define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); +define('DEFAULT_HOST_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); // ipv6 links only /* - * Skip curl download on response data size reached + * Default robots.txt rules (will be overwriten on remote rules available) * - * See also: CURLOPT_TIMEOUT (library/curl.php) + * string|null * */ -define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760); +define('DEFAULT_HOST_ROBOTS_TXT', null); /* - * Stop crawler on disk quota reached (Mb) + * These rules forcely appending to the remote robots.txt file + * + * string|null * */ -define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); +define('DEFAULT_HOST_ROBOTS_TXT_POSTFIX', null); /* - * Pages (URI) processing limit in the crawler.php queue - * - * This option related to CRAWL_PAGE_SECONDS_OFFSET value - * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) - * - * Usually up to 20 pages per minute, - * to prevent websites overload by sending GET crawling requests + * Pages limit per new host by default * - * Set 0 to disable + * Crawler stops indexing on this limit reach to prevent disk overuse * */ -define('CRAWL_PAGE_LIMIT', 20); +define('DEFAULT_HOST_PAGES_LIMIT', 100000); /* - * Renew page index by timing offset provided - * - * This option works with CRAWL_PAGE_LIMIT step queue - * - * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair - * must have enough value to crawl all pages collected in the DB index + * Index pages match MIME types * - * or the crawler can stuck in queue + * comma separated * */ -define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); +define('DEFAULT_HOST_PAGES_MIME', 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac,font/ttf'); /* - * Renew home page index by timing offset provided - * - * Used for new pages scanning in highter priority - * - * This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue - * - * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair - * must have enough value to crawl all pages collected in the DB index + * Index only meta tags + * or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field * - * or the crawler can stuck in queue + * Warning! + * this option requires huge disk storage, + * it's experimental feature, oriented for index operations * */ -define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30); +define('DEFAULT_HOST_PAGES_DATA', false); /* - * Index pages match MIME types + * Generates hostPageDom index based on hostPage.data field * - * comma separated + * Could be useful for building semantical index query (config/sphinx.conf.txt) + * + * At this moment feature available in the CLI only (cli/yggo.php) * */ -define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac'); +define('DEFAULT_HOST_PAGES_DOM_SELECTORS', false); // ";" separated /* - * Only URL addresses match this rule will be auto-crawled + * Strip HTML in the DEFAULT_HOST_PAGES_DOM_SELECTORS content * */ -define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); +define('DEFAULT_HOST_PAGE_DOM_STRIP_TAGS', false); + + +// Crawl queue /* - * Pages limit per new host by default - * - * Crawler stops indexing on this limit reach to prevent disk overuse + * Crawler / Bot User Agent name * - * Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field + * Shared to other hosts through CURL requests by crawler * */ -define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000); +define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); /* - * Set default auto-crawl status for new host added - * - * true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT - * false - requires manual validation by the moderator in the DB `host`.`status` field + * Skip curl download on response data size reached * - * This option also disable host in the search results + * See also: CURLOPT_TIMEOUT (library/curl.php) * */ -define('CRAWL_HOST_DEFAULT_STATUS', true); +define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 50485760); /* - * Index only meta tags - * or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field - * - * Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field - * - * Warning! - * this option disabled requires huge disk storage, - * it's experimental feature, oriented for index operations + * Stop crawler on disk quota reached (Mb) * */ -define('CRAWL_HOST_DEFAULT_META_ONLY', true); +define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 128); /* - * Not suitable/safe for work status for new host by default + * Pages (URI) processing limit in the crawler.php queue + * + * This option related to CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET value + * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) * - * Could be filtered in search results + * Usually up to 20 pages per minute, + * to prevent websites overload by sending GET crawling requests * - * Custom rule for specified host could be provided in the DB `host`.`nsfw` field + * Set 0 to disable * */ -define('CRAWL_HOST_DEFAULT_NSFW', false); +define('CRAWL_HOST_PAGE_QUEUE_LIMIT', 10); /* - * Collect sitemap index when available + * Renew page index by timing offset provided * - * At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only + * This option works with CRAWL_HOST_PAGE_QUEUE_LIMIT step queue * - * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml + * Pay attention, that CRAWL_HOST_PAGE_QUEUE_LIMIT + CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET pair + * must have enough value to crawl all pages collected in the DB index * - * true|false + * or the crawler can stuck in queue * */ -define('CRAWL_SITEMAPS', true); +define('CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET', 60*60*24*30*12); /* * Re-calculate page rank on page update @@ -287,7 +271,7 @@ define('CRAWL_SITEMAPS', true); * true|false * */ -define('CRAWL_PAGE_RANK_UPDATE', true); +define('CRAWL_HOST_PAGE_RANK_UPDATE', false); /* * Renew hosts index by timing offset provided @@ -304,53 +288,28 @@ define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7); define('CRAWL_HOST_LIMIT', 1); /* - * Crawl robots.txt - */ -define('CRAWL_ROBOTS', true); // true|false - -/* - * Default robots.txt rules on remote file not exists - * The crawler able to overwrite these rules - * - * Presets - * yggdrasil: /database/yggdrasil/host.robots.md - * - */ -define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null - -/* - * Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES - * The crawler does not overwrite these rules - * - * Presets - * yggdrasil: /database/yggdrasil/host.robotsPostfix.md + * Collect sitemap index when available * - */ -define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null - -/* - * Generates hostPageDom index based on hostPage.data field + * At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only * - * Could be useful for building semantical index query (config/sphinx.conf.txt) + * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml * - * At this moment feature available in the CLI only (cli/yggo.php) + * true|false * */ -define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6'); +define('CRAWL_SITEMAPS', true); /* - * Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content - * + * Crawl robots.txt */ -define('CRAWL_HOST_PAGE_DOM_STRIP_TAGS', true); +define('CRAWL_ROBOTS', true); // true|false /* * Look for third-party manifests to collect distributed index * * API address provided in yggo meta tag - * will be stored in the `manifest` DB table * - * Collecting URL that match CRAWL_URL_REGEXP condition + * Collecting URL that match DEFAULT_HOST_URL_REGEXP condition * */ define('CRAWL_MANIFEST', true); @@ -359,10 +318,17 @@ define('CRAWL_MANIFEST', true); * Manifest API version compatibility * */ -define('CRAWL_MANIFEST_API_VERSION', 0.12); +define('CRAWL_MANIFEST_API_VERSION', 0.13); -// Cleaner settings +/* + * Remove host ban after following time + * + * This option used in crawler and search page + * to prevent extra http requests to unavailable or not condition resources + * + */ +define('CLEAN_HOST_BAN_SECONDS_OFFSET', 60*60*24*30); /* * Remove page ban after following time * @@ -370,7 +336,7 @@ define('CRAWL_MANIFEST_API_VERSION', 0.12); * to prevent extra http requests to unavailable or not condition resources * */ -define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); +define('CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); /* * Database tables optimization @@ -382,7 +348,7 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); * When enabled - requires enough of RAM * */ -define('CLEAN_DB_TABLES_OPTIMIZATION', false); +define('CLEAN_DB_TABLES_OPTIMIZATION', true); // API settings @@ -420,17 +386,12 @@ define('API_HOSTS_ENABLED', true); * Database host fields comma separated or * to share all the fields * */ -define('API_HOSTS_FIELDS', - '`host`.`scheme`, - `host`.`name`, - `host`.`port`, - `host`.`crawlPageLimit`, - `host`.`robots`, - `host`.`robotsPostfix`, - `host`.`nsfw`, - `host`.`timeAdded`, - `host`.`timeUpdated`, - (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`'); +define('API_HOSTS_FIELDS', "IF (`port` IS NOT NULL, + CONCAT(`scheme`, '://', `name`, ':', `port`), + CONCAT(`scheme`, '://', `name`) + ) AS `url`, + `timeAdded`, + `timeUpdated`"); /* * Manifest API diff --git a/crontab/cleaner.php b/crontab/cleaner.php index a827285..5e12b81 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -28,8 +28,11 @@ require_once(__DIR__ . '/../library/mysql.php'); // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); +// Reset banned hosts +$hostsBansRemoved = $db->resetBannedHostPages(time() - CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET); + // Reset banned pages -$hostPagesBansRemoved = $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET); +$hostPagesBansRemoved = $db->resetBannedHosts(time() - CLEAN_HOST_BAN_SECONDS_OFFSET); // Optimize tables if (CLEAN_DB_TABLES_OPTIMIZATION) { @@ -45,6 +48,7 @@ if (CLEAN_DB_TABLES_OPTIMIZATION) { } // Debug +echo 'Host bans removed: ' . $hostsBansRemoved . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL; \ No newline at end of file diff --git a/crontab/crawler.php b/crontab/crawler.php index 34c32a0..a0570e1 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -24,9 +24,10 @@ require_once(__DIR__ . '/../library/ftp.php'); require_once(__DIR__ . '/../library/curl.php'); require_once(__DIR__ . '/../library/robots.php'); require_once(__DIR__ . '/../library/sitemap.php'); +require_once(__DIR__ . '/../library/url.php'); require_once(__DIR__ . '/../library/filter.php'); -require_once(__DIR__ . '/../library/parser.php'); require_once(__DIR__ . '/../library/mysql.php'); +require_once(__DIR__ . '/../library/helper.php'); require_once(__DIR__ . '/../library/vendor/simple_html_dom.php'); // Check disk quota @@ -62,27 +63,38 @@ try { } catch(Exception $e) { - // Debug std var_dump($e); exit; } -// Process hosts crawl queue -foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) { +// Connect memcached +try { - $db->beginTransaction(); + $memcached = new Memcached(); + $memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} + +// Process hosts crawl queue +foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) { try { + $db->beginTransaction(); + // Update host crawl queue - $hostsProcessed += $db->updateHostCrawlQueue($host->hostId); + $hostsProcessed += $db->updateHostCrawlQueue($queueHost->hostId, time()); - // Crawl robots.txt + // Update host robots.txt settings from remote host if (CRAWL_ROBOTS) { - // Update robots - $curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + $curl = new Curl($queueHost->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; @@ -90,61 +102,63 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); - // Sitemap provided in robots.txt - if (200 == $curl->getCode()) { - - $hostRobots = $curl->getContent(); + // Update robots.txt rules + if (200 == $curl->getCode() && false !== stripos(trim(mb_strtolower((string) $curl->getContentType())), 'text/plain')) { - } else { - - $hostRobots = $host->robots; + Helper::setHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', (string) $curl->getContent()); } - - // Update host index - $db->updateHostRobots($host->hostId, $hostRobots, time()); } // Process sitemaps when enabled if (CRAWL_SITEMAPS) { // Look for custom sitemap URL served in robots.txt - $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); + $robots = new Robots( + Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT', NULL) . PHP_EOL . + Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX) + ); - if ($hostSitemapPath = $robots->getSitemap()) { + if ($sitemapLink = $robots->getSitemap()) { - // Replace relative paths - $hostSitemapPath = trim($hostSitemapPath, '/'); - $hostSitemapPath = str_replace($host->url, '', $hostSitemapPath); - $hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath); + // Replace relative paths + $sitemapURL = sprintf('%s/%s', $queueHost->url, trim(str_ireplace($hostCrawlQueue->url, '', $sitemapLink), '/')); - // Set default path when not exists + // Set default path } else { - $hostSitemapPath = sprintf('%s/sitemap.xml', $host->url); + $sitemapURL = sprintf('%s/sitemap.xml', $queueHost->url); } - // Init sitemap data - $sitemap = new Sitemap($hostSitemapPath); + // Init sitemap + $sitemap = new Sitemap($sitemapURL); if ($sitemapLinks = $sitemap->getLinks()) { $sitemapsProcessed++; // Process collected sitemap links - foreach ($sitemapLinks as $link => $attributes) { + foreach ($sitemapLinks as $loc => $attributes) { - // Parse formatted link - $linkURI = Parser::uri($link); - $linkHostURL = Parser::hostURL($link); + // Replace relative paths + $loc = sprintf('%s/%s', $queueHost->url, trim(str_ireplace($queueHost->url, '', $loc), '/')); - // Add host page - if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format - $linkHostURL->string == $host->url && // this host links only - $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules - $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit - !$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists + // Validate link + if (!$link = URL::parse($loc)) { - $hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time()); + continue; + } + + // Collect this host links only + if ($link->host->url != $queueHost->url) { + + continue; + } + + // Register new link + if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $loc)) { + + $hostsAdded += count($linkToDBresult->new->hostId); + $hostPagesAdded += count($linkToDBresult->new->hostPageId); } } } @@ -152,8 +166,11 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF // Update manifests if (CRAWL_MANIFEST) { - if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) { + // Host have manifest provided + if ($manifestURL = Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'MANIFEST_URL', NULL)) { + + // Get remote manifest $curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT); // Update curl stats @@ -165,42 +182,32 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF // Skip processing non 200 code if (200 != $curl->getCode()) { - $db->commit(); - continue; } // Skip processing without returned data if (!$remoteManifest = $curl->getContent()) { - $db->commit(); - continue; } // Skip processing on json encoding error if (!$remoteManifest = @json_decode($remoteManifest)) { - $db->commit(); - continue; } // Skip processing on required fields missed if (empty($remoteManifest->status) || - empty($remoteManifest->result->config->crawlUrlRegexp) || + empty($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP) || empty($remoteManifest->result->api->version) || empty($remoteManifest->result->api->hosts)) { - $db->commit(); - continue; } // Skip processing on API version not compatible - if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { - - $db->commit(); + if ($remoteManifest->result->api->version !== API_VERSION) { continue; } @@ -208,28 +215,24 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF // Skip processing on host API not available if (!$remoteManifest->result->api->hosts) { - $db->commit(); - continue; } - // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition - if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { - - $db->commit(); + // Skip processing on remote host URL does not match local condition + if ($remoteManifest->result->config->DEFAULT_HOST_URL_REGEXP != + Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP)) { continue; } - // Skip processing on host link does not match condition - if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { - - $db->commit(); + // Skip processing on remote host link does not match local condition + if (false === preg_match(Helper::getHostSetting($db, $memcached, $queueHost->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), + $remoteManifest->result->api->hosts)) { continue; } - // Begin hosts collection + // Grab host URLs $curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT); // Update curl stats @@ -241,32 +244,23 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF // Skip processing non 200 code if (200 != $curl->getCode()) { - $db->commit(); - continue; } // Skip processing without returned data - if (!$remoteManifestHosts = $curl->getContent()) { - - $db->commit(); + if (!$remoteManifest = $curl->getContent()) { continue; } // Skip processing on json encoding error - if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { - - $db->commit(); + if (!$remoteManifestHosts = @json_decode($remoteManifest)) { continue; } // Skip processing on required fields missed - if (empty($remoteManifestHosts->status) || - empty($remoteManifestHosts->result)) { - - $db->commit(); + if (empty($remoteManifestHosts->result)) { continue; } @@ -275,64 +269,16 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF foreach ($remoteManifestHosts->result as $remoteManifestHost) { // Skip processing on required fields missed - if (empty($remoteManifestHost->scheme) || - empty($remoteManifestHost->name)) { + if (empty($remoteManifestHost->url)) { continue; } - $hostURL = $remoteManifestHost->scheme . '://' . - $remoteManifestHost->name . - (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); - - // Validate formatted link - if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { - - // Host not exists - if (!$db->getHostByCRC32URL(crc32($hostURL))) { - - // Get robots.txt if exists - $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); - - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); - - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; - } + // Register new link + if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $remoteManifestHost->url)) { - $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - - $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; - $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; - $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; - $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - - $hostId = $db->addHost( $remoteManifestHosts->result->scheme, - $remoteManifestHosts->result->name, - $remoteManifestHosts->result->port, - crc32($hostURL), - time(), - null, - $hostPageLimit, - (string) $hostMetaOnly, - (string) $hostStatus, - (string) $hostNsfw, - $hostRobots, - $hostRobotsPostfix); - - // Add web root host page to make host visible in the crawl queue - $db->addHostPage($hostId, crc32('/'), '/', time()); - - // Increase counters - $hostPagesAdded++; - $hostsAdded++; - } + $hostsAdded += count($linkToDBresult->new->hostId); + $hostPagesAdded += count($linkToDBresult->new->hostPageId); } } } @@ -354,7 +300,7 @@ foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OF } // Process pages crawl queue -foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) { +foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET) as $queueHostPage) { $db->beginTransaction(); @@ -370,9 +316,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $httpRequestsTimeTotal += $curl->getTotalTime(); // Update page rank - if (CRAWL_PAGE_RANK_UPDATE) { - - // @TODO add common method + if (CRAWL_HOST_PAGE_RANK_UPDATE) { $hostPageRank = 0; @@ -432,113 +376,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $url = trim($match[1]); //Make relative links absolute - if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use + if (!parse_url($url, PHP_URL_HOST)) { $url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.'); } - // Validate formatted link - if (filter_var($url, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $url)) { - - // Parse formatted link - $hostURL = Parser::hostURL($url); - $hostPageURI = Parser::uri($url); - - // Host exists - if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) { - - $hostStatus = $host->status; - $hostNsfw = $host->nsfw; - $hostPageLimit = $host->crawlPageLimit; - $hostMetaOnly = $host->crawlMetaOnly; - $hostId = $host->hostId; - $hostRobots = $host->robots; - $hostRobotsPostfix = $host->robotsPostfix; + // Register new link + if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $url)) { - // Register new host - } else { + $hostsAdded += count($linkToDBresult->new->hostId); + $hostPagesAdded += count($linkToDBresult->new->hostPageId); - // Get robots.txt if exists - $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + // Register referrer + if ($linkToDBresult->old->hostPageId) { - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); + foreach ($linkToDBresult->old->hostPageId as $hostPageIdTarget) { - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; - } - - $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; - $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; - $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; - $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - - $hostId = $db->addHost( $hostURL->scheme, - $hostURL->name, - $hostURL->port, - crc32($hostURL->string), - time(), - null, - $hostPageLimit, - (string) $hostMetaOnly, - (string) $hostStatus, - (string) $hostNsfw, - $hostRobots, - $hostRobotsPostfix); - - // Add web root host page to make host visible in the crawl queue - $db->addHostPage($hostId, crc32('/'), '/', time()); - - // Increase counters - $hostPagesAdded++; - $hostsAdded++; - - // When page is root, skip next operations - if ($hostPageURI->string == '/') { - - $db->commit(); - - continue; + $db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget); } } - // Init robots parser - $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - - // Save page info - if ($hostStatus && // host enabled - $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules - $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit - - if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { - - $hostPageId = $hostPage->hostPageId; + if ($linkToDBresult->new->hostPageId) { - } else { + foreach ($linkToDBresult->new->hostPageId as $hostPageIdTarget) { - $hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); - - // Apply referer meta description to the target page before indexing it - if ($lastHostPageDescription = $db->getLastPageDescription($queueHostPage->hostPageId)) { - - $db->addHostPageDescription($hostPageId, - $lastHostPageDescription->title, - $lastHostPageDescription->description, - $lastHostPageDescription->keywords, - $hostMetaOnly ? null : ($lastHostPageDescription->data ? base64_encode($lastHostPageDescription->data) : null), - time()); - } - - $hostPagesAdded++; - } - - $db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId); + $db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget); + } } } } @@ -567,7 +430,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Check for MIME $hostPageInMime = false; - foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) { + + foreach ((array) explode(',', Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_MIME', DEFAULT_HOST_PAGES_MIME)) as $mime) { // Ban page on MIME type not allowed in settings if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { @@ -622,8 +486,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Insert compressed snap data into the tmp storage if (true === $zip->addFromString('DATA', $content) && true === $zip->addFromString('META', sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . - sprintf('SOURCE: %s', Filter::url($queueHostPage->hostPageURL)) . PHP_EOL . - sprintf('TIMESTAMP: %s', time()))) { + sprintf('SOURCE: %s', Filter::url($queueHostPage->hostPageURL)) . PHP_EOL . + sprintf('TIMESTAMP: %s', time()))) { } } @@ -802,18 +666,18 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Add queued page description if not exists $db->addHostPageDescription($queueHostPage->hostPageId, $metaTitle, - $metaDescription ? Filter::pageDescription($metaDescription) : null, - $metaKeywords ? Filter::pageKeywords($metaKeywords) : null, - $content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, + $metaDescription ? Filter::pageDescription($metaDescription) : null, + $metaKeywords ? Filter::pageKeywords($metaKeywords) : null, + $content ? (Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DATA', DEFAULT_HOST_PAGES_DATA) ? base64_encode($content) : null) : null, time()); // Collect page DOM elements data on enabled - if (CRAWL_HOST_PAGE_DOM_SELECTORS) { + if ($hostPageDomSelectors = Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) { // Begin selectors extraction $html = str_get_html($content); - foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { + foreach ((array) explode(';', $hostPageDomSelectors) as $selector) { foreach($html->find($selector) as $element) { @@ -822,12 +686,11 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $db->addHostPageDom($queueHostPage->hostPageId, time(), $selector, - trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( - preg_replace('/[\s]+/', - ' ', - str_replace(['
', '
', '
', 'innertext))) : $element->innertext)); + trim(Helper::getHostSetting($db, $memcached, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/', + ' ', + str_replace(['
', '
', '
', 'innertext))) : $element->innertext)); } } } @@ -851,7 +714,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND if (CRAWL_MANIFEST && !empty($metaYggoManifestURL) && filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) && - preg_match(CRAWL_URL_REGEXP, $metaYggoManifestURL)) { + preg_match(DEFAULT_HOST_URL_REGEXP, $metaYggoManifestURL)) { $manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL); } @@ -891,7 +754,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND 'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')), 'data' => null, 'mime' => null, - 'ref' => $src, + 'href' => $src, ]; } @@ -923,7 +786,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND 'keywords' => null, 'data' => null, 'mime' => Filter::mime($type), - 'ref' => $src, + 'href' => $src, ]; } @@ -953,7 +816,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND 'keywords' => null, 'data' => null, 'mime' => Filter::mime($type), - 'ref' => $src, + 'href' => $src, ]; } @@ -983,7 +846,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND 'keywords' => null, 'data' => null, 'mime' => Filter::mime($type), - 'ref' => $src, + 'href' => $src, ]; } @@ -1002,7 +865,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND 'keywords' => null, 'data' => null, 'mime' => null, - 'ref' => $src, + 'href' => $src, ]; } @@ -1021,7 +884,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND 'keywords' => null, 'data' => null, 'mime' => null, - 'ref' => $href, + 'href' => $href, ]; } @@ -1084,115 +947,48 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND 'keywords' => Filter::pageKeywords($title), 'data' => null, 'mime' => null, - 'ref' => $href, + 'href' => $href, ]; } // Process links collected foreach ($links as $link) { - //Make relative links absolute - if (!parse_url($link['ref'], PHP_URL_HOST)) { + // Make relative links absolute + if (!parse_url($link['href'], PHP_URL_HOST)) { - $link['ref'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.'); + $link['href'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['href']), '/'), '.'); } - // Validate formatted link - if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) { + // Register new link + if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $link['href'])) { - // Parse formatted link - $hostURL = Parser::hostURL($link['ref']); - $hostPageURI = Parser::uri($link['ref']); + // Increase new hosts counters + if ($linkToDBresult->new->hostId) { - // Host exists - if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) { + $hostsAdded += count($linkToDBresult->new->hostId); + } - $hostStatus = $host->status; - $hostNsfw = $host->nsfw; - $hostPageLimit = $host->crawlPageLimit; - $hostMetaOnly = $host->crawlMetaOnly; - $hostId = $host->hostId; - $hostRobots = $host->robots; - $hostRobotsPostfix = $host->robotsPostfix; + if ($linkToDBresult->new->hostPageId) { - // Register new host - } else { + $hostPagesAdded += count($linkToDBresult->new->hostPageId); + } - // Get robots.txt if exists - $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + // Register referrer + if ($linkToDBresult->old->hostPageId) { - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); + foreach ($linkToDBresult->old->hostPageId as $hostPageIdTarget) { - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; - } - - $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; - $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; - $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; - $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - - $hostId = $db->addHost( $hostURL->scheme, - $hostURL->name, - $hostURL->port, - crc32($hostURL->string), - time(), - null, - $hostPageLimit, - (string) $hostMetaOnly, - (string) $hostStatus, - (string) $hostNsfw, - $hostRobots, - $hostRobotsPostfix); - - // Add web root host page to make host visible in the crawl queue - $db->addHostPage($hostId, crc32('/'), '/', time()); - - // Increase counters - $hostPagesAdded++; - $hostsAdded++; - - // When page is root, skip next operations - if ($hostPageURI->string == '/') { - - continue; + $db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget); } } - // Init robots parser - $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - - // Save page info - if ($hostStatus && // host enabled - $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules - $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit - - if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { - - $hostPageId = $hostPage->hostPageId; - - } else { + if ($linkToDBresult->new->hostPageId) { - $hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); + foreach ($linkToDBresult->new->hostPageId as $hostPageIdTarget) { - $db->addHostPageDescription($hostPageId, - $link['title'], - $link['description'], - $link['keywords'], - $hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null), - time()); - - $hostPagesAdded++; - } - - $db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId); + $db->setHostPageToHostPage($queueHostPage->hostPageId, $hostPageIdTarget); + } } } } @@ -1236,7 +1032,7 @@ $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; // Debug output echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL; -echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL; +echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb index 2c6cab8..862590a 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/filter.php b/library/filter.php index 0f277b5..d066b80 100644 --- a/library/filter.php +++ b/library/filter.php @@ -2,11 +2,6 @@ class Filter { - static public function string(mixed $data) { - - return (string) $data; - } - static public function url(mixed $url) { $url = (string) $url; @@ -54,29 +49,6 @@ class Filter { return $keywords; } - static public function pageData(mixed $data) { - - $data = (string) $data; - - $filterDataPre = [ - '//s', - '//s' - ]; - - $filterDataPost = [ - '/[\s]{2,}/', - ]; - - $data = preg_replace($filterDataPre, ' ', $data); - - $data = html_entity_decode($data); - $data = strip_tags($data); - - $data = preg_replace($filterDataPost, ' ', $data); - - return $data; - } - static public function searchQuery(string $query, string $mode = 'default') { // Create query CRC32 diff --git a/library/helper.php b/library/helper.php new file mode 100644 index 0000000..e65f0b3 --- /dev/null +++ b/library/helper.php @@ -0,0 +1,168 @@ +get(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key))) { + + return $value; + } + + if (!$value = $db->findHostSettingValue($hostId, $key)) { + + $value = $defaultValue; + } + + $memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600); + + return $value; + } + + public static function setHostSetting(MySQL $db, + Memcached $memcached, + int $hostId, + string $key, + mixed $value) : int { + + if ($hostSetting = $db->findHostSetting($hostId, $key)) { + + $rowsAffected = $db->updateHostSetting($hostSetting->hostSettingId, $value, time()); + + } else { + + $rowsAffected = $db->addHostSetting($hostId, $key, $value, time()); + } + + $memcached->set(sprintf('Helper.getHostSetting.%s.%s', $hostId, $key), $value, time() + 3600); + + return $rowsAffected; + } + + public static function addLinkToDB(MySQL $db, Memcached $memcached, string $link) : mixed { + + // Define variables + $result = (object) + [ + 'new' => (object) + [ + 'hostId' => [], + 'hostPageId' => [], + ], + 'old' => (object) + [ + 'hostId' => [], + 'hostPageId' => [], + ], + ]; + + // Validate DB connection + if (!$db) { + + return false; + } + + // Validate link URL + if (!$link = URL::parse($link)) { + + return false; + } + + // Init host + if ($host = $db->findHostByCRC32URL(crc32($link->host->url))) { + + // Make sure host URL compatible with this host rules before continue + if (!preg_match(self::getHostSetting($db, $memcached, $host->hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->host->url)) { + + return false; + } + + $hostId = $host->hostId; + + $result->old->hostId[] = $host->hostId; + + } else { + + // Make sure link compatible with default host rules before create new host + if (!preg_match(DEFAULT_HOST_URL_REGEXP, $link->host->url)) { + + return false; + } + + // Register new host + if ($hostId = $db->addHost($link->host->scheme, $link->host->name, $link->host->port, crc32($link->host->url), time())) { + + $result->new->hostId[] = $hostId; + + // Init required for app web root page + if ($link->page->uri != '/') { + + if ($hostPageId = $db->addHostPage($hostId, crc32('/'), '/', time())) { + + // Note: commented because of referrer link registration implemented out of this method + // $result->new->hostPageId[] = $hostPageId; + } + } + + } else { + + return false; + } + } + + // Add host page if not exists + if ($hostPage = $db->findHostPageByCRC32URI($hostId, crc32($link->page->uri))) { + + $result->old->hostPageId[] = $hostPage->hostPageId; + + } else { + + // Make sure host page URL compatible with this host rules before continue + if (!preg_match(self::getHostSetting($db, $memcached, $hostId, 'URL_REGEXP', DEFAULT_HOST_URL_REGEXP), $link->page->url)) { + + return false; + } + + // Validate page limits for this host + if ($db->getTotalHostPages($hostId) > self::getHostSetting($db, $memcached, $hostId, 'PAGES_LIMIT', DEFAULT_HOST_PAGES_LIMIT)) { + + return false; + } + + // Validate ROBOTS.TXT + $robots = new Robots( + self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT', NULL) . PHP_EOL . + self::getHostSetting($db, $memcached, $hostId, 'ROBOTS_TXT_POSTFIX', DEFAULT_HOST_ROBOTS_TXT_POSTFIX) + ); + + if (!$robots->uriAllowed($link->page->uri)) { + + return false; + } + + // Validate host page MIME + // Note: passed to the crawl queue to prevent extra-curl requests + + // Add host page + if ($hostPageId = $db->addHostPage($hostId, crc32($link->page->uri), $link->page->uri, time())) { + + $result->new->hostPageId[] = $hostPageId; + + } else { + + return false; + } + } + + return $result; + } + + // Cache host setting requests +} \ No newline at end of file diff --git a/library/mysql.php b/library/mysql.php index aad78d9..1b1fde6 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -60,7 +60,7 @@ class MySQL { return $query->fetch(); } - public function getHostByCRC32URL(int $crc32url) { + public function findHostByCRC32URL(int $crc32url) { $query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1'); @@ -78,87 +78,74 @@ class MySQL { return $query->fetch()->total; } - public function addHost(string $scheme, - string $name, - mixed $port, - int $crc32url, - int $timeAdded, - mixed $timeUpdated, - int $crawlPageLimit, - string $crawlMetaOnly, - string $status, - string $nsfw, - mixed $robots, - mixed $robotsPostfix) { + public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded) { $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, - `timeAdded`, - `timeUpdated`, - `crawlPageLimit`, - `crawlMetaOnly`, - `status`, - `nsfw`, - `robots`, - `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); - - $query->execute([ $scheme, - $name, - $port, - $crc32url, - $timeAdded, - $timeUpdated, - $crawlPageLimit, - $crawlMetaOnly, - $status, - $nsfw, - $robots, - $robotsPostfix]); + `timeAdded`) VALUES (?, ?, ?, ?, ?)'); + + $query->execute([$scheme, $name, $port, $crc32url, $timeAdded]); return $this->_db->lastInsertId(); } - public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) { + // Host settings + public function findHostSettingValue(int $hostId, string $key) { - $query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1'); + $query = $this->_db->prepare('SELECT `value` FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1'); - $query->execute([$robots, $timeUpdated, $hostId]); + $query->execute([$hostId, $key]); - return $query->rowCount(); + return $query->rowCount() ? json_decode($query->fetch()->value) : false; } - // Host settings - public function getHostSetting(int $hostId, mixed $key) { + public function findHostSetting(int $hostId, string $key) { - $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1'); + $query = $this->_db->prepare('SELECT * FROM `hostSetting` WHERE `hostId` = ? AND `key` = ? LIMIT 1'); $query->execute([$hostId, $key]); - return $query->rowCount() ? $query->fetch()->value : false; + return $query->fetch(); } - public function getHostSettings(int $hostId) { + public function addHostSetting(int $hostId, string $key, mixed $value, int $timeAdded) { - $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?'); + $query = $this->_db->prepare('INSERT INTO `hostSetting` (`hostId`, `key`, `value`, `timeAdded`) VALUES (?, ?, ?, ?)'); - $query->execute([$hostId]); + $value = json_encode($value); - return $query->fetchAll(); + $query->execute( + [ + $hostId, + $key, + $value, + $timeAdded + ] + ); + + return $query->rowCount(); } - public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) { + public function updateHostSetting(int $hostSettingId, mixed $value, int $timeUpdated) { + + $query = $this->_db->query('UPDATE `hostSetting` SET `value` = ?, + `timeUpdated` = ? - $query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ? - `key` = ?, - `value` = ?, - `timeAdded = ? + WHERE `hostSettingId` = ? - ON DUPLICATE KEY UPDATE `value` = ?, - `timeUpdated` = ?'); + LIMIT 1'); - $query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]); + $value = json_encode($value); + + $query->execute( + [ + $value, + $timeUpdated, + $hostSettingId + ] + ); return $query->rowCount(); } @@ -212,20 +199,16 @@ class MySQL { public function getTopHostPages(int $limit = 100) { // Get ID (to prevent memory over usage) - $query = $this->_db->query("SELECT `hostPage`.`hostPageId` - - FROM `hostPage` - JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) + $query = $this->_db->query("SELECT `hostPageId` FROM `hostPage` - WHERE `host`.`status` = '1' - AND `hostPage`.`httpCode` = 200 - AND `hostPage`.`rank` > 0 - AND `hostPage`.`timeBanned` IS NULL - AND `hostPage`.`mime` IS NOT NULL + WHERE `httpCode` = 200 + AND `rank` > 0 + AND `timeBanned` IS NULL + AND `mime` IS NOT NULL - ORDER BY `rank` DESC + ORDER BY `rank` DESC - LIMIT " . (int) $limit); + LIMIT " . (int) $limit); // Get required page details foreach ($query->fetchAll() as $top) { @@ -387,12 +370,11 @@ class MySQL { return $query->rowCount(); } - public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) { + public function setHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) { $query = $this->_db->prepare('INSERT IGNORE `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`) VALUES (?, ?)'); $query->execute([$hostPageIdSource, $hostPageIdTarget]); - } public function deleteHostPageToHostPage(int $hostPageId) { @@ -422,6 +404,15 @@ class MySQL { return $query->fetchAll(); } + public function getHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) { + + $query = $this->_db->prepare('SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? AND `hostPageIdTarget` = ? LIMIT 1'); + + $query->execute([$hostPageIdSource, $hostPageIdTarget]); + + return $query->fetch(); + } + public function addHostPageSnap(int $hostPageId, int $timeAdded) { $query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, `timeAdded`) VALUES (?, ?)'); @@ -560,62 +551,46 @@ class MySQL { $query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset); - $query->execute(); + $query->execute([$timeOffset]); return $query->rowCount(); } - // Crawler tools - public function getHostPageCrawlQueueTotal(int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { + public function resetBannedHosts(int $timeOffset) { - $query = $this->_db->prepare("SELECT COUNT(*) AS `total` + $query = $this->_db->prepare('UPDATE `host` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset); - FROM `hostPage` - JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + $query->execute([$timeOffset]); - WHERE ( - `hostPage`.`timeUpdated` IS NULL OR - `hostPage`.`timeUpdated` < ? OR ( - `hostPage`.`uri` = '/' AND - `hostPage`.`timeUpdated` < ? - ) - ) + return $query->rowCount(); + } + + // Crawler tools + public function getHostPageCrawlQueueTotal(int $timeFrom) { - AND `host`.`status` <> ? - AND `hostPage`.`timeBanned` IS NULL"); + $query = $this->_db->prepare("SELECT COUNT(*) AS `total` FROM `hostPage` - $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `hostPage`.`timeBanned` IS NULL"); + + $query->execute([$timeFrom]); return $query->fetch()->total; } - public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { + public function getHostPageCrawlQueue(int $limit, int $timeFrom) { $result = []; // Get ID (to prevent memory over usage) - $query = $this->_db->prepare("SELECT `hostPage`.`hostPageId` - - FROM `hostPage` - JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) - - WHERE ( - `hostPage`.`timeUpdated` IS NULL OR - `hostPage`.`timeUpdated` < ? - OR ( - `hostPage`.`uri` = '/' AND - `hostPage`.`timeUpdated` < ? - ) - ) + $query = $this->_db->prepare("SELECT `hostPageId` FROM `hostPage` - AND `host`.`status` <> ? - AND `hostPage`.`timeBanned` IS NULL + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL - ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND() + ORDER BY LENGTH(`uri`) ASC, RAND() - LIMIT " . (int) $limit); + LIMIT " . (int) $limit); - $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); + $query->execute([$timeFrom]); // Get required page details foreach ($query->fetchAll() as $queue) { @@ -627,10 +602,6 @@ class MySQL { `host`.`scheme`, `host`.`name`, `host`.`port`, - `host`.`crawlPageLimit`, - `host`.`crawlMetaOnly`, - `host`.`robots`, - `host`.`robotsPostfix`, IF (`host`.`port` IS NOT NULL, CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), @@ -676,13 +647,13 @@ class MySQL { FROM `host` - WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ?) AND `timeBanned` IS NULL ORDER BY RAND() LIMIT " . (int) $limit); - $query->execute([$timeFrom, 0]); + $query->execute([$timeFrom]); // Get required page details foreach ($query->fetchAll() as $host) { diff --git a/library/parser.php b/library/parser.php deleted file mode 100644 index 5149427..0000000 --- a/library/parser.php +++ /dev/null @@ -1,73 +0,0 @@ - null, - 'scheme' => null, - 'name' => null, - 'port' => null, - ]; - - if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) { - - $result['string'] = $hostScheme . '://'; - - $result['scheme'] = $hostScheme; - - } else { - - return false; - } - - if ($hostName = parse_url($string, PHP_URL_HOST)) { - - $result['string'] .= $hostName; - - $result['name'] = $hostName; - - } else { - - return false; - } - - if ($hostPort = parse_url($string, PHP_URL_PORT)) { - - $result['string'] .= ':' . $hostPort; - - $result['port'] = $hostPort; - - } - - return (object) $result; - } - - static public function uri(string $string) { - - $result = [ - 'string' => '/', - 'path' => '/', - 'query' => null, - ]; - - if ($path = parse_url($string, PHP_URL_PATH)) { - - $result['string'] = $path; - - $result['path'] = $path; - - } - - if ($query = parse_url($string, PHP_URL_QUERY)) { - - $result['string'] .= '?' . $query; - - $result['query'] = '?' . $query; - - } - - return (object) $result; - } -} \ No newline at end of file diff --git a/library/url.php b/library/url.php new file mode 100644 index 0000000..bada461 --- /dev/null +++ b/library/url.php @@ -0,0 +1,82 @@ + (object) + [ + 'url' => null, + 'scheme' => null, + 'name' => null, + 'port' => null, + ], + 'page' => (object) + [ + 'url' => null, + 'uri' => null, + 'path' => null, + 'query' => null, + ] + ]; + + // Validate URL + if (!self::is($url)) { + + return false; + } + + // Parse host + if ($scheme = parse_url($url, PHP_URL_SCHEME)) { + + $result->host->url = $scheme . '://'; + $result->host->scheme = $scheme; + + } else { + + return false; + } + + if ($host = parse_url($url, PHP_URL_HOST)) { + + $result->host->url .= $host; + $result->host->name = $host; + + } else { + + return false; + } + + if ($port = parse_url($url, PHP_URL_PORT)) { + + $result->host->url .= ':' . $port; + $result->host->port = $port; + + // port is optional + } + + // Parse page + if ($path = parse_url($url, PHP_URL_PATH)) { + + $result->page->uri = $path; + $result->page->path = $path; + } + + if ($query = parse_url($url, PHP_URL_QUERY)) { + + $result->page->uri .= '?' . $query; + $result->page->query = '?' . $query; + } + + $result->page->url = $result->host->url . $result->page->uri; + + return $result; + } +} \ No newline at end of file diff --git a/media/db-prototype.png b/media/db-prototype.png index e8c3b75..92bf9d1 100644 Binary files a/media/db-prototype.png and b/media/db-prototype.png differ diff --git a/public/api.php b/public/api.php index 636c3b3..8014212 100644 --- a/public/api.php +++ b/public/api.php @@ -1,14 +1,11 @@ true, 'result' => [ 'config' => [ - 'websiteDomain' => WEBSITE_DOMAIN, - 'crawlUrlRegexp' => CRAWL_URL_REGEXP, - 'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW, - 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, - 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, - 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, - 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, - 'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET, - 'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX, - 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, - 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, + 'WEBSITE_DOMAIN' => WEBSITE_DOMAIN, + 'DEFAULT_HOST_URL_REGEXP' => DEFAULT_HOST_URL_REGEXP, + // @TODO ], 'api' => [ 'version' => (string) API_VERSION, diff --git a/public/explore.php b/public/explore.php index 8e34fe3..10fa63d 100644 --- a/public/explore.php +++ b/public/explore.php @@ -7,10 +7,28 @@ require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/sphinxql.php'); // Connect Sphinx search server -$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); +try { + + $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} // Connect database -$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); +try { + + $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} // Filter request data $hp = !empty($_GET['hp']) ? Filter::url($_GET['hp']) : 0; @@ -283,7 +301,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?> + getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>
diff --git a/public/index.php b/public/index.php index 4ab8e03..b8f3486 100644 --- a/public/index.php +++ b/public/index.php @@ -6,7 +6,16 @@ require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/sphinxql.php'); // Connect Sphinx search server -$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); +try { + + $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} $totalPages = $sphinx->getHostPagesTotal(); diff --git a/public/search.php b/public/search.php index 71537ae..84878cf 100644 --- a/public/search.php +++ b/public/search.php @@ -2,18 +2,48 @@ // Load system dependencies require_once(__DIR__ . '/../config/app.php'); -require_once(__DIR__ . '/../library/curl.php'); -require_once(__DIR__ . '/../library/robots.php'); require_once(__DIR__ . '/../library/filter.php'); -require_once(__DIR__ . '/../library/parser.php'); +require_once(__DIR__ . '/../library/url.php'); require_once(__DIR__ . '/../library/mysql.php'); +require_once(__DIR__ . '/../library/helper.php'); require_once(__DIR__ . '/../library/sphinxql.php'); // Connect Sphinx search server -$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); +try { + + $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} // Connect database -$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); +try { + + $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} + +// Connect memcached +try { + + $memcached = new Memcached(); + $memcached->addServer(MEMCACHED_HOST, MEMCACHED_PORT); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} // Filter request data $t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'text'; @@ -36,82 +66,34 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the sprintf(_('Over %s pages or enter the new one...'), $totalPages), sprintf(_('Over %s pages or enter the new one...'), $totalPages), ]); +// Define alert message +$alertMessages = []; -// Crawl request -if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { - - $db->beginTransaction(); +// Register new host/page on search request contains the link +if (URL::is($q)) { try { - // Parse host info - if ($hostURL = Parser::hostURL($q)) { + $db->beginTransaction(); + + if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, $q)) { - // Host exists - if ($host = $db->getHostByCRC32URL(crc32($hostURL->string))) { + if (count($linkToDBresult->new->hostPageId)) { - $hostStatus = $host->status; - $hostNsfw = $host->nsfw; - $hostPageLimit = $host->crawlPageLimit; - $hostMetaOnly = $host->crawlMetaOnly; - $hostId = $host->hostId; - $hostRobots = $host->robots; - $hostRobotsPostfix = $host->robotsPostfix; + $alertMessages[] = _('Link successfully registered in the crawl queue!'); - // Register new host } else { - // Disk quota not reached - if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) { - - // Get robots.txt if exists - $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT); - - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = null; - } - - $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - - $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; - $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; - $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; - $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - - $hostId = $db->addHost( $hostURL->scheme, - $hostURL->name, - $hostURL->port, - crc32($hostURL->string), - time(), - null, - $hostPageLimit, - (string) $hostMetaOnly, - (string) $hostStatus, - (string) $hostNsfw, - $hostRobots, - $hostRobotsPostfix); - - // Add web root host page to make host visible in the crawl queue - $db->addHostPage($hostId, crc32('/'), '/', time()); - } - } + if ($resultsTotal == 0) { - // Parse page URI - $hostPageURI = Parser::uri($q); + $alertMessages[] = _('This link already registered in the crawl queue.'); + } - // Init robots parser - $robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix); + } - // Save page info - if ($hostStatus && // host enabled - $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules - $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit - !$db->findHostPageByCRC32URI($hostId, crc32($hostPageURI->string))) { // page not exists + } else { - $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); - } + $alertMessages[] = _('Link address not supported on this host!'); } $db->commit(); @@ -124,6 +106,12 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { } } +// Count pages in the crawl queue +if ($queueTotal = $db->getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) { + + $alertMessages[] = sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal); +} + ?> @@ -313,8 +301,8 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?> - + +
@@ -352,7 +340,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?> + getHostPageCrawlQueueTotal(time() - CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET)) { ?>
diff --git a/public/top.php b/public/top.php index 4ccabd4..2f01c1b 100644 --- a/public/top.php +++ b/public/top.php @@ -7,10 +7,28 @@ require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/sphinxql.php'); // Connect Sphinx search server -$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); +try { + + $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} // Connect database -$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); +try { + + $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); + +} catch(Exception $e) { + + var_dump($e); + + exit; +} // Define page basics $totalPages = $sphinx->getHostPagesTotal(); @@ -271,7 +289,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
- getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET)) { ?> + getHostPageCrawlQueueTotal(time() - CRAWL_PAGE_SECONDS_OFFSET)) { ?>