YGGo! Distributed Web Search Engine
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

437 lines
12 KiB

<?php
/*
* YGGo! - Distributed & Open Source Web Search Engine
*
* MIT License
* Copyright (c) 2023 YGGverse
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Default configuration file example
* Production name: app.php
*
* Project home page
* https://github.com/YGGverse/YGGo
*
* Get support
* https://github.com/YGGverse/YGGo/issues
*
*/
// Debug
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);
// Website
/*
* Project domain, without slash on postfix
*
*/
define('WEBSITE_DOMAIN', '');
/*
* Page search results before show the read more link
*
*/
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
/*
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
*
* or false - to generate every time on request
*
*/
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database
define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
define('DB_PASSWORD', '');
// Sphinx
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
// Memcached
define('MEMCACHED_HOST', 'localhost');
define('MEMCACHED_PORT', 11211);
// Snaps
/*
* Storage nodes configuration
*
* Supports optional single 'localhost' and multiple 'FTP' servers
*
* Comment specified node to disable specified connection
*
* Make empty array to disable snaps or set quote.mime = false or quote.size = 0 to disable specified instance
*
*/
define('SNAP_STORAGE', json_encode((object)
[
'localhost' => [
'storage-1' => [
'directory' => __DIR__ . '/../storage/snap/hps/',
'quota' => [
'mime' => false,
'size' => 10000000024,
'request' => [
'download' => [
'size' => 10000024,
'seconds' => 60*60
]
]
],
],
// ...
],
'ftp' => [
'storage-1' => [
'port' => 21,
'host' => '',
'username' => '',
'password' => '',
'directory' => '/snap',
'timeout' => 30,
'passive' => true,
'quota' => [
'mime' => 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico',
'size' => 10000000024,
'request' => [
'download' => [
'size' => 10000024,
'seconds' => 60*60
]
]
],
],
// ...
]
]
));
// Proxy settings
/*
* Search proxy User Agent name
*
* Shared to other hosts through CURL requests by search proxy
*
*/
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
// Host defaults
/*
* Only URL addresses match this rule will be crawled
*
*/
define('DEFAULT_HOST_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); // ipv6 links only
/*
* Default robots.txt rules (will be overwriten on remote rules available)
*
* string|null
*
*/
define('DEFAULT_HOST_ROBOTS_TXT', null);
/*
* These rules forcely appending to the remote robots.txt file
*
* string|null
*
*/
define('DEFAULT_HOST_ROBOTS_TXT_POSTFIX', null);
/*
* Pages limit per new host by default
*
* Crawler stops indexing on this limit reach to prevent disk overuse
*
*/
define('DEFAULT_HOST_PAGES_LIMIT', 100000);
/*
* Index pages match MIME types
*
* comma separated
*
*/
define('DEFAULT_HOST_PAGES_MIME', 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac,font/ttf');
/*
* Index only meta tags
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
*
* Warning!
* this option requires huge disk storage,
* it's experimental feature, oriented for index operations
*
*/
define('DEFAULT_HOST_PAGES_DATA', false);
/*
* Generates hostPageDom index based on hostPage.data field
*
* Could be useful for building semantical index query (config/sphinx.conf.txt)
*
* At this moment feature available in the CLI only (cli/yggo.php)
*
*/
define('DEFAULT_HOST_PAGES_DOM_SELECTORS', false); // ";" separated
/*
* Strip HTML in the DEFAULT_HOST_PAGES_DOM_SELECTORS content
*
*/
define('DEFAULT_HOST_PAGE_DOM_STRIP_TAGS', false);
// Crawl queue
/*
* Crawler / Bot User Agent name
*
* Shared to other hosts through CURL requests by crawler
*
*/
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
/*
* Skip curl download on response data size reached
*
* See also: CURLOPT_TIMEOUT (library/curl.php)
*
*/
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 50485760);
/*
* Stop crawler on disk quota reached (Mb)
*
*/
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 128);
/*
* Pages (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Set 0 to disable
*
*/
define('CRAWL_HOST_PAGE_QUEUE_LIMIT', 10);
/*
* Renew page index by timing offset provided
*
* This option works with CRAWL_HOST_PAGE_QUEUE_LIMIT step queue
*
* Pay attention, that CRAWL_HOST_PAGE_QUEUE_LIMIT + CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Re-calculate page rank on page update
*
* When enabled, may enlarge execution time
*
* true|false
*
*/
define('CRAWL_HOST_PAGE_RANK_UPDATE', false);
/*
* Renew hosts index by timing offset provided
*
*/
define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7);
/*
* Hosts hosts processing limit in the crawler.php queue
*
* Set 0 to disable
*
*/
define('CRAWL_HOST_LIMIT', 1);
/*
* Collect sitemap index when available
*
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
*
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
*
* true|false
*
*/
define('CRAWL_SITEMAPS', true);
/*
* Crawl robots.txt
*/
define('CRAWL_ROBOTS', true); // true|false
/*
* Look for third-party manifests to collect distributed index
*
* API address provided in yggo meta tag
*
* Collecting URL that match DEFAULT_HOST_URL_REGEXP condition
*
*/
define('CRAWL_MANIFEST', true);
/*
* Manifest API version compatibility
*
*/
define('CRAWL_MANIFEST_API_VERSION', 0.13);
// Integrations
/*
* Crawl YGGstate for peers to descover new hosts
*
* Yggdrasil networks only
*
* Read more:
* https://github.com/YGGverse/YGGstate
*
*/
define('CRAWL_YGGSTATE', json_encode((object)
[
'db' =>
[
[
// Conditions
'peer_min_last_uptime' => 60*60*24, // skip short-term connections, seconds
'timeout' => 60*60*24, // these calls running in crontab/crawler qeue, prevent remote server abuse
// Connection
'port' => 3306,
'host' => '',
'database' => '',
'username' => '',
'password' => '',
],
// ...
],
])
);
/*
* Remove host ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_HOST_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove page ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Database tables optimization
*
* Reorganizes the physical storage of table data and associated index data,
* to reduce storage space and improve I/O efficiency when accessing the tables.
* Read more: https://www.forknerds.com/reduce-the-size-of-mysql/#Shrink_and_Optimize_MySQL
*
* When enabled - requires enough of RAM
*
*/
define('CLEAN_DB_TABLES_OPTIMIZATION', true);
// API settings
/*
* JSON API features
*
* When false - every the actions settings below will be ignored
*
*/
define('API_ENABLED', true);
/*
* Search API
*
* When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored
*
*/
define('API_SEARCH_ENABLED', true);
/*
* Search results per page
*
*/
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);
/*
* Hosts distribution API
*
* When false - API_HOSTS_FIELDS will be ignored
*
*/
define('API_HOSTS_ENABLED', true);
/*
* Database host fields comma separated or * to share all the fields
*
*/
define('API_HOSTS_FIELDS', "IF (`port` IS NOT NULL,
CONCAT(`scheme`, '://', `name`, ':', `port`),
CONCAT(`scheme`, '://', `name`)
) AS `url`,
`timeAdded`,
`timeUpdated`");
/*
* Manifest API
*
* Application meta sharing between YGGo remote nodes
*
* When true - make this node public for distributed index sharing
*
*/
define('API_MANIFEST_ENABLED', true);