mirror of https://github.com/YGGverse/YGGo.git
phpyggdrasilmysqlcrawlerjs-lessalt-websphinxspiderdistributedwebsearch-engineopen-sourcepdocurlparserfts5privacy-orientedsphinxsearchfederativeweb-archive
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
437 lines
12 KiB
437 lines
12 KiB
<?php |
|
|
|
/* |
|
* YGGo! - Distributed & Open Source Web Search Engine |
|
* |
|
* MIT License |
|
|
|
* Copyright (c) 2023 YGGverse |
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy |
|
* of this software and associated documentation files (the "Software"), to deal |
|
* in the Software without restriction, including without limitation the rights |
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
* copies of the Software, and to permit persons to whom the Software is |
|
* furnished to do so, subject to the following conditions: |
|
|
|
* The above copyright notice and this permission notice shall be included in all |
|
* copies or substantial portions of the Software. |
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
* SOFTWARE. |
|
* |
|
* Default configuration file example |
|
* Production name: app.php |
|
* |
|
* Project home page |
|
* https://github.com/YGGverse/YGGo |
|
* |
|
* Get support |
|
* https://github.com/YGGverse/YGGo/issues |
|
* |
|
*/ |
|
|
|
// Debug |
|
ini_set('display_errors', '1'); |
|
ini_set('display_startup_errors', '1'); |
|
error_reporting(E_ALL); |
|
|
|
// Website |
|
|
|
/* |
|
* Project domain, without slash on postfix |
|
* |
|
*/ |
|
define('WEBSITE_DOMAIN', ''); |
|
|
|
/* |
|
* Page search results before show the read more link |
|
* |
|
*/ |
|
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100); |
|
|
|
/* |
|
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload |
|
* |
|
* or false - to generate every time on request |
|
* |
|
*/ |
|
define('WEBSITE_IDENTICON_IMAGE_CACHE', true); |
|
|
|
// Database |
|
define('DB_HOST', 'localhost'); |
|
define('DB_PORT', 3306); |
|
define('DB_NAME', ''); |
|
define('DB_USERNAME', ''); |
|
define('DB_PASSWORD', ''); |
|
|
|
// Sphinx |
|
define('SPHINX_HOST', '127.0.0.1'); |
|
define('SPHINX_PORT', 9306); |
|
|
|
// Memcached |
|
define('MEMCACHED_HOST', 'localhost'); |
|
define('MEMCACHED_PORT', 11211); |
|
|
|
// Snaps |
|
|
|
/* |
|
* Storage nodes configuration |
|
* |
|
* Supports optional single 'localhost' and multiple 'FTP' servers |
|
* |
|
* Comment specified node to disable specified connection |
|
* |
|
* Make empty array to disable snaps or set quote.mime = false or quote.size = 0 to disable specified instance |
|
* |
|
*/ |
|
define('SNAP_STORAGE', json_encode((object) |
|
[ |
|
'localhost' => [ |
|
'storage-1' => [ |
|
'directory' => __DIR__ . '/../storage/snap/hps/', |
|
'quota' => [ |
|
'mime' => false, |
|
'size' => 10000000024, |
|
'request' => [ |
|
'download' => [ |
|
'size' => 10000024, |
|
'seconds' => 60*60 |
|
] |
|
] |
|
], |
|
], |
|
// ... |
|
], |
|
'ftp' => [ |
|
'storage-1' => [ |
|
'port' => 21, |
|
'host' => '', |
|
'username' => '', |
|
'password' => '', |
|
'directory' => '/snap', |
|
'timeout' => 30, |
|
'passive' => true, |
|
'quota' => [ |
|
'mime' => 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico', |
|
'size' => 10000000024, |
|
'request' => [ |
|
'download' => [ |
|
'size' => 10000024, |
|
'seconds' => 60*60 |
|
] |
|
] |
|
], |
|
], |
|
// ... |
|
] |
|
] |
|
)); |
|
|
|
|
|
// Proxy settings |
|
|
|
/* |
|
* Search proxy User Agent name |
|
* |
|
* Shared to other hosts through CURL requests by search proxy |
|
* |
|
*/ |
|
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )'); |
|
|
|
// Host defaults |
|
|
|
/* |
|
* Only URL addresses match this rule will be crawled |
|
* |
|
*/ |
|
define('DEFAULT_HOST_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); // ipv6 links only |
|
|
|
/* |
|
* Default robots.txt rules (will be overwriten on remote rules available) |
|
* |
|
* string|null |
|
* |
|
*/ |
|
define('DEFAULT_HOST_ROBOTS_TXT', null); |
|
|
|
/* |
|
* These rules forcely appending to the remote robots.txt file |
|
* |
|
* string|null |
|
* |
|
*/ |
|
define('DEFAULT_HOST_ROBOTS_TXT_POSTFIX', null); |
|
|
|
/* |
|
* Pages limit per new host by default |
|
* |
|
* Crawler stops indexing on this limit reach to prevent disk overuse |
|
* |
|
*/ |
|
define('DEFAULT_HOST_PAGES_LIMIT', 100000); |
|
|
|
/* |
|
* Index pages match MIME types |
|
* |
|
* comma separated |
|
* |
|
*/ |
|
define('DEFAULT_HOST_PAGES_MIME', 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac,font/ttf'); |
|
|
|
/* |
|
* Index only meta tags |
|
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field |
|
* |
|
* Warning! |
|
* this option requires huge disk storage, |
|
* it's experimental feature, oriented for index operations |
|
* |
|
*/ |
|
define('DEFAULT_HOST_PAGES_DATA', false); |
|
|
|
/* |
|
* Generates hostPageDom index based on hostPage.data field |
|
* |
|
* Could be useful for building semantical index query (config/sphinx.conf.txt) |
|
* |
|
* At this moment feature available in the CLI only (cli/yggo.php) |
|
* |
|
*/ |
|
define('DEFAULT_HOST_PAGES_DOM_SELECTORS', false); // ";" separated |
|
|
|
/* |
|
* Strip HTML in the DEFAULT_HOST_PAGES_DOM_SELECTORS content |
|
* |
|
*/ |
|
define('DEFAULT_HOST_PAGE_DOM_STRIP_TAGS', false); |
|
|
|
|
|
// Crawl queue |
|
|
|
/* |
|
* Crawler / Bot User Agent name |
|
* |
|
* Shared to other hosts through CURL requests by crawler |
|
* |
|
*/ |
|
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); |
|
|
|
/* |
|
* Skip curl download on response data size reached |
|
* |
|
* See also: CURLOPT_TIMEOUT (library/curl.php) |
|
* |
|
*/ |
|
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 50485760); |
|
|
|
/* |
|
* Stop crawler on disk quota reached (Mb) |
|
* |
|
*/ |
|
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 128); |
|
|
|
/* |
|
* Pages (URI) processing limit in the crawler.php queue |
|
* |
|
* This option related to CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET value |
|
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) |
|
* |
|
* Usually up to 20 pages per minute, |
|
* to prevent websites overload by sending GET crawling requests |
|
* |
|
* Set 0 to disable |
|
* |
|
*/ |
|
define('CRAWL_HOST_PAGE_QUEUE_LIMIT', 10); |
|
|
|
/* |
|
* Renew page index by timing offset provided |
|
* |
|
* This option works with CRAWL_HOST_PAGE_QUEUE_LIMIT step queue |
|
* |
|
* Pay attention, that CRAWL_HOST_PAGE_QUEUE_LIMIT + CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET pair |
|
* must have enough value to crawl all pages collected in the DB index |
|
* |
|
* or the crawler can stuck in queue |
|
* |
|
*/ |
|
define('CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET', 60*60*24*30*12); |
|
|
|
/* |
|
* Re-calculate page rank on page update |
|
* |
|
* When enabled, may enlarge execution time |
|
* |
|
* true|false |
|
* |
|
*/ |
|
define('CRAWL_HOST_PAGE_RANK_UPDATE', false); |
|
|
|
/* |
|
* Renew hosts index by timing offset provided |
|
* |
|
*/ |
|
define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7); |
|
|
|
/* |
|
* Hosts hosts processing limit in the crawler.php queue |
|
* |
|
* Set 0 to disable |
|
* |
|
*/ |
|
define('CRAWL_HOST_LIMIT', 1); |
|
|
|
/* |
|
* Collect sitemap index when available |
|
* |
|
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only |
|
* |
|
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml |
|
* |
|
* true|false |
|
* |
|
*/ |
|
define('CRAWL_SITEMAPS', true); |
|
|
|
/* |
|
* Crawl robots.txt |
|
*/ |
|
define('CRAWL_ROBOTS', true); // true|false |
|
|
|
/* |
|
* Look for third-party manifests to collect distributed index |
|
* |
|
* API address provided in yggo meta tag |
|
* |
|
* Collecting URL that match DEFAULT_HOST_URL_REGEXP condition |
|
* |
|
*/ |
|
define('CRAWL_MANIFEST', true); |
|
|
|
/* |
|
* Manifest API version compatibility |
|
* |
|
*/ |
|
define('CRAWL_MANIFEST_API_VERSION', 0.13); |
|
|
|
|
|
// Integrations |
|
|
|
/* |
|
* Crawl YGGstate for peers to descover new hosts |
|
* |
|
* Yggdrasil networks only |
|
* |
|
* Read more: |
|
* https://github.com/YGGverse/YGGstate |
|
* |
|
*/ |
|
define('CRAWL_YGGSTATE', json_encode((object) |
|
[ |
|
'db' => |
|
[ |
|
[ |
|
// Conditions |
|
'peer_min_last_uptime' => 60*60*24, // skip short-term connections, seconds |
|
'timeout' => 60*60*24, // these calls running in crontab/crawler qeue, prevent remote server abuse |
|
|
|
// Connection |
|
'port' => 3306, |
|
'host' => '', |
|
'database' => '', |
|
'username' => '', |
|
'password' => '', |
|
], |
|
// ... |
|
], |
|
]) |
|
); |
|
|
|
/* |
|
* Remove host ban after following time |
|
* |
|
* This option used in crawler and search page |
|
* to prevent extra http requests to unavailable or not condition resources |
|
* |
|
*/ |
|
|
|
define('CLEAN_HOST_BAN_SECONDS_OFFSET', 60*60*24*30); |
|
/* |
|
* Remove page ban after following time |
|
* |
|
* This option used in crawler and search page |
|
* to prevent extra http requests to unavailable or not condition resources |
|
* |
|
*/ |
|
define('CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); |
|
|
|
/* |
|
* Database tables optimization |
|
* |
|
* Reorganizes the physical storage of table data and associated index data, |
|
* to reduce storage space and improve I/O efficiency when accessing the tables. |
|
* Read more: https://www.forknerds.com/reduce-the-size-of-mysql/#Shrink_and_Optimize_MySQL |
|
* |
|
* When enabled - requires enough of RAM |
|
* |
|
*/ |
|
define('CLEAN_DB_TABLES_OPTIMIZATION', true); |
|
|
|
// API settings |
|
|
|
/* |
|
* JSON API features |
|
* |
|
* When false - every the actions settings below will be ignored |
|
* |
|
*/ |
|
define('API_ENABLED', true); |
|
|
|
/* |
|
* Search API |
|
* |
|
* When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored |
|
* |
|
*/ |
|
define('API_SEARCH_ENABLED', true); |
|
|
|
/* |
|
* Search results per page |
|
* |
|
*/ |
|
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20); |
|
|
|
/* |
|
* Hosts distribution API |
|
* |
|
* When false - API_HOSTS_FIELDS will be ignored |
|
* |
|
*/ |
|
define('API_HOSTS_ENABLED', true); |
|
|
|
/* |
|
* Database host fields comma separated or * to share all the fields |
|
* |
|
*/ |
|
define('API_HOSTS_FIELDS', "IF (`port` IS NOT NULL, |
|
CONCAT(`scheme`, '://', `name`, ':', `port`), |
|
CONCAT(`scheme`, '://', `name`) |
|
) AS `url`, |
|
`timeAdded`, |
|
`timeUpdated`"); |
|
|
|
/* |
|
* Manifest API |
|
* |
|
* Application meta sharing between YGGo remote nodes |
|
* |
|
* When true - make this node public for distributed index sharing |
|
* |
|
*/ |
|
define('API_MANIFEST_ENABLED', true); |