YGGo/config/app.php.txt

524 lines
13 KiB
Plaintext
Raw Normal View History

2023-04-01 19:29:39 +03:00
<?php
/*
* YGGo! - Distributed & Open Source Web Search Engine
*
* MIT License
* Copyright (c) 2023 YGGverse
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Default configuration file example
* Production name: app.php
*
* Project home page
* https://github.com/YGGverse/YGGo
*
* Get support
* https://github.com/YGGverse/YGGo/issues
*
*/
2023-04-01 19:29:39 +03:00
// Debug
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);
// Website
2023-04-23 01:54:10 +03:00
/*
* Project domain, without slash on postfix
*
*/
define('WEBSITE_DOMAIN', '');
2023-04-23 01:54:10 +03:00
/*
* Page search results before show the read more link
2023-04-23 01:54:10 +03:00
*
*/
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
2023-04-23 01:54:10 +03:00
/*
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
*
* or false - to generate every time on request
*
*/
2023-04-03 01:30:09 +03:00
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
2023-04-01 19:29:39 +03:00
2023-05-15 09:18:18 +03:00
/*
* Total snap files size allowed to download in bytes in WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE_TIME_OFFSET period
*
*/
define('WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE', 10485760);
/*
* Time offset quota when WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE reached
*
*/
define('WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE_TIME_OFFSET', 60*60);
2023-04-01 19:29:39 +03:00
// Database
define('DB_HOST', '127.0.0.1');
define('DB_PORT', 3306);
define('DB_NAME', '');
2023-04-01 19:29:39 +03:00
define('DB_USERNAME', '');
define('DB_PASSWORD', '');
// Sphinx
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
2023-04-01 19:29:39 +03:00
2023-07-27 17:53:36 +03:00
// Memcached
define('MEMCACHED_HOST', '127.0.0.1');
define('MEMCACHED_PORT', 11211);
// Third-party connections (optional)
/*
* Mega.nz remote storage
*
* FTP storage integration through MEGAcmd (https://mega.io/cmd)
*
* Connect mega-ftp instance on CRAWL_PAGE_MIME_SNAP_MEGA enabled
*
*/
define('MEGA_FTP_HOST', '127.0.0.1');
define('MEGA_FTP_PORT', 4990);
define('MEGA_FTP_DIRECTORY', '');
// Proxy settings
/*
* Search proxy User Agent name
*
* Shared to other hosts through CURL requests by search proxy
*
*/
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');
// Crawl settings
2023-05-08 11:04:59 +03:00
/*
* Save crawler debug to `logCrawler` table
*
*/
define('CRAWL_LOG_ENABLED', true);
/*
* Auto clean `logCrawler` items older seconds offset
*
*/
define('CRAWL_LOG_SECONDS_OFFSET', 60*60*24*30);
/*
* Crawler / Bot User Agent name
*
* Shared to other hosts through CURL requests by crawler
*
*/
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');
2023-04-23 01:32:34 +03:00
/*
* Skip curl download on response data size reached
*
* See also: CURLOPT_TIMEOUT (library/curl.php)
*
*/
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760);
2023-04-23 04:05:00 +03:00
/*
* Stop crawler on disk quota reached (Mb)
*
*/
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
2023-04-23 01:32:34 +03:00
/*
* Pages (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_PAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Set 0 to disable
*
2023-04-23 01:32:34 +03:00
*/
define('CRAWL_PAGE_LIMIT', 20);
2023-04-23 01:32:34 +03:00
/*
* Manifest (URI) processing limit in the crawler.php queue
*
* Used to collect distributed data index
* that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION
*
* This option related to CRAWL_MANIFEST_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Set 0 to disable
*
2023-05-04 06:45:04 +03:00
*/
define('CRAWL_MANIFEST_LIMIT', 10);
2023-05-04 06:45:04 +03:00
2023-04-23 01:32:34 +03:00
/*
* Renew page index by timing offset provided
*
* This option works with CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
2023-05-04 06:45:04 +03:00
* must have enough value to crawl all pages collected in the DB index
2023-04-23 01:32:34 +03:00
*
* or the crawler can stuck in queue
*
*/
2023-05-03 04:17:13 +03:00
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
2023-04-01 19:29:39 +03:00
2023-06-30 13:28:22 +03:00
/*
* Renew home page index by timing offset provided
*
* Used for new pages scanning in highter priority
*
* This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enough value to crawl all pages collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7);
/*
* Index pages match MIME types
*
* comma separated
*
*/
2023-06-13 21:57:01 +03:00
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
2023-05-13 10:15:07 +03:00
/*
* Snap pages locally match MIME types
*
* comma separated | false to disable
*
*/
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html');
/*
* Snap pages to mega.nz match MIME types
*
* comma separated | false to disable
*
* Requires connection:
*
* MEGA_FTP_HOST
* MEGA_FTP_PORT
* MEGA_FTP_DIRECTORY
*
*/
define('CRAWL_PAGE_MIME_SNAP_MEGA', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico');
2023-05-04 06:45:04 +03:00
/*
* Renew manifests index by timing offset provided
*
* This option works with CRAWL_MANIFEST_LIMIT step queue
*
* Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair
* must have enough value to crawl all manifests collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
2023-04-23 01:32:34 +03:00
/*
* Only URL addresses match this rule will be auto-crawled
*
*/
define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui');
2023-04-03 03:07:54 +03:00
2023-04-23 01:14:31 +03:00
/*
* Pages limit per new host by default
*
* Crawler stops indexing on this limit reach to prevent disk overuse
*
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
*
*/
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
2023-04-23 01:14:31 +03:00
/*
* Set default auto-crawl status for new host added
*
* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT
* false - requires manual validation by the moderator in the DB `host`.`status` field
*
* This option also disable host in the search results
*
*/
define('CRAWL_HOST_DEFAULT_STATUS', true);
2023-04-23 01:14:31 +03:00
/*
2023-05-14 01:45:55 +03:00
* Index only meta tags
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
2023-04-23 01:14:31 +03:00
*
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
2023-04-23 01:14:31 +03:00
*
2023-05-14 01:45:55 +03:00
* Warning!
* this option disabled requires huge disk storage,
* it's experimental feature, oriented for index operations
*
* see CRAWL_PAGE_MIME_SNAP_LOCAL
* to create compressed data snaps
2023-04-23 01:14:31 +03:00
*
*/
2023-05-14 01:45:55 +03:00
define('CRAWL_HOST_DEFAULT_META_ONLY', true);
2023-05-04 01:04:39 +03:00
/*
2023-05-09 13:26:19 +03:00
* Not suitable/safe for work status for new host by default
2023-05-04 01:04:39 +03:00
*
2023-05-09 13:26:19 +03:00
* Could be filtered in search results
*
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
*
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
2023-07-27 11:44:42 +03:00
/*
* Collect sitemap index when available
*
* At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only
*
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
*
* true|false
*
*/
define('CRAWL_SITEMAPS', true);
/*
* Renew robots.txt index by timing offset provided
*
*/
define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7);
/*
* Hosts Robots.txt processing limit in the crawler.php queue
*
* Set 0 to disable
*
*/
define('CRAWL_ROBOTS_LIMIT', 1);
/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robots.md
*
*/
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
/*
* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
* The crawler does not overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
*
*/
2023-04-09 00:06:28 +03:00
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
/*
* Generates hostPageDom index based on hostPage.data field
*
* Could be useful for building semantical index query (config/sphinx.conf.txt)
*
* At this moment feature available in the CLI only (cli/yggo.php)
*
*/
2023-06-30 13:28:22 +03:00
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6');
/*
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content
*
*/
define('CRAWL_HOST_PAGE_DOM_STRIP_TAGS', true);
2023-05-03 09:22:14 +03:00
/*
* Look for third-party manifests to collect distributed index
*
* API address provided in yggo meta tag
* will be stored in the `manifest` DB table
*
2023-05-03 09:31:40 +03:00
* Collecting URL that match CRAWL_URL_REGEXP condition
*
2023-05-03 09:22:14 +03:00
*/
define('CRAWL_MANIFEST', true);
/*
* Manifest API version compatibility
*
*/
2023-06-30 13:28:22 +03:00
define('CRAWL_MANIFEST_API_VERSION', 0.10);
2023-05-03 09:22:14 +03:00
/*
* Set default auto-crawl status for new manifest added
*
* true - crawler autostart manifest indexer
* false - requires manual validation by the moderator in the DB `manifest`.`status` field
*
2023-05-03 09:31:40 +03:00
* This option applying on CRAWL_MANIFEST enabled
*
2023-05-03 09:22:14 +03:00
*/
define('CRAWL_MANIFEST_DEFAULT_STATUS', true);
2023-04-09 00:06:28 +03:00
// Cleaner settings
2023-04-23 01:46:34 +03:00
2023-05-08 11:04:59 +03:00
/*
* Save cleaner debug to `logCleaner` table
*
*/
define('CLEAN_LOG_ENABLED', true);
/*
* Auto clean `logCleaner` items older seconds offset
*
*/
define('CLEAN_LOG_SECONDS_OFFSET', 60*60*24*30);
2023-04-23 01:46:34 +03:00
/*
* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
*
* This option works with CLEAN_HOST_SECONDS_OFFSET
*
* The value depends of CPU resources available
*
*/
2023-04-09 00:06:28 +03:00
define('CLEAN_HOST_LIMIT', 20);
2023-04-23 01:46:34 +03:00
/*
* Apply cleaning rules to page older than value provided
*
* This option works with CLEAN_HOST_LIMIT step queue
*
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
2023-05-04 06:45:04 +03:00
* must have enough value to process all pages in the DB index
2023-04-23 01:46:34 +03:00
*
* or the cleaner can stuck in queue
*
*/
2023-05-03 04:17:13 +03:00
define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30);
2023-04-23 03:01:51 +03:00
/*
* Remove page ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove page description history after following time
*
*/
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
/*
* Remove page DOM history after following time
*
*/
define('CLEAN_PAGE_DOM_OFFSET', 60*60*24*30*12*10);
2023-05-29 22:13:41 +03:00
/*
2023-05-29 22:36:13 +03:00
* Database tables optimization
2023-05-29 22:13:41 +03:00
*
2023-05-29 22:36:13 +03:00
* Reorganizes the physical storage of table data and associated index data,
* to reduce storage space and improve I/O efficiency when accessing the tables.
2023-05-30 21:46:52 +03:00
* Read more: https://www.forknerds.com/reduce-the-size-of-mysql/#Shrink_and_Optimize_MySQL
2023-05-29 22:36:13 +03:00
*
* When enabled - requires enough of RAM
2023-05-29 22:13:41 +03:00
*
*/
2023-05-30 21:46:52 +03:00
define('CLEAN_DB_TABLES_OPTIMIZATION', false);
2023-04-23 03:01:51 +03:00
// API settings
2023-04-23 03:16:54 +03:00
/*
* JSON API features
*
* When false - every the actions settings below will be ignored
*
*/
2023-04-23 03:01:51 +03:00
define('API_ENABLED', true);
2023-04-23 03:16:54 +03:00
/*
* Search API
*
* When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored
*
*/
2023-04-23 03:01:51 +03:00
define('API_SEARCH_ENABLED', true);
2023-04-23 03:16:54 +03:00
/*
* Search results per page
*
*/
2023-04-23 03:01:51 +03:00
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);
2023-04-23 03:16:54 +03:00
/*
* Hosts distribution API
*
* When false - API_HOSTS_FIELDS will be ignored
*
*/
2023-04-23 03:01:51 +03:00
define('API_HOSTS_ENABLED', true);
2023-04-23 03:16:54 +03:00
/*
* Database host fields comma separated or * to share all the fields
*
*/
define('API_HOSTS_FIELDS',
'`host`.`scheme`,
2023-05-04 01:04:39 +03:00
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`robots`,
`host`.`robotsPostfix`,
2023-05-09 13:26:19 +03:00
`host`.`nsfw`,
2023-05-04 01:04:39 +03:00
`host`.`timeAdded`,
`host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');
2023-04-25 19:35:52 +03:00
/*
* Manifest API
*
* Application meta sharing between YGGo remote nodes
*
* When true - make this node public for distributed index sharing
*
*/
define('API_MANIFEST_ENABLED', true);