YGGo! Distributed Web Search Engine
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

269 lines
6.7 KiB

<?php
/*
* YGGo! - Distributed & Open Source Web Search Engine
*
* MIT License
* Copyright (c) 2023 YGGverse
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Default configuration file example
* Production name: app.php
*
* Project home page
* https://github.com/YGGverse/YGGo
*
* Get support
* https://github.com/YGGverse/YGGo/issues
*
*/
// Debug
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);
// Application
/*
* Unique project name
*
* using to ident the app in the YGGo ecosystem
*
*/
define('APPLICATION_NAME', 'My YGGo host');
/*
* Application mode
*
* FEDERATIVE - crawl and share distributed index from other YGGo nodes running on same API version
* LOCAL - encapsulated web search portal
*
* see also: API_ENABLED, API_MANIFEST_ENABLED settings
*
*/
define('APPLICATION_MODE', 'FEDERATIVE');
// Website
/*
* Project domain, without slash on postfix
*
*/
define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : ''));
/*
* Search results per page before show the read more link.
*
*/
define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100);
/*
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
*
* or false - to generate every time on request
*
*/
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database
define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
define('DB_PASSWORD', '');
// Sphinx
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
// Crawler settings
/*
* Stop crawler on disk quota reached (Mb)
*
*/
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
/*
* Pages (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_PAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
*/
define('CRAWL_PAGE_LIMIT', 10);
/*
* Renew page index by timing offset provided
*
* This option works with CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enought value to crawl all pages collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
/*
* Only URL addresses match this rule will be auto-crawled
*
*/
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
/*
* Pages limit per new host by default
*
* Crawler stops indexing on this limit reach to prevent disk overuse
*
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
*
*/
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
/*
* Set default auto-crawl status for new host added
*
* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT
* false - requires manual validation by the moderator in the DB `host`.`status` field
*
* This option also disable host in the search results
*
*/
define('CRAWL_HOST_DEFAULT_STATUS', true);
/*
* Index only meta tags to prevent disk overuse
* or false to save meta tags + overall plain text page content
*
* Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field
*
* This option able to change search results relevance
*
*/
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robots.md
*
*/
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
/*
* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
* The crawler does not overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
*
*/
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
// Cleaner settings
/*
* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
*
* This option works with CLEAN_HOST_SECONDS_OFFSET
*
* The value depends of CPU resources available
*
*/
define('CLEAN_HOST_LIMIT', 20);
/*
* Apply cleaning rules to page older than value provided
*
* This option works with CLEAN_HOST_LIMIT step queue
*
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
* must have enought value to process all pages in the DB index
*
* or the cleaner can stuck in queue
*
*/
define('CLEAN_HOST_SECONDS_OFFSET', 3600);
// API settings
/*
* JSON API features
*
* When false - every the actions settings below will be ignored
*
*/
define('API_ENABLED', true);
/*
* Search API
*
* When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored
*
*/
define('API_SEARCH_ENABLED', true);
/*
* Search results per page
*
*/
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);
/*
* Hosts distribution API
*
* When false - API_HOSTS_FIELDS will be ignored
*
*/
define('API_HOSTS_ENABLED', true);
/*
* Database host fields comma separated or * to share all the fields
*
*/
define('API_HOSTS_FIELDS',
'`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`robots`,
`host`.`robotsPostfix`,
`host`.`timeAdded`,
`host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`'); // string: *|field names comma separated
/*
* Manifest API
*
* Application meta sharing between YGGo remote nodes
*
* When true - make this node public for distributed index sharing
*
*/
define('API_MANIFEST_ENABLED', true);