2023-04-01 19:29:39 +03:00
|
|
|
<?php
|
|
|
|
|
|
|
|
// Debug
|
|
|
|
ini_set('display_errors', '1');
|
|
|
|
ini_set('display_startup_errors', '1');
|
|
|
|
error_reporting(E_ALL);
|
|
|
|
|
|
|
|
// Website
|
2023-04-23 01:54:10 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Project domain, without slash on postfix
|
|
|
|
*
|
|
|
|
*/
|
2023-04-24 02:32:03 +03:00
|
|
|
define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : ''));
|
2023-04-23 01:54:10 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Search results per page before show the read more link.
|
|
|
|
*
|
|
|
|
*/
|
2023-04-02 23:36:35 +03:00
|
|
|
define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100);
|
2023-04-23 01:54:10 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
|
|
|
|
*
|
|
|
|
* or false - to generate every time on request
|
|
|
|
*
|
|
|
|
*/
|
2023-04-03 01:30:09 +03:00
|
|
|
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
|
2023-04-01 19:29:39 +03:00
|
|
|
|
|
|
|
// Database
|
2023-04-07 04:04:24 +03:00
|
|
|
define('DB_HOST', 'localhost');
|
|
|
|
define('DB_PORT', 3306);
|
|
|
|
define('DB_NAME', '');
|
2023-04-01 19:29:39 +03:00
|
|
|
define('DB_USERNAME', '');
|
|
|
|
define('DB_PASSWORD', '');
|
|
|
|
|
2023-04-07 04:04:24 +03:00
|
|
|
// Sphinx
|
|
|
|
define('SPHINX_HOST', '127.0.0.1');
|
|
|
|
define('SPHINX_PORT', 9306);
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-04-09 00:06:28 +03:00
|
|
|
// Crawler settings
|
2023-04-23 01:32:34 +03:00
|
|
|
|
2023-04-23 04:05:00 +03:00
|
|
|
/*
|
|
|
|
* Stop crawler on disk quota reached (Mb)
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
|
|
|
|
2023-04-23 01:32:34 +03:00
|
|
|
/*
|
|
|
|
* Pages (URI) processing limit in the crawler.php queue
|
|
|
|
*
|
|
|
|
* This option related to CRAWL_PAGE_SECONDS_OFFSET value
|
|
|
|
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
|
|
|
|
*
|
|
|
|
* Usually up to 20 pages per minute,
|
|
|
|
* to prevent websites overload by sending GET crawling requests
|
|
|
|
*
|
|
|
|
*/
|
2023-04-01 19:29:39 +03:00
|
|
|
define('CRAWL_PAGE_LIMIT', 10);
|
2023-04-23 01:32:34 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Renew page index by timing offset provided
|
|
|
|
*
|
|
|
|
* This option works with CRAWL_PAGE_LIMIT step queue
|
|
|
|
*
|
|
|
|
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
|
|
|
|
* must have enought value to crawl all pages collected in the DB index
|
|
|
|
*
|
|
|
|
* or the crawler can stuck in queue
|
|
|
|
*
|
|
|
|
*/
|
2023-04-01 19:29:39 +03:00
|
|
|
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
|
|
|
|
|
2023-04-23 01:32:34 +03:00
|
|
|
/*
|
|
|
|
* Only URL addresses match this rule will be auto-crawled
|
|
|
|
*
|
|
|
|
*/
|
2023-04-04 01:39:48 +03:00
|
|
|
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
|
2023-04-03 03:07:54 +03:00
|
|
|
|
2023-04-23 01:14:31 +03:00
|
|
|
/*
|
|
|
|
* Pages limit per new host by default
|
|
|
|
*
|
|
|
|
* Crawler stops indexing on this limit reach to prevent disk overuse
|
|
|
|
*
|
|
|
|
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
|
|
|
|
*
|
|
|
|
*/
|
2023-04-07 04:04:24 +03:00
|
|
|
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
|
2023-04-23 01:14:31 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Set default auto-crawl status for new host added
|
|
|
|
*
|
|
|
|
* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT
|
|
|
|
* false - requires manual validation by the moderator in the DB `host`.`status` field
|
|
|
|
*
|
|
|
|
* This option also disable host in the search results
|
|
|
|
*
|
|
|
|
*/
|
2023-04-07 04:04:24 +03:00
|
|
|
define('CRAWL_HOST_DEFAULT_STATUS', true);
|
2023-04-23 01:14:31 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Index only meta tags to prevent disk overuse
|
|
|
|
* or false to save meta tags + overall plain text page content
|
|
|
|
*
|
|
|
|
* Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field
|
|
|
|
*
|
|
|
|
* This option able to change search results relevance
|
|
|
|
*
|
|
|
|
*/
|
2023-04-07 04:04:24 +03:00
|
|
|
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
|
|
|
|
|
2023-04-08 22:28:31 +03:00
|
|
|
/*
|
|
|
|
* Default robots.txt rules on remote file not exists
|
|
|
|
* The crawler able to overwrite these rules
|
|
|
|
*
|
|
|
|
* Presets
|
|
|
|
* yggdrasil: /database/yggdrasil/host.robots.md
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
|
|
|
|
* The crawler does not overwrite these rules
|
|
|
|
*
|
|
|
|
* Presets
|
|
|
|
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
|
|
|
|
*
|
|
|
|
*/
|
2023-04-09 00:06:28 +03:00
|
|
|
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
|
|
|
|
|
|
|
// Cleaner settings
|
2023-04-23 01:46:34 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
|
|
|
|
*
|
|
|
|
* This option works with CLEAN_HOST_SECONDS_OFFSET
|
|
|
|
*
|
|
|
|
* The value depends of CPU resources available
|
|
|
|
*
|
|
|
|
*/
|
2023-04-09 00:06:28 +03:00
|
|
|
define('CLEAN_HOST_LIMIT', 20);
|
2023-04-23 01:46:34 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Apply cleaning rules to page older than value provided
|
|
|
|
*
|
|
|
|
* This option works with CLEAN_HOST_LIMIT step queue
|
|
|
|
*
|
|
|
|
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
|
|
|
|
* must have enought value to process all pages in the DB index
|
|
|
|
*
|
|
|
|
* or the cleaner can stuck in queue
|
|
|
|
*
|
|
|
|
*/
|
2023-04-23 03:01:51 +03:00
|
|
|
define('CLEAN_HOST_SECONDS_OFFSET', 3600);
|
|
|
|
|
|
|
|
// API settings
|
2023-04-23 03:16:54 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* JSON API features
|
|
|
|
*
|
|
|
|
* When false - every the actions settings below will be ignored
|
|
|
|
*
|
|
|
|
*/
|
2023-04-23 03:01:51 +03:00
|
|
|
define('API_ENABLED', true);
|
|
|
|
|
2023-04-23 03:16:54 +03:00
|
|
|
/*
|
|
|
|
* Search API
|
|
|
|
*
|
|
|
|
* When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored
|
|
|
|
*
|
|
|
|
*/
|
2023-04-23 03:01:51 +03:00
|
|
|
define('API_SEARCH_ENABLED', true);
|
2023-04-23 03:16:54 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Search results per page
|
|
|
|
*
|
|
|
|
*/
|
2023-04-23 03:01:51 +03:00
|
|
|
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);
|
|
|
|
|
2023-04-23 03:16:54 +03:00
|
|
|
/*
|
|
|
|
* Hosts distribution API
|
|
|
|
*
|
|
|
|
* When false - API_HOSTS_FIELDS will be ignored
|
|
|
|
*
|
|
|
|
*/
|
2023-04-23 03:01:51 +03:00
|
|
|
define('API_HOSTS_ENABLED', true);
|
2023-04-23 03:16:54 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Database host fields comma separated or * to share all the fields
|
|
|
|
*
|
|
|
|
*/
|
2023-04-25 16:01:55 +03:00
|
|
|
define('API_HOSTS_FIELDS',
|
|
|
|
'`host`.`scheme`,
|
|
|
|
`host`.`name`,
|
|
|
|
`host`.`port`,
|
|
|
|
`host`.`crawlPageLimit`,
|
|
|
|
`host`.`robots`,
|
|
|
|
`host`.`robotsPostfix`,
|
|
|
|
`host`.`timeAdded`,
|
|
|
|
`host`.`timeUpdated`,
|
|
|
|
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`'); // string: *|field names comma separated
|