mirror of https://github.com/YGGverse/YGGo.git
phpyggdrasilmysqlcrawlerjs-lessalt-websphinxspiderdistributedwebsearch-engineopen-sourceweb-archivepdocurlparserfts5privacy-orientedsphinxsearchfederative
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
378 lines
9.7 KiB
378 lines
9.7 KiB
<?php |
|
|
|
/* |
|
* YGGo! - Distributed & Open Source Web Search Engine |
|
* |
|
* MIT License |
|
|
|
* Copyright (c) 2023 YGGverse |
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy |
|
* of this software and associated documentation files (the "Software"), to deal |
|
* in the Software without restriction, including without limitation the rights |
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
* copies of the Software, and to permit persons to whom the Software is |
|
* furnished to do so, subject to the following conditions: |
|
|
|
* The above copyright notice and this permission notice shall be included in all |
|
* copies or substantial portions of the Software. |
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
* SOFTWARE. |
|
* |
|
* Default configuration file example |
|
* Production name: app.php |
|
* |
|
* Project home page |
|
* https://github.com/YGGverse/YGGo |
|
* |
|
* Get support |
|
* https://github.com/YGGverse/YGGo/issues |
|
* |
|
*/ |
|
|
|
// Debug |
|
ini_set('display_errors', '1'); |
|
ini_set('display_startup_errors', '1'); |
|
error_reporting(E_ALL); |
|
|
|
// Website |
|
|
|
/* |
|
* Project domain, without slash on postfix |
|
* |
|
*/ |
|
define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : '')); |
|
|
|
/* |
|
* Page search results before show the read more link |
|
* |
|
*/ |
|
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100); |
|
|
|
/* |
|
* Image search results before show the read more link |
|
* |
|
*/ |
|
define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10); |
|
|
|
/* |
|
* Quantity of related pages for each image in the search results |
|
* |
|
*/ |
|
define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5); |
|
|
|
/* |
|
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload |
|
* |
|
* or false - to generate every time on request |
|
* |
|
*/ |
|
define('WEBSITE_IDENTICON_IMAGE_CACHE', true); |
|
|
|
// Database |
|
define('DB_HOST', 'localhost'); |
|
define('DB_PORT', 3306); |
|
define('DB_NAME', ''); |
|
define('DB_USERNAME', ''); |
|
define('DB_PASSWORD', ''); |
|
|
|
// Sphinx |
|
define('SPHINX_HOST', '127.0.0.1'); |
|
define('SPHINX_PORT', 9306); |
|
|
|
// Proxy settings |
|
|
|
/* |
|
* Search proxy User Agent name |
|
* |
|
* Shared to other hosts through CURL requests by search proxy |
|
* |
|
*/ |
|
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )'); |
|
|
|
// Crawl settings |
|
|
|
/* |
|
* Crawler / Bot User Agent name |
|
* |
|
* Shared to other hosts through CURL requests by crawler |
|
* |
|
*/ |
|
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); |
|
|
|
/* |
|
* Stop crawler on disk quota reached (Mb) |
|
* |
|
*/ |
|
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); |
|
|
|
/* |
|
* Pages (URI) processing limit in the crawler.php queue |
|
* |
|
* This option related to CRAWL_PAGE_SECONDS_OFFSET value |
|
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) |
|
* |
|
* Usually up to 20 pages per minute, |
|
* to prevent websites overload by sending GET crawling requests |
|
* |
|
* Set 0 to disable |
|
* |
|
*/ |
|
define('CRAWL_PAGE_LIMIT', 20); |
|
|
|
/* |
|
* Images (URI) processing limit in the crawler.php queue |
|
* |
|
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value |
|
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) |
|
* |
|
* Usually up to 20 pages per minute, |
|
* to prevent websites overload by sending GET crawling requests |
|
* |
|
* Set 0 to disable |
|
* |
|
*/ |
|
define('CRAWL_IMAGE_LIMIT', 10); |
|
|
|
/* |
|
* Manifest (URI) processing limit in the crawler.php queue |
|
* |
|
* Used to collect distributed data index |
|
* that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION |
|
* |
|
* This option related to CRAWL_MANIFEST_SECONDS_OFFSET value |
|
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) |
|
* |
|
* Usually up to 20 pages per minute, |
|
* to prevent websites overload by sending GET crawling requests |
|
* |
|
* Set 0 to disable |
|
* |
|
*/ |
|
define('CRAWL_MANIFEST_LIMIT', 10); |
|
|
|
/* |
|
* Renew page index by timing offset provided |
|
* |
|
* This option works with CRAWL_PAGE_LIMIT step queue |
|
* |
|
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair |
|
* must have enough value to crawl all pages collected in the DB index |
|
* |
|
* or the crawler can stuck in queue |
|
* |
|
*/ |
|
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); |
|
|
|
/* |
|
* Renew image index by timing offset provided |
|
* |
|
* This option works with CRAWL_IMAGE_LIMIT step queue |
|
* |
|
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair |
|
* must have enough value to crawl all images collected in the DB index |
|
* |
|
* or the crawler can stuck in queue |
|
* |
|
*/ |
|
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12); |
|
|
|
/* |
|
* Renew manifests index by timing offset provided |
|
* |
|
* This option works with CRAWL_MANIFEST_LIMIT step queue |
|
* |
|
* Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair |
|
* must have enough value to crawl all manifests collected in the DB index |
|
* |
|
* or the crawler can stuck in queue |
|
* |
|
*/ |
|
define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30); |
|
|
|
/* |
|
* Only URL addresses match this rule will be auto-crawled |
|
* |
|
*/ |
|
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui' |
|
|
|
/* |
|
* Pages limit per new host by default |
|
* |
|
* Crawler stops indexing on this limit reach to prevent disk overuse |
|
* |
|
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field |
|
* |
|
*/ |
|
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000); |
|
|
|
/* |
|
* Set default auto-crawl status for new host added |
|
* |
|
* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT |
|
* false - requires manual validation by the moderator in the DB `host`.`status` field |
|
* |
|
* This option also disable host in the search results |
|
* |
|
*/ |
|
define('CRAWL_HOST_DEFAULT_STATUS', true); |
|
|
|
/* |
|
* Index only meta tags to prevent disk overuse |
|
* or false to save meta tags + overall plain text page content |
|
* |
|
* Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field |
|
* |
|
* This option able to change search results relevance |
|
* This option enables image data caching in base64 |
|
* |
|
*/ |
|
define('CRAWL_HOST_DEFAULT_META_ONLY', false); |
|
|
|
/* |
|
* Images limit per new host by default |
|
* |
|
* Crawler stops indexing on this limit reach to prevent disk overuse |
|
* |
|
* Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field |
|
* |
|
*/ |
|
define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000); |
|
|
|
/* |
|
* Default robots.txt rules on remote file not exists |
|
* The crawler able to overwrite these rules |
|
* |
|
* Presets |
|
* yggdrasil: /database/yggdrasil/host.robots.md |
|
* |
|
*/ |
|
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null |
|
|
|
/* |
|
* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES |
|
* The crawler does not overwrite these rules |
|
* |
|
* Presets |
|
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md |
|
* |
|
*/ |
|
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null |
|
|
|
/* |
|
* Look for third-party manifests to collect distributed index |
|
* |
|
* API address provided in yggo meta tag |
|
* will be stored in the `manifest` DB table |
|
* |
|
* Collecting URL that match CRAWL_URL_REGEXP condition |
|
* |
|
*/ |
|
define('CRAWL_MANIFEST', true); |
|
|
|
/* |
|
* Manifest API version compatibility |
|
* |
|
*/ |
|
define('CRAWL_MANIFEST_API_VERSION', 0.4); |
|
|
|
/* |
|
* Set default auto-crawl status for new manifest added |
|
* |
|
* true - crawler autostart manifest indexer |
|
* false - requires manual validation by the moderator in the DB `manifest`.`status` field |
|
* |
|
* This option applying on CRAWL_MANIFEST enabled |
|
* |
|
*/ |
|
define('CRAWL_MANIFEST_DEFAULT_STATUS', true); |
|
|
|
// Cleaner settings |
|
|
|
/* |
|
* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab) |
|
* |
|
* This option works with CLEAN_HOST_SECONDS_OFFSET |
|
* |
|
* The value depends of CPU resources available |
|
* |
|
*/ |
|
define('CLEAN_HOST_LIMIT', 20); |
|
|
|
/* |
|
* Apply cleaning rules to page older than value provided |
|
* |
|
* This option works with CLEAN_HOST_LIMIT step queue |
|
* |
|
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair |
|
* must have enough value to process all pages in the DB index |
|
* |
|
* or the cleaner can stuck in queue |
|
* |
|
*/ |
|
define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30); |
|
|
|
// API settings |
|
|
|
/* |
|
* JSON API features |
|
* |
|
* When false - every the actions settings below will be ignored |
|
* |
|
*/ |
|
define('API_ENABLED', true); |
|
|
|
/* |
|
* Search API |
|
* |
|
* When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored |
|
* |
|
*/ |
|
define('API_SEARCH_ENABLED', true); |
|
|
|
/* |
|
* Search results per page |
|
* |
|
*/ |
|
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20); |
|
|
|
/* |
|
* Hosts distribution API |
|
* |
|
* When false - API_HOSTS_FIELDS will be ignored |
|
* |
|
*/ |
|
define('API_HOSTS_ENABLED', true); |
|
|
|
/* |
|
* Database host fields comma separated or * to share all the fields |
|
* |
|
*/ |
|
define('API_HOSTS_FIELDS', |
|
'`host`.`scheme`, |
|
`host`.`name`, |
|
`host`.`port`, |
|
`host`.`crawlPageLimit`, |
|
`host`.`crawlImageLimit`, |
|
`host`.`robots`, |
|
`host`.`robotsPostfix`, |
|
`host`.`timeAdded`, |
|
`host`.`timeUpdated`, |
|
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`, |
|
(SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated |
|
|
|
/* |
|
* Manifest API |
|
* |
|
* Application meta sharing between YGGo remote nodes |
|
* |
|
* When true - make this node public for distributed index sharing |
|
* |
|
*/ |
|
define('API_MANIFEST_ENABLED', true); |