YGGo/config/app.php.txt

<?php

/*
 * YGGo! - Distributed & Open Source Web Search Engine
 *
 * MIT License

 * Copyright (c) 2023 YGGverse

 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:

 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Default configuration file example
 * Production name: app.php
 *
 * Project home page
 * https://github.com/YGGverse/YGGo
 *
 * Get support
 * https://github.com/YGGverse/YGGo/issues
 *
 */

// Debug
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);

// Website

/*
 * Project domain, without slash on postfix
 *
 */
define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : ''));

/*
 * Page search results before show the read more link
 *
 */
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);

/*
 * Image search results before show the read more link
 *
 */
define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10);

/*
 * Quantity of related pages for each image in the search results
 *
 */
define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);

/*
 * Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
 *
 * or false - to generate every time on request
 *
 */
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);

// Database
define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
define('DB_PASSWORD', '');

// Sphinx
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);

// Proxy settings

/*
 * Search proxy User Agent name
 *
 * Shared to other hosts through CURL requests by search proxy
 *
 */
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');

// Crawl settings

/*
 * Crawler / Bot User Agent name
 *
 * Shared to other hosts through CURL requests by crawler
 *
 */
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');

/*
 * Stop crawler on disk quota reached (Mb)
 *
 */
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);

/*
 * Pages (URI) processing limit in the crawler.php queue
 *
 * This option related to CRAWL_PAGE_SECONDS_OFFSET value
 * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
 *
 * Usually up to 20 pages per minute,
 * to prevent websites overload by sending GET crawling requests
 *
 * Set 0 to disable
 *
 */
define('CRAWL_PAGE_LIMIT', 20);

/*
 * Images (URI) processing limit in the crawler.php queue
 *
 * This option related to CRAWL_IMAGE_SECONDS_OFFSET value
 * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
 *
 * Usually up to 20 pages per minute,
 * to prevent websites overload by sending GET crawling requests
 *
 * Set 0 to disable
 *
 */
define('CRAWL_IMAGE_LIMIT', 10);

/*
 * Manifest (URI) processing limit in the crawler.php queue
 *
 * Used to collect distributed data index
 * that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION
 *
 * This option related to CRAWL_MANIFEST_SECONDS_OFFSET value
 * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
 *
 * Usually up to 20 pages per minute,
 * to prevent websites overload by sending GET crawling requests
 *
 * Set 0 to disable
 *
 */
define('CRAWL_MANIFEST_LIMIT', 10);

/*
 * Renew page index by timing offset provided
 *
 * This option works with CRAWL_PAGE_LIMIT step queue
 *
 * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
 * must have enough value to crawl all pages collected in the DB index
 *
 * or the crawler can stuck in queue
 *
 */
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);

/*
 * Index pages match MIME types
 *
 * comma separated
 *
 */
define('CRAWL_PAGE_MIME_TYPE', 'text/html');

/*
 * Index images match MIME types
 *
 * comma separated
 *
 */
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpeg,image/ico');

/*
 * Renew image index by timing offset provided
 *
 * This option works with CRAWL_IMAGE_LIMIT step queue
 *
 * Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
 * must have enough value to crawl all images collected in the DB index
 *
 * or the crawler can stuck in queue
 *
 */
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);

/*
 * Renew manifests index by timing offset provided
 *
 * This option works with CRAWL_MANIFEST_LIMIT step queue
 *
 * Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair
 * must have enough value to crawl all manifests collected in the DB index
 *
 * or the crawler can stuck in queue
 *
 */
define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);

/*
 * Only URL addresses match this rule will be auto-crawled
 *
 */
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'

/*
 * Pages limit per new host by default
 *
 * Crawler stops indexing on this limit reach to prevent disk overuse
 *
 * Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
 *
 */
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);

/*
 * Set default auto-crawl status for new host added
 *
 * true  - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT
 * false - requires manual validation by the moderator in the DB `host`.`status` field
 *
 * This option also disable host in the search results
 *
 */
define('CRAWL_HOST_DEFAULT_STATUS', true);

/*
 * Index only meta tags to prevent disk overuse
 * or false to save meta tags + overall plain text page content
 *
 * Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field
 *
 * This option able to change search results relevance
 * This option enables image data caching in base64
 *
 */
define('CRAWL_HOST_DEFAULT_META_ONLY', false);

/*
 * Images limit per new host by default
 *
 * Crawler stops indexing on this limit reach to prevent disk overuse
 *
 * Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field
 *
 */
define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000);

/*
 * Default robots.txt rules on remote file not exists
 * The crawler able to overwrite these rules
 *
 * Presets
 * yggdrasil: /database/yggdrasil/host.robots.md
 *
 */
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null

/*
 * Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
 * The crawler does not overwrite these rules
 *
 * Presets
 * yggdrasil: /database/yggdrasil/host.robotsPostfix.md
 *
 */
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null

/*
 * Look for third-party manifests to collect distributed index
 *
 * API address provided in yggo meta tag
 * will be stored in the `manifest` DB table
 *
 * Collecting URL that match CRAWL_URL_REGEXP condition
 *
 */
define('CRAWL_MANIFEST', true);

/*
 * Manifest API version compatibility
 *
 */
define('CRAWL_MANIFEST_API_VERSION', 0.4);

/*
 * Set default auto-crawl status for new manifest added
 *
 * true  - crawler autostart manifest indexer
 * false - requires manual validation by the moderator in the DB `manifest`.`status` field
 *
 * This option applying on CRAWL_MANIFEST enabled
 *
 */
define('CRAWL_MANIFEST_DEFAULT_STATUS', true);

// Cleaner settings

/*
 * Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
 *
 * This option works with CLEAN_HOST_SECONDS_OFFSET
 *
 * The value depends of CPU resources available
 *
 */
define('CLEAN_HOST_LIMIT', 20);

/*
 * Apply cleaning rules to page older than value provided
 *
 * This option works with CLEAN_HOST_LIMIT step queue
 *
 * Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
 * must have enough value to process all pages in the DB index
 *
 * or the cleaner can stuck in queue
 *
 */
define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30);

/*
 * Remove page ban after following time
 *
 * This option used in crawler and search page
 * to prevent extra http requests to unavailable or not condition resources
 *
 */
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);

/*
 * Remove image ban after following time
 *
 * This option used in crawler and search page
 * to prevent extra http requests to unavailable or not condition resources
 *
 */
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);

// API settings

/*
 * JSON API features
 *
 * When false - every the actions settings below will be ignored
 *
 */
define('API_ENABLED', true);

/*
 * Search API
 *
 * When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored
 *
 */
define('API_SEARCH_ENABLED', true);

/*
 * Search results per page
 *
 */
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);

/*
 * Hosts distribution API
 *
 * When false - API_HOSTS_FIELDS will be ignored
 *
 */
define('API_HOSTS_ENABLED', true);

/*
 * Database host fields comma separated or * to share all the fields
 *
 */
define('API_HOSTS_FIELDS',
       '`host`.`scheme`,
        `host`.`name`,
        `host`.`port`,
        `host`.`crawlPageLimit`,
        `host`.`crawlImageLimit`,
        `host`.`robots`,
        `host`.`robotsPostfix`,
        `host`.`timeAdded`,
        `host`.`timeUpdated`,
        (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,
        (SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated

/*
 * Manifest API
 *
 * Application meta sharing between YGGo remote nodes
 *
 * When true - make this node public for distributed index sharing
 *
 */
define('API_MANIFEST_ENABLED', true);
initial commit 2 years ago			`<?php`

add project description and support links 2 years ago			`/*`
			`* YGGo! - Distributed & Open Source Web Search Engine`
			`*`
			`* MIT License`

			`* Copyright (c) 2023 YGGverse`

			`* Permission is hereby granted, free of charge, to any person obtaining a copy`
			`* of this software and associated documentation files (the "Software"), to deal`
			`* in the Software without restriction, including without limitation the rights`
			`* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`* copies of the Software, and to permit persons to whom the Software is`
			`* furnished to do so, subject to the following conditions:`

			`* The above copyright notice and this permission notice shall be included in all`
			`* copies or substantial portions of the Software.`

			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`* SOFTWARE.`
			`*`
			`* Default configuration file example`
			`* Production name: app.php`
			`*`
			`* Project home page`
			`* https://github.com/YGGverse/YGGo`
			`*`
			`* Get support`
			`* https://github.com/YGGverse/YGGo/issues`
			`*`
			`*/`

initial commit 2 years ago			`// Debug`
			`ini_set('display_errors', '1');`
			`ini_set('display_startup_errors', '1');`
			`error_reporting(E_ALL);`

			`// Website`
add options documentation 2 years ago
			`/*`
			`* Project domain, without slash on postfix`
			`*`
			`*/`
make protocol settings adaptive 2 years ago			`define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : ''));`
add options documentation 2 years ago
			`/*`
display related pages in priority to the unique host by rank, rand() order 2 years ago			`* Page search results before show the read more link`
add options documentation 2 years ago			`*`
			`*/`
create separated pagination settings for page/image search types 2 years ago			`define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);`

			`/*`
display related pages in priority to the unique host by rank, rand() order 2 years ago			`* Image search results before show the read more link`
create separated pagination settings for page/image search types 2 years ago			`*`
			`*/`
			`define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10);`
add options documentation 2 years ago
display related pages in priority to the unique host by rank, rand() order 2 years ago			`/*`
			`* Quantity of related pages for each image in the search results`
			`*`
			`*/`
			`define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);`

add options documentation 2 years ago			`/*`
			`* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload`
			`*`
			`* or false - to generate every time on request`
			`*`
			`*/`
implement hostname identicons 2 years ago			`define('WEBSITE_IDENTICON_IMAGE_CACHE', true);`
initial commit 2 years ago
			`// Database`
implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`define('DB_HOST', 'localhost');`
			`define('DB_PORT', 3306);`
			`define('DB_NAME', '');`
initial commit 2 years ago			`define('DB_USERNAME', '');`
			`define('DB_PASSWORD', '');`

implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`// Sphinx`
			`define('SPHINX_HOST', '127.0.0.1');`
			`define('SPHINX_PORT', 9306);`
initial commit 2 years ago
add crawler / proxy user agent settings 2 years ago			`// Proxy settings`

			`/*`
			`* Search proxy User Agent name`
			`*`
			`* Shared to other hosts through CURL requests by search proxy`
			`*`
			`*/`
			`define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');`

			`// Crawl settings`

			`/*`
			`* Crawler / Bot User Agent name`
			`*`
			`* Shared to other hosts through CURL requests by crawler`
			`*`
			`*/`
			`define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');`
add options documentation 2 years ago
add disk quota validation 2 years ago			`/*`
			`* Stop crawler on disk quota reached (Mb)`
			`*`
			`*/`
			`define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);`

add options documentation 2 years ago			`/*`
			`* Pages (URI) processing limit in the crawler.php queue`
			`*`
			`* This option related to CRAWL_PAGE_SECONDS_OFFSET value`
			`* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)`
			`*`
			`* Usually up to 20 pages per minute,`
			`* to prevent websites overload by sending GET crawling requests`
			`*`
add distributed hosts crawling using yggo nodes manifest 2 years ago			`* Set 0 to disable`
			`*`
add options documentation 2 years ago			`*/`
add distributed hosts crawling using yggo nodes manifest 2 years ago			`define('CRAWL_PAGE_LIMIT', 20);`
add options documentation 2 years ago
add image queue crawler 2 years ago			`/*`
			`* Images (URI) processing limit in the crawler.php queue`
			`*`
			`* This option related to CRAWL_IMAGE_SECONDS_OFFSET value`
			`* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)`
			`*`
			`* Usually up to 20 pages per minute,`
			`* to prevent websites overload by sending GET crawling requests`
			`*`
add distributed hosts crawling using yggo nodes manifest 2 years ago			`* Set 0 to disable`
			`*`
			`*/`
			`define('CRAWL_IMAGE_LIMIT', 10);`

			`/*`
			`* Manifest (URI) processing limit in the crawler.php queue`
			`*`
			`* Used to collect distributed data index`
			`* that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION`
			`*`
			`* This option related to CRAWL_MANIFEST_SECONDS_OFFSET value`
			`* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)`
			`*`
			`* Usually up to 20 pages per minute,`
			`* to prevent websites overload by sending GET crawling requests`
			`*`
			`* Set 0 to disable`
			`*`
add image queue crawler 2 years ago			`*/`
add distributed hosts crawling using yggo nodes manifest 2 years ago			`define('CRAWL_MANIFEST_LIMIT', 10);`
add image queue crawler 2 years ago
add options documentation 2 years ago			`/*`
			`* Renew page index by timing offset provided`
			`*`
			`* This option works with CRAWL_PAGE_LIMIT step queue`
			`*`
			`* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair`
add image queue crawler 2 years ago			`* must have enough value to crawl all pages collected in the DB index`
add options documentation 2 years ago			`*`
			`* or the crawler can stuck in queue`
			`*`
			`*/`
update default settings preset 2 years ago			`define('CRAWL_PAGE_SECONDS_OFFSET', 6060243012);`
initial commit 2 years ago
implement MIME content-type crawler filter 2 years ago			`/*`
			`* Index pages match MIME types`
			`*`
			`* comma separated`
			`*`
			`*/`
			`define('CRAWL_PAGE_MIME_TYPE', 'text/html');`

			`/*`
			`* Index images match MIME types`
			`*`
			`* comma separated`
			`*`
			`*/`
implement not reachable resources ban feature with timeout to prevent extra http requests 2 years ago			`define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpeg,image/ico');`
implement MIME content-type crawler filter 2 years ago
add image queue crawler 2 years ago			`/*`
			`* Renew image index by timing offset provided`
			`*`
			`* This option works with CRAWL_IMAGE_LIMIT step queue`
			`*`
			`* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair`
			`* must have enough value to crawl all images collected in the DB index`
			`*`
			`* or the crawler can stuck in queue`
			`*`
			`*/`
			`define('CRAWL_IMAGE_SECONDS_OFFSET', 6060243012);`

add distributed hosts crawling using yggo nodes manifest 2 years ago			`/*`
			`* Renew manifests index by timing offset provided`
			`*`
			`* This option works with CRAWL_MANIFEST_LIMIT step queue`
			`*`
			`* Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair`
			`* must have enough value to crawl all manifests collected in the DB index`
			`*`
			`* or the crawler can stuck in queue`
			`*`
			`*/`
			`define('CRAWL_MANIFEST_SECONDS_OFFSET', 606024*30);`

add options documentation 2 years ago			`/*`
			`* Only URL addresses match this rule will be auto-crawled`
			`*`
			`*/`
add ipv6 example 2 years ago			`define('CRAWL_URL_REGEXP', '/^.$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].$/ui'`
add CRAWL_META_ONLY option 2 years ago
add options documentation 2 years ago			`/*`
			`* Pages limit per new host by default`
			`*`
			`* Crawler stops indexing on this limit reach to prevent disk overuse`
			`*`
			* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
			`*`
			`*/`
implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);`
add options documentation 2 years ago
			`/*`
			`* Set default auto-crawl status for new host added`
			`*`
			`* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT`
			* false - requires manual validation by the moderator in the DB `host`.`status` field
			`*`
			`* This option also disable host in the search results`
			`*`
			`*/`
implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`define('CRAWL_HOST_DEFAULT_STATUS', true);`
add options documentation 2 years ago
			`/*`
			`* Index only meta tags to prevent disk overuse`
			`* or false to save meta tags + overall plain text page content`
			`*`
			* Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field
			`*`
			`* This option able to change search results relevance`
update host images info on search requests 2 years ago			`* This option enables image data caching in base64`
add options documentation 2 years ago			`*`
			`*/`
implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`define('CRAWL_HOST_DEFAULT_META_ONLY', false);`

implement image crawler 2 years ago			`/*`
			`* Images limit per new host by default`
			`*`
			`* Crawler stops indexing on this limit reach to prevent disk overuse`
			`*`
			* Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field
			`*`
			`*/`
			`define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000);`

implement CRAWL_ROBOTS_POSTFIX_RULES configuration #5 2 years ago			`/*`
			`* Default robots.txt rules on remote file not exists`
			`* The crawler able to overwrite these rules`
			`*`
			`* Presets`
			`* yggdrasil: /database/yggdrasil/host.robots.md`
			`*`
			`*/`
			`define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string\|null`

			`/*`
			`* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES`
			`* The crawler does not overwrite these rules`
			`*`
			`* Presets`
			`* yggdrasil: /database/yggdrasil/host.robotsPostfix.md`
			`*`
			`*/`
implement database cleaner 2 years ago			`define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string\|null`

create manifests registry 2 years ago			`/*`
			`* Look for third-party manifests to collect distributed index`
			`*`
			`* API address provided in yggo meta tag`
			* will be stored in the `manifest` DB table
			`*`
update config documentation 2 years ago			`* Collecting URL that match CRAWL_URL_REGEXP condition`
			`*`
create manifests registry 2 years ago			`*/`
			`define('CRAWL_MANIFEST', true);`

add distributed hosts crawling using yggo nodes manifest 2 years ago			`/*`
			`* Manifest API version compatibility`
			`*`
			`*/`
			`define('CRAWL_MANIFEST_API_VERSION', 0.4);`

create manifests registry 2 years ago			`/*`
			`* Set default auto-crawl status for new manifest added`
			`*`
			`* true - crawler autostart manifest indexer`
			* false - requires manual validation by the moderator in the DB `manifest`.`status` field
			`*`
update config documentation 2 years ago			`* This option applying on CRAWL_MANIFEST enabled`
			`*`
create manifests registry 2 years ago			`*/`
			`define('CRAWL_MANIFEST_DEFAULT_STATUS', true);`

implement database cleaner 2 years ago			`// Cleaner settings`
add options documentation 2 years ago
			`/*`
			`* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)`
			`*`
			`* This option works with CLEAN_HOST_SECONDS_OFFSET`
			`*`
			`* The value depends of CPU resources available`
			`*`
			`*/`
implement database cleaner 2 years ago			`define('CLEAN_HOST_LIMIT', 20);`
add options documentation 2 years ago
			`/*`
			`* Apply cleaning rules to page older than value provided`
			`*`
			`* This option works with CLEAN_HOST_LIMIT step queue`
			`*`
			`* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair`
add image queue crawler 2 years ago			`* must have enough value to process all pages in the DB index`
add options documentation 2 years ago			`*`
			`* or the cleaner can stuck in queue`
			`*`
			`*/`
update default settings preset 2 years ago			`define('CLEAN_HOST_SECONDS_OFFSET', 606024*30);`
implement basic api 2 years ago
implement not reachable resources ban feature with timeout to prevent extra http requests 2 years ago			`/*`
			`* Remove page ban after following time`
			`*`
			`* This option used in crawler and search page`
			`* to prevent extra http requests to unavailable or not condition resources`
			`*`
			`*/`
			`define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 606024*30);`

			`/*`
			`* Remove image ban after following time`
			`*`
			`* This option used in crawler and search page`
			`* to prevent extra http requests to unavailable or not condition resources`
			`*`
			`*/`
			`define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 606024*30);`

implement basic api 2 years ago			`// API settings`
add options documentation 2 years ago
			`/*`
			`* JSON API features`
			`*`
			`* When false - every the actions settings below will be ignored`
			`*`
			`*/`
implement basic api 2 years ago			`define('API_ENABLED', true);`

add options documentation 2 years ago			`/*`
			`* Search API`
			`*`
			`* When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored`
			`*`
			`*/`
implement basic api 2 years ago			`define('API_SEARCH_ENABLED', true);`
add options documentation 2 years ago
			`/*`
			`* Search results per page`
			`*`
			`*/`
implement basic api 2 years ago			`define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);`

add options documentation 2 years ago			`/*`
			`* Hosts distribution API`
			`*`
			`* When false - API_HOSTS_FIELDS will be ignored`
			`*`
			`*/`
implement basic api 2 years ago			`define('API_HOSTS_ENABLED', true);`
add options documentation 2 years ago
			`/*`
			`* Database host fields comma separated or * to share all the fields`
			`*`
			`*/`
add hostPagesTotal info to the hosts API 2 years ago			`define('API_HOSTS_FIELDS',`
			'`host`.`scheme`,
implement image crawler 2 years ago			`host`.`name`,
			`host`.`port`,
			`host`.`crawlPageLimit`,
			`host`.`crawlImageLimit`,
			`host`.`robots`,
			`host`.`robotsPostfix`,
			`host`.`timeAdded`,
			`host`.`timeUpdated`,
			(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,
			(SELECT COUNT() FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: \|field names comma separated
implement manifest API 2 years ago
			`/*`
			`* Manifest API`
			`*`
			`* Application meta sharing between YGGo remote nodes`
			`*`
			`* When true - make this node public for distributed index sharing`
			`*`
			`*/`
			`define('API_MANIFEST_ENABLED', true);`