YGGo/config/app.php.txt

<?php

// Debug
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);

// Website

/*
 * Project domain, without slash on postfix
 *
 */
define('WEBSITE_DOMAIN', (issue($_SERVER['HTTP_HOST']) ? 'http://' . $_SERVER['HTTP_HOST'] : ''));

/*
 * Search results per page before show the read more link.
 *
 */
define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100);

/*
 * Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
 *
 * or false - to generate every time on request
 *
 */
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);

// Database
define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
define('DB_PASSWORD', '');

// Sphinx
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);

// Crawler settings

/*
 * Pages (URI) processing limit in the crawler.php queue
 *
 * This option related to CRAWL_PAGE_SECONDS_OFFSET value
 * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
 *
 * Usually up to 20 pages per minute,
 * to prevent websites overload by sending GET crawling requests
 *
 */
define('CRAWL_PAGE_LIMIT', 10);

/*
 * Renew page index by timing offset provided
 *
 * This option works with CRAWL_PAGE_LIMIT step queue
 *
 * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
 * must have enought value to crawl all pages collected in the DB index
 *
 * or the crawler can stuck in queue
 *
 */
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);

/*
 * Only URL addresses match this rule will be auto-crawled
 *
 */
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'

/*
 * Pages limit per new host by default
 *
 * Crawler stops indexing on this limit reach to prevent disk overuse
 *
 * Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
 *
 */
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);

/*
 * Set default auto-crawl status for new host added
 *
 * true  - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT
 * false - requires manual validation by the moderator in the DB `host`.`status` field
 *
 * This option also disable host in the search results
 *
 */
define('CRAWL_HOST_DEFAULT_STATUS', true);

/*
 * Index only meta tags to prevent disk overuse
 * or false to save meta tags + overall plain text page content
 *
 * Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field
 *
 * This option able to change search results relevance
 *
 */
define('CRAWL_HOST_DEFAULT_META_ONLY', false);

/*
 * Default robots.txt rules on remote file not exists
 * The crawler able to overwrite these rules
 *
 * Presets
 * yggdrasil: /database/yggdrasil/host.robots.md
 *
 */
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null

/*
 * Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
 * The crawler does not overwrite these rules
 *
 * Presets
 * yggdrasil: /database/yggdrasil/host.robotsPostfix.md
 *
 */
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null

// Cleaner settings

/*
 * Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)
 *
 * This option works with CLEAN_HOST_SECONDS_OFFSET
 *
 * The value depends of CPU resources available
 *
 */
define('CLEAN_HOST_LIMIT', 20);

/*
 * Apply cleaning rules to page older than value provided
 *
 * This option works with CLEAN_HOST_LIMIT step queue
 *
 * Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
 * must have enought value to process all pages in the DB index
 *
 * or the cleaner can stuck in queue
 *
 */
define('CLEAN_HOST_SECONDS_OFFSET', 3600);

// API settings
define('API_ENABLED', true);

define('API_SEARCH_ENABLED', true);
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);

define('API_HOSTS_ENABLED', true);
define('API_HOSTS_FIELDS', '`scheme`,`name`,`port`,`crawlPageLimit`,`robots`,`robotsPostfix`,`timeAdded`,`timeUpdated`'); // string: *|field names comma separated
initial commit 2 years ago			`<?php`

			`// Debug`
			`ini_set('display_errors', '1');`
			`ini_set('display_startup_errors', '1');`
			`error_reporting(E_ALL);`

			`// Website`
add options documentation 2 years ago
			`/*`
			`* Project domain, without slash on postfix`
			`*`
			`*/`
fix crawl request warnings 2 years ago			`define('WEBSITE_DOMAIN', (issue($_SERVER['HTTP_HOST']) ? 'http://' . $_SERVER['HTTP_HOST'] : ''));`
add options documentation 2 years ago
			`/*`
			`* Search results per page before show the read more link.`
			`*`
			`*/`
implement search results pagination 2 years ago			`define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100);`
add options documentation 2 years ago
			`/*`
			`* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload`
			`*`
			`* or false - to generate every time on request`
			`*`
			`*/`
implement hostname identicons 2 years ago			`define('WEBSITE_IDENTICON_IMAGE_CACHE', true);`
initial commit 2 years ago
			`// Database`
implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`define('DB_HOST', 'localhost');`
			`define('DB_PORT', 3306);`
			`define('DB_NAME', '');`
initial commit 2 years ago			`define('DB_USERNAME', '');`
			`define('DB_PASSWORD', '');`

implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`// Sphinx`
			`define('SPHINX_HOST', '127.0.0.1');`
			`define('SPHINX_PORT', 9306);`
initial commit 2 years ago
implement database cleaner 2 years ago			`// Crawler settings`
add options documentation 2 years ago
			`/*`
			`* Pages (URI) processing limit in the crawler.php queue`
			`*`
			`* This option related to CRAWL_PAGE_SECONDS_OFFSET value`
			`* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)`
			`*`
			`* Usually up to 20 pages per minute,`
			`* to prevent websites overload by sending GET crawling requests`
			`*`
			`*/`
initial commit 2 years ago			`define('CRAWL_PAGE_LIMIT', 10);`
add options documentation 2 years ago
			`/*`
			`* Renew page index by timing offset provided`
			`*`
			`* This option works with CRAWL_PAGE_LIMIT step queue`
			`*`
			`* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair`
			`* must have enought value to crawl all pages collected in the DB index`
			`*`
			`* or the crawler can stuck in queue`
			`*`
			`*/`
initial commit 2 years ago			`define('CRAWL_PAGE_SECONDS_OFFSET', 3600);`

add options documentation 2 years ago			`/*`
			`* Only URL addresses match this rule will be auto-crawled`
			`*`
			`*/`
add ipv6 example 2 years ago			`define('CRAWL_URL_REGEXP', '/^.$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].$/ui'`
add CRAWL_META_ONLY option 2 years ago
add options documentation 2 years ago			`/*`
			`* Pages limit per new host by default`
			`*`
			`* Crawler stops indexing on this limit reach to prevent disk overuse`
			`*`
			* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
			`*`
			`*/`
implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);`
add options documentation 2 years ago
			`/*`
			`* Set default auto-crawl status for new host added`
			`*`
			`* true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT`
			* false - requires manual validation by the moderator in the DB `host`.`status` field
			`*`
			`* This option also disable host in the search results`
			`*`
			`*/`
implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`define('CRAWL_HOST_DEFAULT_STATUS', true);`
add options documentation 2 years ago
			`/*`
			`* Index only meta tags to prevent disk overuse`
			`* or false to save meta tags + overall plain text page content`
			`*`
			* Custom rule for specified host could be provided in the DB `host`.`crawlPageMetaOnly` field
			`*`
			`* This option able to change search results relevance`
			`*`
			`*/`
implement MySQL/Sphinx data model #3, add basical robots.txt support #2 2 years ago			`define('CRAWL_HOST_DEFAULT_META_ONLY', false);`

implement CRAWL_ROBOTS_POSTFIX_RULES configuration #5 2 years ago			`/*`
			`* Default robots.txt rules on remote file not exists`
			`* The crawler able to overwrite these rules`
			`*`
			`* Presets`
			`* yggdrasil: /database/yggdrasil/host.robots.md`
			`*`
			`*/`
			`define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string\|null`

			`/*`
			`* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES`
			`* The crawler does not overwrite these rules`
			`*`
			`* Presets`
			`* yggdrasil: /database/yggdrasil/host.robotsPostfix.md`
			`*`
			`*/`
implement database cleaner 2 years ago			`define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string\|null`

			`// Cleaner settings`
add options documentation 2 years ago
			`/*`
			`* Hosts limit per crontab execution step (https://github.com/YGGverse/YGGo#crontab)`
			`*`
			`* This option works with CLEAN_HOST_SECONDS_OFFSET`
			`*`
			`* The value depends of CPU resources available`
			`*`
			`*/`
implement database cleaner 2 years ago			`define('CLEAN_HOST_LIMIT', 20);`
add options documentation 2 years ago
			`/*`
			`* Apply cleaning rules to page older than value provided`
			`*`
			`* This option works with CLEAN_HOST_LIMIT step queue`
			`*`
			`* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair`
			`* must have enought value to process all pages in the DB index`
			`*`
			`* or the cleaner can stuck in queue`
			`*`
			`*/`
implement basic api 2 years ago			`define('CLEAN_HOST_SECONDS_OFFSET', 3600);`

			`// API settings`
			`define('API_ENABLED', true);`

			`define('API_SEARCH_ENABLED', true);`
			`define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);`

			`define('API_HOSTS_ENABLED', true);`
			define('API_HOSTS_FIELDS', '`scheme`,`name`,`port`,`crawlPageLimit`,`robots`,`robotsPostfix`,`timeAdded`,`timeUpdated`'); // string: *\|field names comma separated