YGGo/config/app.php.example

<?php

/*
 * YGGo! - Distributed & Open Source Web Search Engine
 *
 * MIT License

 * Copyright (c) 2023 YGGverse

 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:

 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Default configuration file example
 * Production name: app.php
 *
 * Project home page
 * https://github.com/YGGverse/YGGo
 *
 * Get support
 * https://github.com/YGGverse/YGGo/issues
 *
 */

// Debug
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);

// Website

/*
 * Project domain, without slash on postfix
 *
 */
define('WEBSITE_DOMAIN', '');

/*
 * Page search results before show the read more link
 *
 */
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);

/*
 * Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
 *
 * or false - to generate every time on request
 *
 */
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);

// Database
define('DB_HOST', 'localhost');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
define('DB_PASSWORD', '');

// Sphinx
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);

// Memcached
define('MEMCACHED_HOST', 'localhost');
define('MEMCACHED_PORT', 11211);

// Snaps

/*
 * Storage nodes configuration
 *
 * Supports optional single 'localhost' and multiple 'FTP' servers
 *
 * Comment specified node to disable specified connection
 *
 * Make empty array to disable snaps or set quote.mime = false or quote.size = 0 to disable specified instance
 *
 */
define('SNAP_STORAGE', json_encode((object)
        [
                'localhost' => [
                        'storage-1' => [
                                'directory' => __DIR__ . '/../storage/snap/hps/',
                                'quota'     => [
                                        'mime' => false,
                                        'size' => 10000000024,
                                        'request' => [
                                                'download' => [
                                                        'size'    => 10000024,
                                                        'seconds' => 60*60
                                                ]
                                        ]
                                ],
                        ],
                        // ...
                ],
                'ftp'       => [
                        'storage-1' => [
                                'port'      => 21,
                                'host'      => '',
                                'username'  => '',
                                'password'  => '',
                                'directory' => '/snap',
                                'timeout'   => 30,
                                'passive'   => true,
                                'quota'     => [
                                        'mime' => 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico',
                                        'size' => 10000000024,
                                        'request' => [
                                                'download' => [
                                                        'size'    => 10000024,
                                                        'seconds' => 60*60
                                                ]
                                        ]
                                ],
                        ],
                        // ...
                ]
        ]
));


// Proxy settings

/*
 * Search proxy User Agent name
 *
 * Shared to other hosts through CURL requests by search proxy
 *
 */
define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )');

// Host defaults

/*
 * Only URL addresses match this rule will be crawled
 *
 */
define('DEFAULT_HOST_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); // ipv6 links only

/*
 * Default robots.txt rules (will be overwriten on remote rules available)
 *
 * string|null
 *
 */
define('DEFAULT_HOST_ROBOTS_TXT', null);

/*
 * These rules forcely appending to the remote robots.txt file
 *
 * string|null
 *
 */
define('DEFAULT_HOST_ROBOTS_TXT_POSTFIX', null);

/*
 * Pages limit per new host by default
 *
 * Crawler stops indexing on this limit reach to prevent disk overuse
 *
 */
define('DEFAULT_HOST_PAGES_LIMIT', 100000);

/*
 * Index pages match MIME types
 *
 * comma separated
 *
 */
define('DEFAULT_HOST_PAGES_MIME', 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac,font/ttf');

/*
 * Index only meta tags
 * or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
 *
 * Warning!
 * this option requires huge disk storage,
 * it's experimental feature, oriented for index operations
 *
 */
define('DEFAULT_HOST_PAGES_DATA', false);

/*
 * Generates hostPageDom index based on hostPage.data field
 *
 * Could be useful for building semantical index query (config/sphinx.conf.txt)
 *
 * At this moment feature available in the CLI only (cli/yggo.php)
 *
 */
define('DEFAULT_HOST_PAGES_DOM_SELECTORS', false); // ";" separated

/*
 * Strip HTML in the DEFAULT_HOST_PAGES_DOM_SELECTORS content
 *
 */
define('DEFAULT_HOST_PAGE_DOM_STRIP_TAGS', false);


// Crawl queue

/*
 * Crawler / Bot User Agent name
 *
 * Shared to other hosts through CURL requests by crawler
 *
 */
define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )');

/*
 * Skip curl download on response data size reached
 *
 * See also: CURLOPT_TIMEOUT (library/curl.php)
 *
 */
define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 50485760);

/*
 * Stop crawler on disk quota reached (Mb)
 *
 */
define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 128);

/*
 * Pages (URI) processing limit in the crawler.php queue
 *
 * This option related to CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET value
 * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
 *
 * Usually up to 20 pages per minute,
 * to prevent websites overload by sending GET crawling requests
 *
 * Set 0 to disable
 *
 */
define('CRAWL_HOST_PAGE_QUEUE_LIMIT', 10);

/*
 * Renew page index by timing offset provided
 *
 * This option works with CRAWL_HOST_PAGE_QUEUE_LIMIT step queue
 *
 * Pay attention, that CRAWL_HOST_PAGE_QUEUE_LIMIT + CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET pair
 * must have enough value to crawl all pages collected in the DB index
 *
 * or the crawler can stuck in queue
 *
 */
define('CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET', 60*60*24*30*12);

/*
 * Re-calculate page rank on page update
 *
 * When enabled, may enlarge execution time
 *
 * true|false
 *
 */
define('CRAWL_HOST_PAGE_RANK_UPDATE', false);

/*
 * Renew hosts index by timing offset provided
 *
 */
define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7);

/*
 * Hosts hosts processing limit in the crawler.php queue
 *
 * Set 0 to disable
 *
 */
define('CRAWL_HOST_LIMIT', 1);

/*
 * Collect sitemap index when available
 *
 * At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
 *
 * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
 *
 * true|false
 *
 */
define('CRAWL_SITEMAPS', true);

/*
 * Crawl robots.txt
 */
define('CRAWL_ROBOTS', true); // true|false

/*
 * Look for third-party manifests to collect distributed index
 *
 * API address provided in yggo meta tag
 *
 * Collecting URL that match DEFAULT_HOST_URL_REGEXP condition
 *
 */
define('CRAWL_MANIFEST', true);

/*
 * Manifest API version compatibility
 *
 */
define('CRAWL_MANIFEST_API_VERSION', 0.13);


// Integrations

/*
 * Crawl YGGstate for peers to descover new hosts
 *
 * Yggdrasil networks only
 *
 * Read more:
 * https://github.com/YGGverse/YGGstate
 *
 */
define('CRAWL_YGGSTATE', json_encode((object)
        [
                'db' =>
                [
                        [
                                // Conditions
                                'peer_min_last_uptime' => 60*60*24, // skip short-term connections, seconds
                                'timeout'              => 60*60*24, // these calls running in crontab/crawler qeue, prevent remote server abuse

                                // Connection
                                'port'     => 3306,
                                'host'     => '',
                                'database' => '',
                                'username' => '',
                                'password' => '',
                        ],
                        // ...
                ],
        ])
);

/*
 * Remove host ban after following time
 *
 * This option used in crawler and search page
 * to prevent extra http requests to unavailable or not condition resources
 *
 */

define('CLEAN_HOST_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
 * Remove page ban after following time
 *
 * This option used in crawler and search page
 * to prevent extra http requests to unavailable or not condition resources
 *
 */
define('CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);

/*
 * Database tables optimization
 *
 * Reorganizes the physical storage of table data and associated index data,
 * to reduce storage space and improve I/O efficiency when accessing the tables.
 * Read more: https://www.forknerds.com/reduce-the-size-of-mysql/#Shrink_and_Optimize_MySQL
 *
 * When enabled - requires enough of RAM
 *
 */
define('CLEAN_DB_TABLES_OPTIMIZATION', true);

// API settings

/*
 * JSON API features
 *
 * When false - every the actions settings below will be ignored
 *
 */
define('API_ENABLED', true);

/*
 * Search API
 *
 * When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored
 *
 */
define('API_SEARCH_ENABLED', true);

/*
 * Search results per page
 *
 */
define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20);

/*
 * Hosts distribution API
 *
 * When false - API_HOSTS_FIELDS will be ignored
 *
 */
define('API_HOSTS_ENABLED', true);

/*
 * Database host fields comma separated or * to share all the fields
 *
 */
define('API_HOSTS_FIELDS', "IF (`port` IS NOT NULL,
                                CONCAT(`scheme`, '://', `name`, ':', `port`),
                                CONCAT(`scheme`, '://', `name`)
                            ) AS `url`,
                            `timeAdded`,
                            `timeUpdated`");

/*
 * Manifest API
 *
 * Application meta sharing between YGGo remote nodes
 *
 * When true - make this node public for distributed index sharing
 *
 */
define('API_MANIFEST_ENABLED', true);