[ 'storage-1' => [ 'directory' => __DIR__ . '/../storage/snap/hps/', 'quota' => [ 'mime' => false, 'size' => 10000000024, 'request' => [ 'download' => [ 'size' => 10000024, 'seconds' => 60*60 ] ] ], ], // ... ], 'ftp' => [ 'storage-1' => [ 'port' => 21, 'host' => '', 'username' => '', 'password' => '', 'directory' => '/snap', 'timeout' => 30, 'passive' => true, 'quota' => [ 'mime' => 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico', 'size' => 10000000024, 'request' => [ 'download' => [ 'size' => 10000024, 'seconds' => 60*60 ] ] ], ], // ... ] ] )); // Proxy settings /* * Search proxy User Agent name * * Shared to other hosts through CURL requests by search proxy * */ define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )'); // Host defaults /* * Only URL addresses match this rule will be crawled * */ define('DEFAULT_HOST_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); // ipv6 links only /* * Default robots.txt rules (will be overwriten on remote rules available) * * string|null * */ define('DEFAULT_HOST_ROBOTS_TXT', null); /* * These rules forcely appending to the remote robots.txt file * * string|null * */ define('DEFAULT_HOST_ROBOTS_TXT_POSTFIX', null); /* * Pages limit per new host by default * * Crawler stops indexing on this limit reach to prevent disk overuse * */ define('DEFAULT_HOST_PAGES_LIMIT', 100000); /* * Index pages match MIME types * * comma separated * */ define('DEFAULT_HOST_PAGES_MIME', 'text/html,application/xhtml+xml,application/javascript,text/plain,text/css,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac,font/ttf'); /* * Index only meta tags * or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field * * Warning! * this option requires huge disk storage, * it's experimental feature, oriented for index operations * */ define('DEFAULT_HOST_PAGES_DATA', false); /* * Generates hostPageDom index based on hostPage.data field * * Could be useful for building semantical index query (config/sphinx.conf.txt) * * At this moment feature available in the CLI only (cli/yggo.php) * */ define('DEFAULT_HOST_PAGES_DOM_SELECTORS', false); // ";" separated /* * Strip HTML in the DEFAULT_HOST_PAGES_DOM_SELECTORS content * */ define('DEFAULT_HOST_PAGE_DOM_STRIP_TAGS', false); // Crawl queue /* * Crawler / Bot User Agent name * * Shared to other hosts through CURL requests by crawler * */ define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); /* * Skip curl download on response data size reached * * See also: CURLOPT_TIMEOUT (library/curl.php) * */ define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 50485760); /* * Stop crawler on disk quota reached (Mb) * */ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 128); /* * Pages (URI) processing limit in the crawler.php queue * * This option related to CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET value * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) * * Usually up to 20 pages per minute, * to prevent websites overload by sending GET crawling requests * * Set 0 to disable * */ define('CRAWL_HOST_PAGE_QUEUE_LIMIT', 10); /* * Renew page index by timing offset provided * * This option works with CRAWL_HOST_PAGE_QUEUE_LIMIT step queue * * Pay attention, that CRAWL_HOST_PAGE_QUEUE_LIMIT + CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET pair * must have enough value to crawl all pages collected in the DB index * * or the crawler can stuck in queue * */ define('CRAWL_HOST_PAGE_QUEUE_SECONDS_OFFSET', 60*60*24*30*12); /* * Re-calculate page rank on page update * * When enabled, may enlarge execution time * * true|false * */ define('CRAWL_HOST_PAGE_RANK_UPDATE', false); /* * Renew hosts index by timing offset provided * */ define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7); /* * Hosts hosts processing limit in the crawler.php queue * * Set 0 to disable * */ define('CRAWL_HOST_LIMIT', 1); /* * Collect sitemap index when available * * At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only * * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml * * true|false * */ define('CRAWL_SITEMAPS', true); /* * Crawl robots.txt */ define('CRAWL_ROBOTS', true); // true|false /* * Look for third-party manifests to collect distributed index * * API address provided in yggo meta tag * * Collecting URL that match DEFAULT_HOST_URL_REGEXP condition * */ define('CRAWL_MANIFEST', true); /* * Manifest API version compatibility * */ define('CRAWL_MANIFEST_API_VERSION', 0.13); /* * Remove host ban after following time * * This option used in crawler and search page * to prevent extra http requests to unavailable or not condition resources * */ define('CLEAN_HOST_BAN_SECONDS_OFFSET', 60*60*24*30); /* * Remove page ban after following time * * This option used in crawler and search page * to prevent extra http requests to unavailable or not condition resources * */ define('CLEAN_HOST_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); /* * Database tables optimization * * Reorganizes the physical storage of table data and associated index data, * to reduce storage space and improve I/O efficiency when accessing the tables. * Read more: https://www.forknerds.com/reduce-the-size-of-mysql/#Shrink_and_Optimize_MySQL * * When enabled - requires enough of RAM * */ define('CLEAN_DB_TABLES_OPTIMIZATION', true); // API settings /* * JSON API features * * When false - every the actions settings below will be ignored * */ define('API_ENABLED', true); /* * Search API * * When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored * */ define('API_SEARCH_ENABLED', true); /* * Search results per page * */ define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20); /* * Hosts distribution API * * When false - API_HOSTS_FIELDS will be ignored * */ define('API_HOSTS_ENABLED', true); /* * Database host fields comma separated or * to share all the fields * */ define('API_HOSTS_FIELDS', "IF (`port` IS NOT NULL, CONCAT(`scheme`, '://', `name`, ':', `port`), CONCAT(`scheme`, '://', `name`) ) AS `url`, `timeAdded`, `timeUpdated`"); /* * Manifest API * * Application meta sharing between YGGo remote nodes * * When true - make this node public for distributed index sharing * */ define('API_MANIFEST_ENABLED', true);