[ // @TODO see https://github.com/YGGverse/YGGo#roadmap 'storage-1' => [ 'directory' => __DIR__ . '/../storage/snap/hps/', 'quota' => [ 'mime' => false, 'size' => 10000000024, // @TODO 'request' => [ // @TODO 'download' => [ 'size' => 10000024, 'seconds' => 60*60 ] ] ] ], // ... ], 'ftp' => [ 'storage-1' => [ 'port' => 21, 'host' => '', 'username' => '', 'password' => '', 'directory' => '/snap', 'timeout' => 30, 'passive' => true, 'quota' => [ 'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico', 'size' => 10000000024, // @TODO 'request' => [ // @TODO 'download' => [ 'size' => 10000024, 'seconds' => 60*60 ] ] ], ], // ... ] ] )); // Proxy settings /* * Search proxy User Agent name * * Shared to other hosts through CURL requests by search proxy * */ define('PROXY_CURLOPT_USERAGENT', 'YGGo Search Proxy ( https://github.com/YGGverse/YGGo )'); // Crawl settings /* * Crawler / Bot User Agent name * * Shared to other hosts through CURL requests by crawler * */ define('CRAWL_CURLOPT_USERAGENT', 'YGGo Search Crawler / Bot ( https://github.com/YGGverse/YGGo )'); /* * Skip curl download on response data size reached * * See also: CURLOPT_TIMEOUT (library/curl.php) * */ define('CRAWL_CURLOPT_PROGRESSFUNCTION_DOWNLOAD_SIZE_LIMIT', 10485760); /* * Stop crawler on disk quota reached (Mb) * */ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); /* * Pages (URI) processing limit in the crawler.php queue * * This option related to CRAWL_PAGE_SECONDS_OFFSET value * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) * * Usually up to 20 pages per minute, * to prevent websites overload by sending GET crawling requests * * Set 0 to disable * */ define('CRAWL_PAGE_LIMIT', 20); /* * Manifest (URI) processing limit in the crawler.php queue * * Used to collect distributed data index * that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION * * This option related to CRAWL_MANIFEST_SECONDS_OFFSET value * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) * * Usually up to 20 pages per minute, * to prevent websites overload by sending GET crawling requests * * Set 0 to disable * */ define('CRAWL_MANIFEST_LIMIT', 10); /* * Renew page index by timing offset provided * * This option works with CRAWL_PAGE_LIMIT step queue * * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair * must have enough value to crawl all pages collected in the DB index * * or the crawler can stuck in queue * */ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); /* * Renew home page index by timing offset provided * * Used for new pages scanning in highter priority * * This option works with CRAWL_PAGE_SECONDS_OFFSET and CRAWL_PAGE_LIMIT step queue * * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair * must have enough value to crawl all pages collected in the DB index * * or the crawler can stuck in queue * */ define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30); /* * Index pages match MIME types * * comma separated * */ define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac'); /* * Renew manifests index by timing offset provided * * This option works with CRAWL_MANIFEST_LIMIT step queue * * Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair * must have enough value to crawl all manifests collected in the DB index * * or the crawler can stuck in queue * */ define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30); /* * Only URL addresses match this rule will be auto-crawled * */ define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); /* * Pages limit per new host by default * * Crawler stops indexing on this limit reach to prevent disk overuse * * Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field * */ define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000); /* * Set default auto-crawl status for new host added * * true - crawler autostart pages indexer limited by CRAWL_HOST_DEFAULT_PAGES_LIMIT * false - requires manual validation by the moderator in the DB `host`.`status` field * * This option also disable host in the search results * */ define('CRAWL_HOST_DEFAULT_STATUS', true); /* * Index only meta tags * or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field * * Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field * * Warning! * this option disabled requires huge disk storage, * it's experimental feature, oriented for index operations * */ define('CRAWL_HOST_DEFAULT_META_ONLY', true); /* * Not suitable/safe for work status for new host by default * * Could be filtered in search results * * Custom rule for specified host could be provided in the DB `host`.`nsfw` field * */ define('CRAWL_HOST_DEFAULT_NSFW', false); /* * Collect sitemap index when available * * At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only * * When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml * * true|false * */ define('CRAWL_SITEMAPS', true); /* * Re-calculate page rank on page update * * When enabled, may enlarge execution time * * true|false * */ define('CRAWL_PAGE_RANK_UPDATE', true); /* * Renew robots.txt index by timing offset provided * */ define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7); /* * Hosts Robots.txt processing limit in the crawler.php queue * * Set 0 to disable * */ define('CRAWL_ROBOTS_LIMIT', 1); /* * Default robots.txt rules on remote file not exists * The crawler able to overwrite these rules * * Presets * yggdrasil: /database/yggdrasil/host.robots.md * */ define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null /* * Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES * The crawler does not overwrite these rules * * Presets * yggdrasil: /database/yggdrasil/host.robotsPostfix.md * */ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null /* * Generates hostPageDom index based on hostPage.data field * * Could be useful for building semantical index query (config/sphinx.conf.txt) * * At this moment feature available in the CLI only (cli/yggo.php) * */ define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1;h2;h3;h4;h5;h6'); /* * Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content * */ define('CRAWL_HOST_PAGE_DOM_STRIP_TAGS', true); /* * Look for third-party manifests to collect distributed index * * API address provided in yggo meta tag * will be stored in the `manifest` DB table * * Collecting URL that match CRAWL_URL_REGEXP condition * */ define('CRAWL_MANIFEST', true); /* * Manifest API version compatibility * */ define('CRAWL_MANIFEST_API_VERSION', 0.12); /* * Set default auto-crawl status for new manifest added * * true - crawler autostart manifest indexer * false - requires manual validation by the moderator in the DB `manifest`.`status` field * * This option applying on CRAWL_MANIFEST enabled * */ define('CRAWL_MANIFEST_DEFAULT_STATUS', true); // Cleaner settings /* * Remove page ban after following time * * This option used in crawler and search page * to prevent extra http requests to unavailable or not condition resources * */ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); /* * Database tables optimization * * Reorganizes the physical storage of table data and associated index data, * to reduce storage space and improve I/O efficiency when accessing the tables. * Read more: https://www.forknerds.com/reduce-the-size-of-mysql/#Shrink_and_Optimize_MySQL * * When enabled - requires enough of RAM * */ define('CLEAN_DB_TABLES_OPTIMIZATION', false); // API settings /* * JSON API features * * When false - every the actions settings below will be ignored * */ define('API_ENABLED', true); /* * Search API * * When false - API_SEARCH_PAGINATION_RESULTS_LIMIT will be ignored * */ define('API_SEARCH_ENABLED', true); /* * Search results per page * */ define('API_SEARCH_PAGINATION_RESULTS_LIMIT', 20); /* * Hosts distribution API * * When false - API_HOSTS_FIELDS will be ignored * */ define('API_HOSTS_ENABLED', true); /* * Database host fields comma separated or * to share all the fields * */ define('API_HOSTS_FIELDS', '`host`.`scheme`, `host`.`name`, `host`.`port`, `host`.`crawlPageLimit`, `host`.`robots`, `host`.`robotsPostfix`, `host`.`nsfw`, `host`.`timeAdded`, `host`.`timeUpdated`, (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`'); /* * Manifest API * * Application meta sharing between YGGo remote nodes * * When true - make this node public for distributed index sharing * */ define('API_MANIFEST_ENABLED', true);