diff --git a/config/app.php.txt b/config/app.php.txt index c5af62a..8469469 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -270,14 +270,24 @@ define('CRAWL_HOST_DEFAULT_STATUS', true); define('CRAWL_HOST_DEFAULT_META_ONLY', false); /* - * Images limit per new host by default + * Not suitable/safe for work status for new host by default * - * Crawler stops indexing on this limit reach to prevent disk overuse + * Could be filtered in search results + * + * Custom rule for specified host could be provided in the DB `host`.`nsfw` field + * + */ +define('CRAWL_HOST_DEFAULT_NSFW', false); + +/* + * Not suitable/safe for work status for new host by default + * + * Could be filtered in crawl conditions or search results * - * Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field + * Custom rule for specified host could be provided in the DB `host`.`nsfw` field * */ -define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000); +define('CRAWL_HOST_DEFAULT_NSFW', false); /* * Default robots.txt rules on remote file not exists @@ -314,7 +324,7 @@ define('CRAWL_MANIFEST', true); * Manifest API version compatibility * */ -define('CRAWL_MANIFEST_API_VERSION', 0.6); +define('CRAWL_MANIFEST_API_VERSION', 0.7); /* * Set default auto-crawl status for new manifest added @@ -438,6 +448,7 @@ define('API_HOSTS_FIELDS', `host`.`crawlImageLimit`, `host`.`robots`, `host`.`robotsPostfix`, + `host`.`nsfw`, `host`.`timeAdded`, `host`.`timeUpdated`, (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`, diff --git a/crontab/crawler.php b/crontab/crawler.php index fea424b..a390572 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -171,6 +171,7 @@ try { if ($host = $db->getHost(crc32($hostURL))) { $hostStatus = $host->status; + $hostNsfw = $host->nsfw; $hostPageLimit = $host->crawlPageLimit; $hostImageLimit = $host->crawlImageLimit; $hostId = $host->hostId; @@ -198,6 +199,7 @@ try { $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostNsfw = CRAWL_HOST_DEFAULT_NSFW; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; @@ -211,6 +213,7 @@ try { $hostImageLimit, (string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus, + (string) $hostNsfw, $hostRobots, $hostRobotsPostfix); @@ -534,6 +537,7 @@ try { if ($host = $db->getHost(crc32($hostImageURL->string))) { $hostStatus = $host->status; + $hostNsfw = $host->nsfw; $hostPageLimit = $host->crawlPageLimit; $hostImageLimit = $host->crawlImageLimit; $hostId = $host->hostId; @@ -561,6 +565,7 @@ try { $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostNsfw = CRAWL_HOST_DEFAULT_NSFW; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostId = $db->addHost($hostImageURL->scheme, @@ -573,6 +578,7 @@ try { $hostImageLimit, (string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus, + (string) $hostNsfw, $hostRobots, $hostRobotsPostfix); @@ -692,6 +698,7 @@ try { if ($host = $db->getHost(crc32($hostURL->string))) { $hostStatus = $host->status; + $hostNsfw = $host->nsfw; $hostPageLimit = $host->crawlPageLimit; $hostImageLimit = $host->crawlImageLimit; $hostId = $host->hostId; @@ -719,6 +726,7 @@ try { $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostNsfw = CRAWL_HOST_DEFAULT_NSFW; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostId = $db->addHost($hostURL->scheme, @@ -731,6 +739,7 @@ try { $hostImageLimit, (string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus, + (string) $hostNsfw, $hostRobots, $hostRobotsPostfix); diff --git a/database/yggo.mwb b/database/yggo.mwb index 330c005..fd58c14 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/mysql.php b/library/mysql.php index c21a4a7..9dddd2b 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -102,11 +102,11 @@ class MySQL { return $query->fetch()->total; } - public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) { + public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, string $nsfw, mixed $robots, mixed $robotsPostfix) { - $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `nsfw`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); - $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $robots, $robotsPostfix]); + $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $nsfw, $robots, $robotsPostfix]); return $this->_db->lastInsertId(); } diff --git a/public/api.php b/public/api.php index 9639022..becf6ae 100644 --- a/public/api.php +++ b/public/api.php @@ -1,7 +1,7 @@ [ 'websiteDomain' => WEBSITE_DOMAIN, 'crawlUrlRegexp' => CRAWL_URL_REGEXP, + 'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW, 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, 'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT, 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, diff --git a/public/search.php b/public/search.php index b801fab..8567163 100644 --- a/public/search.php +++ b/public/search.php @@ -59,6 +59,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { if ($host = $db->getHost(crc32($hostURL->string))) { $hostStatus = $host->status; + $hostNsfw = $host->nsfw; $hostPageLimit = $host->crawlPageLimit; $hostId = $host->hostId; $hostRobots = $host->robots; @@ -82,6 +83,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostNsfw = CRAWL_HOST_DEFAULT_NSFW; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostId = $db->addHost($hostURL->scheme, $hostURL->name, @@ -92,6 +94,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { $hostPageLimit, (string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus, + (string) $hostNsfw, $hostRobots, $hostRobotsPostfix); }