Browse Source

add host nsfw settings

main
ghost 2 years ago
parent
commit
28bf526d53
  1. 21
      config/app.php.txt
  2. 9
      crontab/crawler.php
  3. BIN
      database/yggo.mwb
  4. 6
      library/mysql.php
  5. 3
      public/api.php
  6. 3
      public/search.php

21
config/app.php.txt

@ -270,14 +270,24 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
define('CRAWL_HOST_DEFAULT_META_ONLY', false); define('CRAWL_HOST_DEFAULT_META_ONLY', false);
/* /*
* Images limit per new host by default * Not suitable/safe for work status for new host by default
* *
* Crawler stops indexing on this limit reach to prevent disk overuse * Could be filtered in search results
*
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
*
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
/*
* Not suitable/safe for work status for new host by default
*
* Could be filtered in crawl conditions or search results
* *
* Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field * Custom rule for specified host could be provided in the DB `host`.`nsfw` field
* *
*/ */
define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000); define('CRAWL_HOST_DEFAULT_NSFW', false);
/* /*
* Default robots.txt rules on remote file not exists * Default robots.txt rules on remote file not exists
@ -314,7 +324,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility * Manifest API version compatibility
* *
*/ */
define('CRAWL_MANIFEST_API_VERSION', 0.6); define('CRAWL_MANIFEST_API_VERSION', 0.7);
/* /*
* Set default auto-crawl status for new manifest added * Set default auto-crawl status for new manifest added
@ -438,6 +448,7 @@ define('API_HOSTS_FIELDS',
`host`.`crawlImageLimit`, `host`.`crawlImageLimit`,
`host`.`robots`, `host`.`robots`,
`host`.`robotsPostfix`, `host`.`robotsPostfix`,
`host`.`nsfw`,
`host`.`timeAdded`, `host`.`timeAdded`,
`host`.`timeUpdated`, `host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`, (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,

9
crontab/crawler.php

@ -171,6 +171,7 @@ try {
if ($host = $db->getHost(crc32($hostURL))) { if ($host = $db->getHost(crc32($hostURL))) {
$hostStatus = $host->status; $hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit; $hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit; $hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId; $hostId = $host->hostId;
@ -198,6 +199,7 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
@ -211,6 +213,7 @@ try {
$hostImageLimit, $hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY, (string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus, (string) $hostStatus,
(string) $hostNsfw,
$hostRobots, $hostRobots,
$hostRobotsPostfix); $hostRobotsPostfix);
@ -534,6 +537,7 @@ try {
if ($host = $db->getHost(crc32($hostImageURL->string))) { if ($host = $db->getHost(crc32($hostImageURL->string))) {
$hostStatus = $host->status; $hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit; $hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit; $hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId; $hostId = $host->hostId;
@ -561,6 +565,7 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostImageURL->scheme, $hostId = $db->addHost($hostImageURL->scheme,
@ -573,6 +578,7 @@ try {
$hostImageLimit, $hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY, (string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus, (string) $hostStatus,
(string) $hostNsfw,
$hostRobots, $hostRobots,
$hostRobotsPostfix); $hostRobotsPostfix);
@ -692,6 +698,7 @@ try {
if ($host = $db->getHost(crc32($hostURL->string))) { if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status; $hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit; $hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit; $hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId; $hostId = $host->hostId;
@ -719,6 +726,7 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme, $hostId = $db->addHost($hostURL->scheme,
@ -731,6 +739,7 @@ try {
$hostImageLimit, $hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY, (string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus, (string) $hostStatus,
(string) $hostNsfw,
$hostRobots, $hostRobots,
$hostRobotsPostfix); $hostRobotsPostfix);

BIN
database/yggo.mwb

Binary file not shown.

6
library/mysql.php

@ -102,11 +102,11 @@ class MySQL {
return $query->fetch()->total; return $query->fetch()->total;
} }
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) { public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, string $nsfw, mixed $robots, mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `nsfw`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $robots, $robotsPostfix]); $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $nsfw, $robots, $robotsPostfix]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }

3
public/api.php

@ -1,7 +1,7 @@
<?php <?php
// Current version // Current version
define('API_VERSION', 0.6); define('API_VERSION', 0.7);
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
@ -127,6 +127,7 @@ if (API_ENABLED) {
'config' => [ 'config' => [
'websiteDomain' => WEBSITE_DOMAIN, 'websiteDomain' => WEBSITE_DOMAIN,
'crawlUrlRegexp' => CRAWL_URL_REGEXP, 'crawlUrlRegexp' => CRAWL_URL_REGEXP,
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT, 'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,

3
public/search.php

@ -59,6 +59,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
if ($host = $db->getHost(crc32($hostURL->string))) { if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status; $hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit; $hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId; $hostId = $host->hostId;
$hostRobots = $host->robots; $hostRobots = $host->robots;
@ -82,6 +83,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme, $hostId = $db->addHost($hostURL->scheme,
$hostURL->name, $hostURL->name,
@ -92,6 +94,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostPageLimit, $hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY, (string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus, (string) $hostStatus,
(string) $hostNsfw,
$hostRobots, $hostRobots,
$hostRobotsPostfix); $hostRobotsPostfix);
} }

Loading…
Cancel
Save