Browse Source

add host nsfw settings

main
ghost 2 years ago
parent
commit
28bf526d53
  1. 21
      config/app.php.txt
  2. 9
      crontab/crawler.php
  3. BIN
      database/yggo.mwb
  4. 6
      library/mysql.php
  5. 3
      public/api.php
  6. 3
      public/search.php

21
config/app.php.txt

@ -270,14 +270,24 @@ define('CRAWL_HOST_DEFAULT_STATUS', true); @@ -270,14 +270,24 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
/*
* Images limit per new host by default
* Not suitable/safe for work status for new host by default
*
* Crawler stops indexing on this limit reach to prevent disk overuse
* Could be filtered in search results
*
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
*
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
/*
* Not suitable/safe for work status for new host by default
*
* Could be filtered in crawl conditions or search results
*
* Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
*
*/
define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000);
define('CRAWL_HOST_DEFAULT_NSFW', false);
/*
* Default robots.txt rules on remote file not exists
@ -314,7 +324,7 @@ define('CRAWL_MANIFEST', true); @@ -314,7 +324,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility
*
*/
define('CRAWL_MANIFEST_API_VERSION', 0.6);
define('CRAWL_MANIFEST_API_VERSION', 0.7);
/*
* Set default auto-crawl status for new manifest added
@ -438,6 +448,7 @@ define('API_HOSTS_FIELDS', @@ -438,6 +448,7 @@ define('API_HOSTS_FIELDS',
`host`.`crawlImageLimit`,
`host`.`robots`,
`host`.`robotsPostfix`,
`host`.`nsfw`,
`host`.`timeAdded`,
`host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,

9
crontab/crawler.php

@ -171,6 +171,7 @@ try { @@ -171,6 +171,7 @@ try {
if ($host = $db->getHost(crc32($hostURL))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
@ -198,6 +199,7 @@ try { @@ -198,6 +199,7 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
@ -211,6 +213,7 @@ try { @@ -211,6 +213,7 @@ try {
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
@ -534,6 +537,7 @@ try { @@ -534,6 +537,7 @@ try {
if ($host = $db->getHost(crc32($hostImageURL->string))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
@ -561,6 +565,7 @@ try { @@ -561,6 +565,7 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostImageURL->scheme,
@ -573,6 +578,7 @@ try { @@ -573,6 +578,7 @@ try {
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
@ -692,6 +698,7 @@ try { @@ -692,6 +698,7 @@ try {
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
@ -719,6 +726,7 @@ try { @@ -719,6 +726,7 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
@ -731,6 +739,7 @@ try { @@ -731,6 +739,7 @@ try {
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);

BIN
database/yggo.mwb

Binary file not shown.

6
library/mysql.php

@ -102,11 +102,11 @@ class MySQL { @@ -102,11 +102,11 @@ class MySQL {
return $query->fetch()->total;
}
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, string $nsfw, mixed $robots, mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `nsfw`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $robots, $robotsPostfix]);
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $nsfw, $robots, $robotsPostfix]);
return $this->_db->lastInsertId();
}

3
public/api.php

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
<?php
// Current version
define('API_VERSION', 0.6);
define('API_VERSION', 0.7);
// Load system dependencies
require_once('../config/app.php');
@ -127,6 +127,7 @@ if (API_ENABLED) { @@ -127,6 +127,7 @@ if (API_ENABLED) {
'config' => [
'websiteDomain' => WEBSITE_DOMAIN,
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,

3
public/search.php

@ -59,6 +59,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -59,6 +59,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
@ -82,6 +83,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -82,6 +83,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
@ -92,6 +94,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -92,6 +94,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
}

Loading…
Cancel
Save