implement CRAWL_ROBOTS_POSTFIX_RULES configuration #5

This commit is contained in:
ghost 2023-04-08 22:28:31 +03:00
parent 505544c8c9
commit df6f2a1869
5 changed files with 43 additions and 17 deletions

View File

@ -31,4 +31,22 @@ define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
define('CRAWL_HOST_DEFAULT_STATUS', true);
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
define('CRAWL_ROBOTS_DEFAULT_RULES', "");
/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robots.md
*
*/
define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
/*
* Permanent rules that append to the robots.txt if exists else CRAWL_ROBOTS_DEFAULT_RULES
* The crawler does not overwrite these rules
*
* Presets
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
*
*/
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null

View File

@ -143,10 +143,11 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
@ -157,9 +158,11 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
@ -171,7 +174,8 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots);
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
@ -184,7 +188,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
}
// Init robots parser
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix);
// Save page info
if ($hostStatus && // host enabled

Binary file not shown.

View File

@ -38,11 +38,11 @@ class MySQL {
return $query->fetch();
}
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots) {
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots]);
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]);
return $this->_db->lastInsertId();
}

View File

@ -40,10 +40,11 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
@ -57,6 +58,8 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostRobots = null;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
@ -68,14 +71,15 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots);
$hostRobots,
$hostRobotsPostfix);
}
// Parse page URI
$hostPageURI = Parser::uri($q);
// Init robots parser
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix);
// Save page info
if ($hostStatus && // host enabled