diff --git a/config/app.php.txt b/config/app.php.txt index e9db210..431faa6 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -145,6 +145,16 @@ define('CRAWL_HOST_DEFAULT_STATUS', true); */ define('CRAWL_HOST_DEFAULT_META_ONLY', false); +/* + * Images limit per new host by default + * + * Crawler stops indexing on this limit reach to prevent disk overuse + * + * Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field + * + */ +define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000); + /* * Default robots.txt rules on remote file not exists * The crawler able to overwrite these rules @@ -250,14 +260,16 @@ define('API_HOSTS_ENABLED', true); */ define('API_HOSTS_FIELDS', '`host`.`scheme`, - `host`.`name`, - `host`.`port`, - `host`.`crawlPageLimit`, - `host`.`robots`, - `host`.`robotsPostfix`, - `host`.`timeAdded`, - `host`.`timeUpdated`, - (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`'); // string: *|field names comma separated + `host`.`name`, + `host`.`port`, + `host`.`crawlPageLimit`, + `host`.`crawlImageLimit`, + `host`.`robots`, + `host`.`robotsPostfix`, + `host`.`timeAdded`, + `host`.`timeUpdated`, + (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`, + (SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated /* * Manifest API diff --git a/crontab/cleaner.php b/crontab/cleaner.php index f3f9ca2..8512ab6 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -21,9 +21,10 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); // Debug $timeStart = microtime(true); -$hostsTotal = $db->getTotalHosts(); -$hostsUpdated = 0; -$hostsPagesDeleted = 0; +$hostsTotal = $db->getTotalHosts(); +$hostsUpdated = 0; +$hostsPagesDeleted = 0; +$hostsImagesDeleted = 0; // Get host queue foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { @@ -48,25 +49,76 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS // Update host data $hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time()); + // Apply host images limits + $totalHostImages = $db->getTotalHostImages($host->hostId); + + if ($totalHostImages > $host->crawlImageLimit) { + + foreach ((array) $db->getHostImagesByLimit($host->hostId, $totalHostImages - $host->crawlImageLimit) as $hostImage) { + + // Delete foreign key relations + $db->deleteHostImageDescription($hostImage->hostImageId); + $db->deleteHostImageToHostPage($hostImage->hostImageId); + + // Delete host image + $hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId); + } + } + // Apply host pages limits $totalHostPages = $db->getTotalHostPages($host->hostId); if ($totalHostPages > $host->crawlPageLimit) { - $hostsPagesDeleted += $db->deleteHostPages($host->hostId, $totalHostPages - $host->crawlPageLimit); + foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) { + + // Delete foreign key relations + $db->deleteHostPageToHostImage($hostPage->hostPageId); + + // Delete host page + $hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); + } } // Apply new robots.txt rules $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); + foreach ($db->getHostImages($host->hostId) as $hostImage) { + + if (!$robots->uriAllowed($hostImage->uri)) { + + // Delete foreign key relations + $db->deleteHostImageDescription($hostImage->hostImageId); + $db->deleteHostImageToHostPage($hostImage->hostImageId); + + // Delete host image + $hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId); + } + } + foreach ($db->getHostPages($host->hostId) as $hostPage) { if (!$robots->uriAllowed($hostPage->uri)) { + // Delete foreign key relations + $db->deleteHostPageToHostImage($hostPage->hostPageId); + + // Delete host page $hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); } } + // Clean up host images unrelated to host pages + foreach ($db->getUnrelatedHostImages() as $hostImage) { + + // Delete foreign key relations + $db->deleteHostImageDescription($hostImage->hostImageId); + $db->deleteHostImageToHostPage($hostImage->hostImageId); + + // Delete host image + $hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId); + } + $db->commit(); } catch(Exception $e){ @@ -81,4 +133,5 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS echo 'Hosts total: ' . $hostsTotal . PHP_EOL; echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL; echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL; +echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL; echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL; \ No newline at end of file diff --git a/crontab/crawler.php b/crontab/crawler.php index 7cbf97f..71fda97 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -30,6 +30,7 @@ $timeStart = microtime(true); $hostPagesProcessed = 0; $hostPagesIndexed = 0; $hostPagesAdded = 0; +$hostImagesAdded = 0; $hostsAdded = 0; // Connect database @@ -127,6 +128,157 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET continue; } + // Collect page images + if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) { + + foreach (@$dom->getElementsByTagName('img') as $img) { + + // Skip images without src attribute + if (!$imageSrc = @$img->getAttribute('src')) { + + continue; + } + + // Skip images without alt attribute + if (!$imageAlt = @$img->getAttribute('alt')) { + + continue; + } + + if (!$imageTitle = @$img->getAttribute('title')) { + $imageTitle = null; + } + + // Add domain to the relative src links + if (!parse_url($imageSrc, PHP_URL_HOST)) { + + $imageSrc = $queueHostPage->scheme . '://' . + $queueHostPage->name . + ($queueHostPage->port ? ':' . $queueHostPage->port : '') . + '/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.'); + } + + // Validate formatted src link + if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) { + + $db->beginTransaction(); + + try { + + // Parse formatted src link + $hostImageURL = Parser::hostURL($imageSrc); + $hostImageURI = Parser::uri($imageSrc); + + // Host exists + if ($host = $db->getHost(crc32($hostImageURL->string))) { + + $hostStatus = $host->status; + $hostPageLimit = $host->crawlPageLimit; + $hostImageLimit = $host->crawlImageLimit; + $hostId = $host->hostId; + $hostRobots = $host->robots; + $hostRobotsPostfix = $host->robotsPostfix; + + // Register new host + } else { + + // Get robots.txt if exists + $curl = new Curl($hostImageURL->string . '/robots.txt'); + + if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { + $hostRobots = $curl->getContent(); + } else { + $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; + } + + $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; + + $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; + $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; + $hostId = $db->addHost($hostImageURL->scheme, + $hostImageURL->name, + $hostImageURL->port, + crc32($hostURL->string), + time(), + null, + $hostPageLimit, + $hostImageLimit, + (string) CRAWL_HOST_DEFAULT_META_ONLY, + (string) $hostStatus, + $hostRobots, + $hostRobotsPostfix); + + if ($hostId) { + + $hostsAdded++; + + } else { + + continue; + } + } + + // Init robots parser + $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); + + // Save image info + $hostImageId = $db->getHostImage($hostId, crc32($hostImageURI->string)); + + if ($hostStatus && // host enabled + $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules + $hostImageLimit > $db->getTotalHostImages($hostId) && // images quantity not reached host limit + !$hostImageId) { // image not exists + + // Add host image + if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time())) { + + $hostImagesAdded++; + + } else { + + continue; + } + } + + // Add host image description + $hostImageDescriptionCRC32id = crc32(md5((string) $imageAlt . (string) $imageTitle)); + + if (!$db->getHostImageDescription($hostImageId, $hostImageDescriptionCRC32id)) { + $db->addHostImageDescription($hostImageId, $hostImageDescriptionCRC32id, (string) $imageAlt, (string) $imageTitle, time()); + } + + // Relate host image with host page was found + if (!$db->getHostImageToHostPage($hostImageId, $queueHostPage->hostPageId)) { + $db->addHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), null, 1); + } else { + $db->updateHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1); + } + + // Increase page rank when link does not match the current host + if ($hostImageURL->scheme . '://' . + $hostImageURL->name . + ($hostImageURL->port ? ':' . $hostImageURL->port : '') + != + $queueHostPage->scheme . '://' . + $queueHostPage->name . + ($queueHostPage->port ? ':' . $queueHostPage->port : '')) { + + $db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1); + } + + $db->commit(); + + } catch(Exception $e) { + + var_dump($e); + + $db->rollBack(); + } + } + } + } + // Collect internal links from page content foreach(@$dom->getElementsByTagName('a') as $a) { @@ -187,6 +339,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET $hostStatus = $host->status; $hostPageLimit = $host->crawlPageLimit; + $hostImageLimit = $host->crawlImageLimit; $hostId = $host->hostId; $hostRobots = $host->robots; $hostRobotsPostfix = $host->robotsPostfix; @@ -207,6 +360,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET $hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; + $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostId = $db->addHost($hostURL->scheme, $hostURL->name, $hostURL->port, @@ -214,6 +368,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET time(), null, $hostPageLimit, + $hostImageLimit, (string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus, $hostRobots, @@ -272,5 +427,6 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; +echo 'Images added: ' . $hostImagesAdded . PHP_EOL; echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb index 00b65c4..486a6d5 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/mysql.php b/library/mysql.php index 6f5584f..24e782d 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -40,7 +40,7 @@ class MySQL { public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) { - $query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?, ?)'); + $query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)'); $query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]); @@ -75,11 +75,11 @@ class MySQL { return $query->fetch()->total; } - public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) { + public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) { - $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); - $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]); + $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]); return $this->_db->lastInsertId(); } @@ -93,6 +93,173 @@ class MySQL { return $query->rowCount(); } + // Images + public function getTotalHostImages(int $hostId) { + + $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE `hostId` = ?'); + + $query->execute([$hostId]); + + return $query->fetch()->total; + } + + public function getHostImage(int $hostId, int $crc32uri) { + + $query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1'); + + $query->execute([$hostId, $crc32uri]); + + return $query->fetch(); + } + + public function getHostImages(int $hostId) { + + $query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ?'); + + $query->execute([$hostId]); + + return $query->fetchAll(); + } + + public function getUnrelatedHostImages() { + + $query = $this->_db->prepare('SELECT * FROM `hostImage` + WHERE `hostImage`.`hostImageId` NOT IN (SELECT `hostImageToHostPage`.`hostImageId` + FROM `hostImageToHostPage` + + WHERE `hostImageToHostPage`.`hostImageId` = `hostImage`.`hostImageId`)'); + + $query->execute(); + + return $query->fetchAll(); + } + + public function getHostImagesByLimit(int $hostId, int $limit) { + + $query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? ORDER BY hostImageId DESC LIMIT ' . (int) $limit); + + $query->execute([$hostId]); + + return $query->fetchAll(); + } + + public function addHostImage(int $hostId, + int $crc32uri, + string $uri, + int $timeAdded, + mixed $timeUpdated = null, + mixed $httpCode = null, + mixed $rank = null) { + + $query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`, + `crc32uri`, + `uri`, + `timeAdded`, + `timeUpdated`, + `httpCode`, + `rank`) VALUES (?, ?, ?, ?, ?, ?, ?)'); + + $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank]); + + return $this->_db->lastInsertId(); + } + + public function updateHostImageRank(int $hostId, + int $crc32uri, + int $increment) { + + $query = $this->_db->prepare('UPDATE `hostImage` SET `rank` = `rank` + ' . (int) $increment . ' WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1'); + + $query->execute([$hostId, $crc32uri]); + + return $query->rowCount(); + } + + public function deleteHostImage(int $hostImageId) { + + $query = $this->_db->prepare('DELETE FROM `hostImage` WHERE `hostImageId` = ? LIMIT 1'); + + $query->execute([$hostImageId]); + + return $query->rowCount(); + } + + public function getHostImageDescription(int $hostImageId, int $crc32id) { + + $query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? AND `crc32id` = ? LIMIT 1'); + + $query->execute([$hostImageId, $crc32id]); + + return $query->fetch(); + } + + public function addHostImageDescription(int $hostImageId, int $crc32id, string $alt, string $title, int $timeAdded) { + + $query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`, + `crc32id`, + `alt`, + `title`, + `timeAdded`) VALUES (?, ?, ?, ?, ?)'); + + $query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded]); + + return $this->_db->lastInsertId(); + } + + public function deleteHostImageDescription(int $hostImageId) { + + $query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `hostImageId` = ?'); + + $query->execute([$hostImageId]); + + return $query->rowCount(); + } + + public function getHostImageToHostPage(int $hostImageId, int $hostPageId) { + + $query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` WHERE `hostImageId` = ? AND `hostPageId` = ? LIMIT 1'); + + $query->execute([$hostImageId, $hostPageId]); + + return $query->fetch(); + } + + public function addHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) { + + $query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`, + `hostPageId`, + `timeAdded`, + `timeUpdated`, + `quantity`) VALUES (?, ?, ?, ?, ?)'); + + $query->execute([$hostImageId, $hostPageId, $timeAdded, $timeUpdated, $quantity]); + + return $query->rowCount(); // no primary key + } + + public function updateHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, int $quantity) { + + $query = $this->_db->prepare('UPDATE `hostImageToHostPage` SET `quantity` = `quantity` + ' . (int) $quantity . ', `timeUpdated` = ? + + WHERE `hostImageId` = ? + AND `hostPageId` = ? + + LIMIT 1'); + + $query->execute([$timeAdded, $hostImageId, $hostPageId]); + + return $query->rowCount(); + } + + public function deleteHostImageToHostPage(int $hostImageId) { + + $query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostImageId` = ?'); + + $query->execute([$hostImageId]); + + return $query->rowCount(); + } + // Pages public function getTotalHostPages(int $hostId) { @@ -147,6 +314,15 @@ class MySQL { return $query->fetchAll(); } + public function getHostPagesByLimit(int $hostId, int $limit) { + + $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit); + + $query->execute([$hostId]); + + return $query->fetchAll(); + } + public function getFoundHostPage(int $hostPageId) { $query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`, @@ -240,11 +416,11 @@ class MySQL { return $query->rowCount(); } - public function deleteHostPages(int $hostId, int $limit) { + public function deleteHostPageToHostImage(int $hostPageId) { - $query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit); + $query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?'); - $query->execute([$hostId]); + $query->execute([$hostPageId]); return $query->rowCount(); } @@ -275,6 +451,7 @@ class MySQL { `host`.`name`, `host`.`port`, `host`.`crawlPageLimit`, + `host`.`crawlImageLimit`, `host`.`crawlPageMetaOnly`, `host`.`robots`, `host`.`robotsPostfix` diff --git a/media/db-prototype.png b/media/db-prototype.png index bdac948..bbedc1a 100644 Binary files a/media/db-prototype.png and b/media/db-prototype.png differ diff --git a/public/api.php b/public/api.php index cd28e33..ded5442 100644 --- a/public/api.php +++ b/public/api.php @@ -1,7 +1,7 @@ true, 'result' => [ 'config' => [ - 'websiteDomain' => WEBSITE_DOMAIN, - 'crawlUrlRegexp' => CRAWL_URL_REGEXP, - 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, - 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, - 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, - 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, - 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, - 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, - 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, + 'websiteDomain' => WEBSITE_DOMAIN, + 'crawlUrlRegexp' => CRAWL_URL_REGEXP, + 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, + 'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT, + 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, + 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, + 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, + 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, + 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, + 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, ], 'api' => [ 'version' => API_VERSION,