Browse Source

implement database cleaner

main
ghost 2 years ago
parent
commit
8e8d89db0e
  1. 8
      config/app.php.txt
  2. 84
      crontab/cleaner.php
  3. 61
      library/mysql.php

8
config/app.php.txt

@ -21,7 +21,7 @@ define('DB_PASSWORD', '');
define('SPHINX_HOST', '127.0.0.1'); define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306); define('SPHINX_PORT', 9306);
// Crawl settings // Crawler settings
define('CRAWL_PAGE_LIMIT', 10); define('CRAWL_PAGE_LIMIT', 10);
define('CRAWL_PAGE_SECONDS_OFFSET', 3600); define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
@ -49,4 +49,8 @@ define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md * yggdrasil: /database/yggdrasil/host.robotsPostfix.md
* *
*/ */
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
// Cleaner settings
define('CLEAN_HOST_LIMIT', 20);
define('CLEAN_HOST_SECONDS_OFFSET', 3600);

84
crontab/cleaner.php

@ -0,0 +1,84 @@
<?php
// Lock multi-thread execution
$semaphore = sem_get(crc32('crontab.cleaner'), 1);
if (false === sem_acquire($semaphore, true)) {
echo 'Process locked by another thread.' . PHP_EOL;
exit;
}
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/mysql.php');
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Debug
$timeStart = microtime(true);
$hostsTotal = $db->getTotalHosts();
$hostsUpdated = 0;
$hostsPagesDeleted = 0;
// Get host queue
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
// Parse host info
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
// Begin update
$db->beginTransaction();
try {
// Update host data
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
// Apply host pages limits
$totalHostPages = $db->getTotalHostPages($host->hostId);
if ($totalHostPages > $host->crawlPageLimit) {
$hostsPagesDeleted += $db->deleteHostPages($host->hostId, $totalHostPages - $host->crawlPageLimit);
}
// Apply new robots.txt rules
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
foreach ($db->getHostPages($host->hostId) as $hostPage) {
if (!$robots->uriAllowed($hostPage->uri)) {
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}
$db->commit();
} catch(Exception $e){
var_dump($e);
$db->rollBack();
}
}
// Debug
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL;

61
library/mysql.php

@ -38,6 +38,15 @@ class MySQL {
return $query->fetch(); return $query->fetch();
} }
public function getTotalHosts() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `host`');
$query->execute();
return $query->fetch()->total;
}
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) { public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
@ -47,6 +56,15 @@ class MySQL {
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
$query->execute([$robots, $timeUpdated, $hostId]);
return $query->rowCount();
}
// Pages // Pages
public function getTotalHostPages(int $hostId) { public function getTotalHostPages(int $hostId) {
@ -92,6 +110,15 @@ class MySQL {
return $query->fetch(); return $query->fetch();
} }
public function getHostPages(int $hostId) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetchAll();
}
public function getFoundHostPage(int $hostPageId) { public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`, $query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
@ -159,6 +186,40 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function deleteHostPage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$hostPageId]);
return $query->rowCount();
}
public function deleteHostPages(int $hostId, int $limit) {
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
$query->execute([$hostId]);
return $query->rowCount();
}
// Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `host`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> 0
ORDER BY `hostId`
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
// Crawl tools // Crawl tools
public function getCrawlQueue(int $limit, int $timeFrom) { public function getCrawlQueue(int $limit, int $timeFrom) {

Loading…
Cancel
Save