mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
implement database cleaner
This commit is contained in:
parent
3c9bc1adaa
commit
8e8d89db0e
@ -21,7 +21,7 @@ define('DB_PASSWORD', '');
|
||||
define('SPHINX_HOST', '127.0.0.1');
|
||||
define('SPHINX_PORT', 9306);
|
||||
|
||||
// Crawl settings
|
||||
// Crawler settings
|
||||
define('CRAWL_PAGE_LIMIT', 10);
|
||||
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
|
||||
|
||||
@ -49,4 +49,8 @@ define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
|
||||
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
|
||||
*
|
||||
*/
|
||||
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
||||
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
||||
|
||||
// Cleaner settings
|
||||
define('CLEAN_HOST_LIMIT', 20);
|
||||
define('CLEAN_HOST_SECONDS_OFFSET', 3600);
|
84
crontab/cleaner.php
Normal file
84
crontab/cleaner.php
Normal file
@ -0,0 +1,84 @@
|
||||
<?php
|
||||
|
||||
// Lock multi-thread execution
|
||||
$semaphore = sem_get(crc32('crontab.cleaner'), 1);
|
||||
|
||||
if (false === sem_acquire($semaphore, true)) {
|
||||
|
||||
echo 'Process locked by another thread.' . PHP_EOL;
|
||||
exit;
|
||||
}
|
||||
|
||||
// Load system dependencies
|
||||
require_once('../config/app.php');
|
||||
require_once('../library/curl.php');
|
||||
require_once('../library/robots.php');
|
||||
require_once('../library/mysql.php');
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
// Debug
|
||||
$timeStart = microtime(true);
|
||||
|
||||
$hostsTotal = $db->getTotalHosts();
|
||||
$hostsUpdated = 0;
|
||||
$hostsPagesDeleted = 0;
|
||||
|
||||
// Get host queue
|
||||
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
||||
|
||||
// Parse host info
|
||||
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL . '/robots.txt');
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
} else {
|
||||
$hostRobots = null;
|
||||
}
|
||||
|
||||
// Begin update
|
||||
$db->beginTransaction();
|
||||
|
||||
try {
|
||||
|
||||
// Update host data
|
||||
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||
|
||||
// Apply host pages limits
|
||||
$totalHostPages = $db->getTotalHostPages($host->hostId);
|
||||
|
||||
if ($totalHostPages > $host->crawlPageLimit) {
|
||||
|
||||
$hostsPagesDeleted += $db->deleteHostPages($host->hostId, $totalHostPages - $host->crawlPageLimit);
|
||||
}
|
||||
|
||||
// Apply new robots.txt rules
|
||||
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||
|
||||
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
}
|
||||
}
|
||||
|
||||
$db->commit();
|
||||
|
||||
} catch(Exception $e){
|
||||
|
||||
var_dump($e);
|
||||
|
||||
$db->rollBack();
|
||||
}
|
||||
}
|
||||
|
||||
// Debug
|
||||
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
||||
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
||||
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
|
||||
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL;
|
@ -38,6 +38,15 @@ class MySQL {
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getTotalHosts() {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `host`');
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
@ -47,6 +56,15 @@ class MySQL {
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
|
||||
public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$robots, $timeUpdated, $hostId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Pages
|
||||
public function getTotalHostPages(int $hostId) {
|
||||
|
||||
@ -92,6 +110,15 @@ class MySQL {
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getHostPages(int $hostId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getFoundHostPage(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
|
||||
@ -159,6 +186,40 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostPage(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostPages(int $hostId, int $limit) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Cleaner tools
|
||||
public function getCleanerQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `host`
|
||||
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||
|
||||
ORDER BY `hostId`
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
// Crawl tools
|
||||
public function getCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user