mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-09 13:24:37 +00:00
implement database cleaner
This commit is contained in:
parent
3c9bc1adaa
commit
8e8d89db0e
@ -21,7 +21,7 @@ define('DB_PASSWORD', '');
|
|||||||
define('SPHINX_HOST', '127.0.0.1');
|
define('SPHINX_HOST', '127.0.0.1');
|
||||||
define('SPHINX_PORT', 9306);
|
define('SPHINX_PORT', 9306);
|
||||||
|
|
||||||
// Crawl settings
|
// Crawler settings
|
||||||
define('CRAWL_PAGE_LIMIT', 10);
|
define('CRAWL_PAGE_LIMIT', 10);
|
||||||
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
|
define('CRAWL_PAGE_SECONDS_OFFSET', 3600);
|
||||||
|
|
||||||
@ -49,4 +49,8 @@ define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
|
|||||||
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
|
* yggdrasil: /database/yggdrasil/host.robotsPostfix.md
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
||||||
|
|
||||||
|
// Cleaner settings
|
||||||
|
define('CLEAN_HOST_LIMIT', 20);
|
||||||
|
define('CLEAN_HOST_SECONDS_OFFSET', 3600);
|
84
crontab/cleaner.php
Normal file
84
crontab/cleaner.php
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
// Lock multi-thread execution
|
||||||
|
$semaphore = sem_get(crc32('crontab.cleaner'), 1);
|
||||||
|
|
||||||
|
if (false === sem_acquire($semaphore, true)) {
|
||||||
|
|
||||||
|
echo 'Process locked by another thread.' . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load system dependencies
|
||||||
|
require_once('../config/app.php');
|
||||||
|
require_once('../library/curl.php');
|
||||||
|
require_once('../library/robots.php');
|
||||||
|
require_once('../library/mysql.php');
|
||||||
|
|
||||||
|
// Connect database
|
||||||
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||||
|
|
||||||
|
// Debug
|
||||||
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
|
$hostsTotal = $db->getTotalHosts();
|
||||||
|
$hostsUpdated = 0;
|
||||||
|
$hostsPagesDeleted = 0;
|
||||||
|
|
||||||
|
// Get host queue
|
||||||
|
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
||||||
|
|
||||||
|
// Parse host info
|
||||||
|
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
|
||||||
|
|
||||||
|
// Get robots.txt if exists
|
||||||
|
$curl = new Curl($hostURL . '/robots.txt');
|
||||||
|
|
||||||
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
} else {
|
||||||
|
$hostRobots = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Begin update
|
||||||
|
$db->beginTransaction();
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Update host data
|
||||||
|
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||||
|
|
||||||
|
// Apply host pages limits
|
||||||
|
$totalHostPages = $db->getTotalHostPages($host->hostId);
|
||||||
|
|
||||||
|
if ($totalHostPages > $host->crawlPageLimit) {
|
||||||
|
|
||||||
|
$hostsPagesDeleted += $db->deleteHostPages($host->hostId, $totalHostPages - $host->crawlPageLimit);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply new robots.txt rules
|
||||||
|
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
|
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||||
|
|
||||||
|
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||||
|
|
||||||
|
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$db->commit();
|
||||||
|
|
||||||
|
} catch(Exception $e){
|
||||||
|
|
||||||
|
var_dump($e);
|
||||||
|
|
||||||
|
$db->rollBack();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Debug
|
||||||
|
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
||||||
|
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
||||||
|
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
|
||||||
|
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL;
|
@ -38,6 +38,15 @@ class MySQL {
|
|||||||
return $query->fetch();
|
return $query->fetch();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getTotalHosts() {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `host`');
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->fetch()->total;
|
||||||
|
}
|
||||||
|
|
||||||
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
|
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
@ -47,6 +56,15 @@ class MySQL {
|
|||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$robots, $timeUpdated, $hostId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
// Pages
|
// Pages
|
||||||
public function getTotalHostPages(int $hostId) {
|
public function getTotalHostPages(int $hostId) {
|
||||||
|
|
||||||
@ -92,6 +110,15 @@ class MySQL {
|
|||||||
return $query->fetch();
|
return $query->fetch();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getHostPages(int $hostId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
public function getFoundHostPage(int $hostPageId) {
|
public function getFoundHostPage(int $hostPageId) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
|
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
|
||||||
@ -159,6 +186,40 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function deleteHostPage(int $hostPageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostPageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteHostPages(int $hostId, int $limit) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleaner tools
|
||||||
|
public function getCleanerQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `host`
|
||||||
|
|
||||||
|
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||||
|
|
||||||
|
ORDER BY `hostId`
|
||||||
|
|
||||||
|
LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$timeFrom]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
// Crawl tools
|
// Crawl tools
|
||||||
public function getCrawlQueue(int $limit, int $timeFrom) {
|
public function getCrawlQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user