mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-08-26 13:51:55 +00:00
implement hosts crawl queue, move robots, sitemaps, manifests to this task
This commit is contained in:
parent
6ee5e53ef4
commit
ab6c0379c8
@ -270,7 +270,7 @@ define('CRAWL_HOST_DEFAULT_NSFW', false);
|
||||
/*
|
||||
* Collect sitemap index when available
|
||||
*
|
||||
* At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only
|
||||
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
|
||||
*
|
||||
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
|
||||
*
|
||||
@ -290,18 +290,23 @@ define('CRAWL_SITEMAPS', true);
|
||||
define('CRAWL_PAGE_RANK_UPDATE', true);
|
||||
|
||||
/*
|
||||
* Renew robots.txt index by timing offset provided
|
||||
* Renew hosts index by timing offset provided
|
||||
*
|
||||
*/
|
||||
define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7);
|
||||
define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7);
|
||||
|
||||
/*
|
||||
* Hosts Robots.txt processing limit in the crawler.php queue
|
||||
* Hosts hosts processing limit in the crawler.php queue
|
||||
*
|
||||
* Set 0 to disable
|
||||
*
|
||||
*/
|
||||
define('CRAWL_ROBOTS_LIMIT', 1);
|
||||
define('CRAWL_HOST_LIMIT', 1);
|
||||
|
||||
/*
|
||||
* Crawl robots.txt
|
||||
*/
|
||||
define('CRAWL_ROBOTS', true); // true|false
|
||||
|
||||
/*
|
||||
* Default robots.txt rules on remote file not exists
|
||||
|
@ -44,11 +44,12 @@ $httpRequestsSizeTotal = 0;
|
||||
$httpDownloadSizeTotal = 0;
|
||||
$httpRequestsTimeTotal = 0;
|
||||
|
||||
$hostsProcessed = 0;
|
||||
$hostsAdded = 0;
|
||||
$hostPagesBanned = 0;
|
||||
$hostPagesSnapAdded = 0;
|
||||
|
||||
$hostPagesProcessed = 0;
|
||||
$hostPagesBanned = 0;
|
||||
$hostPagesSnapAdded = 0;
|
||||
$hostPagesAdded = 0;
|
||||
|
||||
$manifestsProcessed = 0;
|
||||
@ -67,8 +68,18 @@ try {
|
||||
exit;
|
||||
}
|
||||
|
||||
// Process robots crawl queue
|
||||
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
||||
// Process hosts crawl queue
|
||||
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
try {
|
||||
|
||||
// Update host crawl queue
|
||||
$hostsProcessed += $db->updateHostCrawlQueue($host->hostId);
|
||||
|
||||
// Crawl robots.txt
|
||||
if (CRAWL_ROBOTS) {
|
||||
|
||||
// Update robots
|
||||
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
@ -91,6 +102,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
|
||||
|
||||
// Update host index
|
||||
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||
}
|
||||
|
||||
// Process sitemaps when enabled
|
||||
if (CRAWL_SITEMAPS) {
|
||||
@ -138,7 +150,8 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
|
||||
}
|
||||
}
|
||||
|
||||
// Update manifest if available for this host
|
||||
// Update manifests
|
||||
if (CRAWL_MANIFEST) {
|
||||
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
|
||||
|
||||
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
|
||||
@ -325,6 +338,21 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
|
||||
}
|
||||
}
|
||||
|
||||
$db->commit();
|
||||
|
||||
// Process update errors
|
||||
} catch (Exception $e) {
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
// Skip item
|
||||
$db->rollBack();
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
@ -1207,20 +1235,21 @@ $executionTimeTotal = microtime(true) - $timeStart;
|
||||
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
||||
|
||||
// Debug output
|
||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||
echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL;
|
||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL;
|
||||
|
||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL . PHP_EOL;
|
||||
|
||||
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL;
|
||||
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL . PHP_EOL;
|
||||
|
||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL . PHP_EOL;
|
||||
|
||||
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
||||
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
||||
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
|
||||
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
|
||||
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL . PHP_EOL;
|
||||
|
||||
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;
|
||||
|
@ -667,7 +667,7 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
|
||||
public function getHostCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$result = [];
|
||||
|
||||
@ -693,9 +693,19 @@ class MySQL {
|
||||
return (object) $result;
|
||||
}
|
||||
|
||||
public function updateHostCrawlQueue(int $hostId, int $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `host` SET `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$timeUpdated, $hostId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function optimize() {
|
||||
|
||||
$this->_db->query('OPTIMIZE TABLE `host`');
|
||||
$this->_db->query('OPTIMIZE TABLE `hostSetting`');
|
||||
$this->_db->query('OPTIMIZE TABLE `hostPage`');
|
||||
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
|
||||
$this->_db->query('OPTIMIZE TABLE `hostPageDom`');
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 162 KiB After Width: | Height: | Size: 144 KiB |
Loading…
x
Reference in New Issue
Block a user