mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
add image queue crawler
This commit is contained in:
parent
d905e33b4f
commit
9ed8411d2f
@ -94,19 +94,44 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
||||
*/
|
||||
define('CRAWL_PAGE_LIMIT', 10);
|
||||
|
||||
/*
|
||||
* Images (URI) processing limit in the crawler.php queue
|
||||
*
|
||||
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value
|
||||
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
|
||||
*
|
||||
* Usually up to 20 pages per minute,
|
||||
* to prevent websites overload by sending GET crawling requests
|
||||
*
|
||||
*/
|
||||
define('CRAWL_IMAGE_LIMIT', 20);
|
||||
|
||||
/*
|
||||
* Renew page index by timing offset provided
|
||||
*
|
||||
* This option works with CRAWL_PAGE_LIMIT step queue
|
||||
*
|
||||
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
|
||||
* must have enought value to crawl all pages collected in the DB index
|
||||
* must have enough value to crawl all pages collected in the DB index
|
||||
*
|
||||
* or the crawler can stuck in queue
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
|
||||
/*
|
||||
* Renew image index by timing offset provided
|
||||
*
|
||||
* This option works with CRAWL_IMAGE_LIMIT step queue
|
||||
*
|
||||
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
|
||||
* must have enough value to crawl all images collected in the DB index
|
||||
*
|
||||
* or the crawler can stuck in queue
|
||||
*
|
||||
*/
|
||||
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
|
||||
/*
|
||||
* Only URL addresses match this rule will be auto-crawled
|
||||
*
|
||||
@ -216,7 +241,7 @@ define('CLEAN_HOST_LIMIT', 20);
|
||||
* This option works with CLEAN_HOST_LIMIT step queue
|
||||
*
|
||||
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
|
||||
* must have enought value to process all pages in the DB index
|
||||
* must have enough value to process all pages in the DB index
|
||||
*
|
||||
* or the cleaner can stuck in queue
|
||||
*
|
||||
|
@ -27,17 +27,60 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
|
||||
// Debug
|
||||
$timeStart = microtime(true);
|
||||
|
||||
$hostPagesProcessed = 0;
|
||||
$hostPagesIndexed = 0;
|
||||
$hostPagesAdded = 0;
|
||||
$hostImagesAdded = 0;
|
||||
$hostsAdded = 0;
|
||||
$hostPagesProcessed = 0;
|
||||
$hostImagesProcessed = 0;
|
||||
$hostPagesIndexed = 0;
|
||||
$hostImagesIndexed = 0;
|
||||
$hostPagesAdded = 0;
|
||||
$hostImagesAdded = 0;
|
||||
$hostsAdded = 0;
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
// Process crawl queue
|
||||
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
// Process images crawl queue
|
||||
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
|
||||
|
||||
// Build URL from the DB
|
||||
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
|
||||
|
||||
$curl = new Curl($queueHostImageURL);
|
||||
|
||||
// Update image index anyway, with the current time and http code
|
||||
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
||||
|
||||
// Skip next image processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Save image content on data settings enabled
|
||||
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
||||
|
||||
// Skip next image processing images without returned data
|
||||
if (!$content = $curl->getContent()) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Convert remote image data to base64 string to prevent direct URL call
|
||||
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time());
|
||||
}
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
// Build URL from the DB
|
||||
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||
@ -45,7 +88,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
||||
$curl = new Curl($queueHostPageURL);
|
||||
|
||||
// Update page index anyway, with the current time and http code
|
||||
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||
|
||||
// Skip next page processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
@ -427,6 +470,8 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
|
||||
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
|
||||
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;
|
||||
|
@ -502,7 +502,7 @@ class MySQL {
|
||||
}
|
||||
|
||||
// Crawl tools
|
||||
public function getCrawlQueue(int $limit, int $timeFrom) {
|
||||
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
|
||||
`hostPage`.`hostPageId`,
|
||||
@ -530,7 +530,7 @@ class MySQL {
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) {
|
||||
public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||
|
||||
@ -538,4 +538,36 @@ class MySQL {
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getHostImageCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostImage`.`hostId`,
|
||||
`hostImage`.`hostImageId`,
|
||||
`hostImage`.`uri`,
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`
|
||||
|
||||
FROM `hostImage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
||||
|
||||
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||
|
||||
ORDER BY `hostImage`.`hostImageId`
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$timeUpdated, $httpCode, $hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
<?php
|
||||
|
||||
// Current version
|
||||
define('API_VERSION', 0.3);
|
||||
define('API_VERSION', 0.4);
|
||||
|
||||
// Load system dependencies
|
||||
require_once('../config/app.php');
|
||||
@ -132,6 +132,7 @@ if (API_ENABLED) {
|
||||
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
||||
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
||||
'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET,
|
||||
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
||||
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
||||
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
||||
|
Loading…
x
Reference in New Issue
Block a user