mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-08 22:07:56 +00:00
add image queue crawler
This commit is contained in:
parent
d905e33b4f
commit
9ed8411d2f
@ -94,19 +94,44 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_PAGE_LIMIT', 10);
|
define('CRAWL_PAGE_LIMIT', 10);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Images (URI) processing limit in the crawler.php queue
|
||||||
|
*
|
||||||
|
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value
|
||||||
|
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
|
||||||
|
*
|
||||||
|
* Usually up to 20 pages per minute,
|
||||||
|
* to prevent websites overload by sending GET crawling requests
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_IMAGE_LIMIT', 20);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Renew page index by timing offset provided
|
* Renew page index by timing offset provided
|
||||||
*
|
*
|
||||||
* This option works with CRAWL_PAGE_LIMIT step queue
|
* This option works with CRAWL_PAGE_LIMIT step queue
|
||||||
*
|
*
|
||||||
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
|
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
|
||||||
* must have enought value to crawl all pages collected in the DB index
|
* must have enough value to crawl all pages collected in the DB index
|
||||||
*
|
*
|
||||||
* or the crawler can stuck in queue
|
* or the crawler can stuck in queue
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Renew image index by timing offset provided
|
||||||
|
*
|
||||||
|
* This option works with CRAWL_IMAGE_LIMIT step queue
|
||||||
|
*
|
||||||
|
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
|
||||||
|
* must have enough value to crawl all images collected in the DB index
|
||||||
|
*
|
||||||
|
* or the crawler can stuck in queue
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Only URL addresses match this rule will be auto-crawled
|
* Only URL addresses match this rule will be auto-crawled
|
||||||
*
|
*
|
||||||
@ -216,7 +241,7 @@ define('CLEAN_HOST_LIMIT', 20);
|
|||||||
* This option works with CLEAN_HOST_LIMIT step queue
|
* This option works with CLEAN_HOST_LIMIT step queue
|
||||||
*
|
*
|
||||||
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
|
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
|
||||||
* must have enought value to process all pages in the DB index
|
* must have enough value to process all pages in the DB index
|
||||||
*
|
*
|
||||||
* or the cleaner can stuck in queue
|
* or the cleaner can stuck in queue
|
||||||
*
|
*
|
||||||
|
@ -28,7 +28,9 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
|
|||||||
$timeStart = microtime(true);
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
$hostPagesProcessed = 0;
|
$hostPagesProcessed = 0;
|
||||||
|
$hostImagesProcessed = 0;
|
||||||
$hostPagesIndexed = 0;
|
$hostPagesIndexed = 0;
|
||||||
|
$hostImagesIndexed = 0;
|
||||||
$hostPagesAdded = 0;
|
$hostPagesAdded = 0;
|
||||||
$hostImagesAdded = 0;
|
$hostImagesAdded = 0;
|
||||||
$hostsAdded = 0;
|
$hostsAdded = 0;
|
||||||
@ -36,8 +38,49 @@ $hostsAdded = 0;
|
|||||||
// Connect database
|
// Connect database
|
||||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||||
|
|
||||||
// Process crawl queue
|
// Process images crawl queue
|
||||||
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
|
||||||
|
|
||||||
|
// Build URL from the DB
|
||||||
|
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
|
||||||
|
|
||||||
|
$curl = new Curl($queueHostImageURL);
|
||||||
|
|
||||||
|
// Update image index anyway, with the current time and http code
|
||||||
|
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
||||||
|
|
||||||
|
// Skip next image processing non 200 code
|
||||||
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save image content on data settings enabled
|
||||||
|
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
||||||
|
|
||||||
|
// Skip next image processing images without returned data
|
||||||
|
if (!$content = $curl->getContent()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert remote image data to base64 string to prevent direct URL call
|
||||||
|
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process pages crawl queue
|
||||||
|
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||||
|
|
||||||
// Build URL from the DB
|
// Build URL from the DB
|
||||||
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||||
@ -45,7 +88,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
$curl = new Curl($queueHostPageURL);
|
$curl = new Curl($queueHostPageURL);
|
||||||
|
|
||||||
// Update page index anyway, with the current time and http code
|
// Update page index anyway, with the current time and http code
|
||||||
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||||
|
|
||||||
// Skip next page processing non 200 code
|
// Skip next page processing non 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
@ -427,6 +470,8 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||||
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
||||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||||
|
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
|
||||||
|
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
|
||||||
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
||||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||||
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;
|
||||||
|
@ -502,7 +502,7 @@ class MySQL {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Crawl tools
|
// Crawl tools
|
||||||
public function getCrawlQueue(int $limit, int $timeFrom) {
|
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
|
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
|
||||||
`hostPage`.`hostPageId`,
|
`hostPage`.`hostPageId`,
|
||||||
@ -530,7 +530,7 @@ class MySQL {
|
|||||||
return $query->fetchAll();
|
return $query->fetchAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) {
|
public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||||
|
|
||||||
@ -538,4 +538,36 @@ class MySQL {
|
|||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getHostImageCrawlQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT `hostImage`.`hostId`,
|
||||||
|
`hostImage`.`hostImageId`,
|
||||||
|
`hostImage`.`uri`,
|
||||||
|
`host`.`scheme`,
|
||||||
|
`host`.`name`,
|
||||||
|
`host`.`port`
|
||||||
|
|
||||||
|
FROM `hostImage`
|
||||||
|
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
||||||
|
|
||||||
|
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||||
|
|
||||||
|
ORDER BY `hostImage`.`hostImageId`
|
||||||
|
|
||||||
|
LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$timeFrom]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$timeUpdated, $httpCode, $hostImageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
// Current version
|
// Current version
|
||||||
define('API_VERSION', 0.3);
|
define('API_VERSION', 0.4);
|
||||||
|
|
||||||
// Load system dependencies
|
// Load system dependencies
|
||||||
require_once('../config/app.php');
|
require_once('../config/app.php');
|
||||||
@ -132,6 +132,7 @@ if (API_ENABLED) {
|
|||||||
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
||||||
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
||||||
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
||||||
|
'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET,
|
||||||
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
||||||
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
||||||
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
||||||
|
Loading…
Reference in New Issue
Block a user