add image queue crawler

This commit is contained in:
ghost 2023-05-04 06:45:04 +03:00
parent d905e33b4f
commit 9ed8411d2f
4 changed files with 116 additions and 13 deletions

View File

@ -94,19 +94,44 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
*/ */
define('CRAWL_PAGE_LIMIT', 10); define('CRAWL_PAGE_LIMIT', 10);
/*
* Images (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
*/
define('CRAWL_IMAGE_LIMIT', 20);
/* /*
* Renew page index by timing offset provided * Renew page index by timing offset provided
* *
* This option works with CRAWL_PAGE_LIMIT step queue * This option works with CRAWL_PAGE_LIMIT step queue
* *
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enought value to crawl all pages collected in the DB index * must have enough value to crawl all pages collected in the DB index
* *
* or the crawler can stuck in queue * or the crawler can stuck in queue
* *
*/ */
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Renew image index by timing offset provided
*
* This option works with CRAWL_IMAGE_LIMIT step queue
*
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
* must have enough value to crawl all images collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
/* /*
* Only URL addresses match this rule will be auto-crawled * Only URL addresses match this rule will be auto-crawled
* *
@ -216,7 +241,7 @@ define('CLEAN_HOST_LIMIT', 20);
* This option works with CLEAN_HOST_LIMIT step queue * This option works with CLEAN_HOST_LIMIT step queue
* *
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair * Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
* must have enought value to process all pages in the DB index * must have enough value to process all pages in the DB index
* *
* or the cleaner can stuck in queue * or the cleaner can stuck in queue
* *

View File

@ -28,7 +28,9 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
$timeStart = microtime(true); $timeStart = microtime(true);
$hostPagesProcessed = 0; $hostPagesProcessed = 0;
$hostImagesProcessed = 0;
$hostPagesIndexed = 0; $hostPagesIndexed = 0;
$hostImagesIndexed = 0;
$hostPagesAdded = 0; $hostPagesAdded = 0;
$hostImagesAdded = 0; $hostImagesAdded = 0;
$hostsAdded = 0; $hostsAdded = 0;
@ -36,8 +38,49 @@ $hostsAdded = 0;
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Process crawl queue // Process images crawl queue
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
$curl = new Curl($queueHostImageURL);
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip next image processing non 200 code
if (200 != $curl->getCode()) {
continue;
}
// Save image content on data settings enabled
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
// Skip next image processing images without returned data
if (!$content = $curl->getContent()) {
continue;
}
// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
continue;
}
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
continue;
}
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time());
}
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Build URL from the DB // Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
@ -45,7 +88,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
$curl = new Curl($queueHostPageURL); $curl = new Curl($queueHostPageURL);
// Update page index anyway, with the current time and http code // Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
// Skip next page processing non 200 code // Skip next page processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
@ -427,6 +470,8 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL; echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL; echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;

View File

@ -502,7 +502,7 @@ class MySQL {
} }
// Crawl tools // Crawl tools
public function getCrawlQueue(int $limit, int $timeFrom) { public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`, $query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
`hostPage`.`hostPageId`, `hostPage`.`hostPageId`,
@ -530,7 +530,7 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) { public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1'); $query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
@ -538,4 +538,36 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function getHostImageCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostImage`.`hostId`,
`hostImage`.`hostImageId`,
`hostImage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
ORDER BY `hostImage`.`hostImageId`
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $hostImageId]);
return $query->rowCount();
}
} }

View File

@ -1,7 +1,7 @@
<?php <?php
// Current version // Current version
define('API_VERSION', 0.3); define('API_VERSION', 0.4);
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
@ -132,6 +132,7 @@ if (API_ENABLED) {
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,