Browse Source

add image queue crawler

main
ghost 2 years ago
parent
commit
9ed8411d2f
  1. 29
      config/app.php.txt
  2. 51
      crontab/crawler.php
  3. 36
      library/mysql.php
  4. 3
      public/api.php

29
config/app.php.txt

@ -94,19 +94,44 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); @@ -94,19 +94,44 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
*/
define('CRAWL_PAGE_LIMIT', 10);
/*
* Images (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
*/
define('CRAWL_IMAGE_LIMIT', 20);
/*
* Renew page index by timing offset provided
*
* This option works with CRAWL_PAGE_LIMIT step queue
*
* Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair
* must have enought value to crawl all pages collected in the DB index
* must have enough value to crawl all pages collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Renew image index by timing offset provided
*
* This option works with CRAWL_IMAGE_LIMIT step queue
*
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
* must have enough value to crawl all images collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Only URL addresses match this rule will be auto-crawled
*
@ -216,7 +241,7 @@ define('CLEAN_HOST_LIMIT', 20); @@ -216,7 +241,7 @@ define('CLEAN_HOST_LIMIT', 20);
* This option works with CLEAN_HOST_LIMIT step queue
*
* Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair
* must have enought value to process all pages in the DB index
* must have enough value to process all pages in the DB index
*
* or the cleaner can stuck in queue
*

51
crontab/crawler.php

@ -28,7 +28,9 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) { @@ -28,7 +28,9 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
$timeStart = microtime(true);
$hostPagesProcessed = 0;
$hostImagesProcessed = 0;
$hostPagesIndexed = 0;
$hostImagesIndexed = 0;
$hostPagesAdded = 0;
$hostImagesAdded = 0;
$hostsAdded = 0;
@ -36,8 +38,49 @@ $hostsAdded = 0; @@ -36,8 +38,49 @@ $hostsAdded = 0;
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Process crawl queue
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Process images crawl queue
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
$curl = new Curl($queueHostImageURL);
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip next image processing non 200 code
if (200 != $curl->getCode()) {
continue;
}
// Save image content on data settings enabled
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
// Skip next image processing images without returned data
if (!$content = $curl->getContent()) {
continue;
}
// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
continue;
}
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
continue;
}
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time());
}
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
@ -45,7 +88,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET @@ -45,7 +88,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
$curl = new Curl($queueHostPageURL);
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
// Skip next page processing non 200 code
if (200 != $curl->getCode()) {
@ -427,6 +470,8 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET @@ -427,6 +470,8 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;

36
library/mysql.php

@ -502,7 +502,7 @@ class MySQL { @@ -502,7 +502,7 @@ class MySQL {
}
// Crawl tools
public function getCrawlQueue(int $limit, int $timeFrom) {
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
`hostPage`.`hostPageId`,
@ -530,7 +530,7 @@ class MySQL { @@ -530,7 +530,7 @@ class MySQL {
return $query->fetchAll();
}
public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) {
public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1');
@ -538,4 +538,36 @@ class MySQL { @@ -538,4 +538,36 @@ class MySQL {
return $query->rowCount();
}
public function getHostImageCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostImage`.`hostId`,
`hostImage`.`hostImageId`,
`hostImage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
ORDER BY `hostImage`.`hostImageId`
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $hostImageId]);
return $query->rowCount();
}
}

3
public/api.php

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
<?php
// Current version
define('API_VERSION', 0.3);
define('API_VERSION', 0.4);
// Load system dependencies
require_once('../config/app.php');
@ -132,6 +132,7 @@ if (API_ENABLED) { @@ -132,6 +132,7 @@ if (API_ENABLED) {
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,

Loading…
Cancel
Save