diff --git a/config/app.php.txt b/config/app.php.txt index 7fff029..f4e4a1d 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -94,19 +94,44 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); */ define('CRAWL_PAGE_LIMIT', 10); +/* + * Images (URI) processing limit in the crawler.php queue + * + * This option related to CRAWL_IMAGE_SECONDS_OFFSET value + * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) + * + * Usually up to 20 pages per minute, + * to prevent websites overload by sending GET crawling requests + * + */ +define('CRAWL_IMAGE_LIMIT', 20); + /* * Renew page index by timing offset provided * * This option works with CRAWL_PAGE_LIMIT step queue * * Pay attention, that CRAWL_PAGE_LIMIT + CRAWL_PAGE_SECONDS_OFFSET pair - * must have enought value to crawl all pages collected in the DB index + * must have enough value to crawl all pages collected in the DB index * * or the crawler can stuck in queue * */ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); +/* + * Renew image index by timing offset provided + * + * This option works with CRAWL_IMAGE_LIMIT step queue + * + * Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair + * must have enough value to crawl all images collected in the DB index + * + * or the crawler can stuck in queue + * + */ +define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12); + /* * Only URL addresses match this rule will be auto-crawled * @@ -216,7 +241,7 @@ define('CLEAN_HOST_LIMIT', 20); * This option works with CLEAN_HOST_LIMIT step queue * * Pay attention, that CLEAN_HOST_LIMIT + CLEAN_HOST_SECONDS_OFFSET pair - * must have enought value to process all pages in the DB index + * must have enough value to process all pages in the DB index * * or the cleaner can stuck in queue * diff --git a/crontab/crawler.php b/crontab/crawler.php index 34a54ad..16a8023 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -27,17 +27,60 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) { // Debug $timeStart = microtime(true); -$hostPagesProcessed = 0; -$hostPagesIndexed = 0; -$hostPagesAdded = 0; -$hostImagesAdded = 0; -$hostsAdded = 0; +$hostPagesProcessed = 0; +$hostImagesProcessed = 0; +$hostPagesIndexed = 0; +$hostImagesIndexed = 0; +$hostPagesAdded = 0; +$hostImagesAdded = 0; +$hostsAdded = 0; // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); -// Process crawl queue -foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { +// Process images crawl queue +foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) { + + // Build URL from the DB + $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri; + + $curl = new Curl($queueHostImageURL); + + // Update image index anyway, with the current time and http code + $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode()); + + // Skip next image processing non 200 code + if (200 != $curl->getCode()) { + + continue; + } + + // Save image content on data settings enabled + if (!CRAWL_HOST_DEFAULT_META_ONLY) { + + // Skip next image processing images without returned data + if (!$content = $curl->getContent()) { + + continue; + } + + // Convert remote image data to base64 string to prevent direct URL call + if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { + + continue; + } + + if (!$hostImageBase64 = @base64_encode($curl->getContent())) { + + continue; + } + + $hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time()); + } +} + +// Process pages crawl queue +foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { // Build URL from the DB $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; @@ -45,7 +88,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET $curl = new Curl($queueHostPageURL); // Update page index anyway, with the current time and http code - $hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); + $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); // Skip next page processing non 200 code if (200 != $curl->getCode()) { @@ -427,6 +470,8 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; +echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL; +echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL; echo 'Images added: ' . $hostImagesAdded . PHP_EOL; echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL; diff --git a/library/mysql.php b/library/mysql.php index b883045..8bf55de 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -502,7 +502,7 @@ class MySQL { } // Crawl tools - public function getCrawlQueue(int $limit, int $timeFrom) { + public function getHostPageCrawlQueue(int $limit, int $timeFrom) { $query = $this->_db->prepare('SELECT `hostPage`.`hostId`, `hostPage`.`hostPageId`, @@ -530,7 +530,7 @@ class MySQL { return $query->fetchAll(); } - public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) { + public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode) { $query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1'); @@ -538,4 +538,36 @@ class MySQL { return $query->rowCount(); } + + public function getHostImageCrawlQueue(int $limit, int $timeFrom) { + + $query = $this->_db->prepare('SELECT `hostImage`.`hostId`, + `hostImage`.`hostImageId`, + `hostImage`.`uri`, + `host`.`scheme`, + `host`.`name`, + `host`.`port` + + FROM `hostImage` + JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`) + + WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0 + + ORDER BY `hostImage`.`hostImageId` + + LIMIT ' . (int) $limit); + + $query->execute([$timeFrom]); + + return $query->fetchAll(); + } + + public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) { + + $query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1'); + + $query->execute([$timeUpdated, $httpCode, $hostImageId]); + + return $query->rowCount(); + } } diff --git a/public/api.php b/public/api.php index 164ef7f..4486c37 100644 --- a/public/api.php +++ b/public/api.php @@ -1,7 +1,7 @@ CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, + 'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,