Browse Source

implement not reachable resources ban feature with timeout to prevent extra http requests

main
ghost 2 years ago
parent
commit
b6605b9132
  1. 2
      README.md
  2. 20
      config/app.php.txt
  3. 4
      config/sphinx.conf.txt
  4. 26
      crontab/cleaner.php
  5. 75
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 49
      library/mysql.php
  8. BIN
      media/db-prototype.png
  9. 58
      public/search.php

2
README.md

@ -176,7 +176,7 @@ GET m=SphinxQL @@ -176,7 +176,7 @@ GET m=SphinxQL
* [x] Transactions support to prevent data loss on queue failures
* [x] Distributed index crawling between YGGo nodes trough manifest API
* [x] MIME Content-type crawler settings
* [ ] Ban non-condition links to prevent extra requests
* [x] Ban non-condition links to prevent extra requests
* [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing
* [ ] Palette image index / filter

20
config/app.php.txt

@ -184,7 +184,7 @@ define('CRAWL_PAGE_MIME_TYPE', 'text/html'); @@ -184,7 +184,7 @@ define('CRAWL_PAGE_MIME_TYPE', 'text/html');
* comma separated
*
*/
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpg,image/ico');
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpeg,image/ico');
/*
* Renew image index by timing offset provided
@ -334,6 +334,24 @@ define('CLEAN_HOST_LIMIT', 20); @@ -334,6 +334,24 @@ define('CLEAN_HOST_LIMIT', 20);
*/
define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove page ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove image ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
// API settings
/*

4
config/sphinx.conf.txt

@ -15,7 +15,7 @@ source hostPage : common @@ -15,7 +15,7 @@ source hostPage : common
SELECT hostPage.hostPageId, hostPage.rank, hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords, hostPage.data, hostPage.uri, host.name \
FROM hostPage \
JOIN host ON (host.hostId = hostPage.hostId) \
WHERE host.status = '1' AND hostPage.httpCode = 200
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
sql_attr_uint = rank
}
@ -32,7 +32,7 @@ source hostImage : common @@ -32,7 +32,7 @@ source hostImage : common
WHERE hostPage.hostPageId IN (SELECT hostImageToHostPage.hostPageId FROM hostImageToHostPage WHERE hostImageToHostPage.hostImageId = hostImage.hostImageId)) AS pageDescription \
FROM hostImage \
JOIN host ON (host.hostId = hostImage.hostId) \
WHERE host.status = '1' AND hostImage.httpCode = 200
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL
sql_attr_uint = rank
}

26
crontab/cleaner.php

@ -21,12 +21,14 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); @@ -21,12 +21,14 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Debug
$timeStart = microtime(true);
$hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostsPagesDeleted = 0;
$hostsImagesDeleted = 0;
$manifestsDeleted = 0;
$hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostsPagesDeleted = 0;
$hostsImagesDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
$hostImagesBansRemoved = 0;
// Begin update
$db->beginTransaction();
@ -85,7 +87,7 @@ try { @@ -85,7 +87,7 @@ try {
// Apply new robots.txt rules
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
foreach ($db->getHostImages($host->hostId) as $hostImage) { // @TODO implement CRAWL_IMAGE_MIME_TYPE updates
foreach ($db->getHostImages($host->hostId) as $hostImage) {
if (!$robots->uriAllowed($hostImage->uri)) {
@ -98,7 +100,7 @@ try { @@ -98,7 +100,7 @@ try {
}
}
foreach ($db->getHostPages($host->hostId) as $hostPage) { // @TODO implement CRAWL_PAGE_MIME_TYPE updates
foreach ($db->getHostPages($host->hostId) as $hostPage) {
if (!$robots->uriAllowed($hostPage->uri)) {
@ -173,6 +175,12 @@ try { @@ -173,6 +175,12 @@ try {
}
}
// Reset banned pages
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
// Reset banned images
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
$db->commit();
} catch(Exception $e){
@ -189,4 +197,6 @@ echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL; @@ -189,4 +197,6 @@ echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;

75
crontab/crawler.php

@ -222,9 +222,13 @@ try { @@ -222,9 +222,13 @@ try {
// Process images crawl queue
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Define image variables
$hostImageTimeBanned = null;
// Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
// Init image request
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
// Update image index anyway, with the current time and http code
@ -233,50 +237,76 @@ try { @@ -233,50 +237,76 @@ try {
// Skip image processing non 200 code
if (200 != $curl->getCode()) {
$hostImageTimeBanned = time();
continue;
}
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $curl->getContentType()) {
$hostImageTimeBanned = time();
continue;
}
// Skip image processing on MIME type not allowed in settings
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) {
if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) {
$hostImageTimeBanned = time();
continue;
}
// Skip image processing without returned content
if (!$content = $curl->getContent()) {
// Convert remote image data to base64 string
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
continue;
}
// Skip image processing without returned content
if (!$hostImageContent = $curl->getContent()) {
// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
$hostImageTimeBanned = time();
continue;
}
continue;
}
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
continue;
$hostImageTimeBanned = time();
continue;
}
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$hostImageTimeBanned = time();
continue;
}
$hostImageData = 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64;
} else {
$hostImageData = null;
}
$hostImagesIndexed += $db->updateHostImage($hostImage->hostImageId,
Filter::mime($hostImageContentType),
(!CRAWL_HOST_DEFAULT_META_ONLY ? 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64 : null),
time());
$hostImageData,
time(),
$hostImageTimeBanned);
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Define page variables
$hostPageTimeBanned = null;
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
// Init page request
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
// Update page index anyway, with the current time and http code
@ -285,17 +315,23 @@ try { @@ -285,17 +315,23 @@ try {
// Skip page processing non 200 code
if (200 != $curl->getCode()) {
$hostPageTimeBanned = time();
continue;
}
// Skip page processing on MIME type not provided
if (!$contentType = $curl->getContentType()) {
$hostPageTimeBanned = time();
continue;
}
// Skip page processing on MIME type not allowed in settings
if (false === strpos($contentType, CRAWL_PAGE_MIME_TYPE)) {
if (false === strpos(CRAWL_PAGE_MIME_TYPE, $contentType)) {
$hostPageTimeBanned = time();
continue;
}
@ -303,6 +339,8 @@ try { @@ -303,6 +339,8 @@ try {
// Skip page processing without returned data
if (!$content = $curl->getContent()) {
$hostPageTimeBanned = time();
continue;
}
@ -315,6 +353,9 @@ try { @@ -315,6 +353,9 @@ try {
$title = @$dom->getElementsByTagName('title');
if ($title->length == 0) {
$hostPageTimeBanned = time();
continue;
}
@ -346,6 +387,8 @@ try { @@ -346,6 +387,8 @@ try {
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {
$hostPageTimeBanned = time();
continue;
}
@ -361,7 +404,9 @@ try { @@ -361,7 +404,9 @@ try {
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
Filter::mime($contentType),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content),
time(),
$hostPageTimeBanned);
// Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {

BIN
database/yggo.mwb

Binary file not shown.

49
library/mysql.php

@ -184,6 +184,7 @@ class MySQL { @@ -184,6 +184,7 @@ class MySQL {
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $timeBanned = null,
mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null,
@ -194,12 +195,13 @@ class MySQL { @@ -194,12 +195,13 @@ class MySQL {
`uri`,
`timeAdded`,
`timeUpdated`,
`timeBanned`,
`httpCode`,
`mime`,
`rank`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $data]);
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank, $data]);
return $this->_db->lastInsertId();
}
@ -229,11 +231,12 @@ class MySQL { @@ -229,11 +231,12 @@ class MySQL {
public function updateHostImage(int $hostImageId,
string $mime,
mixed $data,
int $timeUpdated) {
int $timeUpdated,
mixed $timeBanned = null) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$mime, $data, $timeUpdated, $hostImageId]);
$query->execute([$mime, $data, $timeUpdated, $timeBanned, $hostImageId]);
return $query->rowCount();
}
@ -441,6 +444,7 @@ class MySQL { @@ -441,6 +444,7 @@ class MySQL {
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $timeBanned = null,
mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null,
@ -454,15 +458,16 @@ class MySQL { @@ -454,15 +458,16 @@ class MySQL {
`uri`,
`timeAdded`,
`timeUpdated`,
`timeBanned`,
`httpCode`,
`mime`,
`rank`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
return $this->_db->lastInsertId();
}
@ -472,15 +477,19 @@ class MySQL { @@ -472,15 +477,19 @@ class MySQL {
mixed $metaDescription,
mixed $metaKeywords,
string $mime,
mixed $data) {
mixed $data,
int $timeUpdated,
mixed $timeBanned = null) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
`metaDescription` = ?,
`metaKeywords` = ?,
`mime` = ?,
`data` = ? WHERE `hostPageId` = ? LIMIT 1');
`data` = ?,
`timeUpdated` = ?,
`timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $hostPageId]);
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $timeUpdated, $timeBanned, $hostPageId]);
return $query->rowCount();
}
@ -535,6 +544,24 @@ class MySQL { @@ -535,6 +544,24 @@ class MySQL {
return $query->fetchAll();
}
public function resetBannedHostPages(int $timeOffset) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` > ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function resetBannedHostImages(int $timeOffset) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` > ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
// Crawl tools
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
@ -554,6 +581,7 @@ class MySQL { @@ -554,6 +581,7 @@ class MySQL {
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
AND `hostPage`.`timeBanned` IS NULL
ORDER BY `hostPage`.`rank` DESC, RAND()
@ -586,6 +614,7 @@ class MySQL { @@ -586,6 +614,7 @@ class MySQL {
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
AND `hostImage`.`timeBanned` IS NULL
ORDER BY `hostImage`.`rank` DESC, RAND()

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 96 KiB

After

Width:  |  Height:  |  Size: 110 KiB

58
public/search.php

@ -345,6 +345,10 @@ if (!empty($q)) { @@ -345,6 +345,10 @@ if (!empty($q)) {
// Get remote image data
if (empty($hostImage->data)) {
// Define image variables
$hostImageTimeBanned = null;
// Init image request
$hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);
// Skip item render on timeout
@ -352,18 +356,60 @@ if (!empty($q)) { @@ -352,18 +356,60 @@ if (!empty($q)) {
$db->updateHostImageHttpCode($hostImage->hostImageId, (int) $hostImageHttpCode, time());
if (200 != $hostImageHttpCode) continue;
if (!$hostImageContentType = $hostImageCurl->getContentType()) continue;
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) continue;
if (200 != $hostImageHttpCode) {
$hostImageTimeBanned = time();
continue;
}
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $hostImageCurl->getContentType()) {
$hostImageTimeBanned = time();
continue;
}
// Skip image processing on MIME type not allowed in settings
if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) {
$hostImageTimeBanned = time();
continue;
}
// Skip image processing without returned content
if (!$hostImageContent = $hostImageCurl->getContent()) {
$hostImageTimeBanned = time();
continue;
}
// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue;
if (!$hostImageBase64 = @base64_encode($hostImageCurl->getContent())) continue;
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) {
$hostImageTimeBanned = time();
continue;
}
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$hostImageTimeBanned = time();
continue;
}
$hostImageURLencoded = 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64;
// Save image content on data settings enabled
$db->updateHostImage($hostImage->hostImageId, Filter::mime($hostImageContentType), (!CRAWL_HOST_DEFAULT_META_ONLY ? $hostImageURLencoded : null), time());
$db->updateHostImage($hostImage->hostImageId,
Filter::mime($hostImageContentType),
CRAWL_HOST_DEFAULT_META_ONLY ? null : $hostImageURLencoded,
time(),
$hostImageTimeBanned);
// Local image data exists
} else {

Loading…
Cancel
Save