mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
implement not reachable resources ban feature with timeout to prevent extra http requests
This commit is contained in:
parent
cfa5d01db1
commit
b6605b9132
@ -176,7 +176,7 @@ GET m=SphinxQL
|
||||
* [x] Transactions support to prevent data loss on queue failures
|
||||
* [x] Distributed index crawling between YGGo nodes trough manifest API
|
||||
* [x] MIME Content-type crawler settings
|
||||
* [ ] Ban non-condition links to prevent extra requests
|
||||
* [x] Ban non-condition links to prevent extra requests
|
||||
* [ ] Indexing new sites homepage in higher priority
|
||||
* [ ] Redirect codes extended processing
|
||||
* [ ] Palette image index / filter
|
||||
|
@ -184,7 +184,7 @@ define('CRAWL_PAGE_MIME_TYPE', 'text/html');
|
||||
* comma separated
|
||||
*
|
||||
*/
|
||||
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpg,image/ico');
|
||||
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpeg,image/ico');
|
||||
|
||||
/*
|
||||
* Renew image index by timing offset provided
|
||||
@ -334,6 +334,24 @@ define('CLEAN_HOST_LIMIT', 20);
|
||||
*/
|
||||
define('CLEAN_HOST_SECONDS_OFFSET', 60*60*24*30);
|
||||
|
||||
/*
|
||||
* Remove page ban after following time
|
||||
*
|
||||
* This option used in crawler and search page
|
||||
* to prevent extra http requests to unavailable or not condition resources
|
||||
*
|
||||
*/
|
||||
define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||
|
||||
/*
|
||||
* Remove image ban after following time
|
||||
*
|
||||
* This option used in crawler and search page
|
||||
* to prevent extra http requests to unavailable or not condition resources
|
||||
*
|
||||
*/
|
||||
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||
|
||||
// API settings
|
||||
|
||||
/*
|
||||
|
@ -15,7 +15,7 @@ source hostPage : common
|
||||
SELECT hostPage.hostPageId, hostPage.rank, hostPage.metaTitle, hostPage.metaDescription, hostPage.metaKeywords, hostPage.data, hostPage.uri, host.name \
|
||||
FROM hostPage \
|
||||
JOIN host ON (host.hostId = hostPage.hostId) \
|
||||
WHERE host.status = '1' AND hostPage.httpCode = 200
|
||||
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
|
||||
|
||||
sql_attr_uint = rank
|
||||
}
|
||||
@ -32,7 +32,7 @@ source hostImage : common
|
||||
WHERE hostPage.hostPageId IN (SELECT hostImageToHostPage.hostPageId FROM hostImageToHostPage WHERE hostImageToHostPage.hostImageId = hostImage.hostImageId)) AS pageDescription \
|
||||
FROM hostImage \
|
||||
JOIN host ON (host.hostId = hostImage.hostId) \
|
||||
WHERE host.status = '1' AND hostImage.httpCode = 200
|
||||
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL
|
||||
|
||||
sql_attr_uint = rank
|
||||
}
|
||||
|
@ -21,12 +21,14 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
// Debug
|
||||
$timeStart = microtime(true);
|
||||
|
||||
$hostsTotal = $db->getTotalHosts();
|
||||
$manifestsTotal = $db->getTotalManifests();
|
||||
$hostsUpdated = 0;
|
||||
$hostsPagesDeleted = 0;
|
||||
$hostsImagesDeleted = 0;
|
||||
$manifestsDeleted = 0;
|
||||
$hostsTotal = $db->getTotalHosts();
|
||||
$manifestsTotal = $db->getTotalManifests();
|
||||
$hostsUpdated = 0;
|
||||
$hostsPagesDeleted = 0;
|
||||
$hostsImagesDeleted = 0;
|
||||
$manifestsDeleted = 0;
|
||||
$hostPagesBansRemoved = 0;
|
||||
$hostImagesBansRemoved = 0;
|
||||
|
||||
// Begin update
|
||||
$db->beginTransaction();
|
||||
@ -85,7 +87,7 @@ try {
|
||||
// Apply new robots.txt rules
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
foreach ($db->getHostImages($host->hostId) as $hostImage) { // @TODO implement CRAWL_IMAGE_MIME_TYPE updates
|
||||
foreach ($db->getHostImages($host->hostId) as $hostImage) {
|
||||
|
||||
if (!$robots->uriAllowed($hostImage->uri)) {
|
||||
|
||||
@ -98,7 +100,7 @@ try {
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) { // @TODO implement CRAWL_PAGE_MIME_TYPE updates
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||
|
||||
@ -173,6 +175,12 @@ try {
|
||||
}
|
||||
}
|
||||
|
||||
// Reset banned pages
|
||||
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
|
||||
|
||||
// Reset banned images
|
||||
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
|
||||
|
||||
$db->commit();
|
||||
|
||||
} catch(Exception $e){
|
||||
@ -189,4 +197,6 @@ echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
|
||||
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
|
||||
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
||||
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
|
||||
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
|
@ -222,9 +222,13 @@ try {
|
||||
// Process images crawl queue
|
||||
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
|
||||
|
||||
// Define image variables
|
||||
$hostImageTimeBanned = null;
|
||||
|
||||
// Build URL from the DB
|
||||
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
|
||||
|
||||
// Init image request
|
||||
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update image index anyway, with the current time and http code
|
||||
@ -233,50 +237,76 @@ try {
|
||||
// Skip image processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not provided
|
||||
if (!$hostImageContentType = $curl->getContentType()) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not allowed in settings
|
||||
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) {
|
||||
if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing without returned content
|
||||
if (!$content = $curl->getContent()) {
|
||||
// Convert remote image data to base64 string
|
||||
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
||||
|
||||
continue;
|
||||
}
|
||||
// Skip image processing without returned content
|
||||
if (!$hostImageContent = $curl->getContent()) {
|
||||
|
||||
// Convert remote image data to base64 string to prevent direct URL call
|
||||
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
|
||||
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
|
||||
|
||||
continue;
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$hostImageData = 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64;
|
||||
|
||||
} else {
|
||||
|
||||
$hostImageData = null;
|
||||
}
|
||||
|
||||
$hostImagesIndexed += $db->updateHostImage($hostImage->hostImageId,
|
||||
Filter::mime($hostImageContentType),
|
||||
(!CRAWL_HOST_DEFAULT_META_ONLY ? 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64 : null),
|
||||
time());
|
||||
$hostImageData,
|
||||
time(),
|
||||
$hostImageTimeBanned);
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
// Define page variables
|
||||
$hostPageTimeBanned = null;
|
||||
|
||||
// Build URL from the DB
|
||||
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||
|
||||
// Init page request
|
||||
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update page index anyway, with the current time and http code
|
||||
@ -285,17 +315,23 @@ try {
|
||||
// Skip page processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$hostPageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip page processing on MIME type not provided
|
||||
if (!$contentType = $curl->getContentType()) {
|
||||
|
||||
$hostPageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip page processing on MIME type not allowed in settings
|
||||
if (false === strpos($contentType, CRAWL_PAGE_MIME_TYPE)) {
|
||||
if (false === strpos(CRAWL_PAGE_MIME_TYPE, $contentType)) {
|
||||
|
||||
$hostPageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
@ -303,6 +339,8 @@ try {
|
||||
// Skip page processing without returned data
|
||||
if (!$content = $curl->getContent()) {
|
||||
|
||||
$hostPageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -315,6 +353,9 @@ try {
|
||||
$title = @$dom->getElementsByTagName('title');
|
||||
|
||||
if ($title->length == 0) {
|
||||
|
||||
$hostPageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -346,6 +387,8 @@ try {
|
||||
// Append page with meta robots:noindex value to the robotsPostfix disallow list
|
||||
if (false !== stripos($metaRobots, 'noindex')) {
|
||||
|
||||
$hostPageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -361,7 +404,9 @@ try {
|
||||
Filter::pageDescription($metaDescription),
|
||||
Filter::pageKeywords($metaKeywords),
|
||||
Filter::mime($contentType),
|
||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content),
|
||||
time(),
|
||||
$hostPageTimeBanned);
|
||||
|
||||
// Update manifest registry
|
||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||
|
Binary file not shown.
@ -184,6 +184,7 @@ class MySQL {
|
||||
string $uri,
|
||||
int $timeAdded,
|
||||
mixed $timeUpdated = null,
|
||||
mixed $timeBanned = null,
|
||||
mixed $httpCode = null,
|
||||
mixed $mime = null,
|
||||
mixed $rank = null,
|
||||
@ -194,12 +195,13 @@ class MySQL {
|
||||
`uri`,
|
||||
`timeAdded`,
|
||||
`timeUpdated`,
|
||||
`timeBanned`,
|
||||
`httpCode`,
|
||||
`mime`,
|
||||
`rank`,
|
||||
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $data]);
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank, $data]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
@ -229,11 +231,12 @@ class MySQL {
|
||||
public function updateHostImage(int $hostImageId,
|
||||
string $mime,
|
||||
mixed $data,
|
||||
int $timeUpdated) {
|
||||
int $timeUpdated,
|
||||
mixed $timeBanned = null) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$mime, $data, $timeUpdated, $hostImageId]);
|
||||
$query->execute([$mime, $data, $timeUpdated, $timeBanned, $hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
@ -441,6 +444,7 @@ class MySQL {
|
||||
string $uri,
|
||||
int $timeAdded,
|
||||
mixed $timeUpdated = null,
|
||||
mixed $timeBanned = null,
|
||||
mixed $httpCode = null,
|
||||
mixed $mime = null,
|
||||
mixed $rank = null,
|
||||
@ -454,15 +458,16 @@ class MySQL {
|
||||
`uri`,
|
||||
`timeAdded`,
|
||||
`timeUpdated`,
|
||||
`timeBanned`,
|
||||
`httpCode`,
|
||||
`mime`,
|
||||
`rank`,
|
||||
`metaTitle`,
|
||||
`metaDescription`,
|
||||
`metaKeywords`,
|
||||
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
@ -472,15 +477,19 @@ class MySQL {
|
||||
mixed $metaDescription,
|
||||
mixed $metaKeywords,
|
||||
string $mime,
|
||||
mixed $data) {
|
||||
mixed $data,
|
||||
int $timeUpdated,
|
||||
mixed $timeBanned = null) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
|
||||
`metaDescription` = ?,
|
||||
`metaKeywords` = ?,
|
||||
`mime` = ?,
|
||||
`data` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||
`data` = ?,
|
||||
`timeUpdated` = ?,
|
||||
`timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $hostPageId]);
|
||||
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $timeUpdated, $timeBanned, $hostPageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
@ -535,6 +544,24 @@ class MySQL {
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function resetBannedHostPages(int $timeOffset) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` > ' . (int) $timeOffset);
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function resetBannedHostImages(int $timeOffset) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` > ' . (int) $timeOffset);
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Crawl tools
|
||||
public function getHostPageCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
@ -554,6 +581,7 @@ class MySQL {
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
|
||||
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
|
||||
ORDER BY `hostPage`.`rank` DESC, RAND()
|
||||
|
||||
@ -586,6 +614,7 @@ class MySQL {
|
||||
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
||||
|
||||
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||
AND `hostImage`.`timeBanned` IS NULL
|
||||
|
||||
ORDER BY `hostImage`.`rank` DESC, RAND()
|
||||
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 96 KiB After Width: | Height: | Size: 110 KiB |
@ -345,6 +345,10 @@ if (!empty($q)) {
|
||||
// Get remote image data
|
||||
if (empty($hostImage->data)) {
|
||||
|
||||
// Define image variables
|
||||
$hostImageTimeBanned = null;
|
||||
|
||||
// Init image request
|
||||
$hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);
|
||||
|
||||
// Skip item render on timeout
|
||||
@ -352,18 +356,60 @@ if (!empty($q)) {
|
||||
|
||||
$db->updateHostImageHttpCode($hostImage->hostImageId, (int) $hostImageHttpCode, time());
|
||||
|
||||
if (200 != $hostImageHttpCode) continue;
|
||||
if (!$hostImageContentType = $hostImageCurl->getContentType()) continue;
|
||||
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) continue;
|
||||
if (200 != $hostImageHttpCode) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not provided
|
||||
if (!$hostImageContentType = $hostImageCurl->getContentType()) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not allowed in settings
|
||||
if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing without returned content
|
||||
if (!$hostImageContent = $hostImageCurl->getContent()) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Convert remote image data to base64 string to prevent direct URL call
|
||||
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue;
|
||||
if (!$hostImageBase64 = @base64_encode($hostImageCurl->getContent())) continue;
|
||||
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
|
||||
|
||||
$hostImageTimeBanned = time();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$hostImageURLencoded = 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64;
|
||||
|
||||
// Save image content on data settings enabled
|
||||
$db->updateHostImage($hostImage->hostImageId, Filter::mime($hostImageContentType), (!CRAWL_HOST_DEFAULT_META_ONLY ? $hostImageURLencoded : null), time());
|
||||
$db->updateHostImage($hostImage->hostImageId,
|
||||
Filter::mime($hostImageContentType),
|
||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : $hostImageURLencoded,
|
||||
time(),
|
||||
$hostImageTimeBanned);
|
||||
|
||||
// Local image data exists
|
||||
} else {
|
||||
|
Loading…
x
Reference in New Issue
Block a user