Browse Source

fix ban time update / count affected rows only

main
ghost 2 years ago
parent
commit
6c41dd5831
  1. 60
      crontab/crawler.php
  2. 18
      library/mysql.php

60
crontab/crawler.php

@ -224,9 +224,6 @@ try {
// Process images crawl queue // Process images crawl queue
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) { foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Define image variables
$hostImageTimeBanned = null;
// Build URL from the DB // Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri; $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
@ -239,9 +236,7 @@ try {
// Skip image processing non 200 code // Skip image processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
$hostImagesBanned++; $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
$hostImageTimeBanned = time();
continue; continue;
} }
@ -249,9 +244,7 @@ try {
// Skip image processing on MIME type not provided // Skip image processing on MIME type not provided
if (!$hostImageContentType = $curl->getContentType()) { if (!$hostImageContentType = $curl->getContentType()) {
$hostImagesBanned++; $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
$hostImageTimeBanned = time();
continue; continue;
} }
@ -259,9 +252,7 @@ try {
// Skip image processing on MIME type not allowed in settings // Skip image processing on MIME type not allowed in settings
if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) { if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) {
$hostImagesBanned++; $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
$hostImageTimeBanned = time();
continue; continue;
} }
@ -272,27 +263,21 @@ try {
// Skip image processing without returned content // Skip image processing without returned content
if (!$hostImageContent = $curl->getContent()) { if (!$hostImageContent = $curl->getContent()) {
$hostImagesBanned++; $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
$hostImageTimeBanned = time();
continue; continue;
} }
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
$hostImagesBanned++; $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
$hostImageTimeBanned = time();
continue; continue;
} }
if (!$hostImageBase64 = @base64_encode($hostImageContent)) { if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$hostImagesBanned++; $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
$hostImageTimeBanned = time();
continue; continue;
} }
@ -307,16 +292,12 @@ try {
$hostImagesIndexed += $db->updateHostImage($hostImage->hostImageId, $hostImagesIndexed += $db->updateHostImage($hostImage->hostImageId,
Filter::mime($hostImageContentType), Filter::mime($hostImageContentType),
$hostImageData, $hostImageData,
time(), time());
$hostImageTimeBanned);
} }
// Process pages crawl queue // Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Define page variables
$hostPageTimeBanned = null;
// Build URL from the DB // Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
@ -329,9 +310,7 @@ try {
// Skip page processing non 200 code // Skip page processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
$hostPagesBanned++; $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPageTimeBanned = time();
continue; continue;
} }
@ -339,9 +318,7 @@ try {
// Skip page processing on MIME type not provided // Skip page processing on MIME type not provided
if (!$contentType = $curl->getContentType()) { if (!$contentType = $curl->getContentType()) {
$hostPagesBanned++; $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPageTimeBanned = time();
continue; continue;
} }
@ -349,9 +326,7 @@ try {
// Skip page processing on MIME type not allowed in settings // Skip page processing on MIME type not allowed in settings
if (false === strpos(CRAWL_PAGE_MIME_TYPE, $contentType)) { if (false === strpos(CRAWL_PAGE_MIME_TYPE, $contentType)) {
$hostPagesBanned++; $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPageTimeBanned = time();
continue; continue;
} }
@ -359,9 +334,7 @@ try {
// Skip page processing without returned data // Skip page processing without returned data
if (!$content = $curl->getContent()) { if (!$content = $curl->getContent()) {
$hostPagesBanned++; $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPageTimeBanned = time();
continue; continue;
} }
@ -376,9 +349,7 @@ try {
if ($title->length == 0) { if ($title->length == 0) {
$hostPagesBanned++; $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPageTimeBanned = time();
continue; continue;
} }
@ -411,9 +382,7 @@ try {
// Append page with meta robots:noindex value to the robotsPostfix disallow list // Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) { if (false !== stripos($metaRobots, 'noindex')) {
$hostPagesBanned++; $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPageTimeBanned = time();
continue; continue;
} }
@ -431,8 +400,7 @@ try {
Filter::pageKeywords($metaKeywords), Filter::pageKeywords($metaKeywords),
Filter::mime($contentType), Filter::mime($contentType),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content), CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content),
time(), time());
$hostPageTimeBanned);
// Update manifest registry // Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {

18
library/mysql.php

@ -217,6 +217,15 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function updateHostImageTimeBanned(int $hostImageId, int $timeBanned) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$timeBanned, $hostImageId]);
return $query->rowCount();
}
public function updateHostImageHttpCode(int $hostImageId, public function updateHostImageHttpCode(int $hostImageId,
int $httpCode, int $httpCode,
int $timeUpdated) { int $timeUpdated) {
@ -510,6 +519,15 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$timeBanned, $hostPageId]);
return $query->rowCount();
}
public function deleteHostPage(int $hostPageId) { public function deleteHostPage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1'); $query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1');

Loading…
Cancel
Save