Browse Source

add banned items counters

main
ghost 2 years ago
parent
commit
20514c455f
  1. 28
      crontab/crawler.php

28
crontab/crawler.php

@ -36,6 +36,8 @@ $manifestsIndexed = 0; @@ -36,6 +36,8 @@ $manifestsIndexed = 0;
$hostPagesAdded = 0;
$hostImagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
$hostImagesBanned = 0;
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@ -237,6 +239,8 @@ try { @@ -237,6 +239,8 @@ try {
// Skip image processing non 200 code
if (200 != $curl->getCode()) {
$hostImagesBanned++;
$hostImageTimeBanned = time();
continue;
@ -245,6 +249,8 @@ try { @@ -245,6 +249,8 @@ try {
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $curl->getContentType()) {
$hostImagesBanned++;
$hostImageTimeBanned = time();
continue;
@ -253,6 +259,8 @@ try { @@ -253,6 +259,8 @@ try {
// Skip image processing on MIME type not allowed in settings
if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) {
$hostImagesBanned++;
$hostImageTimeBanned = time();
continue;
@ -264,6 +272,8 @@ try { @@ -264,6 +272,8 @@ try {
// Skip image processing without returned content
if (!$hostImageContent = $curl->getContent()) {
$hostImagesBanned++;
$hostImageTimeBanned = time();
continue;
@ -271,6 +281,8 @@ try { @@ -271,6 +281,8 @@ try {
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
$hostImagesBanned++;
$hostImageTimeBanned = time();
continue;
@ -278,6 +290,8 @@ try { @@ -278,6 +290,8 @@ try {
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$hostImagesBanned++;
$hostImageTimeBanned = time();
continue;
@ -315,6 +329,8 @@ try { @@ -315,6 +329,8 @@ try {
// Skip page processing non 200 code
if (200 != $curl->getCode()) {
$hostPagesBanned++;
$hostPageTimeBanned = time();
continue;
@ -323,6 +339,8 @@ try { @@ -323,6 +339,8 @@ try {
// Skip page processing on MIME type not provided
if (!$contentType = $curl->getContentType()) {
$hostPagesBanned++;
$hostPageTimeBanned = time();
continue;
@ -331,6 +349,8 @@ try { @@ -331,6 +349,8 @@ try {
// Skip page processing on MIME type not allowed in settings
if (false === strpos(CRAWL_PAGE_MIME_TYPE, $contentType)) {
$hostPagesBanned++;
$hostPageTimeBanned = time();
continue;
@ -339,6 +359,8 @@ try { @@ -339,6 +359,8 @@ try {
// Skip page processing without returned data
if (!$content = $curl->getContent()) {
$hostPagesBanned++;
$hostPageTimeBanned = time();
continue;
@ -354,6 +376,8 @@ try { @@ -354,6 +376,8 @@ try {
if ($title->length == 0) {
$hostPagesBanned++;
$hostPageTimeBanned = time();
continue;
@ -387,6 +411,8 @@ try { @@ -387,6 +411,8 @@ try {
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {
$hostPagesBanned++;
$hostPageTimeBanned = time();
continue;
@ -713,4 +739,6 @@ echo 'Images added: ' . $hostImagesAdded . PHP_EOL; @@ -713,4 +739,6 @@ echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Hosts pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Hosts images banned: ' . $hostImagesBanned . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;

Loading…
Cancel
Save