|
|
@ -36,6 +36,8 @@ $manifestsIndexed = 0; |
|
|
|
$hostPagesAdded = 0; |
|
|
|
$hostPagesAdded = 0; |
|
|
|
$hostImagesAdded = 0; |
|
|
|
$hostImagesAdded = 0; |
|
|
|
$hostsAdded = 0; |
|
|
|
$hostsAdded = 0; |
|
|
|
|
|
|
|
$hostPagesBanned = 0; |
|
|
|
|
|
|
|
$hostImagesBanned = 0; |
|
|
|
|
|
|
|
|
|
|
|
// Connect database |
|
|
|
// Connect database |
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); |
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); |
|
|
@ -237,6 +239,8 @@ try { |
|
|
|
// Skip image processing non 200 code |
|
|
|
// Skip image processing non 200 code |
|
|
|
if (200 != $curl->getCode()) { |
|
|
|
if (200 != $curl->getCode()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostImagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -245,6 +249,8 @@ try { |
|
|
|
// Skip image processing on MIME type not provided |
|
|
|
// Skip image processing on MIME type not provided |
|
|
|
if (!$hostImageContentType = $curl->getContentType()) { |
|
|
|
if (!$hostImageContentType = $curl->getContentType()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostImagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -253,6 +259,8 @@ try { |
|
|
|
// Skip image processing on MIME type not allowed in settings |
|
|
|
// Skip image processing on MIME type not allowed in settings |
|
|
|
if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) { |
|
|
|
if (false === strpos(CRAWL_IMAGE_MIME_TYPE, $hostImageContentType)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostImagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -264,6 +272,8 @@ try { |
|
|
|
// Skip image processing without returned content |
|
|
|
// Skip image processing without returned content |
|
|
|
if (!$hostImageContent = $curl->getContent()) { |
|
|
|
if (!$hostImageContent = $curl->getContent()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostImagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -271,6 +281,8 @@ try { |
|
|
|
|
|
|
|
|
|
|
|
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { |
|
|
|
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostImagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -278,6 +290,8 @@ try { |
|
|
|
|
|
|
|
|
|
|
|
if (!$hostImageBase64 = @base64_encode($hostImageContent)) { |
|
|
|
if (!$hostImageBase64 = @base64_encode($hostImageContent)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostImagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
$hostImageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -315,6 +329,8 @@ try { |
|
|
|
// Skip page processing non 200 code |
|
|
|
// Skip page processing non 200 code |
|
|
|
if (200 != $curl->getCode()) { |
|
|
|
if (200 != $curl->getCode()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -323,6 +339,8 @@ try { |
|
|
|
// Skip page processing on MIME type not provided |
|
|
|
// Skip page processing on MIME type not provided |
|
|
|
if (!$contentType = $curl->getContentType()) { |
|
|
|
if (!$contentType = $curl->getContentType()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -331,6 +349,8 @@ try { |
|
|
|
// Skip page processing on MIME type not allowed in settings |
|
|
|
// Skip page processing on MIME type not allowed in settings |
|
|
|
if (false === strpos(CRAWL_PAGE_MIME_TYPE, $contentType)) { |
|
|
|
if (false === strpos(CRAWL_PAGE_MIME_TYPE, $contentType)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -339,6 +359,8 @@ try { |
|
|
|
// Skip page processing without returned data |
|
|
|
// Skip page processing without returned data |
|
|
|
if (!$content = $curl->getContent()) { |
|
|
|
if (!$content = $curl->getContent()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -354,6 +376,8 @@ try { |
|
|
|
|
|
|
|
|
|
|
|
if ($title->length == 0) { |
|
|
|
if ($title->length == 0) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -387,6 +411,8 @@ try { |
|
|
|
// Append page with meta robots:noindex value to the robotsPostfix disallow list |
|
|
|
// Append page with meta robots:noindex value to the robotsPostfix disallow list |
|
|
|
if (false !== stripos($metaRobots, 'noindex')) { |
|
|
|
if (false !== stripos($metaRobots, 'noindex')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned++; |
|
|
|
|
|
|
|
|
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
$hostPageTimeBanned = time(); |
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
continue; |
|
|
@ -713,4 +739,6 @@ echo 'Images added: ' . $hostImagesAdded . PHP_EOL; |
|
|
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; |
|
|
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; |
|
|
|
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL; |
|
|
|
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL; |
|
|
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; |
|
|
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; |
|
|
|
|
|
|
|
echo 'Hosts pages banned: ' . $hostPagesBanned . PHP_EOL; |
|
|
|
|
|
|
|
echo 'Hosts images banned: ' . $hostImagesBanned . PHP_EOL; |
|
|
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL; |
|
|
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL; |
|
|
|