Browse Source

implement MIME content-type crawler filter

main
ghost 2 years ago
parent
commit
f88d2ee9ff
  1. 16
      config/app.php.txt
  2. 4
      crontab/cleaner.php
  3. 32
      crontab/crawler.php
  4. 5
      library/curl.php

16
config/app.php.txt

@ -170,6 +170,22 @@ define('CRAWL_MANIFEST_LIMIT', 10); @@ -170,6 +170,22 @@ define('CRAWL_MANIFEST_LIMIT', 10);
*/
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Index pages match MIME types
*
* comma separated
*
*/
define('CRAWL_PAGE_MIME_TYPE', 'text/html');
/*
* Index images match MIME types
*
* comma separated
*
*/
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpg,image/ico');
/*
* Renew image index by timing offset provided
*

4
crontab/cleaner.php

@ -85,7 +85,7 @@ try { @@ -85,7 +85,7 @@ try {
// Apply new robots.txt rules
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
foreach ($db->getHostImages($host->hostId) as $hostImage) {
foreach ($db->getHostImages($host->hostId) as $hostImage) { // @TODO implement CRAWL_IMAGE_MIME_TYPE updates
if (!$robots->uriAllowed($hostImage->uri)) {
@ -98,7 +98,7 @@ try { @@ -98,7 +98,7 @@ try {
}
}
foreach ($db->getHostPages($host->hostId) as $hostPage) {
foreach ($db->getHostPages($host->hostId) as $hostPage) { // @TODO implement CRAWL_PAGE_MIME_TYPE updates
if (!$robots->uriAllowed($hostPage->uri)) {

32
crontab/crawler.php

@ -230,7 +230,7 @@ try { @@ -230,7 +230,7 @@ try {
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip next image processing non 200 code
// Skip image processing non 200 code
if (200 != $curl->getCode()) {
continue;
@ -239,7 +239,19 @@ try { @@ -239,7 +239,19 @@ try {
// Save image content on data settings enabled
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
// Skip next image processing images without returned data
// Skip image processing on MIME type not provided
if (!$contentType = $curl->getContentType()) {
continue;
}
// Skip image processing on MIME type not allowed in settings
if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) {
continue;
}
// Skip image processing images without returned content
if (!$content = $curl->getContent()) {
continue;
@ -271,13 +283,25 @@ try { @@ -271,13 +283,25 @@ try {
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
// Skip next page processing non 200 code
// Skip page processing non 200 code
if (200 != $curl->getCode()) {
continue;
}
// Skip next page processing pages without returned data
// Skip page processing on MIME type not provided
if (!$contentType = $curl->getContentType()) {
continue;
}
// Skip page processing on MIME type not allowed in settings
if (false === strpos($contentType, CRAWL_PAGE_MIME_TYPE)) {
continue;
}
// Skip page processing pages without returned data
if (!$content = $curl->getContent()) {
continue;

5
library/curl.php

@ -42,6 +42,11 @@ class Curl { @@ -42,6 +42,11 @@ class Curl {
}
public function getContentType() {
return curl_getinfo($this->_connection, CURLINFO_CONTENT_TYPE);
}
public function getContent() {
return $this->_response;

Loading…
Cancel
Save