implement MIME content-type crawler filter

This commit is contained in:
ghost 2023-05-05 21:25:57 +03:00
parent d945fdfd91
commit f88d2ee9ff
4 changed files with 51 additions and 6 deletions

View File

@ -170,6 +170,22 @@ define('CRAWL_MANIFEST_LIMIT', 10);
*/
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Index pages match MIME types
*
* comma separated
*
*/
define('CRAWL_PAGE_MIME_TYPE', 'text/html');
/*
* Index images match MIME types
*
* comma separated
*
*/
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpg,image/ico');
/*
* Renew image index by timing offset provided
*

View File

@ -85,7 +85,7 @@ try {
// Apply new robots.txt rules
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
foreach ($db->getHostImages($host->hostId) as $hostImage) {
foreach ($db->getHostImages($host->hostId) as $hostImage) { // @TODO implement CRAWL_IMAGE_MIME_TYPE updates
if (!$robots->uriAllowed($hostImage->uri)) {
@ -98,7 +98,7 @@ try {
}
}
foreach ($db->getHostPages($host->hostId) as $hostPage) {
foreach ($db->getHostPages($host->hostId) as $hostPage) { // @TODO implement CRAWL_PAGE_MIME_TYPE updates
if (!$robots->uriAllowed($hostPage->uri)) {

View File

@ -230,7 +230,7 @@ try {
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip next image processing non 200 code
// Skip image processing non 200 code
if (200 != $curl->getCode()) {
continue;
@ -239,7 +239,19 @@ try {
// Save image content on data settings enabled
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
// Skip next image processing images without returned data
// Skip image processing on MIME type not provided
if (!$contentType = $curl->getContentType()) {
continue;
}
// Skip image processing on MIME type not allowed in settings
if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) {
continue;
}
// Skip image processing images without returned content
if (!$content = $curl->getContent()) {
continue;
@ -271,13 +283,25 @@ try {
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
// Skip next page processing non 200 code
// Skip page processing non 200 code
if (200 != $curl->getCode()) {
continue;
}
// Skip next page processing pages without returned data
// Skip page processing on MIME type not provided
if (!$contentType = $curl->getContentType()) {
continue;
}
// Skip page processing on MIME type not allowed in settings
if (false === strpos($contentType, CRAWL_PAGE_MIME_TYPE)) {
continue;
}
// Skip page processing pages without returned data
if (!$content = $curl->getContent()) {
continue;

View File

@ -42,6 +42,11 @@ class Curl {
}
public function getContentType() {
return curl_getinfo($this->_connection, CURLINFO_CONTENT_TYPE);
}
public function getContent() {
return $this->_response;