mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
implement MIME content-type crawler filter
This commit is contained in:
parent
d945fdfd91
commit
f88d2ee9ff
@ -170,6 +170,22 @@ define('CRAWL_MANIFEST_LIMIT', 10);
|
||||
*/
|
||||
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
|
||||
/*
|
||||
* Index pages match MIME types
|
||||
*
|
||||
* comma separated
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_MIME_TYPE', 'text/html');
|
||||
|
||||
/*
|
||||
* Index images match MIME types
|
||||
*
|
||||
* comma separated
|
||||
*
|
||||
*/
|
||||
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpg,image/ico');
|
||||
|
||||
/*
|
||||
* Renew image index by timing offset provided
|
||||
*
|
||||
|
@ -85,7 +85,7 @@ try {
|
||||
// Apply new robots.txt rules
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
foreach ($db->getHostImages($host->hostId) as $hostImage) {
|
||||
foreach ($db->getHostImages($host->hostId) as $hostImage) { // @TODO implement CRAWL_IMAGE_MIME_TYPE updates
|
||||
|
||||
if (!$robots->uriAllowed($hostImage->uri)) {
|
||||
|
||||
@ -98,7 +98,7 @@ try {
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) { // @TODO implement CRAWL_PAGE_MIME_TYPE updates
|
||||
|
||||
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||
|
||||
|
@ -230,7 +230,7 @@ try {
|
||||
// Update image index anyway, with the current time and http code
|
||||
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
||||
|
||||
// Skip next image processing non 200 code
|
||||
// Skip image processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
continue;
|
||||
@ -239,7 +239,19 @@ try {
|
||||
// Save image content on data settings enabled
|
||||
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
||||
|
||||
// Skip next image processing images without returned data
|
||||
// Skip image processing on MIME type not provided
|
||||
if (!$contentType = $curl->getContentType()) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not allowed in settings
|
||||
if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing images without returned content
|
||||
if (!$content = $curl->getContent()) {
|
||||
|
||||
continue;
|
||||
@ -271,13 +283,25 @@ try {
|
||||
// Update page index anyway, with the current time and http code
|
||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||
|
||||
// Skip next page processing non 200 code
|
||||
// Skip page processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip next page processing pages without returned data
|
||||
// Skip page processing on MIME type not provided
|
||||
if (!$contentType = $curl->getContentType()) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip page processing on MIME type not allowed in settings
|
||||
if (false === strpos($contentType, CRAWL_PAGE_MIME_TYPE)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip page processing pages without returned data
|
||||
if (!$content = $curl->getContent()) {
|
||||
|
||||
continue;
|
||||
|
@ -42,6 +42,11 @@ class Curl {
|
||||
|
||||
}
|
||||
|
||||
public function getContentType() {
|
||||
|
||||
return curl_getinfo($this->_connection, CURLINFO_CONTENT_TYPE);
|
||||
}
|
||||
|
||||
public function getContent() {
|
||||
|
||||
return $this->_response;
|
||||
|
Loading…
x
Reference in New Issue
Block a user