mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
implement MIME content-type crawler filter
This commit is contained in:
parent
d945fdfd91
commit
f88d2ee9ff
@ -170,6 +170,22 @@ define('CRAWL_MANIFEST_LIMIT', 10);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Index pages match MIME types
|
||||||
|
*
|
||||||
|
* comma separated
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_PAGE_MIME_TYPE', 'text/html');
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Index images match MIME types
|
||||||
|
*
|
||||||
|
* comma separated
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_IMAGE_MIME_TYPE', 'image/webp,image/png,image/gif,image/jpg,image/ico');
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Renew image index by timing offset provided
|
* Renew image index by timing offset provided
|
||||||
*
|
*
|
||||||
|
@ -85,7 +85,7 @@ try {
|
|||||||
// Apply new robots.txt rules
|
// Apply new robots.txt rules
|
||||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
foreach ($db->getHostImages($host->hostId) as $hostImage) {
|
foreach ($db->getHostImages($host->hostId) as $hostImage) { // @TODO implement CRAWL_IMAGE_MIME_TYPE updates
|
||||||
|
|
||||||
if (!$robots->uriAllowed($hostImage->uri)) {
|
if (!$robots->uriAllowed($hostImage->uri)) {
|
||||||
|
|
||||||
@ -98,7 +98,7 @@ try {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
foreach ($db->getHostPages($host->hostId) as $hostPage) { // @TODO implement CRAWL_PAGE_MIME_TYPE updates
|
||||||
|
|
||||||
if (!$robots->uriAllowed($hostPage->uri)) {
|
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||||
|
|
||||||
|
@ -230,7 +230,7 @@ try {
|
|||||||
// Update image index anyway, with the current time and http code
|
// Update image index anyway, with the current time and http code
|
||||||
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
||||||
|
|
||||||
// Skip next image processing non 200 code
|
// Skip image processing non 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
@ -239,7 +239,19 @@ try {
|
|||||||
// Save image content on data settings enabled
|
// Save image content on data settings enabled
|
||||||
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
||||||
|
|
||||||
// Skip next image processing images without returned data
|
// Skip image processing on MIME type not provided
|
||||||
|
if (!$contentType = $curl->getContentType()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip image processing on MIME type not allowed in settings
|
||||||
|
if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip image processing images without returned content
|
||||||
if (!$content = $curl->getContent()) {
|
if (!$content = $curl->getContent()) {
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
@ -271,13 +283,25 @@ try {
|
|||||||
// Update page index anyway, with the current time and http code
|
// Update page index anyway, with the current time and http code
|
||||||
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
||||||
|
|
||||||
// Skip next page processing non 200 code
|
// Skip page processing non 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip next page processing pages without returned data
|
// Skip page processing on MIME type not provided
|
||||||
|
if (!$contentType = $curl->getContentType()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip page processing on MIME type not allowed in settings
|
||||||
|
if (false === strpos($contentType, CRAWL_PAGE_MIME_TYPE)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip page processing pages without returned data
|
||||||
if (!$content = $curl->getContent()) {
|
if (!$content = $curl->getContent()) {
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
|
@ -42,6 +42,11 @@ class Curl {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getContentType() {
|
||||||
|
|
||||||
|
return curl_getinfo($this->_connection, CURLINFO_CONTENT_TYPE);
|
||||||
|
}
|
||||||
|
|
||||||
public function getContent() {
|
public function getContent() {
|
||||||
|
|
||||||
return $this->_response;
|
return $this->_response;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user