|
|
|
@ -449,25 +449,21 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
@@ -449,25 +449,21 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Parse index MIME |
|
|
|
|
$hostPageIsDom = false; |
|
|
|
|
// Parse MIME |
|
|
|
|
$hostPageIsHtml = false; |
|
|
|
|
$hostPageInMime = false; |
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) { |
|
|
|
|
|
|
|
|
|
$mime = Filter::mime($mime); |
|
|
|
|
// Ban page on MIME type not allowed in settings |
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
|
|
|
|
|
|
// Check for DOM |
|
|
|
|
if (false !== stripos('text/html', $mime)) { |
|
|
|
|
// Check for HTML page |
|
|
|
|
if (false !== stripos(Filter::mime($contentType), 'text/html')) { |
|
|
|
|
|
|
|
|
|
$hostPageIsDom = true; |
|
|
|
|
$hostPageInMime = true; |
|
|
|
|
break; |
|
|
|
|
$hostPageIsHtml = true; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Ban page on MIME type not allowed in settings |
|
|
|
|
if (false !== stripos(Filter::mime($contentType), $mime)) { |
|
|
|
|
|
|
|
|
|
$hostPageInMime = true; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
@ -503,7 +499,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
@@ -503,7 +499,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|
|
|
|
$yggoManifest = null; |
|
|
|
|
|
|
|
|
|
// Is DOM content |
|
|
|
|
if ($hostPageIsDom) { |
|
|
|
|
if ($hostPageIsHtml) { |
|
|
|
|
|
|
|
|
|
// Parse content |
|
|
|
|
$dom = new DomDocument(); |
|
|
|
@ -598,10 +594,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
@@ -598,10 +594,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { |
|
|
|
|
|
|
|
|
|
$mime = Filter::mime($mime); |
|
|
|
|
|
|
|
|
|
// MIME type allowed in settings |
|
|
|
|
if (false !== stripos(Filter::mime($contentType), $mime)) { |
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
|
|
|
|
|
|
$snapLocal = true; |
|
|
|
|
break; |
|
|
|
@ -614,10 +608,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
@@ -614,10 +608,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) { |
|
|
|
|
|
|
|
|
|
$mime = Filter::mime($mime); |
|
|
|
|
|
|
|
|
|
// MIME type allowed in settings |
|
|
|
|
if (false !== stripos(Filter::mime($contentType), $mime)) { |
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
|
|
|
|
|
|
$snapMega = true; |
|
|
|
|
break; |
|
|
|
|