ban host page on encoding not detected

This commit is contained in:
ghost 2023-06-16 13:23:52 +03:00
parent d2469e9adc
commit d96abb8ea8

View File

@ -494,7 +494,18 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Parse content
$dom = new DomDocument();
@$dom->loadHTML(sprintf('<?xml encoding="%s" ?>', mb_detect_encoding($content)) . $content);
if ($encoding = mb_detect_encoding($content)) {
@$dom->loadHTML(sprintf('<?xml encoding="%s" ?>', $encoding) . $content);
} else {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
// Skip index page links without titles
$title = @$dom->getElementsByTagName('title');