Browse Source

fix multimedia snaps

main
ghost 1 year ago
parent
commit
efbbf19601
  1. 208
      crontab/crawler.php

208
crontab/crawler.php

@ -586,110 +586,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -586,110 +586,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
continue;
}
// Is HTML document
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
// Define variables
$metaDescription = null;
$metaKeywords = null;
$metaYggoManifest = null;
// Parse page content
$dom = new DomDocument();
if ($encoding = mb_detect_encoding($content)) {
@$dom->loadHTML(sprintf('<?xml encoding="%s" ?>', $encoding) . $content);
} else {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
// Skip index page links without title tag
$title = @$dom->getElementsByTagName('title');
if ($title->length == 0) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
} else {
$metaTitle = $title->item(0)->nodeValue;
}
// Get optional page meta data
foreach (@$dom->getElementsByTagName('meta') as $meta) {
if (@$meta->getAttribute('name') == 'description') {
$metaDescription = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'keywords') {
$metaKeywords = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'robots') {
$metaRobots = @$meta->getAttribute('content');
// Ban page with meta robots:noindex attribute
if (false !== stripos($metaRobots, 'noindex')) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
continue;
}
}
// Grab meta yggo:manifest link when available
if (@$meta->getAttribute('name') == 'yggo:manifest') {
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
}
}
// Add queued page description if not exists
$db->addHostPageDescription($queueHostPage->hostPageId,
$metaTitle,
$metaDescription ? Filter::pageDescription($metaDescription) : null,
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
time());
// Collect page DOM elements data on enabled
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
// Begin selectors extraction
$html = str_get_html($content);
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
foreach($html->find($selector) as $element) {
if (!empty($element->innertext)) {
$db->addHostPageDom($queueHostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext));
}
}
}
}
// Begin snaps
if (SNAP_STORAGE) {
@ -820,6 +716,110 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -820,6 +716,110 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
}
}
// Is HTML document
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
// Define variables
$metaDescription = null;
$metaKeywords = null;
$metaYggoManifest = null;
// Parse page content
$dom = new DomDocument();
if ($encoding = mb_detect_encoding($content)) {
@$dom->loadHTML(sprintf('<?xml encoding="%s" ?>', $encoding) . $content);
} else {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
// Skip index page links without title tag
$title = @$dom->getElementsByTagName('title');
if ($title->length == 0) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
} else {
$metaTitle = $title->item(0)->nodeValue;
}
// Get optional page meta data
foreach (@$dom->getElementsByTagName('meta') as $meta) {
if (@$meta->getAttribute('name') == 'description') {
$metaDescription = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'keywords') {
$metaKeywords = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'robots') {
$metaRobots = @$meta->getAttribute('content');
// Ban page with meta robots:noindex attribute
if (false !== stripos($metaRobots, 'noindex')) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
continue;
}
}
// Grab meta yggo:manifest link when available
if (@$meta->getAttribute('name') == 'yggo:manifest') {
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
}
}
// Add queued page description if not exists
$db->addHostPageDescription($queueHostPage->hostPageId,
$metaTitle,
$metaDescription ? Filter::pageDescription($metaDescription) : null,
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
time());
// Collect page DOM elements data on enabled
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
// Begin selectors extraction
$html = str_get_html($content);
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
foreach($html->find($selector) as $element) {
if (!empty($element->innertext)) {
$db->addHostPageDom($queueHostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext));
}
}
}
}
// Skip page links following with meta robots:nofollow attribute
foreach (@$dom->getElementsByTagName('meta') as $meta) {

Loading…
Cancel
Save