Browse Source

crawl host page DOM selectors on meta robots:index/follow condition enabled only

main
ghost 1 year ago
parent
commit
b7c415a8b0
  1. 52
      crontab/crawler.php

52
crontab/crawler.php

@ -492,32 +492,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -492,32 +492,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$metaKeywords = null;
$metaYggoManifest = null;
// Collect page DOM elements data
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
// Begin selectors extraction
$html = str_get_html($content);
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
foreach($html->find($selector) as $element) {
if (!empty($element->innertext)) {
$db->addHostPageDom($queueHostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext));
}
}
}
}
// Parse page content
$dom = new DomDocument();
@ -594,6 +568,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -594,6 +568,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
time());
// Collect page DOM elements data on enabled
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
// Begin selectors extraction
$html = str_get_html($content);
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
foreach($html->find($selector) as $element) {
if (!empty($element->innertext)) {
$db->addHostPageDom($queueHostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext));
}
}
}
}
// Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {

Loading…
Cancel
Save