|
|
@ -492,32 +492,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
$metaKeywords = null; |
|
|
|
$metaKeywords = null; |
|
|
|
$metaYggoManifest = null; |
|
|
|
$metaYggoManifest = null; |
|
|
|
|
|
|
|
|
|
|
|
// Collect page DOM elements data |
|
|
|
|
|
|
|
if (CRAWL_HOST_PAGE_DOM_SELECTORS) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin selectors extraction |
|
|
|
|
|
|
|
$html = str_get_html($content); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach($html->find($selector) as $element) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!empty($element->innertext)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->addHostPageDom($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
time(), |
|
|
|
|
|
|
|
$selector, |
|
|
|
|
|
|
|
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( |
|
|
|
|
|
|
|
preg_replace('/[\s]+/', |
|
|
|
|
|
|
|
' ', |
|
|
|
|
|
|
|
str_replace(['<br />', '<br/>', '<br>', '</'], |
|
|
|
|
|
|
|
[' ', ' ', ' ', ' </'], |
|
|
|
|
|
|
|
$element->innertext))) : $element->innertext)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Parse page content |
|
|
|
// Parse page content |
|
|
|
$dom = new DomDocument(); |
|
|
|
$dom = new DomDocument(); |
|
|
|
|
|
|
|
|
|
|
@ -594,6 +568,32 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, |
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, |
|
|
|
time()); |
|
|
|
time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Collect page DOM elements data on enabled |
|
|
|
|
|
|
|
if (CRAWL_HOST_PAGE_DOM_SELECTORS) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin selectors extraction |
|
|
|
|
|
|
|
$html = str_get_html($content); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach($html->find($selector) as $element) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!empty($element->innertext)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->addHostPageDom($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
time(), |
|
|
|
|
|
|
|
$selector, |
|
|
|
|
|
|
|
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( |
|
|
|
|
|
|
|
preg_replace('/[\s]+/', |
|
|
|
|
|
|
|
' ', |
|
|
|
|
|
|
|
str_replace(['<br />', '<br/>', '<br>', '</'], |
|
|
|
|
|
|
|
[' ', ' ', ' ', ' </'], |
|
|
|
|
|
|
|
$element->innertext))) : $element->innertext)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Update manifest registry |
|
|
|
// Update manifest registry |
|
|
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { |
|
|
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { |
|
|
|
|
|
|
|
|
|
|
|