mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 21:44:59 +00:00
crawl host page DOM selectors on meta robots:index/follow condition enabled only
This commit is contained in:
parent
5c0cee7561
commit
b7c415a8b0
@ -492,32 +492,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
$metaKeywords = null;
|
$metaKeywords = null;
|
||||||
$metaYggoManifest = null;
|
$metaYggoManifest = null;
|
||||||
|
|
||||||
// Collect page DOM elements data
|
|
||||||
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
|
|
||||||
|
|
||||||
// Begin selectors extraction
|
|
||||||
$html = str_get_html($content);
|
|
||||||
|
|
||||||
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
|
|
||||||
|
|
||||||
foreach($html->find($selector) as $element) {
|
|
||||||
|
|
||||||
if (!empty($element->innertext)) {
|
|
||||||
|
|
||||||
$db->addHostPageDom($queueHostPage->hostPageId,
|
|
||||||
time(),
|
|
||||||
$selector,
|
|
||||||
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
|
|
||||||
preg_replace('/[\s]+/',
|
|
||||||
' ',
|
|
||||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
|
||||||
[' ', ' ', ' ', ' </'],
|
|
||||||
$element->innertext))) : $element->innertext));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse page content
|
// Parse page content
|
||||||
$dom = new DomDocument();
|
$dom = new DomDocument();
|
||||||
|
|
||||||
@ -594,13 +568,39 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
|
||||||
time());
|
time());
|
||||||
|
|
||||||
|
// Collect page DOM elements data on enabled
|
||||||
|
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
|
||||||
|
|
||||||
|
// Begin selectors extraction
|
||||||
|
$html = str_get_html($content);
|
||||||
|
|
||||||
|
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
|
||||||
|
|
||||||
|
foreach($html->find($selector) as $element) {
|
||||||
|
|
||||||
|
if (!empty($element->innertext)) {
|
||||||
|
|
||||||
|
$db->addHostPageDom($queueHostPage->hostPageId,
|
||||||
|
time(),
|
||||||
|
$selector,
|
||||||
|
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
|
||||||
|
preg_replace('/[\s]+/',
|
||||||
|
' ',
|
||||||
|
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||||
|
[' ', ' ', ' ', ' </'],
|
||||||
|
$element->innertext))) : $element->innertext));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Update manifest registry
|
// Update manifest registry
|
||||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||||
|
|
||||||
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
|
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
|
||||||
|
|
||||||
if (!$db->getManifest($metaYggoManifestCRC32)) {
|
if (!$db->getManifest($metaYggoManifestCRC32)) {
|
||||||
$db->addManifest($metaYggoManifestCRC32,
|
$db->addManifest($metaYggoManifestCRC32,
|
||||||
$metaYggoManifest,
|
$metaYggoManifest,
|
||||||
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
||||||
time());
|
time());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user