mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 18:35:04 +00:00
upgrade hostPageDom crawler to Symfony\Component\DomCrawler
This commit is contained in:
parent
055b15333e
commit
0f127ddb91
@ -31,10 +31,6 @@ require_once(__DIR__ . '/../library/filter.php');
|
||||
require_once(__DIR__ . '/../library/mysql.php');
|
||||
require_once(__DIR__ . '/../library/helper.php');
|
||||
require_once(__DIR__ . '/../library/yggstate.php');
|
||||
|
||||
// @TODO deprecated, use Symfony\Component\DomCrawler\Crawler instead
|
||||
// require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
|
||||
|
||||
require_once __DIR__ . '/../../vendor/autoload.php';
|
||||
|
||||
// Check disk quota
|
||||
@ -666,6 +662,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
|
||||
$metaYggoManifestURL = null;
|
||||
|
||||
// Parse page content
|
||||
// @TODO refactor to Symfony\Component\DomCrawler\Crawler
|
||||
$dom = new DomDocument();
|
||||
|
||||
if ($encoding = mb_detect_encoding($content)) {
|
||||
@ -738,33 +735,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
|
||||
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
|
||||
time());
|
||||
|
||||
// Collect page DOM elements data on enabled
|
||||
/* @TODO deprecated, use Symfony\Component\DomCrawler\Crawler instead
|
||||
if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
|
||||
|
||||
// Begin selectors extraction
|
||||
$html = str_get_html($content);
|
||||
|
||||
foreach ((array) explode(';', $hostPageDomSelectors) as $selector) {
|
||||
|
||||
foreach($html->find($selector) as $element) {
|
||||
|
||||
if (!empty($element->innertext)) {
|
||||
|
||||
$db->addHostPageDom($queueHostPage->hostPageId,
|
||||
time(),
|
||||
$selector,
|
||||
trim(Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGE_DOM_STRIP_TAGS', DEFAULT_HOST_PAGE_DOM_STRIP_TAGS) ? strip_tags( preg_replace('/[\s]+/',
|
||||
' ',
|
||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||
[' ', ' ', ' ', ' </'],
|
||||
$element->innertext))) : $element->innertext));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// Skip page links following with meta robots:nofollow attribute
|
||||
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
||||
|
||||
@ -1020,6 +990,70 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
|
||||
];
|
||||
}
|
||||
|
||||
// Init DOM crawler
|
||||
$crawler = new Symfony\Component\DomCrawler\Crawler();
|
||||
$crawler->addHtmlContent($content);
|
||||
|
||||
// Process selectors configuration
|
||||
if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', json_decode(DEFAULT_HOST_PAGES_DOM_SELECTORS))) {
|
||||
|
||||
foreach ($hostPageDomSelectors as $selector => $settings) {
|
||||
|
||||
// Extract target selector data
|
||||
foreach ($crawler->filter($selector) as $data) {
|
||||
|
||||
foreach ($data->childNodes as $node) {
|
||||
|
||||
$value = trim($node->ownerDocument->saveHtml());
|
||||
|
||||
// Apply selector settings
|
||||
foreach ($settings as $key => $value) {
|
||||
|
||||
switch ($key) {
|
||||
|
||||
case 'strip_tags':
|
||||
|
||||
if (!isset($value->enabled)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (false === $value->enabled) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($value->allowed_tags)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$value = strip_tags($value, $value->allowed_tags);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip empty selector values save
|
||||
if (empty($value)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Save selector value
|
||||
$db->addHostPageDom(
|
||||
$queueHostPage->hostPageId,
|
||||
$selector,
|
||||
$value,
|
||||
time()
|
||||
);
|
||||
|
||||
$hostPageDomAddedTotal++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process links collected
|
||||
foreach ($links as $link) {
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user