From 8fd422b5c2ca43dd0cef3ea0628e5b589674c859 Mon Sep 17 00:00:00 2001 From: ghost Date: Thu, 17 Aug 2023 12:58:38 +0300 Subject: [PATCH] generate hostPageDom target value based on source selector --- src/cli/yggo.php | 122 +++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 62 deletions(-) diff --git a/src/cli/yggo.php b/src/cli/yggo.php index dfe2b23..0467d72 100644 --- a/src/cli/yggo.php +++ b/src/cli/yggo.php @@ -473,7 +473,7 @@ if (!empty($argv[1])) { switch ($argv[2]) { - case 'generate': + case 'parse': // Validate hostId if (empty($argv[3])) { @@ -488,10 +488,17 @@ if (!empty($argv[1])) { exit; } - // Validate selector + // Validate selector source if (empty($argv[4])) { - CLI::danger(_('CSS selector required')); + CLI::danger(_('CSS selector source required')); + exit; + } + + // Validate selector target + if (empty($argv[5])) { + + CLI::danger(_('CSS selector target required')); exit; } @@ -500,92 +507,83 @@ if (!empty($argv[1])) { $hostPagesSkippedTotal = 0; $hostPageDomAddedTotal = 0; - // Begin selectors extraction - foreach ($db->getHostPages($argv[3]) as $hostPage) { + try { - $hostPagesProcessedTotal++; + $db->beginTransaction(); - if (false === stripos(Filter::mime($hostPage->mime), 'text/html')) { + // Begin selectors values processing by hostId + foreach ($db->getHostPages($argv[3]) as $hostPage) { - CLI::warning(sprintf(_('not supported MIME type for hostPageId "%s", skipped'), $hostPage->hostPageId)); + $hostPagesProcessedTotal++; - $hostPagesSkippedTotal++; + if (!$hostPageDomSelectorSource = $db->findLastHostPageDomBySelector($hostPage->hostPageId, $argv[4])) { - continue; - } + CLI::warning(sprintf(_('[selector source "%s"] not found for hostPageId "%s", skipped'), $argv[4], $hostPage->hostPageId)); - if (!$hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) { + $hostPagesSkippedTotal++; - CLI::warning(sprintf(_('last hostPageId "%s" description empty, skipped'), $hostPage->hostPageId)); + continue; + } - $hostPagesSkippedTotal++; + if (empty($hostPageDomSelectorSource->value)) { - continue; - } + CLI::warning(sprintf(_('[selector source "%s"] value is empty for hostPageId "%s", skipped'), $argv[4], $hostPage->hostPageId)); - if (empty($hostPageDescription->data)) { + $hostPagesSkippedTotal++; - CLI::warning(sprintf(_('empty hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId)); + continue; + } - $hostPagesSkippedTotal++; + // Init crawler + $crawler = new Symfony\Component\DomCrawler\Crawler(); + $crawler->addHtmlContent($hostPageDomSelectorSource->value); - continue; - } + // Extract target selector data + foreach ($crawler->filter($argv[5]) as $selectorTarget) { - if (!$html = base64_decode($hostPageDescription->data)) { + foreach ($selectorTarget->childNodes as $node) { - CLI::danger(sprintf(_('could not decode base64 for hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId)); + $value = trim($element->ownerDocument->saveHtml()); - $hostPagesSkippedTotal++; + if (empty($value)) { - continue; - } + CLI::warning(sprintf(_('[selector target "%s"] value is empty for hostPageId "%s", skipped'), $argv[5], $hostPage->hostPageId)); - if (empty($html)) { + $hostPagesSkippedTotal++; - CLI::warning(sprintf(_('empty decoded hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId)); + continue; + } - $hostPagesSkippedTotal++; + // Save selector value + $db->addHostPageDom( + $hostPage->hostPageId, + $argv[5], + $value, + time() + ); - continue; + $hostPageDomAddedTotal++; + } + exit; + } } - // Init crawler - $crawler = new Symfony\Component\DomCrawler\Crawler(); - $crawler->addHtmlContent($html); - - $selector = trim($argv[4]); + $db->commit(); - if ($elements = $crawler->filter($selector)) { - - foreach ($elements as $element) { - - $value = trim($element->nodeValue); - $value = strip_tags($value, empty($argv[5]) ? null : $argv[5]); + CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal)); + CLI::success(sprintf(_('Host pages skipped: %s'), $hostPagesSkippedTotal)); + CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDomAddedTotal)); + exit; - if (empty($value)) { + } catch(Exception $e) { - continue; - } + $db->rollBack(); - // Save selector value - $db->addHostPageDom( - $hostPage->hostPageId, - $selector, - $value, - time() - ); + var_dump($e); - $hostPageDomAddedTotal++; - } - } + exit; } - CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal)); - CLI::success(sprintf(_('Host pages skipped: %s'), $hostPagesSkippedTotal)); - CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDomAddedTotal)); - exit; - break; case 'truncate': @@ -637,9 +635,9 @@ CLI::default(' db - scan database registry for new CLI::default(' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem'); CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue'); CLI::break(); -CLI::default(' hostPageDom '); -CLI::default(' generate [hostId] [selector] [allowed tags] - generate hostPageDom values based on indexed hostPage.data field'); -CLI::default(' truncate - flush hostPageDom table'); +CLI::default(' hostPageDom '); +CLI::default(' parse [hostId] [selector source] [selector target] - parse new hostPageDom.selector target based on hostPageDom.selector source'); +CLI::default(' truncate - flush hostPageDom table'); CLI::break();