diff --git a/cli/yggo.php b/cli/yggo.php new file mode 100644 index 0000000..0e304ce --- /dev/null +++ b/cli/yggo.php @@ -0,0 +1,124 @@ +getHostPagesByIndexed() as $hostPage) { + + if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) { + + if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) { + + $hostPagesProcessedTotal++; + + if (!empty($hostPageDescription->data)) { + + $html = str_get_html(base64_decode($hostPageDescription->data)); + + foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { + + foreach($html->find($selector) as $element) { + + if (!empty($element->innertext)) { + + $hostPageDOMAddedTotal++; + + $db->addHostPageDom($hostPage->hostPageId, + time(), + $selector, + trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( + preg_replace('/[\s]+/', + ' ', + str_replace(['
', '
', '
', 'innertext))) : $element->innertext)); + } + } + } + } + } + } + } + + echo sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal) . PHP_EOL; + echo sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal) . PHP_EOL; + exit; + } + + echo PHP_EOL . _('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file') . PHP_EOL; + exit; + + break; + case 'truncate': + + $db->truncateHostPageDom(); + + echo _('hostPageDom table successfully truncated') . PHP_EOL; + exit; + + break; + default: + + echo PHP_EOL . _('undefined action argument') . PHP_EOL; + } + + break; +} + +// Default message +echo '__ ______________ __' . PHP_EOL; +echo '\ \/ / ____/ ____/___ / /' . PHP_EOL; +echo ' \ / / __/ / __/ __ \/ /' . PHP_EOL; +echo ' / / /_/ / /_/ / /_/ /_/' . PHP_EOL; +echo '/_/\____/\____/\____(_)' . PHP_EOL; + +echo PHP_EOL . _('available options:') . PHP_EOL . PHP_EOL; + +echo _(' help - this message') . PHP_EOL; +echo _(' hostPageDom generate - make hostPageDom index based on related hostPage.data field') . PHP_EOL; +echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL . PHP_EOL; + +echo _('get support: https://github.com/YGGverse/YGGo/issues') . PHP_EOL . PHP_EOL;