replace simple_html_dom library with Symfony\Component\DomCrawler

This commit is contained in:
ghost 2023-08-16 22:01:10 +03:00
parent caa0df67ee
commit 70db9620ec
6 changed files with 134 additions and 2443 deletions

View File

@ -278,7 +278,6 @@ See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
#### License
* Engine sources [MIT License](https://github.com/YGGverse/YGGo/blob/main/LICENSE)
* HTML parser [simple_html_dom](http://sourceforge.net/projects/simplehtmldom/)
* Home page animation by [alvarotrigo](https://codepen.io/alvarotrigo/pen/GRvYNax)
* CLI logo by [patorjk.com](https://patorjk.com/software/taag/#p=display&f=Slant&t=YGGo!)

View File

@ -5,7 +5,9 @@
"require": {
"php": ">=8.1",
"yggverse/parser": ">=0.1.0",
"yggverse/cache": ">=0.3.0"
"yggverse/cache": ">=0.3.0",
"symfony/dom-crawler": "^6.3",
"symfony/css-selector": "^6.3"
},
"license": "MIT",
"autoload": {

View File

@ -6,13 +6,12 @@ require_once(__DIR__ . '/../library/cli.php');
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/ftp.php');
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
require_once __DIR__ . '/../../vendor/autoload.php';
// CLI only to prevent https server connection timeout
if (php_sapi_name() != 'cli') {
CLI::danger(_('supported command line interface only'));
CLI::break();
exit;
}
@ -22,7 +21,6 @@ $semaphore = sem_get(crc32('crontab.cleaner'), 1);
if (false === sem_acquire($semaphore, true)) {
CLI::danger(_('stop crontab.cleaner is running in another thread.'));
CLI::break();
exit;
}
@ -32,7 +30,6 @@ $semaphore = sem_get(crc32('crontab.crawler'), 1);
if (false === sem_acquire($semaphore, true)) {
CLI::danger(_('stop crontab.crawler is running in another thread.'));
CLI::break();
exit;
}
@ -42,7 +39,6 @@ $semaphore = sem_get(crc32('cli.yggo'), 1);
if (false === sem_acquire($semaphore, true)) {
CLI::danger(_('process locked by another thread.'));
CLI::break();
exit;
}
@ -202,7 +198,6 @@ if (!empty($argv[1])) {
// @TODO
CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
CLI::danger(_('make sure you have backups then remove this alert.'));
CLI::break();
exit;
switch ($argv[3]) {
@ -285,7 +280,6 @@ if (!empty($argv[1])) {
} else {
CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
CLI::break();
exit;
}
@ -458,7 +452,6 @@ if (!empty($argv[1])) {
}
CLI::notice(_('hostPage rank fields successfully updated!'));
CLI::break();
exit;
break;
@ -474,83 +467,138 @@ if (!empty($argv[1])) {
if (empty($argv[2])) {
switch ($argv[2]) {
CLI::danger(_('action required'));
exit;
}
case 'generate':
switch ($argv[2]) {
$selectors = [];
case 'generate':
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) as $selector) {
// Validate hostId
if (empty($argv[3])) {
if (!empty($selector)) {
CLI::danger(_('hostId required'));
exit;
}
$selectors[] = trim($selector);
}
if (!$db->getHost($argv[3])) {
CLI::danger(_('hostId not found'));
exit;
}
// Validate selector
if (empty($argv[4])) {
CLI::danger(_('CSS selector required'));
exit;
}
// Init variables
$hostPagesProcessedTotal = 0;
$hostPagesSkippedTotal = 0;
$hostPageDomAddedTotal = 0;
// Begin selectors extraction
foreach ($db->getHostPages($argv[3]) as $hostPage) {
$hostPagesProcessedTotal++;
if (false === stripos(Filter::mime($hostPage->mime), 'text/html')) {
CLI::warning(sprintf(_('not supported MIME type for hostPageId "%s", skipped'), $hostPage->hostPageId));
$hostPagesSkippedTotal++;
continue;
}
if ($selectors) {
if (!$hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
// Init variables
$hostPagesProcessedTotal = 0;
$hostPageDOMAddedTotal = 0;
CLI::warning(sprintf(_('last hostPageId "%s" description empty, skipped'), $hostPage->hostPageId));
// Begin selectors extraction
foreach ($db->getHostPagesByIndexed() as $hostPage) {
$hostPagesSkippedTotal++;
if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
continue;
}
if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
if (empty($hostPageDescription->data)) {
$hostPagesProcessedTotal++;
CLI::warning(sprintf(_('empty hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId));
if (!empty($hostPageDescription->data)) {
$hostPagesSkippedTotal++;
$html = str_get_html(base64_decode($hostPageDescription->data));
continue;
}
foreach ($selectors as $selector) {
if (!$html = base64_decode($hostPageDescription->data)) {
foreach($html->find($selector) as $element) {
CLI::danger(sprintf(_('could not decode base64 for hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId));
if (!empty($element->innertext)) {
$hostPagesSkippedTotal++;
$hostPageDOMAddedTotal++;
continue;
}
$db->addHostPageDom($hostPage->hostPageId,
time(),
$selector,
trim((bool) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_STRIP_TAGS', DEFAULT_HOST_PAGES_DOM_STRIP_TAGS) ? strip_tags(preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext));
}
}
}
}
}
if (empty($html)) {
CLI::warning(sprintf(_('empty decoded hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId));
$hostPagesSkippedTotal++;
continue;
}
// Init crawler
$crawler = new Symfony\Component\DomCrawler\Crawler();
$crawler->addHtmlContent($html);
$selector = trim($argv[4]);
if ($elements = $crawler->filter($selector)) {
foreach ($elements as $element) {
$value = trim($element->nodeValue);
$value = strip_tags($value, empty($argv[5]) ? null : $argv[5]);
if (empty($value)) {
continue;
}
// Save selector value
$db->addHostPageDom(
$hostPage->hostPageId,
$selector,
$value,
time()
);
$hostPageDomAddedTotal++;
}
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
exit;
}
}
CLI::danger(_('DEFAULT_HOST_PAGES_DOM_SELECTORS not provided in the configuration file'));
CLI::break();
exit;
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
CLI::success(sprintf(_('Host pages skipped: %s'), $hostPagesSkippedTotal));
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDomAddedTotal));
exit;
break;
case 'truncate':
break;
case 'truncate':
$db->truncateHostPageDom();
$db->truncateHostPageDom();
CLI::success(_('hostPageDom table successfully truncated'));
CLI::break();
exit;
CLI::success(_('hostPageDom table successfully truncated'));
exit;
break;
}
break;
default:
CLI::danger(_('unknown action'));
exit;
}
break;
}
@ -566,32 +614,32 @@ CLI::default('/_/\____/\____/\____(_)' );
CLI::break();
CLI::default('available options:');
CLI::break();
CLI::default(' help - this message');
CLI::default(' help - this message');
CLI::break();
CLI::default(' db ');
CLI::default(' optimize - optimize all tables');
CLI::default(' db ');
CLI::default(' optimize - optimize all tables');
CLI::break();
CLI::default(' crontab ');
CLI::default(' crawl - execute step in crawler queue');
CLI::default(' clean - execute step in cleaner queue');
CLI::default(' crontab ');
CLI::default(' crawl - execute step in crawler queue');
CLI::default(' clean - execute step in cleaner queue');
CLI::break();
CLI::default(' hostPage ');
CLI::default(' rank ');
CLI::default(' reindex - reindex hostPage.rank fields');
CLI::default(' hostPage ');
CLI::default(' rank ');
CLI::default(' reindex - reindex hostPage.rank fields');
CLI::break();
CLI::default(' hostSetting ');
CLI::default(' set [hostId] [key] [value] - set formatted value by hostId and key');
CLI::default(' get [hostId] [key] - get formatted value by hostId and key');
CLI::default(' set [hostId] [key] [value] - set formatted value by hostId and key');
CLI::default(' get [hostId] [key] - get formatted value by hostId and key');
CLI::break();
CLI::default(' hostPageSnap ');
CLI::default(' repair ');
CLI::default(' db - scan database registry for new or deprecated snap files');
CLI::default(' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem');
CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue');
CLI::default(' hostPageSnap ');
CLI::default(' repair ');
CLI::default(' db - scan database registry for new or deprecated snap files');
CLI::default(' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem');
CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue');
CLI::break();
CLI::default(' hostPageDom ');
CLI::default(' generate [selectors] - make hostPageDom index based on related hostPage.data field');
CLI::default(' truncate - flush hostPageDom table');
CLI::default(' hostPageDom ');
CLI::default(' generate [hostId] [selector] [allowed tags] - generate hostPageDom values based on indexed hostPage.data field');
CLI::default(' truncate - flush hostPageDom table');
CLI::break();

View File

@ -31,7 +31,9 @@ require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/yggstate.php');
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
// @TODO deprecated, use Symfony\Component\DomCrawler\Crawler instead
// require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
require_once __DIR__ . '/../../vendor/autoload.php';
@ -738,6 +740,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
time());
// Collect page DOM elements data on enabled
/* @TODO deprecated, use Symfony\Component\DomCrawler\Crawler instead
if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
// Begin selectors extraction
@ -761,6 +764,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
}
}
}
*/
// Skip page links following with meta robots:nofollow attribute
foreach (@$dom->getElementsByTagName('meta') as $meta) {

View File

@ -314,15 +314,6 @@ class MySQL {
return $result;
}
public function getHostPagesByIndexed() {
$this->_debug->query->select->total++;
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL');
return $query->fetchAll();
}
public function getHostPagesByLimit(int $hostId, int $limit) {
$this->_debug->query->select->total++;
@ -632,7 +623,7 @@ class MySQL {
return $query->rowCount();
}
public function addHostPageDom(int $hostPageId, int $timeAdded, string $selector, string $value) {
public function addHostPageDom(int $hostPageId, string $selector, string $value, int $timeAdded) {
$this->_debug->query->insert->total++;

File diff suppressed because it is too large Load Diff