mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-09-04 10:11:57 +00:00
replace simple_html_dom library with Symfony\Component\DomCrawler
This commit is contained in:
parent
caa0df67ee
commit
70db9620ec
@ -278,7 +278,6 @@ See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
|
|||||||
#### License
|
#### License
|
||||||
|
|
||||||
* Engine sources [MIT License](https://github.com/YGGverse/YGGo/blob/main/LICENSE)
|
* Engine sources [MIT License](https://github.com/YGGverse/YGGo/blob/main/LICENSE)
|
||||||
* HTML parser [simple_html_dom](http://sourceforge.net/projects/simplehtmldom/)
|
|
||||||
* Home page animation by [alvarotrigo](https://codepen.io/alvarotrigo/pen/GRvYNax)
|
* Home page animation by [alvarotrigo](https://codepen.io/alvarotrigo/pen/GRvYNax)
|
||||||
* CLI logo by [patorjk.com](https://patorjk.com/software/taag/#p=display&f=Slant&t=YGGo!)
|
* CLI logo by [patorjk.com](https://patorjk.com/software/taag/#p=display&f=Slant&t=YGGo!)
|
||||||
|
|
||||||
|
@ -5,7 +5,9 @@
|
|||||||
"require": {
|
"require": {
|
||||||
"php": ">=8.1",
|
"php": ">=8.1",
|
||||||
"yggverse/parser": ">=0.1.0",
|
"yggverse/parser": ">=0.1.0",
|
||||||
"yggverse/cache": ">=0.3.0"
|
"yggverse/cache": ">=0.3.0",
|
||||||
|
"symfony/dom-crawler": "^6.3",
|
||||||
|
"symfony/css-selector": "^6.3"
|
||||||
},
|
},
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"autoload": {
|
"autoload": {
|
||||||
|
144
src/cli/yggo.php
144
src/cli/yggo.php
@ -6,13 +6,12 @@ require_once(__DIR__ . '/../library/cli.php');
|
|||||||
require_once(__DIR__ . '/../library/mysql.php');
|
require_once(__DIR__ . '/../library/mysql.php');
|
||||||
require_once(__DIR__ . '/../library/filter.php');
|
require_once(__DIR__ . '/../library/filter.php');
|
||||||
require_once(__DIR__ . '/../library/ftp.php');
|
require_once(__DIR__ . '/../library/ftp.php');
|
||||||
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
|
require_once __DIR__ . '/../../vendor/autoload.php';
|
||||||
|
|
||||||
// CLI only to prevent https server connection timeout
|
// CLI only to prevent https server connection timeout
|
||||||
if (php_sapi_name() != 'cli') {
|
if (php_sapi_name() != 'cli') {
|
||||||
|
|
||||||
CLI::danger(_('supported command line interface only'));
|
CLI::danger(_('supported command line interface only'));
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -22,7 +21,6 @@ $semaphore = sem_get(crc32('crontab.cleaner'), 1);
|
|||||||
if (false === sem_acquire($semaphore, true)) {
|
if (false === sem_acquire($semaphore, true)) {
|
||||||
|
|
||||||
CLI::danger(_('stop crontab.cleaner is running in another thread.'));
|
CLI::danger(_('stop crontab.cleaner is running in another thread.'));
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -32,7 +30,6 @@ $semaphore = sem_get(crc32('crontab.crawler'), 1);
|
|||||||
if (false === sem_acquire($semaphore, true)) {
|
if (false === sem_acquire($semaphore, true)) {
|
||||||
|
|
||||||
CLI::danger(_('stop crontab.crawler is running in another thread.'));
|
CLI::danger(_('stop crontab.crawler is running in another thread.'));
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -42,7 +39,6 @@ $semaphore = sem_get(crc32('cli.yggo'), 1);
|
|||||||
if (false === sem_acquire($semaphore, true)) {
|
if (false === sem_acquire($semaphore, true)) {
|
||||||
|
|
||||||
CLI::danger(_('process locked by another thread.'));
|
CLI::danger(_('process locked by another thread.'));
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -202,7 +198,6 @@ if (!empty($argv[1])) {
|
|||||||
// @TODO
|
// @TODO
|
||||||
CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
|
CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
|
||||||
CLI::danger(_('make sure you have backups then remove this alert.'));
|
CLI::danger(_('make sure you have backups then remove this alert.'));
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
|
|
||||||
switch ($argv[3]) {
|
switch ($argv[3]) {
|
||||||
@ -285,7 +280,6 @@ if (!empty($argv[1])) {
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
|
CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -458,7 +452,6 @@ if (!empty($argv[1])) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
CLI::notice(_('hostPage rank fields successfully updated!'));
|
CLI::notice(_('hostPage rank fields successfully updated!'));
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@ -474,70 +467,123 @@ if (!empty($argv[1])) {
|
|||||||
|
|
||||||
if (empty($argv[2])) {
|
if (empty($argv[2])) {
|
||||||
|
|
||||||
|
CLI::danger(_('action required'));
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
|
||||||
switch ($argv[2]) {
|
switch ($argv[2]) {
|
||||||
|
|
||||||
case 'generate':
|
case 'generate':
|
||||||
|
|
||||||
$selectors = [];
|
// Validate hostId
|
||||||
|
if (empty($argv[3])) {
|
||||||
|
|
||||||
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) as $selector) {
|
CLI::danger(_('hostId required'));
|
||||||
|
exit;
|
||||||
if (!empty($selector)) {
|
|
||||||
|
|
||||||
$selectors[] = trim($selector);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($selectors) {
|
if (!$db->getHost($argv[3])) {
|
||||||
|
|
||||||
|
CLI::danger(_('hostId not found'));
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate selector
|
||||||
|
if (empty($argv[4])) {
|
||||||
|
|
||||||
|
CLI::danger(_('CSS selector required'));
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
|
||||||
// Init variables
|
// Init variables
|
||||||
$hostPagesProcessedTotal = 0;
|
$hostPagesProcessedTotal = 0;
|
||||||
$hostPageDOMAddedTotal = 0;
|
$hostPagesSkippedTotal = 0;
|
||||||
|
$hostPageDomAddedTotal = 0;
|
||||||
|
|
||||||
// Begin selectors extraction
|
// Begin selectors extraction
|
||||||
foreach ($db->getHostPagesByIndexed() as $hostPage) {
|
foreach ($db->getHostPages($argv[3]) as $hostPage) {
|
||||||
|
|
||||||
if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
|
|
||||||
|
|
||||||
if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
|
|
||||||
|
|
||||||
$hostPagesProcessedTotal++;
|
$hostPagesProcessedTotal++;
|
||||||
|
|
||||||
if (!empty($hostPageDescription->data)) {
|
if (false === stripos(Filter::mime($hostPage->mime), 'text/html')) {
|
||||||
|
|
||||||
$html = str_get_html(base64_decode($hostPageDescription->data));
|
CLI::warning(sprintf(_('not supported MIME type for hostPageId "%s", skipped'), $hostPage->hostPageId));
|
||||||
|
|
||||||
foreach ($selectors as $selector) {
|
$hostPagesSkippedTotal++;
|
||||||
|
|
||||||
foreach($html->find($selector) as $element) {
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (!empty($element->innertext)) {
|
if (!$hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
|
||||||
|
|
||||||
$hostPageDOMAddedTotal++;
|
CLI::warning(sprintf(_('last hostPageId "%s" description empty, skipped'), $hostPage->hostPageId));
|
||||||
|
|
||||||
$db->addHostPageDom($hostPage->hostPageId,
|
$hostPagesSkippedTotal++;
|
||||||
time(),
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (empty($hostPageDescription->data)) {
|
||||||
|
|
||||||
|
CLI::warning(sprintf(_('empty hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId));
|
||||||
|
|
||||||
|
$hostPagesSkippedTotal++;
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$html = base64_decode($hostPageDescription->data)) {
|
||||||
|
|
||||||
|
CLI::danger(sprintf(_('could not decode base64 for hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId));
|
||||||
|
|
||||||
|
$hostPagesSkippedTotal++;
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (empty($html)) {
|
||||||
|
|
||||||
|
CLI::warning(sprintf(_('empty decoded hostPageDescription.data value for hostPageId "%s", skipped'), $hostPage->hostPageId));
|
||||||
|
|
||||||
|
$hostPagesSkippedTotal++;
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init crawler
|
||||||
|
$crawler = new Symfony\Component\DomCrawler\Crawler();
|
||||||
|
$crawler->addHtmlContent($html);
|
||||||
|
|
||||||
|
$selector = trim($argv[4]);
|
||||||
|
|
||||||
|
if ($elements = $crawler->filter($selector)) {
|
||||||
|
|
||||||
|
foreach ($elements as $element) {
|
||||||
|
|
||||||
|
$value = trim($element->nodeValue);
|
||||||
|
$value = strip_tags($value, empty($argv[5]) ? null : $argv[5]);
|
||||||
|
|
||||||
|
if (empty($value)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save selector value
|
||||||
|
$db->addHostPageDom(
|
||||||
|
$hostPage->hostPageId,
|
||||||
$selector,
|
$selector,
|
||||||
trim((bool) $db->getHostSetting($hostPage->hostId, 'PAGES_DOM_STRIP_TAGS', DEFAULT_HOST_PAGES_DOM_STRIP_TAGS) ? strip_tags(preg_replace('/[\s]+/',
|
$value,
|
||||||
' ',
|
time()
|
||||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
);
|
||||||
[' ', ' ', ' ', ' </'],
|
|
||||||
$element->innertext))) : $element->innertext));
|
$hostPageDomAddedTotal++;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
|
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
|
||||||
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
|
CLI::success(sprintf(_('Host pages skipped: %s'), $hostPagesSkippedTotal));
|
||||||
exit;
|
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDomAddedTotal));
|
||||||
}
|
|
||||||
|
|
||||||
CLI::danger(_('DEFAULT_HOST_PAGES_DOM_SELECTORS not provided in the configuration file'));
|
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@ -546,11 +592,13 @@ if (!empty($argv[1])) {
|
|||||||
$db->truncateHostPageDom();
|
$db->truncateHostPageDom();
|
||||||
|
|
||||||
CLI::success(_('hostPageDom table successfully truncated'));
|
CLI::success(_('hostPageDom table successfully truncated'));
|
||||||
CLI::break();
|
|
||||||
exit;
|
exit;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
default:
|
||||||
|
|
||||||
|
CLI::danger(_('unknown action'));
|
||||||
|
exit;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -590,7 +638,7 @@ CLI::default(' fs - check all storages for snap files not r
|
|||||||
CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue');
|
CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue');
|
||||||
CLI::break();
|
CLI::break();
|
||||||
CLI::default(' hostPageDom ');
|
CLI::default(' hostPageDom ');
|
||||||
CLI::default(' generate [selectors] - make hostPageDom index based on related hostPage.data field');
|
CLI::default(' generate [hostId] [selector] [allowed tags] - generate hostPageDom values based on indexed hostPage.data field');
|
||||||
CLI::default(' truncate - flush hostPageDom table');
|
CLI::default(' truncate - flush hostPageDom table');
|
||||||
|
|
||||||
CLI::break();
|
CLI::break();
|
||||||
|
@ -31,7 +31,9 @@ require_once(__DIR__ . '/../library/filter.php');
|
|||||||
require_once(__DIR__ . '/../library/mysql.php');
|
require_once(__DIR__ . '/../library/mysql.php');
|
||||||
require_once(__DIR__ . '/../library/helper.php');
|
require_once(__DIR__ . '/../library/helper.php');
|
||||||
require_once(__DIR__ . '/../library/yggstate.php');
|
require_once(__DIR__ . '/../library/yggstate.php');
|
||||||
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
|
|
||||||
|
// @TODO deprecated, use Symfony\Component\DomCrawler\Crawler instead
|
||||||
|
// require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
|
||||||
|
|
||||||
require_once __DIR__ . '/../../vendor/autoload.php';
|
require_once __DIR__ . '/../../vendor/autoload.php';
|
||||||
|
|
||||||
@ -738,6 +740,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
|
|||||||
time());
|
time());
|
||||||
|
|
||||||
// Collect page DOM elements data on enabled
|
// Collect page DOM elements data on enabled
|
||||||
|
/* @TODO deprecated, use Symfony\Component\DomCrawler\Crawler instead
|
||||||
if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
|
if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', DEFAULT_HOST_PAGES_DOM_SELECTORS)) {
|
||||||
|
|
||||||
// Begin selectors extraction
|
// Begin selectors extraction
|
||||||
@ -761,6 +764,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
// Skip page links following with meta robots:nofollow attribute
|
// Skip page links following with meta robots:nofollow attribute
|
||||||
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
||||||
|
@ -314,15 +314,6 @@ class MySQL {
|
|||||||
return $result;
|
return $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getHostPagesByIndexed() {
|
|
||||||
|
|
||||||
$this->_debug->query->select->total++;
|
|
||||||
|
|
||||||
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL');
|
|
||||||
|
|
||||||
return $query->fetchAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getHostPagesByLimit(int $hostId, int $limit) {
|
public function getHostPagesByLimit(int $hostId, int $limit) {
|
||||||
|
|
||||||
$this->_debug->query->select->total++;
|
$this->_debug->query->select->total++;
|
||||||
@ -632,7 +623,7 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function addHostPageDom(int $hostPageId, int $timeAdded, string $selector, string $value) {
|
public function addHostPageDom(int $hostPageId, string $selector, string $value, int $timeAdded) {
|
||||||
|
|
||||||
$this->_debug->query->insert->total++;
|
$this->_debug->query->insert->total++;
|
||||||
|
|
||||||
|
2353
src/library/vendor/simple_html_dom.php
vendored
2353
src/library/vendor/simple_html_dom.php
vendored
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user