mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-03-12 05:11:19 +00:00
update cli
This commit is contained in:
parent
43776b5ff4
commit
ef170f62f3
683
cli/yggo.php
683
cli/yggo.php
@ -50,391 +50,384 @@ if (false === sem_acquire($semaphore, true)) {
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
// CLI begin
|
||||
if (empty($argv[1])) $argv[1] = 'help';
|
||||
if (!empty($argv[1])) {
|
||||
|
||||
switch ($argv[1]) {
|
||||
switch ($argv[1]) {
|
||||
|
||||
case 'crontab':
|
||||
case 'db':
|
||||
|
||||
if (empty($argv[2])) {
|
||||
if (empty($argv[2])) {
|
||||
|
||||
CLI::danger(_('crontab method requires action argument'));
|
||||
switch ($argv[2]) {
|
||||
|
||||
case 'optimize':
|
||||
|
||||
CLI::notice(_('optimize database tables...'));
|
||||
|
||||
$db->optimize();
|
||||
|
||||
CLI::success(_('tables successfully optimized!'));
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
case 'crontab':
|
||||
|
||||
if (empty($argv[2])) {
|
||||
|
||||
switch ($argv[2]) {
|
||||
|
||||
case 'crawl':
|
||||
|
||||
CLI::notice(_('crawler queue step begin...'));
|
||||
|
||||
include_once(__DIR__ . '/../crontab/crawler.php');
|
||||
|
||||
CLI::notice(_('crawler queue step begin...'));
|
||||
break;
|
||||
|
||||
case 'clean':
|
||||
|
||||
CLI::notice(_('cleaner queue step begin...'));
|
||||
|
||||
include_once(__DIR__ . '/../crontab/cleaner.php');
|
||||
|
||||
CLI::notice(_('cleaner queue step completed.'));
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
case 'hostPageSnap':
|
||||
|
||||
if (empty($argv[2])) {
|
||||
|
||||
switch ($argv[2]) {
|
||||
|
||||
case 'repair':
|
||||
|
||||
// @TODO
|
||||
CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
|
||||
CLI::danger(_('make sure you have backups then remove this alert.'));
|
||||
CLI::break();
|
||||
exit;
|
||||
|
||||
switch ($argv[3]) {
|
||||
|
||||
case 'db':
|
||||
|
||||
// Normalize & cleanup DB
|
||||
CLI::notice(_('scan database registry for missed snap files...'));
|
||||
|
||||
foreach ($db->getHosts() as $host) {
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
|
||||
|
||||
// Prepare filenames
|
||||
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
|
||||
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
|
||||
|
||||
// Define variables
|
||||
$hostPageSnapStorageFilesExists = false;
|
||||
|
||||
// Check file exists
|
||||
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
|
||||
|
||||
foreach ($storages as $location => $storage) {
|
||||
|
||||
// Generate storage id
|
||||
$crc32name = crc32(sprintf('%s.%s', $node, $location));
|
||||
|
||||
switch ($node) {
|
||||
|
||||
case 'localhost':
|
||||
|
||||
// @TODO implemented, not tested
|
||||
$hostPageSnapFile = $storage->directory . $hostPageSnapFile;
|
||||
|
||||
if (file_exists($hostPageSnapFile)) {
|
||||
|
||||
$hostPageSnapStorageFilesExists = true;
|
||||
|
||||
if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
|
||||
|
||||
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
|
||||
|
||||
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case 'ftp':
|
||||
|
||||
$ftp = new Ftp();
|
||||
|
||||
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
|
||||
|
||||
if ($ftp->size($hostPageSnapFile)) {
|
||||
|
||||
$hostPageSnapStorageFilesExists = true;
|
||||
|
||||
if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
|
||||
|
||||
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
|
||||
|
||||
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
|
||||
}
|
||||
} else {
|
||||
|
||||
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
|
||||
}
|
||||
}
|
||||
|
||||
// Prevent snap deletion from registry on FTP connection lost
|
||||
} else {
|
||||
|
||||
CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
|
||||
CLI::break();
|
||||
exit;
|
||||
}
|
||||
|
||||
$ftp->close();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Files not exists
|
||||
if (!$hostPageSnapStorageFilesExists) {
|
||||
|
||||
// Delete snap from registry
|
||||
try {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
|
||||
|
||||
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
|
||||
}
|
||||
|
||||
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
|
||||
$db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
|
||||
|
||||
CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded));
|
||||
|
||||
$db->commit();
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
$db->rollBack();
|
||||
|
||||
var_dump($e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
case 'fs':
|
||||
|
||||
// Cleanup FS
|
||||
CLI::notice(_('scan storage for snap files missed in the DB...'));
|
||||
|
||||
// Copy files to each storage
|
||||
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
|
||||
|
||||
foreach ($storages as $location => $storage) {
|
||||
|
||||
// Generate storage id
|
||||
$crc32name = crc32(sprintf('%s.%s', $node, $location));
|
||||
|
||||
switch ($node) {
|
||||
|
||||
case 'localhost':
|
||||
|
||||
// @TODO
|
||||
|
||||
break;
|
||||
|
||||
case 'ftp':
|
||||
|
||||
$ftp = new Ftp();
|
||||
|
||||
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
|
||||
|
||||
foreach ($ftp->nlistr($storage->directory) as $filename) {
|
||||
|
||||
if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) {
|
||||
|
||||
if (!empty($matches[1])) { // hostPageSnapId
|
||||
|
||||
if (!$db->getHostPageSnap($matches[1])) {
|
||||
|
||||
if ($ftp->delete($filename)) {
|
||||
|
||||
CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
|
||||
|
||||
} else {
|
||||
|
||||
CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$ftp->close();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CLI::success(_('missed snap files successfully deleted!'));
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 'reindex':
|
||||
|
||||
//@TODO
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
case 'hostPage':
|
||||
|
||||
switch ($argv[2]) {
|
||||
|
||||
case 'crawl':
|
||||
case 'rank':
|
||||
|
||||
CLI::notice(_('crawler queue step begin...'));
|
||||
if (empty($argv[3])) {
|
||||
|
||||
include_once(__DIR__ . '/../crontab/crawler.php');
|
||||
switch ($argv[3]) {
|
||||
|
||||
CLI::notice(_('crawler queue step begin...'));
|
||||
break;
|
||||
case 'reindex':
|
||||
|
||||
case 'clean':
|
||||
foreach ($db->getHosts() as $host) {
|
||||
|
||||
CLI::notice(_('cleaner queue step begin...'));
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
include_once(__DIR__ . '/../crontab/cleaner.php');
|
||||
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
|
||||
}
|
||||
}
|
||||
|
||||
CLI::notice(_('cleaner queue step completed.'));
|
||||
CLI::success(_('hostPage rank successfully updated'));
|
||||
exit;
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
CLI::danger(_('undefined action argument'));
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
case 'hostPageSnap':
|
||||
break;
|
||||
case 'hostPageDom':
|
||||
|
||||
if (empty($argv[2])) {
|
||||
if (empty($argv[2])) {
|
||||
|
||||
CLI::danger(_('hostPageSnap method requires action argument'));
|
||||
CLI::break();
|
||||
exit;
|
||||
}
|
||||
switch ($argv[2]) {
|
||||
|
||||
switch ($argv[2]) {
|
||||
case 'generate':
|
||||
|
||||
case 'repair':
|
||||
$selectors = [];
|
||||
|
||||
// @TODO
|
||||
CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
|
||||
CLI::danger(_('make sure you have backups then remove this alert.'));
|
||||
CLI::break();
|
||||
exit;
|
||||
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
|
||||
|
||||
// Normalize & cleanup DB
|
||||
CLI::notice(_('scan database registry for missed snap files...'));
|
||||
if (!empty($selector)) {
|
||||
|
||||
foreach ($db->getHosts() as $host) {
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
|
||||
|
||||
// Prepare filenames
|
||||
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
|
||||
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
|
||||
|
||||
// Define variables
|
||||
$hostPageSnapStorageFilesExists = false;
|
||||
|
||||
// Check file exists
|
||||
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
|
||||
|
||||
foreach ($storages as $location => $storage) {
|
||||
|
||||
// Generate storage id
|
||||
$crc32name = crc32(sprintf('%s.%s', $node, $location));
|
||||
|
||||
switch ($node) {
|
||||
|
||||
case 'localhost':
|
||||
|
||||
// @TODO implemented, not tested
|
||||
$hostPageSnapFile = $storage->directory . $hostPageSnapFile;
|
||||
|
||||
if (file_exists($hostPageSnapFile)) {
|
||||
|
||||
$hostPageSnapStorageFilesExists = true;
|
||||
|
||||
if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
|
||||
|
||||
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
|
||||
|
||||
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case 'ftp':
|
||||
|
||||
$ftp = new Ftp();
|
||||
|
||||
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
|
||||
|
||||
if ($ftp->size($hostPageSnapFile)) {
|
||||
|
||||
$hostPageSnapStorageFilesExists = true;
|
||||
|
||||
if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
|
||||
|
||||
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
|
||||
|
||||
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
|
||||
}
|
||||
} else {
|
||||
|
||||
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
|
||||
}
|
||||
}
|
||||
|
||||
// Prevent snap deletion from registry on FTP connection lost
|
||||
} else {
|
||||
|
||||
CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
|
||||
CLI::break();
|
||||
exit;
|
||||
}
|
||||
|
||||
$ftp->close();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Files not exists
|
||||
if (!$hostPageSnapStorageFilesExists) {
|
||||
|
||||
// Delete snap from registry
|
||||
try {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
|
||||
|
||||
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
|
||||
}
|
||||
|
||||
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
|
||||
$db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
|
||||
|
||||
CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded));
|
||||
|
||||
$db->commit();
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
$db->rollBack();
|
||||
|
||||
var_dump($e);
|
||||
}
|
||||
$selectors[] = trim($selector);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup FS
|
||||
CLI::notice(_('scan storage for snap files missed in the DB...'));
|
||||
if ($selectors) {
|
||||
|
||||
// Copy files to each storage
|
||||
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
|
||||
// Init variables
|
||||
$hostPagesProcessedTotal = 0;
|
||||
$hostPageDOMAddedTotal = 0;
|
||||
|
||||
foreach ($storages as $location => $storage) {
|
||||
// Begin selectors extraction
|
||||
foreach ($db->getHostPagesByIndexed() as $hostPage) {
|
||||
|
||||
// Generate storage id
|
||||
$crc32name = crc32(sprintf('%s.%s', $node, $location));
|
||||
if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
|
||||
|
||||
switch ($node) {
|
||||
if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
|
||||
|
||||
case 'localhost':
|
||||
$hostPagesProcessedTotal++;
|
||||
|
||||
// @TODO
|
||||
if (!empty($hostPageDescription->data)) {
|
||||
|
||||
break;
|
||||
$html = str_get_html(base64_decode($hostPageDescription->data));
|
||||
|
||||
case 'ftp':
|
||||
foreach ($selectors as $selector) {
|
||||
|
||||
$ftp = new Ftp();
|
||||
foreach($html->find($selector) as $element) {
|
||||
|
||||
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
|
||||
if (!empty($element->innertext)) {
|
||||
|
||||
foreach ($ftp->nlistr($storage->directory) as $filename) {
|
||||
$hostPageDOMAddedTotal++;
|
||||
|
||||
if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) {
|
||||
|
||||
if (!empty($matches[1])) { // hostPageSnapId
|
||||
|
||||
if (!$db->getHostPageSnap($matches[1])) {
|
||||
|
||||
if ($ftp->delete($filename)) {
|
||||
|
||||
CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
|
||||
|
||||
} else {
|
||||
|
||||
CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
|
||||
$db->addHostPageDom($hostPage->hostPageId,
|
||||
time(),
|
||||
$selector,
|
||||
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
|
||||
preg_replace('/[\s]+/',
|
||||
' ',
|
||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||
[' ', ' ', ' ', ' </'],
|
||||
$element->innertext))) : $element->innertext));
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$ftp->close();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CLI::success(_('missed snap files successfully deleted!'));
|
||||
|
||||
// Optimize DB tables
|
||||
CLI::notice(_('optimize database tables...'));
|
||||
|
||||
$db->optimize();
|
||||
|
||||
CLI::success(_('tables successfully optimized!'));
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
CLI::danger(_('undefined action argument!'));
|
||||
}
|
||||
|
||||
break;
|
||||
case 'hostPage':
|
||||
|
||||
if (empty($argv[2])) {
|
||||
|
||||
CLI::danger(_('hostPage method requires action argument'));
|
||||
}
|
||||
|
||||
switch ($argv[2]) {
|
||||
|
||||
case 'rank':
|
||||
|
||||
if (empty($argv[3])) {
|
||||
|
||||
CLI::danger(_('hostPage rank requires action argument'));
|
||||
}
|
||||
|
||||
switch ($argv[3]) {
|
||||
|
||||
case 'reindex':
|
||||
|
||||
foreach ($db->getHosts() as $host) {
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
|
||||
}
|
||||
|
||||
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
|
||||
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
|
||||
exit;
|
||||
}
|
||||
|
||||
CLI::success(_('hostPage rank successfully updated'));
|
||||
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
|
||||
exit;
|
||||
|
||||
break;
|
||||
default:
|
||||
case 'truncate':
|
||||
|
||||
CLI::danger(_('undefined action argument'));
|
||||
$db->truncateHostPageDom();
|
||||
|
||||
CLI::success(_('hostPageDom table successfully truncated'));
|
||||
exit;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
case 'truncate':
|
||||
|
||||
$db->truncateHostPageDom();
|
||||
|
||||
CLI::success(_('hostPageDom table successfully truncated'));
|
||||
exit;
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
CLI::danger(_('undefined action argument'));
|
||||
}
|
||||
|
||||
break;
|
||||
case 'hostPageDom':
|
||||
|
||||
if (empty($argv[2])) {
|
||||
|
||||
CLI::danger(_('hostPageDom method requires action argument'));
|
||||
}
|
||||
|
||||
switch ($argv[2]) {
|
||||
|
||||
case 'generate':
|
||||
|
||||
$selectors = [];
|
||||
|
||||
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
|
||||
|
||||
if (!empty($selector)) {
|
||||
|
||||
$selectors[] = trim($selector);
|
||||
}
|
||||
}
|
||||
|
||||
if ($selectors) {
|
||||
|
||||
// Init variables
|
||||
$hostPagesProcessedTotal = 0;
|
||||
$hostPageDOMAddedTotal = 0;
|
||||
|
||||
// Begin selectors extraction
|
||||
foreach ($db->getHostPagesByIndexed() as $hostPage) {
|
||||
|
||||
if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
|
||||
|
||||
if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
|
||||
|
||||
$hostPagesProcessedTotal++;
|
||||
|
||||
if (!empty($hostPageDescription->data)) {
|
||||
|
||||
$html = str_get_html(base64_decode($hostPageDescription->data));
|
||||
|
||||
foreach ($selectors as $selector) {
|
||||
|
||||
foreach($html->find($selector) as $element) {
|
||||
|
||||
if (!empty($element->innertext)) {
|
||||
|
||||
$hostPageDOMAddedTotal++;
|
||||
|
||||
$db->addHostPageDom($hostPage->hostPageId,
|
||||
time(),
|
||||
$selector,
|
||||
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
|
||||
preg_replace('/[\s]+/',
|
||||
' ',
|
||||
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||
[' ', ' ', ' ', ' </'],
|
||||
$element->innertext))) : $element->innertext));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
|
||||
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
|
||||
exit;
|
||||
}
|
||||
|
||||
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
|
||||
exit;
|
||||
|
||||
break;
|
||||
case 'truncate':
|
||||
|
||||
$db->truncateHostPageDom();
|
||||
|
||||
CLI::success(_('hostPageDom table successfully truncated'));
|
||||
exit;
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
CLI::danger(_('undefined action argument'));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Default message
|
||||
@ -446,13 +439,29 @@ CLI::default('/_/\____/\____/\____(_)' );
|
||||
|
||||
CLI::break();
|
||||
CLI::default('available options:');
|
||||
CLI::break();
|
||||
CLI::default(' help - this message');
|
||||
CLI::break();
|
||||
CLI::default(' db ');
|
||||
CLI::default(' optimize - optimize all tables');
|
||||
CLI::break();
|
||||
CLI::default(' crontab ');
|
||||
CLI::default(' crawl - execute step in crawler queue');
|
||||
CLI::default(' clean - execute step in cleaner queue');
|
||||
CLI::break();
|
||||
CLI::default(' hostPage ');
|
||||
CLI::default(' rank - generate hostPage.rank fields');
|
||||
CLI::break();
|
||||
CLI::default(' hostPageSnap ');
|
||||
CLI::default(' repair ');
|
||||
CLI::default(' db - scan database registry for new or deprecated snap files');
|
||||
CLI::default(' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem');
|
||||
CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue');
|
||||
CLI::break();
|
||||
CLI::default(' hostPageDom ');
|
||||
CLI::default(' generate [selectors] - make hostPageDom index based on related hostPage.data field');
|
||||
CLI::default(' truncate - flush hostPageDom table');
|
||||
|
||||
CLI::default(' help - this message');
|
||||
CLI::default(' crontab [crawl|clean] - execute crontab script queue');
|
||||
CLI::default(' hostPage rank reindex - generate rank indexes in hostPage table');
|
||||
CLI::default(' hostPageSnap repair - sync DB/FS relations');
|
||||
CLI::default(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field');
|
||||
CLI::default(' hostPageDom truncate - flush hostPageDom table');
|
||||
CLI::break();
|
||||
|
||||
CLI::default('get support: https://github.com/YGGverse/YGGo/issues');
|
||||
|
Loading…
x
Reference in New Issue
Block a user