Browse Source

update cli

main
ghost 1 year ago
parent
commit
ef170f62f3
  1. 501
      cli/yggo.php

501
cli/yggo.php

@ -50,391 +50,384 @@ if (false === sem_acquire($semaphore, true)) {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// CLI begin // CLI begin
if (empty($argv[1])) $argv[1] = 'help'; if (!empty($argv[1])) {
switch ($argv[1]) { switch ($argv[1]) {
case 'crontab': case 'db':
if (empty($argv[2])) { if (empty($argv[2])) {
CLI::danger(_('crontab method requires action argument')); switch ($argv[2]) {
switch ($argv[2]) { case 'optimize':
case 'crawl': CLI::notice(_('optimize database tables...'));
CLI::notice(_('crawler queue step begin...')); $db->optimize();
include_once(__DIR__ . '/../crontab/crawler.php'); CLI::success(_('tables successfully optimized!'));
CLI::notice(_('crawler queue step begin...')); break;
break; }
}
case 'clean': break;
case 'crontab':
CLI::notice(_('cleaner queue step begin...')); if (empty($argv[2])) {
include_once(__DIR__ . '/../crontab/cleaner.php'); switch ($argv[2]) {
CLI::notice(_('cleaner queue step completed.')); case 'crawl':
break; CLI::notice(_('crawler queue step begin...'));
}
}
break; include_once(__DIR__ . '/../crontab/crawler.php');
case 'hostPageSnap':
if (empty($argv[2])) { CLI::notice(_('crawler queue step begin...'));
break;
CLI::danger(_('hostPageSnap method requires action argument'));
CLI::break();
exit;
}
switch ($argv[2]) { case 'clean':
case 'repair': CLI::notice(_('cleaner queue step begin...'));
// @TODO include_once(__DIR__ . '/../crontab/cleaner.php');
CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
CLI::danger(_('make sure you have backups then remove this alert.'));
CLI::break();
exit;
// Normalize & cleanup DB CLI::notice(_('cleaner queue step completed.'));
CLI::notice(_('scan database registry for missed snap files...'));
foreach ($db->getHosts() as $host) { break;
}
}
foreach ($db->getHostPages($host->hostId) as $hostPage) { break;
case 'hostPageSnap':
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { if (empty($argv[2])) {
// Prepare filenames switch ($argv[2]) {
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
// Define variables case 'repair':
$hostPageSnapStorageFilesExists = false;
// Check file exists // @TODO
foreach (json_decode(SNAP_STORAGE) as $node => $storages) { CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
CLI::danger(_('make sure you have backups then remove this alert.'));
CLI::break();
exit;
foreach ($storages as $location => $storage) { switch ($argv[3]) {
// Generate storage id case 'db':
$crc32name = crc32(sprintf('%s.%s', $node, $location));
switch ($node) { // Normalize & cleanup DB
CLI::notice(_('scan database registry for missed snap files...'));
case 'localhost': foreach ($db->getHosts() as $host) {
// @TODO implemented, not tested foreach ($db->getHostPages($host->hostId) as $hostPage) {
$hostPageSnapFile = $storage->directory . $hostPageSnapFile;
if (file_exists($hostPageSnapFile)) { foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
$hostPageSnapStorageFilesExists = true; // Prepare filenames
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) { // Define variables
$hostPageSnapStorageFilesExists = false;
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) { // Check file exists
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); foreach ($storages as $location => $storage) {
}
} else { // Generate storage id
$crc32name = crc32(sprintf('%s.%s', $node, $location));
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); switch ($node) {
}
}
break; case 'localhost':
case 'ftp': // @TODO implemented, not tested
$hostPageSnapFile = $storage->directory . $hostPageSnapFile;
$ftp = new Ftp(); if (file_exists($hostPageSnapFile)) {
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { $hostPageSnapStorageFilesExists = true;
if ($ftp->size($hostPageSnapFile)) { if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
$hostPageSnapStorageFilesExists = true; if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) { CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
}
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) { } else {
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
} }
} else { }
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); break;
}
}
// Prevent snap deletion from registry on FTP connection lost case 'ftp':
} else {
CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location)); $ftp = new Ftp();
CLI::break();
exit;
}
$ftp->close(); if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
break; if ($ftp->size($hostPageSnapFile)) {
}
}
}
// Files not exists $hostPageSnapStorageFilesExists = true;
if (!$hostPageSnapStorageFilesExists) {
// Delete snap from registry if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
try {
$db->beginTransaction(); if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) { CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
}
} else {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId); CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
} }
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId); // Prevent snap deletion from registry on FTP connection lost
$db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); } else {
CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded)); CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
CLI::break();
exit;
}
$db->commit(); $ftp->close();
} catch(Exception $e) { break;
}
}
}
$db->rollBack(); // Files not exists
if (!$hostPageSnapStorageFilesExists) {
var_dump($e); // Delete snap from registry
} try {
}
}
}
}
// Cleanup FS $db->beginTransaction();
CLI::notice(_('scan storage for snap files missed in the DB...'));
// Copy files to each storage foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
foreach ($storages as $location => $storage) { $db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
// Generate storage id $db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$crc32name = crc32(sprintf('%s.%s', $node, $location)); $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
switch ($node) { CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded));
case 'localhost': $db->commit();
// @TODO } catch(Exception $e) {
break; $db->rollBack();
case 'ftp': var_dump($e);
}
}
}
}
}
$ftp = new Ftp(); break;
case 'fs':
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { // Cleanup FS
CLI::notice(_('scan storage for snap files missed in the DB...'));
foreach ($ftp->nlistr($storage->directory) as $filename) { // Copy files to each storage
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) { foreach ($storages as $location => $storage) {
if (!empty($matches[1])) { // hostPageSnapId // Generate storage id
$crc32name = crc32(sprintf('%s.%s', $node, $location));
if (!$db->getHostPageSnap($matches[1])) { switch ($node) {
if ($ftp->delete($filename)) { case 'localhost':
CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location)); // @TODO
} else { break;
CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location)); case 'ftp':
}
} else { $ftp = new Ftp();
CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location)); if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
}
}
}
}
}
$ftp->close(); foreach ($ftp->nlistr($storage->directory) as $filename) {
break; if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) {
}
}
}
CLI::success(_('missed snap files successfully deleted!')); if (!empty($matches[1])) { // hostPageSnapId
// Optimize DB tables if (!$db->getHostPageSnap($matches[1])) {
CLI::notice(_('optimize database tables...'));
$db->optimize(); if ($ftp->delete($filename)) {
CLI::success(_('tables successfully optimized!')); CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
break; } else {
default:
CLI::danger(_('undefined action argument!')); CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
} }
break; } else {
case 'hostPage':
if (empty($argv[2])) { CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location));
}
}
}
}
}
CLI::danger(_('hostPage method requires action argument')); $ftp->close();
}
switch ($argv[2]) { break;
}
}
}
case 'rank': CLI::success(_('missed snap files successfully deleted!'));
break;
}
break;
case 'reindex':
if (empty($argv[3])) { //@TODO
CLI::danger(_('hostPage rank requires action argument')); break;
} }
}
switch ($argv[3]) { break;
case 'hostPage':
case 'reindex':
foreach ($db->getHosts() as $host) { switch ($argv[2]) {
foreach ($db->getHostPages($host->hostId) as $hostPage) { case 'rank':
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover if (empty($argv[3])) {
}
}
CLI::success(_('hostPage rank successfully updated')); switch ($argv[3]) {
exit;
break; case 'reindex':
default:
CLI::danger(_('undefined action argument')); foreach ($db->getHosts() as $host) {
}
break; foreach ($db->getHostPages($host->hostId) as $hostPage) {
case 'truncate':
$db->truncateHostPageDom(); $db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
}
}
CLI::success(_('hostPageDom table successfully truncated')); CLI::success(_('hostPage rank successfully updated'));
exit; exit;
break; break;
default: default:
CLI::danger(_('undefined action argument')); CLI::danger(_('undefined action argument'));
} }
}
break; break;
case 'hostPageDom': }
if (empty($argv[2])) { break;
case 'hostPageDom':
CLI::danger(_('hostPageDom method requires action argument')); if (empty($argv[2])) {
}
switch ($argv[2]) { switch ($argv[2]) {
case 'generate': case 'generate':
$selectors = []; $selectors = [];
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
if (!empty($selector)) { if (!empty($selector)) {
$selectors[] = trim($selector); $selectors[] = trim($selector);
} }
} }
if ($selectors) { if ($selectors) {
// Init variables // Init variables
$hostPagesProcessedTotal = 0; $hostPagesProcessedTotal = 0;
$hostPageDOMAddedTotal = 0; $hostPageDOMAddedTotal = 0;
// Begin selectors extraction // Begin selectors extraction
foreach ($db->getHostPagesByIndexed() as $hostPage) { foreach ($db->getHostPagesByIndexed() as $hostPage) {
if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) { if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) { if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
$hostPagesProcessedTotal++; $hostPagesProcessedTotal++;
if (!empty($hostPageDescription->data)) { if (!empty($hostPageDescription->data)) {
$html = str_get_html(base64_decode($hostPageDescription->data)); $html = str_get_html(base64_decode($hostPageDescription->data));
foreach ($selectors as $selector) { foreach ($selectors as $selector) {
foreach($html->find($selector) as $element) { foreach($html->find($selector) as $element) {
if (!empty($element->innertext)) { if (!empty($element->innertext)) {
$hostPageDOMAddedTotal++; $hostPageDOMAddedTotal++;
$db->addHostPageDom($hostPage->hostPageId, $db->addHostPageDom($hostPage->hostPageId,
time(), time(),
$selector, $selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/', preg_replace('/[\s]+/',
' ', ' ',
str_replace(['<br />', '<br/>', '<br>', '</'], str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'], [' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext)); $element->innertext))) : $element->innertext));
}
}
} }
} }
} }
} }
} }
}
}
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
exit;
}
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
exit;
break; CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
case 'truncate': CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
exit;
}
$db->truncateHostPageDom(); CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
exit;
CLI::success(_('hostPageDom table successfully truncated')); break;
exit; case 'truncate':
break; $db->truncateHostPageDom();
default:
CLI::danger(_('undefined action argument')); CLI::success(_('hostPageDom table successfully truncated'));
} exit;
break; break;
}
}
break;
}
} }
// Default message // Default message
@ -446,13 +439,29 @@ CLI::default('/_/\____/\____/\____(_)' );
CLI::break(); CLI::break();
CLI::default('available options:'); CLI::default('available options:');
CLI::break();
CLI::default(' help - this message');
CLI::break();
CLI::default(' db ');
CLI::default(' optimize - optimize all tables');
CLI::break();
CLI::default(' crontab ');
CLI::default(' crawl - execute step in crawler queue');
CLI::default(' clean - execute step in cleaner queue');
CLI::break();
CLI::default(' hostPage ');
CLI::default(' rank - generate hostPage.rank fields');
CLI::break();
CLI::default(' hostPageSnap ');
CLI::default(' repair ');
CLI::default(' db - scan database registry for new or deprecated snap files');
CLI::default(' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem');
CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue');
CLI::break();
CLI::default(' hostPageDom ');
CLI::default(' generate [selectors] - make hostPageDom index based on related hostPage.data field');
CLI::default(' truncate - flush hostPageDom table');
CLI::default(' help - this message');
CLI::default(' crontab [crawl|clean] - execute crontab script queue');
CLI::default(' hostPage rank reindex - generate rank indexes in hostPage table');
CLI::default(' hostPageSnap repair - sync DB/FS relations');
CLI::default(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field');
CLI::default(' hostPageDom truncate - flush hostPageDom table');
CLI::break(); CLI::break();
CLI::default('get support: https://github.com/YGGverse/YGGo/issues'); CLI::default('get support: https://github.com/YGGverse/YGGo/issues');

Loading…
Cancel
Save