diff --git a/cli/yggo.php b/cli/yggo.php
index 457a983..00cdd3f 100644
--- a/cli/yggo.php
+++ b/cli/yggo.php
@@ -50,391 +50,384 @@ if (false === sem_acquire($semaphore, true)) {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// CLI begin
-if (empty($argv[1])) $argv[1] = 'help';
+if (!empty($argv[1])) {
-switch ($argv[1]) {
+ switch ($argv[1]) {
- case 'crontab':
+ case 'db':
- if (empty($argv[2])) {
+ if (empty($argv[2])) {
- CLI::danger(_('crontab method requires action argument'));
+ switch ($argv[2]) {
- switch ($argv[2]) {
+ case 'optimize':
- case 'crawl':
+ CLI::notice(_('optimize database tables...'));
- CLI::notice(_('crawler queue step begin...'));
+ $db->optimize();
- include_once(__DIR__ . '/../crontab/crawler.php');
+ CLI::success(_('tables successfully optimized!'));
- CLI::notice(_('crawler queue step begin...'));
- break;
+ break;
+ }
+ }
- case 'clean':
+ break;
+ case 'crontab':
- CLI::notice(_('cleaner queue step begin...'));
+ if (empty($argv[2])) {
- include_once(__DIR__ . '/../crontab/cleaner.php');
+ switch ($argv[2]) {
- CLI::notice(_('cleaner queue step completed.'));
+ case 'crawl':
- break;
- }
- }
+ CLI::notice(_('crawler queue step begin...'));
- break;
- case 'hostPageSnap':
+ include_once(__DIR__ . '/../crontab/crawler.php');
- if (empty($argv[2])) {
-
- CLI::danger(_('hostPageSnap method requires action argument'));
- CLI::break();
- exit;
- }
+ CLI::notice(_('crawler queue step begin...'));
+ break;
- switch ($argv[2]) {
+ case 'clean':
- case 'repair':
+ CLI::notice(_('cleaner queue step begin...'));
- // @TODO
- CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
- CLI::danger(_('make sure you have backups then remove this alert.'));
- CLI::break();
- exit;
+ include_once(__DIR__ . '/../crontab/cleaner.php');
- // Normalize & cleanup DB
- CLI::notice(_('scan database registry for missed snap files...'));
+ CLI::notice(_('cleaner queue step completed.'));
- foreach ($db->getHosts() as $host) {
+ break;
+ }
+ }
- foreach ($db->getHostPages($host->hostId) as $hostPage) {
+ break;
+ case 'hostPageSnap':
- foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
+ if (empty($argv[2])) {
- // Prepare filenames
- $hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
- $hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
+ switch ($argv[2]) {
- // Define variables
- $hostPageSnapStorageFilesExists = false;
+ case 'repair':
- // Check file exists
- foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
+ // @TODO
+ CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
+ CLI::danger(_('make sure you have backups then remove this alert.'));
+ CLI::break();
+ exit;
- foreach ($storages as $location => $storage) {
+ switch ($argv[3]) {
- // Generate storage id
- $crc32name = crc32(sprintf('%s.%s', $node, $location));
+ case 'db':
- switch ($node) {
+ // Normalize & cleanup DB
+ CLI::notice(_('scan database registry for missed snap files...'));
- case 'localhost':
+ foreach ($db->getHosts() as $host) {
- // @TODO implemented, not tested
- $hostPageSnapFile = $storage->directory . $hostPageSnapFile;
+ foreach ($db->getHostPages($host->hostId) as $hostPage) {
- if (file_exists($hostPageSnapFile)) {
+ foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
- $hostPageSnapStorageFilesExists = true;
+ // Prepare filenames
+ $hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
+ $hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
- if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
+ // Define variables
+ $hostPageSnapStorageFilesExists = false;
- if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
+ // Check file exists
+ foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
- CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
- }
+ foreach ($storages as $location => $storage) {
- } else {
+ // Generate storage id
+ $crc32name = crc32(sprintf('%s.%s', $node, $location));
- CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
- }
- }
+ switch ($node) {
- break;
+ case 'localhost':
- case 'ftp':
+ // @TODO implemented, not tested
+ $hostPageSnapFile = $storage->directory . $hostPageSnapFile;
- $ftp = new Ftp();
+ if (file_exists($hostPageSnapFile)) {
- if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
+ $hostPageSnapStorageFilesExists = true;
- if ($ftp->size($hostPageSnapFile)) {
+ if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
- $hostPageSnapStorageFilesExists = true;
+ if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
- if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
+ CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
+ }
- if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
+ } else {
- CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
- }
- } else {
+ CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
+ }
+ }
- CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
- }
- }
+ break;
- // Prevent snap deletion from registry on FTP connection lost
- } else {
+ case 'ftp':
- CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
- CLI::break();
- exit;
- }
+ $ftp = new Ftp();
- $ftp->close();
+ if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
- break;
- }
- }
- }
+ if ($ftp->size($hostPageSnapFile)) {
- // Files not exists
- if (!$hostPageSnapStorageFilesExists) {
+ $hostPageSnapStorageFilesExists = true;
- // Delete snap from registry
- try {
+ if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
- $db->beginTransaction();
+ if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
- foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
+ CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
+ }
+ } else {
- $db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
- }
+ CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
+ }
+ }
- $db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
- $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
+ // Prevent snap deletion from registry on FTP connection lost
+ } else {
- CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded));
+ CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
+ CLI::break();
+ exit;
+ }
- $db->commit();
+ $ftp->close();
- } catch(Exception $e) {
+ break;
+ }
+ }
+ }
- $db->rollBack();
+ // Files not exists
+ if (!$hostPageSnapStorageFilesExists) {
- var_dump($e);
- }
- }
- }
- }
- }
+ // Delete snap from registry
+ try {
- // Cleanup FS
- CLI::notice(_('scan storage for snap files missed in the DB...'));
+ $db->beginTransaction();
- // Copy files to each storage
- foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
+ foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
- foreach ($storages as $location => $storage) {
+ $db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
+ }
- // Generate storage id
- $crc32name = crc32(sprintf('%s.%s', $node, $location));
+ $db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
+ $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
- switch ($node) {
+ CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded));
- case 'localhost':
+ $db->commit();
- // @TODO
+ } catch(Exception $e) {
- break;
+ $db->rollBack();
- case 'ftp':
+ var_dump($e);
+ }
+ }
+ }
+ }
+ }
- $ftp = new Ftp();
+ break;
+ case 'fs':
- if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
+ // Cleanup FS
+ CLI::notice(_('scan storage for snap files missed in the DB...'));
- foreach ($ftp->nlistr($storage->directory) as $filename) {
+ // Copy files to each storage
+ foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
- if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) {
+ foreach ($storages as $location => $storage) {
- if (!empty($matches[1])) { // hostPageSnapId
+ // Generate storage id
+ $crc32name = crc32(sprintf('%s.%s', $node, $location));
- if (!$db->getHostPageSnap($matches[1])) {
+ switch ($node) {
- if ($ftp->delete($filename)) {
+ case 'localhost':
- CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
+ // @TODO
- } else {
+ break;
- CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
- }
+ case 'ftp':
- } else {
+ $ftp = new Ftp();
- CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location));
- }
- }
- }
- }
- }
+ if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
- $ftp->close();
+ foreach ($ftp->nlistr($storage->directory) as $filename) {
- break;
- }
- }
- }
+ if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) {
- CLI::success(_('missed snap files successfully deleted!'));
+ if (!empty($matches[1])) { // hostPageSnapId
- // Optimize DB tables
- CLI::notice(_('optimize database tables...'));
+ if (!$db->getHostPageSnap($matches[1])) {
- $db->optimize();
+ if ($ftp->delete($filename)) {
- CLI::success(_('tables successfully optimized!'));
+ CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
- break;
- default:
+ } else {
- CLI::danger(_('undefined action argument!'));
- }
+ CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
+ }
- break;
- case 'hostPage':
+ } else {
- if (empty($argv[2])) {
+ CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location));
+ }
+ }
+ }
+ }
+ }
- CLI::danger(_('hostPage method requires action argument'));
- }
+ $ftp->close();
- switch ($argv[2]) {
+ break;
+ }
+ }
+ }
- case 'rank':
+ CLI::success(_('missed snap files successfully deleted!'));
+ break;
+ }
+ break;
+ case 'reindex':
- if (empty($argv[3])) {
+ //@TODO
- CLI::danger(_('hostPage rank requires action argument'));
+ break;
}
+ }
- switch ($argv[3]) {
-
- case 'reindex':
+ break;
+ case 'hostPage':
- foreach ($db->getHosts() as $host) {
+ switch ($argv[2]) {
- foreach ($db->getHostPages($host->hostId) as $hostPage) {
+ case 'rank':
- $db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
- }
- }
+ if (empty($argv[3])) {
- CLI::success(_('hostPage rank successfully updated'));
- exit;
+ switch ($argv[3]) {
- break;
- default:
+ case 'reindex':
- CLI::danger(_('undefined action argument'));
- }
+ foreach ($db->getHosts() as $host) {
- break;
- case 'truncate':
+ foreach ($db->getHostPages($host->hostId) as $hostPage) {
- $db->truncateHostPageDom();
+ $db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
+ }
+ }
- CLI::success(_('hostPageDom table successfully truncated'));
- exit;
+ CLI::success(_('hostPage rank successfully updated'));
+ exit;
- break;
- default:
+ break;
+ default:
- CLI::danger(_('undefined action argument'));
- }
+ CLI::danger(_('undefined action argument'));
+ }
+ }
- break;
- case 'hostPageDom':
+ break;
+ }
- if (empty($argv[2])) {
+ break;
+ case 'hostPageDom':
- CLI::danger(_('hostPageDom method requires action argument'));
- }
+ if (empty($argv[2])) {
- switch ($argv[2]) {
+ switch ($argv[2]) {
- case 'generate':
+ case 'generate':
- $selectors = [];
+ $selectors = [];
- foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
+ foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
- if (!empty($selector)) {
+ if (!empty($selector)) {
- $selectors[] = trim($selector);
- }
- }
+ $selectors[] = trim($selector);
+ }
+ }
- if ($selectors) {
+ if ($selectors) {
- // Init variables
- $hostPagesProcessedTotal = 0;
- $hostPageDOMAddedTotal = 0;
+ // Init variables
+ $hostPagesProcessedTotal = 0;
+ $hostPageDOMAddedTotal = 0;
- // Begin selectors extraction
- foreach ($db->getHostPagesByIndexed() as $hostPage) {
+ // Begin selectors extraction
+ foreach ($db->getHostPagesByIndexed() as $hostPage) {
- if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
+ if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
- if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
+ if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
- $hostPagesProcessedTotal++;
+ $hostPagesProcessedTotal++;
- if (!empty($hostPageDescription->data)) {
+ if (!empty($hostPageDescription->data)) {
- $html = str_get_html(base64_decode($hostPageDescription->data));
+ $html = str_get_html(base64_decode($hostPageDescription->data));
- foreach ($selectors as $selector) {
+ foreach ($selectors as $selector) {
- foreach($html->find($selector) as $element) {
+ foreach($html->find($selector) as $element) {
- if (!empty($element->innertext)) {
+ if (!empty($element->innertext)) {
- $hostPageDOMAddedTotal++;
+ $hostPageDOMAddedTotal++;
- $db->addHostPageDom($hostPage->hostPageId,
- time(),
- $selector,
- trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
- preg_replace('/[\s]+/',
- ' ',
- str_replace(['
', '
', '
', ''],
- [' ', ' ', ' ', ' '],
- $element->innertext))) : $element->innertext));
+ $db->addHostPageDom($hostPage->hostPageId,
+ time(),
+ $selector,
+ trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
+ preg_replace('/[\s]+/',
+ ' ',
+ str_replace(['
', '
', '
', ''],
+ [' ', ' ', ' ', ' '],
+ $element->innertext))) : $element->innertext));
+ }
+ }
}
}
}
}
}
- }
- }
-
- CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
- CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
- exit;
- }
-
- CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
- exit;
- break;
- case 'truncate':
+ CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
+ CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
+ exit;
+ }
- $db->truncateHostPageDom();
+ CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
+ exit;
- CLI::success(_('hostPageDom table successfully truncated'));
- exit;
+ break;
+ case 'truncate':
- break;
- default:
+ $db->truncateHostPageDom();
- CLI::danger(_('undefined action argument'));
- }
+ CLI::success(_('hostPageDom table successfully truncated'));
+ exit;
- break;
+ break;
+ }
+ }
+ break;
+ }
}
// Default message
@@ -446,13 +439,29 @@ CLI::default('/_/\____/\____/\____(_)' );
CLI::break();
CLI::default('available options:');
+CLI::break();
+CLI::default(' help - this message');
+CLI::break();
+CLI::default(' db ');
+CLI::default(' optimize - optimize all tables');
+CLI::break();
+CLI::default(' crontab ');
+CLI::default(' crawl - execute step in crawler queue');
+CLI::default(' clean - execute step in cleaner queue');
+CLI::break();
+CLI::default(' hostPage ');
+CLI::default(' rank - generate hostPage.rank fields');
+CLI::break();
+CLI::default(' hostPageSnap ');
+CLI::default(' repair ');
+CLI::default(' db - scan database registry for new or deprecated snap files');
+CLI::default(' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem');
+CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue');
+CLI::break();
+CLI::default(' hostPageDom ');
+CLI::default(' generate [selectors] - make hostPageDom index based on related hostPage.data field');
+CLI::default(' truncate - flush hostPageDom table');
-CLI::default(' help - this message');
-CLI::default(' crontab [crawl|clean] - execute crontab script queue');
-CLI::default(' hostPage rank reindex - generate rank indexes in hostPage table');
-CLI::default(' hostPageSnap repair - sync DB/FS relations');
-CLI::default(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field');
-CLI::default(' hostPageDom truncate - flush hostPageDom table');
CLI::break();
CLI::default('get support: https://github.com/YGGverse/YGGo/issues');