diff --git a/cli/yggo.php b/cli/yggo.php index 457a983..00cdd3f 100644 --- a/cli/yggo.php +++ b/cli/yggo.php @@ -50,391 +50,384 @@ if (false === sem_acquire($semaphore, true)) { $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); // CLI begin -if (empty($argv[1])) $argv[1] = 'help'; +if (!empty($argv[1])) { -switch ($argv[1]) { + switch ($argv[1]) { - case 'crontab': + case 'db': - if (empty($argv[2])) { + if (empty($argv[2])) { - CLI::danger(_('crontab method requires action argument')); + switch ($argv[2]) { - switch ($argv[2]) { + case 'optimize': - case 'crawl': + CLI::notice(_('optimize database tables...')); - CLI::notice(_('crawler queue step begin...')); + $db->optimize(); - include_once(__DIR__ . '/../crontab/crawler.php'); + CLI::success(_('tables successfully optimized!')); - CLI::notice(_('crawler queue step begin...')); - break; + break; + } + } - case 'clean': + break; + case 'crontab': - CLI::notice(_('cleaner queue step begin...')); + if (empty($argv[2])) { - include_once(__DIR__ . '/../crontab/cleaner.php'); + switch ($argv[2]) { - CLI::notice(_('cleaner queue step completed.')); + case 'crawl': - break; - } - } + CLI::notice(_('crawler queue step begin...')); - break; - case 'hostPageSnap': + include_once(__DIR__ . '/../crontab/crawler.php'); - if (empty($argv[2])) { - - CLI::danger(_('hostPageSnap method requires action argument')); - CLI::break(); - exit; - } + CLI::notice(_('crawler queue step begin...')); + break; - switch ($argv[2]) { + case 'clean': - case 'repair': + CLI::notice(_('cleaner queue step begin...')); - // @TODO - CLI::danger(_('this function upgraded but not tested after snaps refactor.')); - CLI::danger(_('make sure you have backups then remove this alert.')); - CLI::break(); - exit; + include_once(__DIR__ . '/../crontab/cleaner.php'); - // Normalize & cleanup DB - CLI::notice(_('scan database registry for missed snap files...')); + CLI::notice(_('cleaner queue step completed.')); - foreach ($db->getHosts() as $host) { + break; + } + } - foreach ($db->getHostPages($host->hostId) as $hostPage) { + break; + case 'hostPageSnap': - foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { + if (empty($argv[2])) { - // Prepare filenames - $hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1); - $hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip'; + switch ($argv[2]) { - // Define variables - $hostPageSnapStorageFilesExists = false; + case 'repair': - // Check file exists - foreach (json_decode(SNAP_STORAGE) as $node => $storages) { + // @TODO + CLI::danger(_('this function upgraded but not tested after snaps refactor.')); + CLI::danger(_('make sure you have backups then remove this alert.')); + CLI::break(); + exit; - foreach ($storages as $location => $storage) { + switch ($argv[3]) { - // Generate storage id - $crc32name = crc32(sprintf('%s.%s', $node, $location)); + case 'db': - switch ($node) { + // Normalize & cleanup DB + CLI::notice(_('scan database registry for missed snap files...')); - case 'localhost': + foreach ($db->getHosts() as $host) { - // @TODO implemented, not tested - $hostPageSnapFile = $storage->directory . $hostPageSnapFile; + foreach ($db->getHostPages($host->hostId) as $hostPage) { - if (file_exists($hostPageSnapFile)) { + foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { - $hostPageSnapStorageFilesExists = true; + // Prepare filenames + $hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1); + $hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip'; - if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) { + // Define variables + $hostPageSnapStorageFilesExists = false; - if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) { + // Check file exists + foreach (json_decode(SNAP_STORAGE) as $node => $storages) { - CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); - } + foreach ($storages as $location => $storage) { - } else { + // Generate storage id + $crc32name = crc32(sprintf('%s.%s', $node, $location)); - CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); - } - } + switch ($node) { - break; + case 'localhost': - case 'ftp': + // @TODO implemented, not tested + $hostPageSnapFile = $storage->directory . $hostPageSnapFile; - $ftp = new Ftp(); + if (file_exists($hostPageSnapFile)) { - if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { + $hostPageSnapStorageFilesExists = true; - if ($ftp->size($hostPageSnapFile)) { + if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) { - $hostPageSnapStorageFilesExists = true; + if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) { - if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) { + CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); + } - if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) { + } else { - CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); - } - } else { + CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); + } + } - CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); - } - } + break; - // Prevent snap deletion from registry on FTP connection lost - } else { + case 'ftp': - CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location)); - CLI::break(); - exit; - } + $ftp = new Ftp(); - $ftp->close(); + if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { - break; - } - } - } + if ($ftp->size($hostPageSnapFile)) { - // Files not exists - if (!$hostPageSnapStorageFilesExists) { + $hostPageSnapStorageFilesExists = true; - // Delete snap from registry - try { + if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) { - $db->beginTransaction(); + if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) { - foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) { + CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); + } + } else { - $db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId); - } + CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location)); + } + } - $db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId); - $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); + // Prevent snap deletion from registry on FTP connection lost + } else { - CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded)); + CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location)); + CLI::break(); + exit; + } - $db->commit(); + $ftp->close(); - } catch(Exception $e) { + break; + } + } + } - $db->rollBack(); + // Files not exists + if (!$hostPageSnapStorageFilesExists) { - var_dump($e); - } - } - } - } - } + // Delete snap from registry + try { - // Cleanup FS - CLI::notice(_('scan storage for snap files missed in the DB...')); + $db->beginTransaction(); - // Copy files to each storage - foreach (json_decode(SNAP_STORAGE) as $node => $storages) { + foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) { - foreach ($storages as $location => $storage) { + $db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId); + } - // Generate storage id - $crc32name = crc32(sprintf('%s.%s', $node, $location)); + $db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId); + $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); - switch ($node) { + CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded)); - case 'localhost': + $db->commit(); - // @TODO + } catch(Exception $e) { - break; + $db->rollBack(); - case 'ftp': + var_dump($e); + } + } + } + } + } - $ftp = new Ftp(); + break; + case 'fs': - if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { + // Cleanup FS + CLI::notice(_('scan storage for snap files missed in the DB...')); - foreach ($ftp->nlistr($storage->directory) as $filename) { + // Copy files to each storage + foreach (json_decode(SNAP_STORAGE) as $node => $storages) { - if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) { + foreach ($storages as $location => $storage) { - if (!empty($matches[1])) { // hostPageSnapId + // Generate storage id + $crc32name = crc32(sprintf('%s.%s', $node, $location)); - if (!$db->getHostPageSnap($matches[1])) { + switch ($node) { - if ($ftp->delete($filename)) { + case 'localhost': - CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location)); + // @TODO - } else { + break; - CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location)); - } + case 'ftp': - } else { + $ftp = new Ftp(); - CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location)); - } - } - } - } - } + if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { - $ftp->close(); + foreach ($ftp->nlistr($storage->directory) as $filename) { - break; - } - } - } + if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) { - CLI::success(_('missed snap files successfully deleted!')); + if (!empty($matches[1])) { // hostPageSnapId - // Optimize DB tables - CLI::notice(_('optimize database tables...')); + if (!$db->getHostPageSnap($matches[1])) { - $db->optimize(); + if ($ftp->delete($filename)) { - CLI::success(_('tables successfully optimized!')); + CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location)); - break; - default: + } else { - CLI::danger(_('undefined action argument!')); - } + CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location)); + } - break; - case 'hostPage': + } else { - if (empty($argv[2])) { + CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location)); + } + } + } + } + } - CLI::danger(_('hostPage method requires action argument')); - } + $ftp->close(); - switch ($argv[2]) { + break; + } + } + } - case 'rank': + CLI::success(_('missed snap files successfully deleted!')); + break; + } + break; + case 'reindex': - if (empty($argv[3])) { + //@TODO - CLI::danger(_('hostPage rank requires action argument')); + break; } + } - switch ($argv[3]) { - - case 'reindex': + break; + case 'hostPage': - foreach ($db->getHosts() as $host) { + switch ($argv[2]) { - foreach ($db->getHostPages($host->hostId) as $hostPage) { + case 'rank': - $db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover - } - } + if (empty($argv[3])) { - CLI::success(_('hostPage rank successfully updated')); - exit; + switch ($argv[3]) { - break; - default: + case 'reindex': - CLI::danger(_('undefined action argument')); - } + foreach ($db->getHosts() as $host) { - break; - case 'truncate': + foreach ($db->getHostPages($host->hostId) as $hostPage) { - $db->truncateHostPageDom(); + $db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover + } + } - CLI::success(_('hostPageDom table successfully truncated')); - exit; + CLI::success(_('hostPage rank successfully updated')); + exit; - break; - default: + break; + default: - CLI::danger(_('undefined action argument')); - } + CLI::danger(_('undefined action argument')); + } + } - break; - case 'hostPageDom': + break; + } - if (empty($argv[2])) { + break; + case 'hostPageDom': - CLI::danger(_('hostPageDom method requires action argument')); - } + if (empty($argv[2])) { - switch ($argv[2]) { + switch ($argv[2]) { - case 'generate': + case 'generate': - $selectors = []; + $selectors = []; - foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { + foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) { - if (!empty($selector)) { + if (!empty($selector)) { - $selectors[] = trim($selector); - } - } + $selectors[] = trim($selector); + } + } - if ($selectors) { + if ($selectors) { - // Init variables - $hostPagesProcessedTotal = 0; - $hostPageDOMAddedTotal = 0; + // Init variables + $hostPagesProcessedTotal = 0; + $hostPageDOMAddedTotal = 0; - // Begin selectors extraction - foreach ($db->getHostPagesByIndexed() as $hostPage) { + // Begin selectors extraction + foreach ($db->getHostPagesByIndexed() as $hostPage) { - if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) { + if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) { - if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) { + if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) { - $hostPagesProcessedTotal++; + $hostPagesProcessedTotal++; - if (!empty($hostPageDescription->data)) { + if (!empty($hostPageDescription->data)) { - $html = str_get_html(base64_decode($hostPageDescription->data)); + $html = str_get_html(base64_decode($hostPageDescription->data)); - foreach ($selectors as $selector) { + foreach ($selectors as $selector) { - foreach($html->find($selector) as $element) { + foreach($html->find($selector) as $element) { - if (!empty($element->innertext)) { + if (!empty($element->innertext)) { - $hostPageDOMAddedTotal++; + $hostPageDOMAddedTotal++; - $db->addHostPageDom($hostPage->hostPageId, - time(), - $selector, - trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( - preg_replace('/[\s]+/', - ' ', - str_replace(['
', '
', '
', 'innertext))) : $element->innertext)); + $db->addHostPageDom($hostPage->hostPageId, + time(), + $selector, + trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags( + preg_replace('/[\s]+/', + ' ', + str_replace(['
', '
', '
', 'innertext))) : $element->innertext)); + } + } } } } } } - } - } - - CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal)); - CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal)); - exit; - } - - CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file')); - exit; - break; - case 'truncate': + CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal)); + CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal)); + exit; + } - $db->truncateHostPageDom(); + CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file')); + exit; - CLI::success(_('hostPageDom table successfully truncated')); - exit; + break; + case 'truncate': - break; - default: + $db->truncateHostPageDom(); - CLI::danger(_('undefined action argument')); - } + CLI::success(_('hostPageDom table successfully truncated')); + exit; - break; + break; + } + } + break; + } } // Default message @@ -446,13 +439,29 @@ CLI::default('/_/\____/\____/\____(_)' ); CLI::break(); CLI::default('available options:'); +CLI::break(); +CLI::default(' help - this message'); +CLI::break(); +CLI::default(' db '); +CLI::default(' optimize - optimize all tables'); +CLI::break(); +CLI::default(' crontab '); +CLI::default(' crawl - execute step in crawler queue'); +CLI::default(' clean - execute step in cleaner queue'); +CLI::break(); +CLI::default(' hostPage '); +CLI::default(' rank - generate hostPage.rank fields'); +CLI::break(); +CLI::default(' hostPageSnap '); +CLI::default(' repair '); +CLI::default(' db - scan database registry for new or deprecated snap files'); +CLI::default(' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem'); +CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue'); +CLI::break(); +CLI::default(' hostPageDom '); +CLI::default(' generate [selectors] - make hostPageDom index based on related hostPage.data field'); +CLI::default(' truncate - flush hostPageDom table'); -CLI::default(' help - this message'); -CLI::default(' crontab [crawl|clean] - execute crontab script queue'); -CLI::default(' hostPage rank reindex - generate rank indexes in hostPage table'); -CLI::default(' hostPageSnap repair - sync DB/FS relations'); -CLI::default(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field'); -CLI::default(' hostPageDom truncate - flush hostPageDom table'); CLI::break(); CLI::default('get support: https://github.com/YGGverse/YGGo/issues');