YGGo/cli/yggo.php

352 lines
10 KiB
PHP
Raw Normal View History

2023-06-25 22:11:49 +03:00
<?php
// CLI only to prevent https server connection timeout
if (php_sapi_name() != 'cli') {
CLI::error(_('supported command line interface only'));
2023-06-25 22:11:49 +03:00
exit;
}
// Lock multi-thread execution
2023-06-25 22:16:05 +03:00
$semaphore = sem_get(crc32('cli.yggo'), 1);
2023-06-25 22:11:49 +03:00
if (false === sem_acquire($semaphore, true)) {
CLI::error(_('Process locked by another thread.'));
2023-06-25 22:11:49 +03:00
exit;
}
// Load system dependencies
require_once(__DIR__ . '/../config/app.php');
require_once(__DIR__ . '/../library/cli.php');
2023-06-25 22:11:49 +03:00
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/ftp.php');
2023-06-25 22:11:49 +03:00
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// CLI begin
if (empty($argv[1])) $argv[1] = 'help';
switch ($argv[1]) {
case 'crawl':
CLI::notice(_('crawler queue step begin...'));
include_once(__DIR__ . '/../crontab/crawler.php');
CLI::notice(_('crawler queue step begin...'));
break;
case 'clean':
CLI::notice(_('cleaner queue step begin...'));
include_once(__DIR__ . '/../crontab/cleaner.php');
CLI::notice(_('cleaner queue step completed.'));
break;
case 'snap':
if (empty($argv[2])) {
CLI::error(_('snap method requires action argument'));
}
switch ($argv[2]) {
case 'reindex':
// Scan for new files/storages
CLI::notice(_('scan database registry for missed snap files...'));
$hostPageSnapsTrash = [];
foreach ($db->getHosts() as $host) {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
// Define variables
$snapFileExists = false;
$snapPath = chunk_split($hostPage->hostPageId, 1, '/');
// Check file exists
$snapStorageIndex = 0;
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
foreach ($storages as $storage) {
$snapStorageIndex++;
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
switch ($name) {
case 'localhost':
$filename = $storage->directory . $snapPath . $hostPageSnap->timeAdded . '.zip';
if (file_exists($filename)) {
$snapFileExists = true;
if (!$db->getHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
CLI::success(sprintf(_('register new file location: %s storage: %s index: %s;'), $filename, $name, $snapStorageIndex));
}
}
}
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$filename = 'hp/' . $snapPath . $hostPageSnap->timeAdded . '.zip';
if ($ftp->size($filename)) {
$snapFileExists = true;
if (!$db->getHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
CLI::success(sprintf(_('register new file location: %s storage: %s index: %s;'), $filename, $name, $snapStorageIndex));
}
}
}
}
$ftp->close();
break;
}
}
}
if (!$snapFileExists) {
$hostPageSnapsTrash[] = $hostPageSnap->hostPageSnapId;
CLI::warning(sprintf(_('trash snap index: #%s file: %s not found in the any of storage;'), $hostPageSnap->hostPageSnapId, $filename));
}
}
}
}
if ($hostPageSnapsTrash) {
CLI::notice(_('snap index deletion queue begin...'));
foreach ($hostPageSnapsTrash as $hostPageSnapId) {
CLI::warning(sprintf(_('delete snap index: #%s;'), $hostPageSnapId));
// Clear queued snap registry
foreach ($db->getHostPageSnapStorages($hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnapId);
$db->deleteHostPageSnap($hostPageSnapId);
}
CLI::success(_('snap index trash deletion completed!'));
}
CLI::notice(_('optimize database tables...'));
$db->optimize();
CLI::success(_('tables successfully optimized!'));
CLI::notice(_('scan storage locations for snap files not registered in the DB...'));
CLI::success(_('snap index successfully updated!'));
// Cleanup FS items on missed DB registry
// @TODO
break;
default:
CLI::error(_('undefined action argument'));
}
break;
case 'hostPage':
if (empty($argv[2])) {
CLI::error(_('hostPage method requires action argument'));
}
switch ($argv[2]) {
case 'rank':
if (empty($argv[3])) {
CLI::error(_('hostPage rank requires action argument'));
}
switch ($argv[3]) {
case 'reindex':
foreach ($db->getHosts() as $host) {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
$db->updateHostPageRank($hostPage->hostPageId, $db->getTotalExternalHostPageIdSourcesByHostPageIdTarget($hostPage->hostPageId)); // @TODO add library cover
}
}
CLI::success(_('hostPage rank successfully updated'));
exit;
break;
default:
CLI::error(_('undefined action argument'));
}
break;
case 'truncate':
$db->truncateHostPageDom();
CLI::success(_('hostPageDom table successfully truncated'));
exit;
break;
default:
CLI::error(_('undefined action argument'));
}
break;
2023-06-25 22:11:49 +03:00
case 'hostPageDom':
if (empty($argv[2])) {
CLI::error(_('hostPageDom method requires action argument'));
2023-06-25 22:11:49 +03:00
}
switch ($argv[2]) {
case 'generate':
$selectors = [];
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
if (!empty($selector)) {
$selectors[] = trim($selector);
}
}
if ($selectors) {
2023-06-25 22:11:49 +03:00
// Init variables
$hostPagesProcessedTotal = 0;
$hostPageDOMAddedTotal = 0;
// Begin selectors extraction
foreach ($db->getHostPagesByIndexed() as $hostPage) {
if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
$hostPagesProcessedTotal++;
if (!empty($hostPageDescription->data)) {
$html = str_get_html(base64_decode($hostPageDescription->data));
foreach ($selectors as $selector) {
2023-06-25 22:11:49 +03:00
foreach($html->find($selector) as $element) {
if (!empty($element->innertext)) {
$hostPageDOMAddedTotal++;
$db->addHostPageDom($hostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext));
}
}
}
}
}
}
}
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
2023-06-25 22:11:49 +03:00
exit;
}
CLI::error(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
2023-06-25 22:11:49 +03:00
exit;
break;
case 'truncate':
$db->truncateHostPageDom();
CLI::success(_('hostPageDom table successfully truncated'));
2023-06-25 22:11:49 +03:00
exit;
break;
default:
CLI::error(_('undefined action argument'));
2023-06-25 22:11:49 +03:00
}
break;
}
// Default message
CLI::default('__ ______________ __');
CLI::default('\ \/ / ____/ ____/___ / /');
CLI::default(' \ / / __/ / __/ __ \/ /' );
CLI::default(' / / /_/ / /_/ / /_/ /_/' );
CLI::default('/_/\____/\____/\____(_)' );
CLI::brake();
CLI::default('available options:');
CLI::default(' help - this message');
CLI::default(' crawl - execute crawler step in the crontab queue');
CLI::default(' clean - execute cleaner step in the crontab queue');
CLI::default(' snap reindex - sync DB/FS relations');
CLI::default(' hostPage rank reindex - generate rank indexes in hostPage table');
CLI::default(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field');
CLI::default(' hostPageDom truncate - flush hostPageDom table');
CLI::brake();
CLI::default('get support: https://github.com/YGGverse/YGGo/issues');