YGGo/cli/yggo.php

509 lines
17 KiB
PHP
Raw Normal View History

2023-06-25 22:11:49 +03:00
<?php
// Load system dependencies
require_once(__DIR__ . '/../config/app.php');
require_once(__DIR__ . '/../library/cli.php');
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/ftp.php');
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
2023-06-25 22:11:49 +03:00
// CLI only to prevent https server connection timeout
if (php_sapi_name() != 'cli') {
2023-07-29 20:02:53 +03:00
CLI::danger(_('supported command line interface only'));
CLI::break();
2023-06-25 22:11:49 +03:00
exit;
}
2023-08-01 17:53:14 +03:00
// Stop CLI execution on cleaner process running
$semaphore = sem_get(crc32('crontab.cleaner'), 1);
2023-06-25 22:11:49 +03:00
if (false === sem_acquire($semaphore, true)) {
2023-08-01 17:53:14 +03:00
CLI::danger(_('stop crontab.cleaner is running in another thread.'));
CLI::break();
2023-06-25 22:11:49 +03:00
exit;
}
2023-08-01 17:53:14 +03:00
// Stop CLI execution on crawler process running
$semaphore = sem_get(crc32('crontab.crawler'), 1);
if (false === sem_acquire($semaphore, true)) {
2023-08-01 17:53:14 +03:00
CLI::danger(_('stop crontab.crawler is running in another thread.'));
CLI::break();
exit;
}
2023-08-01 17:53:14 +03:00
// Lock multi-thread execution
$semaphore = sem_get(crc32('cli.yggo'), 1);
if (false === sem_acquire($semaphore, true)) {
2023-08-01 17:53:14 +03:00
CLI::danger(_('process locked by another thread.'));
CLI::break();
exit;
}
2023-06-25 22:11:49 +03:00
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// CLI begin
2023-08-01 21:55:18 +03:00
if (!empty($argv[1])) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
switch ($argv[1]) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
case 'db':
2023-08-01 21:55:18 +03:00
if (empty($argv[2])) {
2023-08-01 21:55:18 +03:00
switch ($argv[2]) {
2023-08-01 21:55:18 +03:00
case 'optimize':
2023-08-01 21:55:18 +03:00
CLI::notice(_('optimize database tables...'));
2023-08-01 21:55:18 +03:00
$db->optimize();
2023-08-01 21:55:18 +03:00
CLI::success(_('tables successfully optimized!'));
2023-08-01 21:55:18 +03:00
break;
}
}
2023-08-01 21:55:18 +03:00
break;
case 'crontab':
2023-08-01 21:55:18 +03:00
if (empty($argv[2])) {
2023-08-01 21:55:18 +03:00
switch ($argv[2]) {
2023-08-01 21:55:18 +03:00
case 'crawl':
2023-08-01 21:55:18 +03:00
CLI::notice(_('crawler queue step begin...'));
2023-08-01 21:55:18 +03:00
include_once(__DIR__ . '/../crontab/crawler.php');
2023-08-01 21:55:18 +03:00
CLI::notice(_('crawler queue step begin...'));
break;
2023-08-01 21:55:18 +03:00
case 'clean':
2023-08-01 21:55:18 +03:00
CLI::notice(_('cleaner queue step begin...'));
2023-08-01 21:55:18 +03:00
include_once(__DIR__ . '/../crontab/cleaner.php');
2023-07-31 13:33:30 +03:00
2023-08-01 21:55:18 +03:00
CLI::notice(_('cleaner queue step completed.'));
2023-08-01 21:55:18 +03:00
break;
}
}
2023-08-01 21:55:18 +03:00
break;
case 'hostPageSnap':
2023-08-01 21:55:18 +03:00
if (empty($argv[2])) {
2023-08-01 21:55:18 +03:00
switch ($argv[2]) {
2023-07-31 13:33:30 +03:00
2023-08-01 21:55:18 +03:00
case 'repair':
2023-08-01 21:55:18 +03:00
// @TODO
CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
CLI::danger(_('make sure you have backups then remove this alert.'));
CLI::break();
exit;
2023-08-01 21:55:18 +03:00
switch ($argv[3]) {
2023-08-01 21:55:18 +03:00
case 'db':
2023-08-01 21:55:18 +03:00
// Normalize & cleanup DB
CLI::notice(_('scan database registry for missed snap files...'));
2023-08-01 21:55:18 +03:00
foreach ($db->getHosts() as $host) {
2023-08-01 21:55:18 +03:00
foreach ($db->getHostPages($host->hostId) as $hostPage) {
2023-08-01 21:55:18 +03:00
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
2023-08-01 21:55:18 +03:00
// Prepare filenames
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
2023-08-01 21:55:18 +03:00
// Define variables
$hostPageSnapStorageFilesExists = false;
2023-08-01 21:55:18 +03:00
// Check file exists
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
2023-08-01 21:55:18 +03:00
foreach ($storages as $location => $storage) {
2023-07-29 20:23:43 +03:00
2023-08-01 21:55:18 +03:00
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $node, $location));
2023-07-29 20:23:43 +03:00
2023-08-01 21:55:18 +03:00
switch ($node) {
2023-08-01 21:55:18 +03:00
case 'localhost':
2023-08-01 21:55:18 +03:00
// @TODO implemented, not tested
$hostPageSnapFile = $storage->directory . $hostPageSnapFile;
2023-08-01 21:55:18 +03:00
if (file_exists($hostPageSnapFile)) {
2023-08-01 21:55:18 +03:00
$hostPageSnapStorageFilesExists = true;
2023-08-01 21:55:18 +03:00
if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
2023-08-01 21:55:18 +03:00
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
2023-08-01 21:55:18 +03:00
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
}
2023-08-01 21:55:18 +03:00
} else {
2023-08-01 21:55:18 +03:00
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
}
}
2023-07-29 20:23:43 +03:00
2023-08-01 21:55:18 +03:00
break;
2023-08-01 21:55:18 +03:00
case 'ftp':
2023-08-01 21:55:18 +03:00
$ftp = new Ftp();
2023-08-01 21:55:18 +03:00
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
2023-08-01 21:55:18 +03:00
if ($ftp->size($hostPageSnapFile)) {
2023-08-01 21:55:18 +03:00
$hostPageSnapStorageFilesExists = true;
2023-08-01 21:55:18 +03:00
if (!$db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
2023-08-01 21:55:18 +03:00
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
2023-08-01 21:55:18 +03:00
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
}
} else {
2023-08-01 21:55:18 +03:00
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
}
}
2023-08-01 21:55:18 +03:00
// Prevent snap deletion from registry on FTP connection lost
} else {
2023-08-01 21:55:18 +03:00
CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
CLI::break();
exit;
}
2023-08-01 21:55:18 +03:00
$ftp->close();
2023-08-01 21:55:18 +03:00
break;
}
}
}
2023-07-30 12:18:35 +03:00
2023-08-01 21:55:18 +03:00
// Files not exists
if (!$hostPageSnapStorageFilesExists) {
2023-07-30 12:18:35 +03:00
2023-08-01 21:55:18 +03:00
// Delete snap from registry
try {
2023-08-01 21:55:18 +03:00
$db->beginTransaction();
2023-08-01 21:55:18 +03:00
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
2023-07-31 13:33:30 +03:00
2023-08-01 21:55:18 +03:00
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
2023-08-01 21:55:18 +03:00
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
2023-08-01 21:55:18 +03:00
CLI::warning(sprintf(_('delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;'), $hostPageSnap->hostPageSnapId, $hostPageSnap->timeAdded));
2023-08-01 21:55:18 +03:00
$db->commit();
2023-08-01 21:55:18 +03:00
} catch(Exception $e) {
2023-08-01 21:55:18 +03:00
$db->rollBack();
2023-08-01 21:55:18 +03:00
var_dump($e);
}
}
}
}
}
2023-08-01 21:55:18 +03:00
break;
case 'fs':
2023-08-01 21:55:18 +03:00
// Cleanup FS
CLI::notice(_('scan storage for snap files missed in the DB...'));
2023-08-01 21:55:18 +03:00
// Copy files to each storage
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
2023-08-01 21:55:18 +03:00
foreach ($storages as $location => $storage) {
2023-08-01 21:55:18 +03:00
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $node, $location));
2023-08-01 21:55:18 +03:00
switch ($node) {
2023-08-01 21:55:18 +03:00
case 'localhost':
2023-08-01 21:55:18 +03:00
// @TODO
2023-08-01 21:55:18 +03:00
break;
2023-08-01 21:55:18 +03:00
case 'ftp':
2023-07-30 22:24:16 +03:00
2023-08-01 21:55:18 +03:00
$ftp = new Ftp();
2023-07-30 22:24:16 +03:00
2023-08-01 21:55:18 +03:00
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
2023-08-01 21:55:18 +03:00
foreach ($ftp->nlistr($storage->directory) as $filename) {
2023-08-01 21:55:18 +03:00
if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) {
2023-08-01 21:55:18 +03:00
if (!empty($matches[1])) { // hostPageSnapId
2023-08-01 21:55:18 +03:00
if (!$db->getHostPageSnap($matches[1])) {
2023-08-01 21:55:18 +03:00
if ($ftp->delete($filename)) {
2023-08-01 21:55:18 +03:00
CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
2023-08-01 21:55:18 +03:00
} else {
2023-08-01 21:55:18 +03:00
CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
}
2023-08-01 21:55:18 +03:00
} else {
2023-08-01 21:55:18 +03:00
CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location));
}
}
}
}
}
2023-08-01 21:55:18 +03:00
$ftp->close();
2023-08-01 21:55:18 +03:00
break;
}
}
}
2023-08-01 21:55:18 +03:00
CLI::success(_('missed snap files successfully deleted!'));
break;
}
break;
case 'reindex':
2023-08-01 21:55:18 +03:00
//@TODO
2023-08-01 21:55:18 +03:00
break;
}
2023-08-01 21:55:18 +03:00
}
2023-08-01 21:55:18 +03:00
break;
case 'hostPage':
if (!empty($argv[2])) {
switch ($argv[2]) {
case 'rank':
if (!empty($argv[3])) {
switch ($argv[3]) {
case 'reindex':
CLI::notice(_('hostPage rank fields reindex begin...'));
foreach ($db->getHosts() as $host) {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
// @TODO add common method
$hostPageRank = 0;
// Get referrers
foreach ($db->getHostPagesToHostPageByHostPageIdTarget($hostPage->hostPageId) as $hostPageToHostPageByHostPageIdTarget) {
// Get source page details
if ($hostPageSource = $db->getHostPage($hostPageToHostPageByHostPageIdTarget->hostPageIdSource)) {
// Increase PR on external referrer only
if ($hostPageSource->hostId != $hostPage->hostId) {
$hostPageRank++;
}
// Delegate page rank value from redirected pages
if (false !== strpos($hostPageSource->httpCode, '30')) {
$hostPageRank += $hostPageSource->rank;
}
}
}
// Update registry
if ($db->updateHostPageRank($hostPage->hostPageId, $hostPageRank)) {
CLI::warning(sprintf(_('update hostPage #%s rank from %s to %s;'), $hostPage->hostPageId, $hostPage->rank, $hostPageRank));
} else {
# CLI::success(sprintf(_('keep hostPage #%s rank %s;'), $hostPage->hostPageId, $hostPageRank));
}
}
}
CLI::notice(_('hostPage rank fields successfully updated!'));
CLI::break();
exit;
break;
}
2023-08-01 21:55:18 +03:00
}
break;
}
2023-08-01 21:55:18 +03:00
}
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
break;
case 'hostPageDom':
2023-08-01 21:55:18 +03:00
if (empty($argv[2])) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
switch ($argv[2]) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
case 'generate':
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$selectors = [];
2023-08-01 21:55:18 +03:00
foreach ((array) explode(';', !empty($argv[3]) ? $argv[3] : (string) CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
2023-08-01 21:55:18 +03:00
if (!empty($selector)) {
2023-08-01 21:55:18 +03:00
$selectors[] = trim($selector);
}
}
2023-08-01 21:55:18 +03:00
if ($selectors) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
// Init variables
$hostPagesProcessedTotal = 0;
$hostPageDOMAddedTotal = 0;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
// Begin selectors extraction
foreach ($db->getHostPagesByIndexed() as $hostPage) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
if (false !== stripos(Filter::mime($hostPage->mime), 'text/html')) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
if ($hostPageDescription = $db->getLastPageDescription($hostPage->hostPageId)) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$hostPagesProcessedTotal++;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
if (!empty($hostPageDescription->data)) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$html = str_get_html(base64_decode($hostPageDescription->data));
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
foreach ($selectors as $selector) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
foreach($html->find($selector) as $element) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
if (!empty($element->innertext)) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$hostPageDOMAddedTotal++;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$db->addHostPageDom($hostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext));
}
}
2023-06-25 22:11:49 +03:00
}
}
}
}
}
2023-08-01 21:55:18 +03:00
CLI::success(sprintf(_('Host pages processed: %s'), $hostPagesProcessedTotal));
CLI::success(sprintf(_('Host page DOM elements added: %s'), $hostPageDOMAddedTotal));
exit;
}
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
CLI::danger(_('CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file'));
CLI::break();
2023-08-01 21:55:18 +03:00
exit;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
break;
case 'truncate':
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$db->truncateHostPageDom();
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
CLI::success(_('hostPageDom table successfully truncated'));
CLI::break();
2023-08-01 21:55:18 +03:00
exit;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
break;
}
}
break;
}
2023-06-25 22:11:49 +03:00
}
// Default message
CLI::default('__ ______________ __');
CLI::default('\ \/ / ____/ ____/___ / /');
CLI::default(' \ / / __/ / __/ __ \/ /' );
CLI::default(' / / /_/ / /_/ / /_/ /_/' );
CLI::default('/_/\____/\____/\____(_)' );
2023-07-29 20:07:44 +03:00
CLI::break();
CLI::default('available options:');
2023-08-01 21:55:18 +03:00
CLI::break();
CLI::default(' help - this message');
CLI::break();
CLI::default(' db ');
CLI::default(' optimize - optimize all tables');
CLI::break();
CLI::default(' crontab ');
CLI::default(' crawl - execute step in crawler queue');
CLI::default(' clean - execute step in cleaner queue');
CLI::break();
CLI::default(' hostPage ');
CLI::default(' rank ');
CLI::default(' reindex - reindex hostPage.rank fields');
2023-08-01 21:55:18 +03:00
CLI::break();
CLI::default(' hostPageSnap ');
CLI::default(' repair ');
CLI::default(' db - scan database registry for new or deprecated snap files');
CLI::default(' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem');
CLI::default(' reindex - search for host pages without snap records, add found pages to the crawl queue');
CLI::break();
CLI::default(' hostPageDom ');
CLI::default(' generate [selectors] - make hostPageDom index based on related hostPage.data field');
CLI::default(' truncate - flush hostPageDom table');
2023-07-29 20:07:44 +03:00
CLI::break();
CLI::default('get support: https://github.com/YGGverse/YGGo/issues');
2023-07-29 20:07:44 +03:00
CLI::break();
CLI::break();