2023-06-25 22:11:49 +03:00
< ? php
2023-07-30 21:53:30 +03:00
// Load system dependencies
require_once ( __DIR__ . '/../config/app.php' );
require_once ( __DIR__ . '/../library/cli.php' );
require_once ( __DIR__ . '/../library/mysql.php' );
require_once ( __DIR__ . '/../library/filter.php' );
require_once ( __DIR__ . '/../library/ftp.php' );
require_once ( __DIR__ . '/../library/vendor/simple_html_dom.php' );
2023-06-25 22:11:49 +03:00
// CLI only to prevent https server connection timeout
if ( php_sapi_name () != 'cli' ) {
2023-07-29 17:49:07 +03:00
2023-07-29 20:02:53 +03:00
CLI :: danger ( _ ( 'supported command line interface only' ));
2023-07-30 21:53:30 +03:00
CLI :: break ();
2023-06-25 22:11:49 +03:00
exit ;
}
2023-08-01 17:53:14 +03:00
// Stop CLI execution on cleaner process running
$semaphore = sem_get ( crc32 ( 'crontab.cleaner' ), 1 );
2023-06-25 22:11:49 +03:00
if ( false === sem_acquire ( $semaphore , true )) {
2023-08-01 17:53:14 +03:00
CLI :: danger ( _ ( 'stop crontab.cleaner is running in another thread.' ));
2023-07-30 21:53:30 +03:00
CLI :: break ();
2023-06-25 22:11:49 +03:00
exit ;
}
2023-08-01 17:53:14 +03:00
// Stop CLI execution on crawler process running
$semaphore = sem_get ( crc32 ( 'crontab.crawler' ), 1 );
2023-07-30 21:53:30 +03:00
if ( false === sem_acquire ( $semaphore , true )) {
2023-08-01 17:53:14 +03:00
CLI :: danger ( _ ( 'stop crontab.crawler is running in another thread.' ));
2023-07-30 21:53:30 +03:00
CLI :: break ();
exit ;
}
2023-08-01 17:53:14 +03:00
// Lock multi-thread execution
$semaphore = sem_get ( crc32 ( 'cli.yggo' ), 1 );
2023-07-30 21:53:30 +03:00
if ( false === sem_acquire ( $semaphore , true )) {
2023-08-01 17:53:14 +03:00
CLI :: danger ( _ ( 'process locked by another thread.' ));
2023-07-30 21:53:30 +03:00
CLI :: break ();
exit ;
}
2023-06-25 22:11:49 +03:00
// Connect database
$db = new MySQL ( DB_HOST , DB_PORT , DB_NAME , DB_USERNAME , DB_PASSWORD );
// CLI begin
2023-08-01 21:55:18 +03:00
if ( ! empty ( $argv [ 1 ])) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
switch ( $argv [ 1 ]) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
case 'db' :
2023-07-12 21:30:53 +03:00
2023-08-01 21:55:18 +03:00
if ( empty ( $argv [ 2 ])) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
switch ( $argv [ 2 ]) {
2023-07-12 21:30:53 +03:00
2023-08-01 21:55:18 +03:00
case 'optimize' :
2023-07-30 22:05:37 +03:00
2023-08-01 21:55:18 +03:00
CLI :: notice ( _ ( 'optimize database tables...' ));
2023-07-30 22:05:37 +03:00
2023-08-01 21:55:18 +03:00
$db -> optimize ();
2023-07-30 22:05:37 +03:00
2023-08-01 21:55:18 +03:00
CLI :: success ( _ ( 'tables successfully optimized!' ));
2023-07-12 21:30:53 +03:00
2023-08-01 21:55:18 +03:00
break ;
}
}
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
break ;
case 'crontab' :
2023-07-12 21:30:53 +03:00
2023-08-01 21:55:18 +03:00
if ( empty ( $argv [ 2 ])) {
2023-07-30 22:05:37 +03:00
2023-08-01 21:55:18 +03:00
switch ( $argv [ 2 ]) {
2023-07-30 22:05:37 +03:00
2023-08-01 21:55:18 +03:00
case 'crawl' :
2023-07-30 22:05:37 +03:00
2023-08-01 21:55:18 +03:00
CLI :: notice ( _ ( 'crawler queue step begin...' ));
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
include_once ( __DIR__ . '/../crontab/crawler.php' );
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: notice ( _ ( 'crawler queue step begin...' ));
break ;
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
case 'clean' :
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: notice ( _ ( 'cleaner queue step begin...' ));
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
include_once ( __DIR__ . '/../crontab/cleaner.php' );
2023-07-31 13:33:30 +03:00
2023-08-01 21:55:18 +03:00
CLI :: notice ( _ ( 'cleaner queue step completed.' ));
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
break ;
}
}
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
break ;
case 'hostPageSnap' :
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( empty ( $argv [ 2 ])) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
switch ( $argv [ 2 ]) {
2023-07-31 13:33:30 +03:00
2023-08-01 21:55:18 +03:00
case 'repair' :
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
// @TODO
CLI :: danger ( _ ( 'this function upgraded but not tested after snaps refactor.' ));
CLI :: danger ( _ ( 'make sure you have backups then remove this alert.' ));
CLI :: break ();
exit ;
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
switch ( $argv [ 3 ]) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
case 'db' :
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
// Normalize & cleanup DB
CLI :: notice ( _ ( 'scan database registry for missed snap files...' ));
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $db -> getHosts () as $host ) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $db -> getHostPages ( $host -> hostId ) as $hostPage ) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $db -> getHostPageSnaps ( $hostPage -> hostPageId ) as $hostPageSnap ) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
// Prepare filenames
$hostPageSnapPath = 'hps/' . substr ( trim ( chunk_split ( $hostPageSnap -> hostPageSnapId , 1 , '/' ), '/' ), 0 , - 1 );
$hostPageSnapFile = $hostPageSnapPath . substr ( $hostPageSnap -> hostPageSnapId , - 1 ) . '.zip' ;
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
// Define variables
$hostPageSnapStorageFilesExists = false ;
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
// Check file exists
foreach ( json_decode ( SNAP_STORAGE ) as $node => $storages ) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $storages as $location => $storage ) {
2023-07-29 20:23:43 +03:00
2023-08-01 21:55:18 +03:00
// Generate storage id
$crc32name = crc32 ( sprintf ( '%s.%s' , $node , $location ));
2023-07-29 20:23:43 +03:00
2023-08-01 21:55:18 +03:00
switch ( $node ) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
case 'localhost' :
2023-07-30 17:42:36 +03:00
2023-08-01 21:55:18 +03:00
// @TODO implemented, not tested
$hostPageSnapFile = $storage -> directory . $hostPageSnapFile ;
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( file_exists ( $hostPageSnapFile )) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
$hostPageSnapStorageFilesExists = true ;
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( ! $db -> findHostPageSnapStorageByCRC32Name ( $hostPageSnap -> hostPageSnapId , $crc32name )) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( $db -> addHostPageSnapStorage ( $hostPageSnap -> hostPageSnapId , $crc32name , $hostPageSnap -> timeAdded )) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: warning ( sprintf ( _ ( 'add index hostPageSnapId #%s file: %s node: %s location: %s;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnapFile , $node , $location ));
}
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
} else {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: success ( sprintf ( _ ( 'skip related index hostPageSnapId #%s file: %s node: %s location: %s;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnapFile , $node , $location ));
}
}
2023-07-29 20:23:43 +03:00
2023-08-01 21:55:18 +03:00
break ;
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
case 'ftp' :
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
$ftp = new Ftp ();
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( $ftp -> connect ( $storage -> host , $storage -> port , $storage -> username , $storage -> password , $storage -> directory , $storage -> timeout , $storage -> passive )) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( $ftp -> size ( $hostPageSnapFile )) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
$hostPageSnapStorageFilesExists = true ;
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( ! $db -> findHostPageSnapStorageByCRC32Name ( $hostPageSnap -> hostPageSnapId , $crc32name )) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( $db -> addHostPageSnapStorage ( $hostPageSnap -> hostPageSnapId , $crc32name , $hostPageSnap -> timeAdded )) {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: warning ( sprintf ( _ ( 'add index hostPageSnapId #%s file: %s node: %s location: %s;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnapFile , $node , $location ));
}
} else {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: success ( sprintf ( _ ( 'skip related index hostPageSnapId #%s file: %s node: %s location: %s;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnapFile , $node , $location ));
}
}
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
// Prevent snap deletion from registry on FTP connection lost
} else {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: danger ( sprintf ( _ ( 'could not connect to storage %s location %s. operation stopped to prevent the data lose.' ), $hostPageSnapStorageName , $location ));
CLI :: break ();
exit ;
}
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
$ftp -> close ();
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
break ;
}
}
}
2023-07-30 12:18:35 +03:00
2023-08-01 21:55:18 +03:00
// Files not exists
if ( ! $hostPageSnapStorageFilesExists ) {
2023-07-30 12:18:35 +03:00
2023-08-01 21:55:18 +03:00
// Delete snap from registry
try {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
$db -> beginTransaction ();
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $db -> getHostPageSnapStorages ( $hostPageSnap -> hostPageSnapId ) as $hostPageSnapStorage ) {
2023-07-31 13:33:30 +03:00
2023-08-01 21:55:18 +03:00
$db -> deleteHostPageSnapDownloads ( $hostPageSnapStorage -> hostPageSnapStorageId );
}
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
$db -> deleteHostPageSnapStorages ( $hostPageSnap -> hostPageSnapId );
$db -> deleteHostPageSnap ( $hostPageSnap -> hostPageSnapId );
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: warning ( sprintf ( _ ( 'delete hostPageSnapId: #%s timeAdded: %s as not found in file storages;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnap -> timeAdded ));
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
$db -> commit ();
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
} catch ( Exception $e ) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
$db -> rollBack ();
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
var_dump ( $e );
}
}
}
}
}
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
break ;
case 'fs' :
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
// Cleanup FS
CLI :: notice ( _ ( 'scan storage for snap files missed in the DB...' ));
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
// Copy files to each storage
foreach ( json_decode ( SNAP_STORAGE ) as $node => $storages ) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $storages as $location => $storage ) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
// Generate storage id
$crc32name = crc32 ( sprintf ( '%s.%s' , $node , $location ));
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
switch ( $node ) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
case 'localhost' :
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
// @TODO
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
break ;
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
case 'ftp' :
2023-07-30 22:24:16 +03:00
2023-08-01 21:55:18 +03:00
$ftp = new Ftp ();
2023-07-30 22:24:16 +03:00
2023-08-01 21:55:18 +03:00
if ( $ftp -> connect ( $storage -> host , $storage -> port , $storage -> username , $storage -> password , $storage -> directory , $storage -> timeout , $storage -> passive )) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $ftp -> nlistr ( $storage -> directory ) as $filename ) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
if ( false !== preg_match ( sprintf ( '!/hps/([\d]+)\.zip$!ui' , $storage -> directory ), $filename , $matches )) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
if ( ! empty ( $matches [ 1 ])) { // hostPageSnapId
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
if ( ! $db -> getHostPageSnap ( $matches [ 1 ])) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
if ( $ftp -> delete ( $filename )) {
2023-07-30 21:53:30 +03:00
2023-08-01 21:55:18 +03:00
CLI :: warning ( sprintf ( _ ( 'delete snap file: #%s from node %s location %s not found in registry;' ), $filename , $node , $location ));
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
} else {
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
CLI :: danger ( sprintf ( _ ( 'delete snap file: #%s from node %s location %s not found in registry;' ), $filename , $node , $location ));
}
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
} else {
2023-07-28 12:49:43 +03:00
2023-08-01 21:55:18 +03:00
CLI :: success ( sprintf ( _ ( 'skip snap file: #%s available in node %s location %s;' ), $filename , $node , $location ));
}
}
}
}
}
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
$ftp -> close ();
2023-07-28 12:49:43 +03:00
2023-08-01 21:55:18 +03:00
break ;
}
}
}
2023-07-28 12:49:43 +03:00
2023-08-01 21:55:18 +03:00
CLI :: success ( _ ( 'missed snap files successfully deleted!' ));
break ;
}
break ;
case 'reindex' :
2023-07-28 12:49:43 +03:00
2023-08-01 21:55:18 +03:00
//@TODO
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
break ;
2023-07-28 12:49:43 +03:00
}
2023-08-01 21:55:18 +03:00
}
2023-07-28 12:49:43 +03:00
2023-08-01 21:55:18 +03:00
break ;
case 'hostPage' :
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
if ( ! empty ( $argv [ 2 ])) {
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
switch ( $argv [ 2 ]) {
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
case 'rank' :
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
if ( ! empty ( $argv [ 3 ])) {
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
switch ( $argv [ 3 ]) {
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
case 'reindex' :
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
CLI :: notice ( _ ( 'hostPage rank fields reindex begin...' ));
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
foreach ( $db -> getHosts () as $host ) {
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
foreach ( $db -> getHostPages ( $host -> hostId ) as $hostPage ) {
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
// @TODO add common method
$hostPageRank = 0 ;
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
// Get referrers
foreach ( $db -> getHostPagesToHostPageByHostPageIdTarget ( $hostPage -> hostPageId ) as $hostPageToHostPageByHostPageIdTarget ) {
// Get source page details
if ( $hostPageSource = $db -> getHostPage ( $hostPageToHostPageByHostPageIdTarget -> hostPageIdSource )) {
// Increase PR on external referrer only
if ( $hostPageSource -> hostId != $hostPage -> hostId ) {
$hostPageRank ++ ;
}
// Delegate page rank value from redirected pages
if ( false !== strpos ( $hostPageSource -> httpCode , '30' )) {
$hostPageRank += $hostPageSource -> rank ;
}
}
}
// Update registry
if ( $db -> updateHostPageRank ( $hostPage -> hostPageId , $hostPageRank )) {
CLI :: warning ( sprintf ( _ ( 'update hostPage #%s rank from %s to %s;' ), $hostPage -> hostPageId , $hostPage -> rank , $hostPageRank ));
} else {
# CLI::success(sprintf(_('keep hostPage #%s rank %s;'), $hostPage->hostPageId, $hostPageRank));
}
}
}
CLI :: notice ( _ ( 'hostPage rank fields successfully updated!' ));
CLI :: break ();
exit ;
break ;
}
2023-08-01 21:55:18 +03:00
}
2023-07-28 12:49:43 +03:00
2023-08-02 15:43:44 +03:00
break ;
}
2023-08-01 21:55:18 +03:00
}
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
break ;
case 'hostPageDom' :
2023-07-29 17:49:07 +03:00
2023-08-01 21:55:18 +03:00
if ( empty ( $argv [ 2 ])) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
switch ( $argv [ 2 ]) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
case 'generate' :
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$selectors = [];
2023-06-26 17:15:28 +03:00
2023-08-01 21:55:18 +03:00
foreach (( array ) explode ( ';' , ! empty ( $argv [ 3 ]) ? $argv [ 3 ] : ( string ) CRAWL_HOST_PAGE_DOM_SELECTORS ) as $selector ) {
2023-06-26 17:15:28 +03:00
2023-08-01 21:55:18 +03:00
if ( ! empty ( $selector )) {
2023-06-26 17:15:28 +03:00
2023-08-01 21:55:18 +03:00
$selectors [] = trim ( $selector );
}
}
2023-06-26 17:15:28 +03:00
2023-08-01 21:55:18 +03:00
if ( $selectors ) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
// Init variables
$hostPagesProcessedTotal = 0 ;
$hostPageDOMAddedTotal = 0 ;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
// Begin selectors extraction
foreach ( $db -> getHostPagesByIndexed () as $hostPage ) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
if ( false !== stripos ( Filter :: mime ( $hostPage -> mime ), 'text/html' )) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
if ( $hostPageDescription = $db -> getLastPageDescription ( $hostPage -> hostPageId )) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$hostPagesProcessedTotal ++ ;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
if ( ! empty ( $hostPageDescription -> data )) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$html = str_get_html ( base64_decode ( $hostPageDescription -> data ));
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $selectors as $selector ) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
foreach ( $html -> find ( $selector ) as $element ) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
if ( ! empty ( $element -> innertext )) {
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$hostPageDOMAddedTotal ++ ;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$db -> addHostPageDom ( $hostPage -> hostPageId ,
time (),
$selector ,
trim ( CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags (
preg_replace ( '/[\s]+/' ,
' ' ,
str_replace ([ '<br />' , '<br/>' , '<br>' , '</' ],
[ ' ' , ' ' , ' ' , ' </' ],
$element -> innertext ))) : $element -> innertext ));
}
}
2023-06-25 22:11:49 +03:00
}
}
}
}
}
2023-08-01 21:55:18 +03:00
CLI :: success ( sprintf ( _ ( 'Host pages processed: %s' ), $hostPagesProcessedTotal ));
CLI :: success ( sprintf ( _ ( 'Host page DOM elements added: %s' ), $hostPageDOMAddedTotal ));
exit ;
}
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
CLI :: danger ( _ ( 'CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file' ));
2023-08-02 15:43:44 +03:00
CLI :: break ();
2023-08-01 21:55:18 +03:00
exit ;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
break ;
case 'truncate' :
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
$db -> truncateHostPageDom ();
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
CLI :: success ( _ ( 'hostPageDom table successfully truncated' ));
2023-08-02 15:43:44 +03:00
CLI :: break ();
2023-08-01 21:55:18 +03:00
exit ;
2023-06-25 22:11:49 +03:00
2023-08-01 21:55:18 +03:00
break ;
}
}
break ;
}
2023-06-25 22:11:49 +03:00
}
// Default message
2023-07-29 17:49:07 +03:00
CLI :: default ( '__ ______________ __' );
CLI :: default ( '\ \/ / ____/ ____/___ / /' );
CLI :: default ( ' \ / / __/ / __/ __ \/ /' );
CLI :: default ( ' / / /_/ / /_/ / /_/ /_/' );
CLI :: default ( '/_/\____/\____/\____(_)' );
2023-07-29 20:07:44 +03:00
CLI :: break ();
2023-07-29 17:49:07 +03:00
CLI :: default ( 'available options:' );
2023-08-01 21:55:18 +03:00
CLI :: break ();
CLI :: default ( ' help - this message' );
CLI :: break ();
CLI :: default ( ' db ' );
CLI :: default ( ' optimize - optimize all tables' );
CLI :: break ();
CLI :: default ( ' crontab ' );
CLI :: default ( ' crawl - execute step in crawler queue' );
CLI :: default ( ' clean - execute step in cleaner queue' );
CLI :: break ();
CLI :: default ( ' hostPage ' );
2023-08-02 15:43:44 +03:00
CLI :: default ( ' rank ' );
CLI :: default ( ' reindex - reindex hostPage.rank fields' );
2023-08-01 21:55:18 +03:00
CLI :: break ();
CLI :: default ( ' hostPageSnap ' );
CLI :: default ( ' repair ' );
CLI :: default ( ' db - scan database registry for new or deprecated snap files' );
CLI :: default ( ' fs - check all storages for snap files not registered in hostPageSnapStorage, cleanup filesystem' );
CLI :: default ( ' reindex - search for host pages without snap records, add found pages to the crawl queue' );
CLI :: break ();
CLI :: default ( ' hostPageDom ' );
CLI :: default ( ' generate [selectors] - make hostPageDom index based on related hostPage.data field' );
CLI :: default ( ' truncate - flush hostPageDom table' );
2023-07-29 17:49:07 +03:00
2023-07-29 20:07:44 +03:00
CLI :: break ();
2023-07-29 17:49:07 +03:00
CLI :: default ( 'get support: https://github.com/YGGverse/YGGo/issues' );
2023-07-29 20:07:44 +03:00
CLI :: break ();
CLI :: break ();