2023-06-25 19:11:49 +00:00
< ? php
2023-07-30 18:53:30 +00:00
// Load system dependencies
require_once ( __DIR__ . '/../config/app.php' );
require_once ( __DIR__ . '/../library/cli.php' );
require_once ( __DIR__ . '/../library/mysql.php' );
require_once ( __DIR__ . '/../library/filter.php' );
require_once ( __DIR__ . '/../library/ftp.php' );
require_once ( __DIR__ . '/../library/vendor/simple_html_dom.php' );
2023-06-25 19:11:49 +00:00
// CLI only to prevent https server connection timeout
if ( php_sapi_name () != 'cli' ) {
2023-07-29 14:49:07 +00:00
2023-07-29 17:02:53 +00:00
CLI :: danger ( _ ( 'supported command line interface only' ));
2023-07-30 18:53:30 +00:00
CLI :: break ();
2023-06-25 19:11:49 +00:00
exit ;
}
// Lock multi-thread execution
2023-07-30 18:53:30 +00:00
$semaphore = sem_get ( crc32 ( 'crontab.crawler' ), 1 );
2023-06-25 19:11:49 +00:00
if ( false === sem_acquire ( $semaphore , true )) {
2023-07-30 18:53:30 +00:00
CLI :: danger ( _ ( 'process locked by another thread.' ));
CLI :: break ();
2023-06-25 19:11:49 +00:00
exit ;
}
2023-07-30 18:53:30 +00:00
// Stop CLI execution on cleaner process running
$semaphore = sem_get ( crc32 ( 'crontab.cleaner' ), 1 );
if ( false === sem_acquire ( $semaphore , true )) {
CLI :: danger ( _ ( 'stop crontab.cleaner is running in another thread.' ));
CLI :: break ();
exit ;
}
// Stop CLI execution on crawler process running
$semaphore = sem_get ( crc32 ( 'crontab.crawler' ), 1 );
if ( false === sem_acquire ( $semaphore , true )) {
CLI :: danger ( _ ( 'stop crontab.crawler is running in another thread.' ));
CLI :: break ();
exit ;
}
2023-06-25 19:11:49 +00:00
// Connect database
$db = new MySQL ( DB_HOST , DB_PORT , DB_NAME , DB_USERNAME , DB_PASSWORD );
// CLI begin
if ( empty ( $argv [ 1 ])) $argv [ 1 ] = 'help' ;
switch ( $argv [ 1 ]) {
2023-07-30 19:05:37 +00:00
case 'crontab' :
2023-07-12 18:30:53 +00:00
2023-07-30 19:05:37 +00:00
if ( empty ( $argv [ 2 ])) {
2023-07-29 14:49:07 +00:00
2023-07-30 19:05:37 +00:00
CLI :: danger ( _ ( 'crontab method requires action argument' ));
2023-07-12 18:30:53 +00:00
2023-07-30 19:05:37 +00:00
switch ( $argv [ 2 ]) {
case 'crawl' :
CLI :: notice ( _ ( 'crawler queue step begin...' ));
include_once ( __DIR__ . '/../crontab/crawler.php' );
2023-07-12 18:30:53 +00:00
2023-07-30 19:05:37 +00:00
CLI :: notice ( _ ( 'crawler queue step begin...' ));
break ;
2023-07-29 14:49:07 +00:00
2023-07-30 19:05:37 +00:00
case 'clean' :
2023-07-12 18:30:53 +00:00
2023-07-30 19:05:37 +00:00
CLI :: notice ( _ ( 'cleaner queue step begin...' ));
include_once ( __DIR__ . '/../crontab/cleaner.php' );
CLI :: notice ( _ ( 'cleaner queue step completed.' ));
break ;
}
}
2023-07-29 14:49:07 +00:00
break ;
2023-07-30 18:53:30 +00:00
case 'hostPageSnap' :
2023-07-29 14:49:07 +00:00
if ( empty ( $argv [ 2 ])) {
2023-07-30 18:53:30 +00:00
CLI :: danger ( _ ( 'hostPageSnap method requires action argument' ));
CLI :: break ();
exit ;
2023-07-29 14:49:07 +00:00
}
switch ( $argv [ 2 ]) {
2023-07-30 18:53:30 +00:00
case 'repair' :
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
// Normalize & cleanup DB
2023-07-29 14:49:07 +00:00
CLI :: notice ( _ ( 'scan database registry for missed snap files...' ));
foreach ( $db -> getHosts () as $host ) {
foreach ( $db -> getHostPages ( $host -> hostId ) as $hostPage ) {
foreach ( $db -> getHostPageSnaps ( $hostPage -> hostPageId ) as $hostPageSnap ) {
// Define variables
2023-07-30 09:18:35 +00:00
$snapFilesExists = false ;
2023-07-29 14:49:07 +00:00
$snapPath = chunk_split ( $hostPage -> hostPageId , 1 , '/' );
// Check file exists
2023-07-30 18:53:30 +00:00
foreach ( json_decode ( SNAP_STORAGE ) as $hostPageSnapStorageName => $storages ) {
2023-07-29 14:49:07 +00:00
2023-07-29 17:02:08 +00:00
foreach ( $storages as $i => $storage ) {
2023-07-29 14:49:07 +00:00
// Generate storage id
2023-07-30 18:53:30 +00:00
$crc32name = crc32 ( sprintf ( '%s.%s' , $hostPageSnapStorageName , $i ));
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
switch ( $hostPageSnapStorageName ) {
2023-07-29 14:49:07 +00:00
case 'localhost' :
2023-07-30 20:33:31 +00:00
// @TODO implemented, not tested
2023-07-30 18:53:30 +00:00
$hostPageSnapFilename = $storage -> directory . $snapPath . $hostPageSnap -> timeAdded . '.zip' ;
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
if ( file_exists ( $hostPageSnapFilename )) {
2023-07-29 14:49:07 +00:00
2023-07-30 09:18:35 +00:00
$snapFilesExists = true ;
2023-07-29 14:49:07 +00:00
2023-07-30 20:32:02 +00:00
if ( ! $db -> findHostPageSnapStorageByCRC32Name ( $hostPageSnap -> hostPageSnapId , $crc32name )) {
2023-07-29 14:49:07 +00:00
if ( $db -> addHostPageSnapStorage ( $hostPageSnap -> hostPageSnapId , $crc32name , $hostPageSnap -> timeAdded )) {
2023-07-30 18:53:30 +00:00
CLI :: warning ( sprintf ( _ ( 'register snap #%s file: %s storage: %s index: %s;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnapFilename , $hostPageSnapStorageName , $i ));
2023-07-29 14:49:07 +00:00
}
2023-07-29 17:23:43 +00:00
} else {
2023-07-30 18:53:30 +00:00
CLI :: success ( sprintf ( _ ( 'skip related snap #%s file: %s storage: %s index: %s;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnapFilename , $hostPageSnapStorageName , $i ));
2023-07-29 14:49:07 +00:00
}
}
break ;
2023-07-30 14:42:36 +00:00
2023-07-29 14:49:07 +00:00
case 'ftp' :
$ftp = new Ftp ();
if ( $ftp -> connect ( $storage -> host , $storage -> port , $storage -> username , $storage -> password , $storage -> directory , $storage -> timeout , $storage -> passive )) {
2023-07-30 18:53:30 +00:00
$hostPageSnapFilename = 'hp/' . $snapPath . $hostPageSnap -> timeAdded . '.zip' ;
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
if ( $ftp -> size ( $hostPageSnapFilename )) {
2023-07-29 14:49:07 +00:00
2023-07-30 09:18:35 +00:00
$snapFilesExists = true ;
2023-07-29 14:49:07 +00:00
2023-07-30 20:32:02 +00:00
if ( ! $db -> findHostPageSnapStorageByCRC32Name ( $hostPageSnap -> hostPageSnapId , $crc32name )) {
2023-07-29 14:49:07 +00:00
if ( $db -> addHostPageSnapStorage ( $hostPageSnap -> hostPageSnapId , $crc32name , $hostPageSnap -> timeAdded )) {
2023-07-30 18:53:30 +00:00
CLI :: warning ( sprintf ( _ ( 'register snap #%s file: %s storage: %s index: %s;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnapFilename , $hostPageSnapStorageName , $i ));
2023-07-29 14:49:07 +00:00
}
2023-07-29 17:23:43 +00:00
} else {
2023-07-30 18:53:30 +00:00
CLI :: success ( sprintf ( _ ( 'skip related snap #%s file: %s storage: %s index: %s;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnapFilename , $hostPageSnapStorageName , $i ));
2023-07-29 14:49:07 +00:00
}
}
2023-07-30 18:53:30 +00:00
// Prevent snap deletion from registry on FTP connection lost
} else {
CLI :: danger ( sprintf ( _ ( 'could not connect to storage %s index %s. operation stopped to prevent the data lose.' ), $hostPageSnapStorageName , $i ));
CLI :: break ();
exit ;
2023-07-29 14:49:07 +00:00
}
$ftp -> close ();
break ;
}
}
}
2023-07-30 14:42:36 +00:00
// Files not exists
2023-07-30 09:18:35 +00:00
if ( ! $snapFilesExists ) {
2023-07-29 14:49:07 +00:00
2023-07-30 14:42:36 +00:00
// Delete snap from registry
try {
2023-07-29 14:49:07 +00:00
2023-07-30 14:42:36 +00:00
$db -> beginTransaction ();
2023-07-29 14:49:07 +00:00
2023-07-30 14:59:15 +00:00
foreach ( $db -> getHostPageSnapStorages ( $hostPageSnap -> hostPageSnapId ) as $hostPageSnapStorage ) {
2023-07-29 14:49:07 +00:00
2023-07-30 14:42:36 +00:00
$db -> deleteHostPageSnapDownloads ( $hostPageSnapStorage -> hostPageSnapStorageId );
}
2023-07-29 14:49:07 +00:00
2023-07-30 14:59:15 +00:00
$db -> deleteHostPageSnapStorages ( $hostPageSnap -> hostPageSnapId );
$db -> deleteHostPageSnap ( $hostPageSnap -> hostPageSnapId );
2023-07-29 14:49:07 +00:00
2023-07-30 19:12:30 +00:00
CLI :: warning ( sprintf ( _ ( 'delete snap index: #%s timestamp: %s as not found in file storages;' ), $hostPageSnap -> hostPageSnapId , $hostPageSnap -> timeAdded ));
2023-07-29 14:49:07 +00:00
2023-07-30 14:42:36 +00:00
$db -> commit ();
2023-07-29 14:49:07 +00:00
2023-07-30 14:42:36 +00:00
} catch ( Exception $e ) {
2023-07-30 09:18:35 +00:00
2023-07-30 14:42:36 +00:00
$db -> rollBack ();
2023-07-30 09:18:35 +00:00
2023-07-30 14:42:36 +00:00
var_dump ( $e );
}
2023-07-30 09:18:35 +00:00
}
}
2023-07-29 14:49:07 +00:00
}
}
2023-07-30 18:53:30 +00:00
// Cleanup FS
CLI :: notice ( _ ( 'scan storage for snap files missed in the DB...' ));
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
foreach ( json_decode ( SNAP_STORAGE ) as $hostPageSnapStorageName => $storages ) {
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
foreach ( $storages as $i => $storage ) {
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
switch ( $hostPageSnapStorageName ) {
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
case 'localhost' :
2023-07-29 14:49:07 +00:00
2023-07-30 18:53:30 +00:00
// @TODO
break ;
case 'ftp' :
$ftp = new Ftp ();
if ( $ftp -> connect ( $storage -> host , $storage -> port , $storage -> username , $storage -> password , $storage -> directory , $storage -> timeout , $storage -> passive )) {
foreach ( $ftp -> nlistr ( $storage -> directory ) as $hostPageSnapFilename ) {
if ( false !== preg_match ( sprintf ( '!/hp/([\d/]+)/([\d]+)\.zip$!ui' , $storage -> directory ), $hostPageSnapFilename , $matches )) {
if ( ! empty ( $matches [ 1 ]) && // hostPageSnapId
! empty ( $matches [ 2 ])) { // timeAdded
if ( ! $db -> findHostPageSnapByTimeAdded ( $matches [ 1 ], $matches [ 2 ])) {
if ( $ftp -> delete ( $hostPageSnapFilename )) {
CLI :: warning ( sprintf ( _ ( 'delete snap file: #%s from storage %s index %s not found in registry;' ), $hostPageSnapFilename , $hostPageSnapStorageName , $i ));
} else {
CLI :: danger ( sprintf ( _ ( 'delete snap file: #%s from storage %s index %s not found in registry;' ), $hostPageSnapFilename , $hostPageSnapStorageName , $i ));
}
2023-07-30 19:24:16 +00:00
} else {
CLI :: success ( sprintf ( _ ( 'skip snap file: #%s available in storage %s index %s;' ), $hostPageSnapFilename , $hostPageSnapStorageName , $i ));
2023-07-30 18:53:30 +00:00
}
}
}
}
}
$ftp -> close ();
break ;
}
}
}
CLI :: success ( _ ( 'missed snap files successfully deleted!' ));
// Optimize DB tables
CLI :: notice ( _ ( 'optimize database tables...' ));
$db -> optimize ();
CLI :: success ( _ ( 'tables successfully optimized!' ));
2023-07-29 14:49:07 +00:00
break ;
default :
2023-07-30 18:53:30 +00:00
CLI :: danger ( _ ( 'undefined action argument!' ));
2023-07-29 14:49:07 +00:00
}
2023-07-28 09:49:43 +00:00
break ;
case 'hostPage' :
if ( empty ( $argv [ 2 ])) {
2023-07-29 14:49:07 +00:00
2023-07-29 17:02:53 +00:00
CLI :: danger ( _ ( 'hostPage method requires action argument' ));
2023-07-28 09:49:43 +00:00
}
switch ( $argv [ 2 ]) {
case 'rank' :
if ( empty ( $argv [ 3 ])) {
2023-07-29 14:49:07 +00:00
2023-07-29 17:02:53 +00:00
CLI :: danger ( _ ( 'hostPage rank requires action argument' ));
2023-07-28 09:49:43 +00:00
}
switch ( $argv [ 3 ]) {
case 'reindex' :
foreach ( $db -> getHosts () as $host ) {
foreach ( $db -> getHostPages ( $host -> hostId ) as $hostPage ) {
$db -> updateHostPageRank ( $hostPage -> hostPageId , $db -> getTotalExternalHostPageIdSourcesByHostPageIdTarget ( $hostPage -> hostPageId )); // @TODO add library cover
}
}
2023-07-29 14:49:07 +00:00
CLI :: success ( _ ( 'hostPage rank successfully updated' ));
2023-07-28 09:49:43 +00:00
exit ;
break ;
default :
2023-07-29 17:02:53 +00:00
CLI :: danger ( _ ( 'undefined action argument' ));
2023-07-28 09:49:43 +00:00
}
break ;
case 'truncate' :
$db -> truncateHostPageDom ();
2023-07-29 14:49:07 +00:00
CLI :: success ( _ ( 'hostPageDom table successfully truncated' ));
2023-07-28 09:49:43 +00:00
exit ;
break ;
default :
2023-07-29 17:02:53 +00:00
CLI :: danger ( _ ( 'undefined action argument' ));
2023-07-28 09:49:43 +00:00
}
2023-07-12 18:30:53 +00:00
break ;
2023-06-25 19:11:49 +00:00
case 'hostPageDom' :
if ( empty ( $argv [ 2 ])) {
2023-07-29 14:49:07 +00:00
2023-07-29 17:02:53 +00:00
CLI :: danger ( _ ( 'hostPageDom method requires action argument' ));
2023-06-25 19:11:49 +00:00
}
switch ( $argv [ 2 ]) {
case 'generate' :
2023-06-26 14:15:28 +00:00
$selectors = [];
foreach (( array ) explode ( ';' , ! empty ( $argv [ 3 ]) ? $argv [ 3 ] : ( string ) CRAWL_HOST_PAGE_DOM_SELECTORS ) as $selector ) {
if ( ! empty ( $selector )) {
$selectors [] = trim ( $selector );
}
}
if ( $selectors ) {
2023-06-25 19:11:49 +00:00
// Init variables
$hostPagesProcessedTotal = 0 ;
$hostPageDOMAddedTotal = 0 ;
// Begin selectors extraction
foreach ( $db -> getHostPagesByIndexed () as $hostPage ) {
if ( false !== stripos ( Filter :: mime ( $hostPage -> mime ), 'text/html' )) {
if ( $hostPageDescription = $db -> getLastPageDescription ( $hostPage -> hostPageId )) {
$hostPagesProcessedTotal ++ ;
if ( ! empty ( $hostPageDescription -> data )) {
$html = str_get_html ( base64_decode ( $hostPageDescription -> data ));
2023-06-26 14:15:28 +00:00
foreach ( $selectors as $selector ) {
2023-06-25 19:11:49 +00:00
foreach ( $html -> find ( $selector ) as $element ) {
if ( ! empty ( $element -> innertext )) {
$hostPageDOMAddedTotal ++ ;
$db -> addHostPageDom ( $hostPage -> hostPageId ,
time (),
$selector ,
trim ( CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags (
preg_replace ( '/[\s]+/' ,
' ' ,
str_replace ([ '<br />' , '<br/>' , '<br>' , '</' ],
[ ' ' , ' ' , ' ' , ' </' ],
$element -> innertext ))) : $element -> innertext ));
}
}
}
}
}
}
}
2023-07-29 14:49:07 +00:00
CLI :: success ( sprintf ( _ ( 'Host pages processed: %s' ), $hostPagesProcessedTotal ));
CLI :: success ( sprintf ( _ ( 'Host page DOM elements added: %s' ), $hostPageDOMAddedTotal ));
2023-06-25 19:11:49 +00:00
exit ;
}
2023-07-29 17:02:53 +00:00
CLI :: danger ( _ ( 'CRAWL_HOST_PAGE_DOM_SELECTORS not provided in the configuration file' ));
2023-06-25 19:11:49 +00:00
exit ;
break ;
case 'truncate' :
$db -> truncateHostPageDom ();
2023-07-29 14:49:07 +00:00
CLI :: success ( _ ( 'hostPageDom table successfully truncated' ));
2023-06-25 19:11:49 +00:00
exit ;
break ;
default :
2023-07-29 17:02:53 +00:00
CLI :: danger ( _ ( 'undefined action argument' ));
2023-06-25 19:11:49 +00:00
}
break ;
}
// Default message
2023-07-29 14:49:07 +00:00
CLI :: default ( '__ ______________ __' );
CLI :: default ( '\ \/ / ____/ ____/___ / /' );
CLI :: default ( ' \ / / __/ / __/ __ \/ /' );
CLI :: default ( ' / / /_/ / /_/ / /_/ /_/' );
CLI :: default ( '/_/\____/\____/\____(_)' );
2023-07-29 17:07:44 +00:00
CLI :: break ();
2023-07-29 14:49:07 +00:00
CLI :: default ( 'available options:' );
CLI :: default ( ' help - this message' );
2023-07-30 19:05:37 +00:00
CLI :: default ( ' crontab [crawl|clean] - execute crontab script queue' );
2023-07-29 14:49:07 +00:00
CLI :: default ( ' hostPage rank reindex - generate rank indexes in hostPage table' );
2023-07-30 18:53:30 +00:00
CLI :: default ( ' hostPageSnap repair - sync DB/FS relations' );
2023-07-29 14:49:07 +00:00
CLI :: default ( ' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field' );
CLI :: default ( ' hostPageDom truncate - flush hostPageDom table' );
2023-07-29 17:07:44 +00:00
CLI :: break ();
2023-07-29 14:49:07 +00:00
CLI :: default ( 'get support: https://github.com/YGGverse/YGGo/issues' );
2023-07-29 17:07:44 +00:00
CLI :: break ();
CLI :: break ();