2023-04-07 01:04:24 +00:00
< ? php
class MySQL {
private PDO $_db ;
public function __construct ( string $host , int $port , string $database , string $username , string $password ) {
$this -> _db = new PDO ( 'mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8' , $username , $password , [ PDO :: MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8' ]);
$this -> _db -> setAttribute ( PDO :: ATTR_ERRMODE , PDO :: ERRMODE_EXCEPTION );
$this -> _db -> setAttribute ( PDO :: ATTR_DEFAULT_FETCH_MODE , PDO :: FETCH_OBJ );
$this -> _db -> setAttribute ( PDO :: ATTR_TIMEOUT , 600 );
}
// System
public function beginTransaction () {
$this -> _db -> beginTransaction ();
}
public function commit () {
$this -> _db -> commit ();
}
public function rollBack () {
$this -> _db -> rollBack ();
}
2023-05-03 06:22:14 +00:00
// Manifest
2023-05-05 02:26:53 +00:00
public function getTotalManifests () {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `manifest`' );
$query -> execute ();
return $query -> fetch () -> total ;
}
public function getManifests () {
$query = $this -> _db -> prepare ( 'SELECT * FROM `manifest`' );
$query -> execute ();
return $query -> fetchAll ();
}
2023-05-03 06:22:14 +00:00
public function getManifest ( int $crc32url ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1' );
$query -> execute ([ $crc32url ]);
return $query -> fetch ();
}
public function addManifest ( int $crc32url , string $url , string $status , int $timeAdded , mixed $timeUpdated = null ) {
2023-05-03 22:04:39 +00:00
$query = $this -> _db -> prepare ( 'INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)' );
2023-05-03 06:22:14 +00:00
$query -> execute ([ $crc32url , $url , $status , $timeAdded , $timeUpdated ]);
return $this -> _db -> lastInsertId ();
}
2023-05-05 02:26:53 +00:00
public function deleteManifest ( int $manifestId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1' );
$query -> execute ([ $manifestId ]);
return $query -> rowCount ();
}
2023-04-07 01:04:24 +00:00
// Host
2023-04-23 00:01:51 +00:00
public function getAPIHosts ( string $apiHostFields ) {
$query = $this -> _db -> prepare ( 'SELECT ' . $apiHostFields . ' FROM `host`' );
$query -> execute ();
return $query -> fetchAll ();
}
2023-04-07 01:04:24 +00:00
public function getHost ( int $crc32url ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1' );
$query -> execute ([ $crc32url ]);
return $query -> fetch ();
}
2023-04-08 21:06:28 +00:00
public function getTotalHosts () {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `host`' );
$query -> execute ();
return $query -> fetch () -> total ;
}
2023-05-10 09:47:36 +00:00
public function addHost ( string $scheme ,
string $name ,
mixed $port ,
int $crc32url ,
int $timeAdded ,
mixed $timeUpdated ,
int $crawlPageLimit ,
string $crawlMetaOnly ,
string $status ,
string $nsfw ,
mixed $robots ,
mixed $robotsPostfix ) {
$query = $this -> _db -> prepare ( ' INSERT INTO `host` ( `scheme` ,
`name` ,
`port` ,
`crc32url` ,
`timeAdded` ,
`timeUpdated` ,
`crawlPageLimit` ,
`crawlMetaOnly` ,
`status` ,
`nsfw` ,
`robots` ,
`robotsPostfix` ) VALUES ( ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? ) ' );
$query -> execute ([ $scheme ,
$name ,
$port ,
$crc32url ,
$timeAdded ,
$timeUpdated ,
$crawlPageLimit ,
$crawlMetaOnly ,
$status ,
$nsfw ,
$robots ,
$robotsPostfix ]);
2023-04-07 01:04:24 +00:00
return $this -> _db -> lastInsertId ();
}
2023-04-08 21:06:28 +00:00
public function updateHostRobots ( int $hostId , mixed $robots , int $timeUpdated ) {
$query = $this -> _db -> prepare ( 'UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1' );
$query -> execute ([ $robots , $timeUpdated , $hostId ]);
return $query -> rowCount ();
}
2023-04-07 01:04:24 +00:00
// Pages
public function getTotalHostPages ( int $hostId ) {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?' );
$query -> execute ([ $hostId ]);
return $query -> fetch () -> total ;
}
2023-06-27 10:14:53 +00:00
/* not in use
2023-04-07 01:04:24 +00:00
public function getTotalPagesByHttpCode ( mixed $httpCode ) {
if ( is_null ( $httpCode )) {
$query = $this -> _db -> query ( 'SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` IS NULL' );
} else {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` = ?' );
$query -> execute ([ $httpCode ]);
}
return $query -> fetch () -> total ;
}
2023-06-27 10:14:53 +00:00
*/
2023-04-07 01:04:24 +00:00
public function getHostPage ( int $hostId , int $crc32uri ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1' );
$query -> execute ([ $hostId , $crc32uri ]);
return $query -> fetch ();
}
2023-04-08 21:06:28 +00:00
public function getHostPages ( int $hostId ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPage` WHERE `hostId` = ?' );
$query -> execute ([ $hostId ]);
return $query -> fetchAll ();
}
2023-06-25 19:10:47 +00:00
public function getHostPagesByIndexed () {
2023-06-26 12:59:08 +00:00
$query = $this -> _db -> query ( 'SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL' );
2023-06-25 19:10:47 +00:00
return $query -> fetchAll ();
}
2023-05-03 22:04:39 +00:00
public function getHostPagesByLimit ( int $hostId , int $limit ) {
2023-05-08 22:29:32 +00:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . ( int ) $limit );
2023-05-03 22:04:39 +00:00
$query -> execute ([ $hostId ]);
return $query -> fetchAll ();
}
2023-05-08 22:29:32 +00:00
public function getLastPageDescription ( int $hostPageId ) {
2023-05-10 09:47:36 +00:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1' );
2023-05-08 22:29:32 +00:00
$query -> execute ([ $hostPageId ]);
return $query -> fetch ();
}
2023-04-07 01:04:24 +00:00
public function getFoundHostPage ( int $hostPageId ) {
2023-05-13 02:54:15 +00:00
$query = $this -> _db -> prepare ( ' SELECT `hostPage` . `hostPageId` ,
`hostPage` . `uri` ,
`hostPage` . `timeAdded` ,
`hostPage` . `timeUpdated` ,
`hostPage` . `mime` ,
2023-06-13 20:20:22 +00:00
`hostPage` . `size` ,
2023-04-07 01:04:24 +00:00
`host` . `scheme` ,
`host` . `name` ,
`host` . `port`
FROM `hostPage`
JOIN `host` ON ( `host` . `hostId` = `hostPage` . `hostId` )
WHERE `hostPage` . `hostPageId` = ?
LIMIT 1 ' );
$query -> execute ([ $hostPageId ]);
return $query -> fetch ();
}
public function addHostPage ( int $hostId ,
int $crc32uri ,
string $uri ,
int $timeAdded ,
mixed $timeUpdated = null ,
2023-05-06 05:45:37 +00:00
mixed $timeBanned = null ,
2023-04-07 01:04:24 +00:00
mixed $httpCode = null ,
2023-05-10 09:47:36 +00:00
mixed $mime = null ) {
2023-04-07 01:04:24 +00:00
$query = $this -> _db -> prepare ( ' INSERT INTO `hostPage` ( `hostId` ,
`crc32uri` ,
`uri` ,
`timeAdded` ,
`timeUpdated` ,
2023-05-06 05:45:37 +00:00
`timeBanned` ,
2023-04-07 01:04:24 +00:00
`httpCode` ,
2023-05-10 09:47:36 +00:00
`mime` ) VALUES ( ? , ? , ? , ? , ? , ? , ? , ? ) ' );
2023-04-07 01:04:24 +00:00
2023-05-10 09:47:36 +00:00
$query -> execute ([ $hostId , $crc32uri , $uri , $timeAdded , $timeUpdated , $timeBanned , $httpCode , $mime ]);
2023-04-07 01:04:24 +00:00
return $this -> _db -> lastInsertId ();
}
2023-05-06 07:11:25 +00:00
public function updateHostPageTimeBanned ( int $hostPageId , int $timeBanned ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1' );
$query -> execute ([ $timeBanned , $hostPageId ]);
return $query -> rowCount ();
}
2023-05-08 11:13:53 +00:00
public function updateHostPageMime ( int $hostPageId , string $mime ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `mime` = ? WHERE `hostPageId` = ? LIMIT 1' );
$query -> execute ([ $mime , $hostPageId ]);
return $query -> rowCount ();
}
2023-04-08 21:06:28 +00:00
public function deleteHostPage ( int $hostPageId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1' );
$query -> execute ([ $hostPageId ]);
return $query -> rowCount ();
}
2023-05-08 22:29:32 +00:00
public function deleteHostPageDescriptions ( int $hostPageId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageDescription` WHERE `hostPageId` = ?' );
$query -> execute ([ $hostPageId ]);
return $query -> rowCount ();
}
2023-05-10 09:47:36 +00:00
public function addHostPageDescription ( int $hostPageId ,
mixed $title ,
mixed $description ,
mixed $keywords ,
2023-05-08 22:29:32 +00:00
mixed $data ,
2023-05-10 09:47:36 +00:00
int $timeAdded ) {
2023-05-08 22:29:32 +00:00
$query = $this -> _db -> prepare ( ' INSERT INTO `hostPageDescription` ( `hostPageId` ,
2023-05-10 09:47:36 +00:00
`title` ,
`description` ,
`keywords` ,
2023-05-08 22:29:32 +00:00
`data` ,
`timeAdded`
2023-05-10 09:47:36 +00:00
) VALUES ( ? , ? , ? , ? , ? , ? ) ' );
2023-05-08 22:29:32 +00:00
$query -> execute ([
$hostPageId ,
2023-05-10 09:47:36 +00:00
$title ,
$description ,
$keywords ,
2023-05-08 22:29:32 +00:00
$data ,
2023-05-10 09:47:36 +00:00
$timeAdded ,
2023-05-08 22:29:32 +00:00
]);
return $query -> rowCount ();
}
2023-05-10 09:47:36 +00:00
public function addHostPageToHostPage ( int $hostPageIdSource , int $hostPageIdTarget ) {
2023-05-13 03:30:40 +00:00
$query = $this -> _db -> prepare ( 'INSERT IGNORE `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`) VALUES (?, ?)' );
2023-05-10 09:47:36 +00:00
$query -> execute ([ $hostPageIdSource , $hostPageIdTarget ]);
}
public function deleteHostPageToHostPage ( int $hostPageId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? OR `hostPageIdTarget` = ?' );
$query -> execute ([ $hostPageId , $hostPageId ]);
return $query -> rowCount ();
}
2023-05-13 00:01:00 +00:00
public function getTotalHostPageIdSourcesByHostPageIdTarget ( int $hostPageIdTarget ) {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?' );
$query -> execute ([ $hostPageIdTarget ]);
return $query -> fetch () -> total ;
}
2023-05-13 02:54:15 +00:00
public function getHostPageIdSourcesByHostPageIdTarget ( int $hostPageIdTarget , int $limit = 1000 ) {
2023-05-13 00:01:00 +00:00
2023-05-13 03:30:40 +00:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . ( int ) $limit );
2023-05-13 00:01:00 +00:00
$query -> execute ([ $hostPageIdTarget ]);
return $query -> fetchAll ();
}
2023-05-13 22:45:55 +00:00
public function addHostPageSnap ( int $hostPageId , string $crc32data , int $timeAdded ) {
2023-05-14 16:41:20 +00:00
$query = $this -> _db -> prepare ( ' INSERT INTO `hostPageSnap` ( `hostPageId` ,
`crc32data` ,
`timeAdded` ) VALUES ( ? , ? , ? ) ' );
2023-05-13 22:45:55 +00:00
$query -> execute ([ $hostPageId , $crc32data , $timeAdded ]);
2023-05-13 07:15:07 +00:00
2023-05-14 16:41:20 +00:00
return $this -> _db -> lastInsertId ();
}
public function updateHostPageSnapStorageLocal ( int $hostPageSnapId , mixed $value ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPageSnap` SET `storageLocal` = ? WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $value , $hostPageSnapId ]);
return $query -> rowCount ();
}
public function updateHostPageSnapStorageMega ( int $hostPageSnapId , mixed $value ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPageSnap` SET `storageMega` = ? WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $value , $hostPageSnapId ]);
2023-05-13 07:15:07 +00:00
return $query -> rowCount ();
}
2023-05-13 22:45:55 +00:00
public function deleteHostPageSnap ( int $hostPageSnapId ) {
2023-05-13 07:15:07 +00:00
2023-05-13 22:45:55 +00:00
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1' );
2023-05-13 07:15:07 +00:00
2023-05-13 22:45:55 +00:00
$query -> execute ([ $hostPageSnapId ]);
2023-05-13 07:15:07 +00:00
return $query -> rowCount ();
}
2023-06-25 20:29:30 +00:00
public function getTotalHostPageSnaps ( int $hostPageId , bool $storageLocal = true , bool $storageMega = true ) {
2023-05-13 07:15:07 +00:00
2023-06-25 20:29:30 +00:00
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ? AND (`storageLocal` = ? OR `storageMega` = ?)' );
2023-05-13 07:15:07 +00:00
2023-06-25 20:29:30 +00:00
$query -> execute ([ $hostPageId , $storageLocal , $storageMega ]);
2023-05-13 07:15:07 +00:00
return $query -> fetch () -> total ;
}
2023-07-07 09:30:07 +00:00
public function getHostPageSnaps ( int $hostPageId , bool $storageLocal = true , bool $storageMega = true , string $condition = 'OR' ) {
2023-05-13 07:15:07 +00:00
2023-07-07 09:30:07 +00:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND (`storageLocal` = ? ' . ( $condition == 'OR' ? 'OR' : 'AND' ) . ' `storageMega` = ?) ORDER BY `timeAdded` DESC' );
2023-05-13 07:15:07 +00:00
2023-06-25 20:29:30 +00:00
$query -> execute ([ $hostPageId , $storageLocal , $storageMega ]);
2023-05-13 07:15:07 +00:00
return $query -> fetchAll ();
}
2023-05-15 06:18:18 +00:00
public function getHostPageSnap ( int $hostPageSnapId ) {
2023-05-13 07:15:07 +00:00
2023-05-15 06:18:18 +00:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $hostPageSnapId ]);
return $query -> fetch ();
}
public function findHostPageSnap ( int $hostPageId , int $crc32data ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1' );
2023-05-13 07:15:07 +00:00
2023-05-13 22:45:55 +00:00
$query -> execute ([ $hostPageId , $crc32data ]);
2023-05-13 07:15:07 +00:00
return $query -> fetch ();
}
2023-05-15 06:18:18 +00:00
/* not in use
public function getHostPageSnapDownloads ( int $hostPageSnapId ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageSnapDownload` WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $hostPageSnapId ]);
return $query -> fetchAll ();
}
*/
public function addHostPageSnapDownload ( int $hostPageSnapId , string $crc32ip , int $timeAdded ) {
$query = $this -> _db -> prepare ( ' INSERT INTO `hostPageSnapDownload` ( `hostPageSnapId` ,
`crc32ip` ,
`timeAdded` ) VALUES ( ? , ? , ? ) ' );
$query -> execute ([ $hostPageSnapId , $crc32ip , $timeAdded ]);
return $this -> _db -> lastInsertId ();
}
2023-06-12 10:34:25 +00:00
public function updateHostPageSnapDownload ( int $hostPageSnapDownloadId , string $storage , int $size , mixed $httpCode = NULL ) {
2023-05-15 06:18:18 +00:00
2023-06-12 10:34:25 +00:00
$query = $this -> _db -> prepare ( 'UPDATE `hostPageSnapDownload` SET `storage` = ?, `size` = ?, `httpCode` = ? WHERE `hostPageSnapDownloadId` = ? LIMIT 1' );
2023-05-15 06:18:18 +00:00
2023-06-12 10:34:25 +00:00
$query -> execute ([ $storage , $size , $httpCode , $hostPageSnapDownloadId ]);
2023-05-15 06:18:18 +00:00
return $query -> rowCount ();
}
public function deleteHostPageSnapDownloads ( int $hostPageSnapId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageSnapDownload` WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $hostPageSnapId ]);
return $query -> rowCount ();
}
public function findHostPageSnapDownloadsTotalSize ( int $crc32ip , int $timeOffset ) {
$query = $this -> _db -> prepare ( ' SELECT SUM ( `size` ) AS `size` FROM `hostPageSnapDownload`
WHERE `crc32ip` = ? AND `timeAdded` < ? ' );
$query -> execute ([ $crc32ip , $timeOffset ]);
return $query -> fetch () -> size ;
}
2023-06-25 19:10:47 +00:00
public function addHostPageDom ( int $hostPageId , int $timeAdded , string $selector , string $value ) {
$query = $this -> _db -> prepare ( 'INSERT INTO `hostPageDom` SET `hostPageId` = ?, `timeAdded` = ?, `selector` = ?, `value` = ?' );
$query -> execute ([ $hostPageId , $timeAdded , $selector , $value ]);
}
public function deleteHostPageDoms ( int $hostPageId ) {
2023-06-30 10:28:22 +00:00
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageDom` WHERE `hostPageId` = ?' );
$query -> execute ([ $hostPageId ]);
2023-06-25 19:10:47 +00:00
return $query -> rowCount ();
}
public function deleteHostPageDomsByTimeAdded ( int $timeOffset ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageDom` WHERE `timeAdded` < ' . ( int ) $timeOffset );
$query -> execute ();
return $query -> rowCount ();
}
public function truncateHostPageDom () {
$query = $this -> _db -> query ( 'TRUNCATE `hostPageDom`' );
}
2023-04-08 21:06:28 +00:00
// Cleaner tools
public function getCleanerQueue ( int $limit , int $timeFrom ) {
$query = $this -> _db -> prepare ( ' SELECT * FROM `host`
2023-06-16 13:32:46 +00:00
WHERE ( `timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host` . `status` <> ?
2023-04-08 21:06:28 +00:00
ORDER BY `hostId`
LIMIT ' . ( int ) $limit );
2023-06-16 13:32:46 +00:00
$query -> execute ([ $timeFrom , 0 ]);
2023-04-08 21:06:28 +00:00
2023-06-16 13:53:14 +00:00
return $query -> fetchAll ();
}
public function getHostPagesBanned () {
$query = $this -> _db -> query ( 'SELECT * FROM `hostPage` WHERE `timeBanned` IS NOT NULL' );
2023-04-08 21:06:28 +00:00
return $query -> fetchAll ();
}
2023-05-06 05:45:37 +00:00
public function resetBannedHostPages ( int $timeOffset ) {
2023-05-08 08:04:59 +00:00
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . ( int ) $timeOffset );
2023-05-06 05:45:37 +00:00
$query -> execute ();
return $query -> rowCount ();
}
2023-05-09 05:19:49 +00:00
public function deleteHostPageDescriptionsByTimeAdded ( int $timeOffset ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageDescription` WHERE `timeAdded` < ' . ( int ) $timeOffset );
$query -> execute ();
return $query -> rowCount ();
}
2023-05-08 08:04:59 +00:00
public function addCleanerLog ( int $timeAdded ,
int $hostsTotal ,
int $hostsUpdated ,
int $hostPagesDeleted ,
2023-05-13 07:15:07 +00:00
int $hostPagesDescriptionsDeleted ,
2023-06-25 19:10:47 +00:00
int $hostPagesDomsDeleted ,
2023-05-13 22:45:55 +00:00
int $hostPagesSnapDeleted ,
2023-05-13 07:15:07 +00:00
int $hostPagesToHostPageDeleted ,
2023-05-08 08:04:59 +00:00
int $hostPagesBansRemoved ,
int $manifestsTotal ,
int $manifestsDeleted ,
int $logsCleanerDeleted ,
int $logsCrawlerDeleted ,
int $httpRequestsTotal ,
int $httpRequestsSizeTotal ,
int $httpDownloadSizeTotal ,
float $httpRequestsTimeTotal ,
float $executionTimeTotal ) {
$query = $this -> _db -> prepare ( ' INSERT INTO `logCleaner` ( `timeAdded` ,
`hostsTotal` ,
`hostsUpdated` ,
`hostPagesDeleted` ,
2023-05-13 07:15:07 +00:00
`hostPagesDescriptionsDeleted` ,
2023-06-25 19:10:47 +00:00
`hostPagesDomsDeleted` ,
2023-05-13 22:45:55 +00:00
`hostPagesSnapDeleted` ,
2023-05-13 07:15:07 +00:00
`hostPagesToHostPageDeleted` ,
2023-05-08 08:04:59 +00:00
`hostPagesBansRemoved` ,
`manifestsTotal` ,
`manifestsDeleted` ,
`logsCleanerDeleted` ,
`logsCrawlerDeleted` ,
`httpRequestsTotal` ,
`httpRequestsSizeTotal` ,
`httpDownloadSizeTotal` ,
`httpRequestsTimeTotal` ,
2023-06-25 19:10:47 +00:00
`executionTimeTotal` ) VALUES ( ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? ) ' );
2023-05-08 08:04:59 +00:00
$query -> execute ([
$timeAdded ,
$hostsTotal ,
$hostsUpdated ,
$hostPagesDeleted ,
2023-05-13 07:15:07 +00:00
$hostPagesDescriptionsDeleted ,
2023-06-25 19:10:47 +00:00
$hostPagesDomsDeleted ,
2023-05-13 22:45:55 +00:00
$hostPagesSnapDeleted ,
2023-05-13 07:15:07 +00:00
$hostPagesToHostPageDeleted ,
2023-05-08 08:04:59 +00:00
$hostPagesBansRemoved ,
$manifestsTotal ,
$manifestsDeleted ,
$logsCleanerDeleted ,
$logsCrawlerDeleted ,
$httpRequestsTotal ,
$httpRequestsSizeTotal ,
$httpDownloadSizeTotal ,
$httpRequestsTimeTotal ,
$executionTimeTotal
]);
return $this -> _db -> lastInsertId ();
}
public function deleteLogCleaner ( int $timeOffset ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `logCleaner` WHERE `timeAdded` < ' . ( int ) $timeOffset );
2023-05-06 05:45:37 +00:00
$query -> execute ();
return $query -> rowCount ();
}
2023-04-07 01:04:24 +00:00
// Crawl tools
2023-06-30 10:28:22 +00:00
public function getHostPageCrawlQueueTotal ( int $hostPageTimeFrom , int $hostPageHomeTimeFrom ) {
$query = $this -> _db -> prepare ( " SELECT COUNT(*) AS `total`
FROM `hostPage`
JOIN `host` ON ( `host` . `hostId` = `hostPage` . `hostId` )
2023-06-30 10:41:07 +00:00
WHERE ( `hostPage` . `timeUpdated` IS NULL OR `hostPage` . `timeUpdated` < ? OR ( `hostPage` . `uri` = '/' AND `hostPage` . `timeUpdated` < ? ))
2023-06-30 10:28:22 +00:00
AND `host` . `status` <> ?
AND `hostPage` . `timeBanned` IS NULL " );
$query -> execute ([ $hostPageTimeFrom , $hostPageHomeTimeFrom , 0 ]);
return $query -> fetch () -> total ;
}
public function getHostPageCrawlQueue ( int $limit , int $hostPageTimeFrom , int $hostPageHomeTimeFrom ) {
2023-04-07 01:04:24 +00:00
2023-06-30 10:28:22 +00:00
$query = $this -> _db -> prepare ( " SELECT `hostPage`.`hostId`,
2023-04-09 00:28:31 +00:00
`hostPage` . `hostPageId` ,
2023-04-07 01:04:24 +00:00
`hostPage` . `uri` ,
`host` . `scheme` ,
`host` . `name` ,
`host` . `port` ,
`host` . `crawlPageLimit` ,
2023-05-09 05:19:49 +00:00
`host` . `crawlMetaOnly` ,
2023-04-09 00:28:31 +00:00
`host` . `robots` ,
`host` . `robotsPostfix`
2023-04-07 01:04:24 +00:00
FROM `hostPage`
JOIN `host` ON ( `host` . `hostId` = `hostPage` . `hostId` )
2023-06-30 10:41:07 +00:00
WHERE ( `hostPage` . `timeUpdated` IS NULL OR `hostPage` . `timeUpdated` < ? OR ( `hostPage` . `uri` = '/' AND `hostPage` . `timeUpdated` < ? ))
2023-06-30 10:28:22 +00:00
AND `host` . `status` <> ?
AND `hostPage` . `timeBanned` IS NULL
2023-04-07 01:04:24 +00:00
2023-06-04 08:38:56 +00:00
ORDER BY LENGTH ( `hostPage` . `uri` ) ASC , RAND ()
2023-04-07 01:04:24 +00:00
2023-06-30 10:28:22 +00:00
LIMIT " . (int) $limit );
2023-04-07 01:04:24 +00:00
2023-06-30 10:28:22 +00:00
$query -> execute ([ $hostPageTimeFrom , $hostPageHomeTimeFrom , 0 ]);
2023-04-07 01:04:24 +00:00
return $query -> fetchAll ();
}
2023-06-13 09:45:12 +00:00
public function updateHostPageCrawlQueue ( int $hostPageId , int $timeUpdated , int $httpCode , int $size ) {
2023-04-07 01:04:24 +00:00
2023-06-13 09:45:12 +00:00
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ?, `size` = ? WHERE `hostPageId` = ? LIMIT 1' );
2023-04-07 01:04:24 +00:00
2023-06-13 09:45:12 +00:00
$query -> execute ([ $timeUpdated , $httpCode , $size , $hostPageId ]);
2023-04-07 01:04:24 +00:00
return $query -> rowCount ();
}
2023-05-04 03:45:04 +00:00
2023-05-05 02:26:53 +00:00
public function getManifestCrawlQueue ( int $limit , int $timeFrom ) {
$query = $this -> _db -> prepare ( ' SELECT * FROM `manifest`
2023-06-16 13:32:46 +00:00
WHERE ( `timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
2023-05-05 02:26:53 +00:00
ORDER BY RAND ()
LIMIT ' . ( int ) $limit );
2023-06-16 13:32:46 +00:00
$query -> execute ([ $timeFrom , 0 ]);
2023-05-05 02:26:53 +00:00
return $query -> fetchAll ();
}
public function updateManifestCrawlQueue ( int $manifestId , int $timeUpdated , int $httpCode ) {
$query = $this -> _db -> prepare ( 'UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1' );
$query -> execute ([ $timeUpdated , $httpCode , $manifestId ]);
return $query -> rowCount ();
}
2023-05-08 08:04:59 +00:00
public function addCrawlerLog ( int $timeAdded ,
int $hostsAdded ,
int $hostPagesProcessed ,
int $hostPagesAdded ,
2023-05-13 22:45:55 +00:00
int $hostPagesSnapAdded ,
2023-05-08 08:04:59 +00:00
int $hostPagesBanned ,
int $manifestsProcessed ,
int $manifestsAdded ,
int $httpRequestsTotal ,
int $httpRequestsSizeTotal ,
int $httpDownloadSizeTotal ,
float $httpRequestsTimeTotal ,
float $executionTimeTotal ) {
$query = $this -> _db -> prepare ( ' INSERT INTO `logCrawler` ( `timeAdded` ,
`hostsAdded` ,
`hostPagesProcessed` ,
`hostPagesAdded` ,
2023-05-13 22:45:55 +00:00
`hostPagesSnapAdded` ,
2023-05-08 08:04:59 +00:00
`hostPagesBanned` ,
`manifestsProcessed` ,
`manifestsAdded` ,
`httpRequestsTotal` ,
`httpRequestsSizeTotal` ,
`httpDownloadSizeTotal` ,
`httpRequestsTimeTotal` ,
2023-06-05 19:06:55 +00:00
`executionTimeTotal` ) VALUES ( ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? ) ' );
2023-05-08 08:04:59 +00:00
$query -> execute ([
$timeAdded ,
$hostsAdded ,
$hostPagesProcessed ,
$hostPagesAdded ,
2023-05-13 22:45:55 +00:00
$hostPagesSnapAdded ,
2023-05-08 08:04:59 +00:00
$hostPagesBanned ,
$manifestsProcessed ,
$manifestsAdded ,
$httpRequestsTotal ,
$httpRequestsSizeTotal ,
$httpDownloadSizeTotal ,
$httpRequestsTimeTotal ,
$executionTimeTotal
]);
return $this -> _db -> lastInsertId ();
}
public function deleteLogCrawler ( int $timeOffset ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `logCrawler` WHERE `timeAdded` < ' . ( int ) $timeOffset );
$query -> execute ();
return $query -> rowCount ();
}
2023-05-13 23:39:32 +00:00
public function optimize () {
$this -> _db -> query ( 'OPTIMIZE TABLE `host`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPage`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageDescription`' );
2023-06-25 19:10:47 +00:00
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageDom`' );
2023-05-13 23:39:32 +00:00
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageSnap`' );
2023-05-15 06:18:18 +00:00
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageSnapDownload`' );
2023-05-13 23:39:32 +00:00
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageToHostPage`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `logCleaner`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `logCrawler`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `manifest`' );
}
2023-04-07 01:04:24 +00:00
}