2023-04-07 04:04:24 +03:00
< ? php
class MySQL {
private PDO $_db ;
2023-07-27 17:53:36 +03:00
private Memcached $_memcached ;
2023-04-07 04:04:24 +03:00
2023-07-27 17:53:36 +03:00
public function __construct ( string $host , int $port , string $database , string $username , string $password , Memcached $memcached = null ) {
2023-04-07 04:04:24 +03:00
$this -> _db = new PDO ( 'mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8' , $username , $password , [ PDO :: MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8' ]);
$this -> _db -> setAttribute ( PDO :: ATTR_ERRMODE , PDO :: ERRMODE_EXCEPTION );
$this -> _db -> setAttribute ( PDO :: ATTR_DEFAULT_FETCH_MODE , PDO :: FETCH_OBJ );
$this -> _db -> setAttribute ( PDO :: ATTR_TIMEOUT , 600 );
2023-07-27 17:53:36 +03:00
if ( $memcached ) {
$this -> _memcached = $memcached ;
}
2023-04-07 04:04:24 +03:00
}
// System
public function beginTransaction () {
$this -> _db -> beginTransaction ();
}
public function commit () {
$this -> _db -> commit ();
}
public function rollBack () {
$this -> _db -> rollBack ();
}
2023-05-03 09:22:14 +03:00
// Manifest
2023-05-05 05:26:53 +03:00
public function getTotalManifests () {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `manifest`' );
$query -> execute ();
return $query -> fetch () -> total ;
}
public function getManifests () {
$query = $this -> _db -> prepare ( 'SELECT * FROM `manifest`' );
$query -> execute ();
return $query -> fetchAll ();
}
2023-05-03 09:22:14 +03:00
public function getManifest ( int $crc32url ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1' );
$query -> execute ([ $crc32url ]);
return $query -> fetch ();
}
public function addManifest ( int $crc32url , string $url , string $status , int $timeAdded , mixed $timeUpdated = null ) {
2023-05-04 01:04:39 +03:00
$query = $this -> _db -> prepare ( 'INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)' );
2023-05-03 09:22:14 +03:00
$query -> execute ([ $crc32url , $url , $status , $timeAdded , $timeUpdated ]);
return $this -> _db -> lastInsertId ();
}
2023-05-05 05:26:53 +03:00
public function deleteManifest ( int $manifestId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1' );
$query -> execute ([ $manifestId ]);
return $query -> rowCount ();
}
2023-04-07 04:04:24 +03:00
// Host
2023-04-23 03:01:51 +03:00
public function getAPIHosts ( string $apiHostFields ) {
$query = $this -> _db -> prepare ( 'SELECT ' . $apiHostFields . ' FROM `host`' );
$query -> execute ();
return $query -> fetchAll ();
}
2023-07-25 20:33:25 +03:00
public function getHosts () {
$query = $this -> _db -> query ( 'SELECT * FROM `host`' );
return $query -> fetchAll ();
}
2023-04-07 04:04:24 +03:00
public function getHost ( int $crc32url ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1' );
$query -> execute ([ $crc32url ]);
return $query -> fetch ();
}
2023-04-09 00:06:28 +03:00
public function getTotalHosts () {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `host`' );
$query -> execute ();
return $query -> fetch () -> total ;
}
2023-05-10 12:47:36 +03:00
public function addHost ( string $scheme ,
string $name ,
mixed $port ,
int $crc32url ,
int $timeAdded ,
mixed $timeUpdated ,
int $crawlPageLimit ,
string $crawlMetaOnly ,
string $status ,
string $nsfw ,
mixed $robots ,
mixed $robotsPostfix ) {
$query = $this -> _db -> prepare ( ' INSERT INTO `host` ( `scheme` ,
`name` ,
`port` ,
`crc32url` ,
`timeAdded` ,
`timeUpdated` ,
`crawlPageLimit` ,
`crawlMetaOnly` ,
`status` ,
`nsfw` ,
`robots` ,
`robotsPostfix` ) VALUES ( ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? ) ' );
$query -> execute ([ $scheme ,
$name ,
$port ,
$crc32url ,
$timeAdded ,
$timeUpdated ,
$crawlPageLimit ,
$crawlMetaOnly ,
$status ,
$nsfw ,
$robots ,
$robotsPostfix ]);
2023-04-07 04:04:24 +03:00
return $this -> _db -> lastInsertId ();
}
2023-04-09 00:06:28 +03:00
public function updateHostRobots ( int $hostId , mixed $robots , int $timeUpdated ) {
$query = $this -> _db -> prepare ( 'UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1' );
$query -> execute ([ $robots , $timeUpdated , $hostId ]);
return $query -> rowCount ();
}
2023-04-07 04:04:24 +03:00
// Pages
public function getTotalHostPages ( int $hostId ) {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?' );
$query -> execute ([ $hostId ]);
return $query -> fetch () -> total ;
}
2023-07-27 17:53:36 +03:00
public function getTotalHostPagesIndexed ( int $hostId ) {
if ( $this -> _memcached ) {
if ( $result = $this -> _memcached -> get ( sprintf ( 'MySQL.getTotalHostPagesIndexed.%s' , $hostId ))) {
return $result ;
}
}
$query = $this -> _db -> prepare ( ' SELECT COUNT ( * ) AS `total` FROM `hostPage`
WHERE `hostId` = ?
AND `httpCode` = 200
AND `timeBanned` IS NULL
AND `mime` IS NOT NULL ' );
$query -> execute ([ $hostId ]);
$result = $query -> fetch () -> total ;
if ( $this -> _memcached ) {
$this -> _memcached -> set ( sprintf ( 'MySQL.getTotalHostPagesIndexed.%s' , $hostId ), $result , time () + 3600 );
}
return $result ;
}
2023-04-07 04:04:24 +03:00
public function getHostPage ( int $hostId , int $crc32uri ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1' );
$query -> execute ([ $hostId , $crc32uri ]);
return $query -> fetch ();
}
2023-04-09 00:06:28 +03:00
public function getHostPages ( int $hostId ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPage` WHERE `hostId` = ?' );
$query -> execute ([ $hostId ]);
return $query -> fetchAll ();
}
2023-07-28 12:49:43 +03:00
public function getTopHostPages ( int $limit = 100 ) {
if ( $this -> _memcached ) {
if ( $result = $this -> _memcached -> get ( sprintf ( 'MySQL.getTopHostPages.%s' , $limit ))) {
return $result ;
}
}
$query = $this -> _db -> query ( " SELECT
`hostPage` . `hostId` ,
`hostPage` . `hostPageId` ,
`hostPage` . `uri` ,
`hostPage` . `rank` ,
`host` . `scheme` ,
`host` . `name` ,
`host` . `port`
FROM `hostPage`
JOIN `host` ON ( `hostPage` . `hostId` = `host` . `hostId` )
WHERE `host` . `status` = '1'
AND `hostPage` . `httpCode` = 200
AND `hostPage` . `rank` > 0
AND `hostPage` . `timeBanned` IS NULL
AND `hostPage` . `mime` IS NOT NULL
ORDER BY `rank` DESC
LIMIT " . (int) $limit );
$result = $query -> fetchAll ();
if ( $this -> _memcached ) {
$this -> _memcached -> set ( sprintf ( 'MySQL.getTopHostPages.%s' , $limit ), $result , time () + 3600 );
}
return $result ;
}
2023-06-25 22:10:47 +03:00
public function getHostPagesByIndexed () {
2023-06-26 15:59:08 +03:00
$query = $this -> _db -> query ( 'SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL' );
2023-06-25 22:10:47 +03:00
return $query -> fetchAll ();
}
2023-05-04 01:04:39 +03:00
public function getHostPagesByLimit ( int $hostId , int $limit ) {
2023-05-09 01:29:32 +03:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . ( int ) $limit );
2023-05-04 01:04:39 +03:00
$query -> execute ([ $hostId ]);
return $query -> fetchAll ();
}
2023-05-09 01:29:32 +03:00
public function getLastPageDescription ( int $hostPageId ) {
2023-05-10 12:47:36 +03:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1' );
2023-05-09 01:29:32 +03:00
$query -> execute ([ $hostPageId ]);
return $query -> fetch ();
}
2023-04-07 04:04:24 +03:00
public function getFoundHostPage ( int $hostPageId ) {
2023-05-13 05:54:15 +03:00
$query = $this -> _db -> prepare ( ' SELECT `hostPage` . `hostPageId` ,
`hostPage` . `uri` ,
`hostPage` . `timeAdded` ,
`hostPage` . `timeUpdated` ,
`hostPage` . `mime` ,
2023-06-13 23:20:22 +03:00
`hostPage` . `size` ,
2023-04-07 04:04:24 +03:00
`host` . `scheme` ,
`host` . `name` ,
`host` . `port`
FROM `hostPage`
JOIN `host` ON ( `host` . `hostId` = `hostPage` . `hostId` )
WHERE `hostPage` . `hostPageId` = ?
LIMIT 1 ' );
$query -> execute ([ $hostPageId ]);
return $query -> fetch ();
}
public function addHostPage ( int $hostId ,
int $crc32uri ,
string $uri ,
int $timeAdded ,
mixed $timeUpdated = null ,
2023-05-06 08:45:37 +03:00
mixed $timeBanned = null ,
2023-04-07 04:04:24 +03:00
mixed $httpCode = null ,
2023-05-10 12:47:36 +03:00
mixed $mime = null ) {
2023-04-07 04:04:24 +03:00
$query = $this -> _db -> prepare ( ' INSERT INTO `hostPage` ( `hostId` ,
`crc32uri` ,
`uri` ,
`timeAdded` ,
`timeUpdated` ,
2023-05-06 08:45:37 +03:00
`timeBanned` ,
2023-04-07 04:04:24 +03:00
`httpCode` ,
2023-05-10 12:47:36 +03:00
`mime` ) VALUES ( ? , ? , ? , ? , ? , ? , ? , ? ) ' );
2023-04-07 04:04:24 +03:00
2023-05-10 12:47:36 +03:00
$query -> execute ([ $hostId , $crc32uri , $uri , $timeAdded , $timeUpdated , $timeBanned , $httpCode , $mime ]);
2023-04-07 04:04:24 +03:00
return $this -> _db -> lastInsertId ();
}
2023-05-06 10:11:25 +03:00
public function updateHostPageTimeBanned ( int $hostPageId , int $timeBanned ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1' );
$query -> execute ([ $timeBanned , $hostPageId ]);
return $query -> rowCount ();
}
2023-05-08 14:13:53 +03:00
public function updateHostPageMime ( int $hostPageId , string $mime ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `mime` = ? WHERE `hostPageId` = ? LIMIT 1' );
$query -> execute ([ $mime , $hostPageId ]);
return $query -> rowCount ();
}
2023-07-28 12:49:43 +03:00
public function updateHostPageRank ( int $hostPageId , int $rank ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `rank` = ? WHERE `hostPageId` = ? LIMIT 1' );
$query -> execute ([ $rank , $hostPageId ]);
return $query -> rowCount ();
}
2023-04-09 00:06:28 +03:00
public function deleteHostPage ( int $hostPageId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPage` WHERE `hostPageId` = ? LIMIT 1' );
$query -> execute ([ $hostPageId ]);
return $query -> rowCount ();
}
2023-05-09 01:29:32 +03:00
public function deleteHostPageDescriptions ( int $hostPageId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageDescription` WHERE `hostPageId` = ?' );
$query -> execute ([ $hostPageId ]);
return $query -> rowCount ();
}
2023-05-10 12:47:36 +03:00
public function addHostPageDescription ( int $hostPageId ,
mixed $title ,
mixed $description ,
mixed $keywords ,
2023-05-09 01:29:32 +03:00
mixed $data ,
2023-05-10 12:47:36 +03:00
int $timeAdded ) {
2023-05-09 01:29:32 +03:00
$query = $this -> _db -> prepare ( ' INSERT INTO `hostPageDescription` ( `hostPageId` ,
2023-05-10 12:47:36 +03:00
`title` ,
`description` ,
`keywords` ,
2023-05-09 01:29:32 +03:00
`data` ,
`timeAdded`
2023-05-10 12:47:36 +03:00
) VALUES ( ? , ? , ? , ? , ? , ? ) ' );
2023-05-09 01:29:32 +03:00
$query -> execute ([
$hostPageId ,
2023-05-10 12:47:36 +03:00
$title ,
$description ,
$keywords ,
2023-05-09 01:29:32 +03:00
$data ,
2023-05-10 12:47:36 +03:00
$timeAdded ,
2023-05-09 01:29:32 +03:00
]);
return $query -> rowCount ();
}
2023-05-10 12:47:36 +03:00
public function addHostPageToHostPage ( int $hostPageIdSource , int $hostPageIdTarget ) {
2023-05-13 06:30:40 +03:00
$query = $this -> _db -> prepare ( 'INSERT IGNORE `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`) VALUES (?, ?)' );
2023-05-10 12:47:36 +03:00
$query -> execute ([ $hostPageIdSource , $hostPageIdTarget ]);
}
public function deleteHostPageToHostPage ( int $hostPageId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? OR `hostPageIdTarget` = ?' );
$query -> execute ([ $hostPageId , $hostPageId ]);
return $query -> rowCount ();
}
2023-07-28 12:49:43 +03:00
public function getTotalExternalHostPageIdSourcesByHostPageIdTarget ( int $hostPageIdTarget ) {
$query = $this -> _db -> prepare ( ' SELECT COUNT ( * ) AS `total`
FROM `hostPageToHostPage`
JOIN `hostPage` AS `hostPageSource` ON ( `hostPageSource` . `hostPageId` = `hostPageToHostPage` . `hostPageIdSource` )
JOIN `hostPage` AS `hostPageTarget` ON ( `hostPageTarget` . `hostPageId` = `hostPageToHostPage` . `hostPageIdTarget` )
WHERE `hostPageToHostPage` . `hostPageIdTarget` = ?
AND `hostPageSource` . `hostId` <> `hostPageTarget` . `hostId` ' );
$query -> execute ([ $hostPageIdTarget ]);
return $query -> fetch () -> total ;
}
2023-05-13 03:01:00 +03:00
public function getTotalHostPageIdSourcesByHostPageIdTarget ( int $hostPageIdTarget ) {
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ?' );
$query -> execute ([ $hostPageIdTarget ]);
return $query -> fetch () -> total ;
}
2023-05-13 05:54:15 +03:00
public function getHostPageIdSourcesByHostPageIdTarget ( int $hostPageIdTarget , int $limit = 1000 ) {
2023-05-13 03:01:00 +03:00
2023-05-13 06:30:40 +03:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageToHostPage` WHERE `hostPageIdTarget` = ? LIMIT ' . ( int ) $limit );
2023-05-13 03:01:00 +03:00
$query -> execute ([ $hostPageIdTarget ]);
return $query -> fetchAll ();
}
2023-05-14 01:45:55 +03:00
public function addHostPageSnap ( int $hostPageId , string $crc32data , int $timeAdded ) {
2023-05-14 19:41:20 +03:00
$query = $this -> _db -> prepare ( ' INSERT INTO `hostPageSnap` ( `hostPageId` ,
`crc32data` ,
`timeAdded` ) VALUES ( ? , ? , ? ) ' );
2023-05-14 01:45:55 +03:00
$query -> execute ([ $hostPageId , $crc32data , $timeAdded ]);
2023-05-13 10:15:07 +03:00
2023-05-14 19:41:20 +03:00
return $this -> _db -> lastInsertId ();
}
public function updateHostPageSnapStorageLocal ( int $hostPageSnapId , mixed $value ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPageSnap` SET `storageLocal` = ? WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $value , $hostPageSnapId ]);
return $query -> rowCount ();
}
public function updateHostPageSnapStorageMega ( int $hostPageSnapId , mixed $value ) {
$query = $this -> _db -> prepare ( 'UPDATE `hostPageSnap` SET `storageMega` = ? WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $value , $hostPageSnapId ]);
2023-05-13 10:15:07 +03:00
return $query -> rowCount ();
}
2023-05-14 01:45:55 +03:00
public function deleteHostPageSnap ( int $hostPageSnapId ) {
2023-05-13 10:15:07 +03:00
2023-05-14 01:45:55 +03:00
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1' );
2023-05-13 10:15:07 +03:00
2023-05-14 01:45:55 +03:00
$query -> execute ([ $hostPageSnapId ]);
2023-05-13 10:15:07 +03:00
return $query -> rowCount ();
}
2023-06-25 23:29:30 +03:00
public function getTotalHostPageSnaps ( int $hostPageId , bool $storageLocal = true , bool $storageMega = true ) {
2023-05-13 10:15:07 +03:00
2023-06-25 23:29:30 +03:00
$query = $this -> _db -> prepare ( 'SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ? AND (`storageLocal` = ? OR `storageMega` = ?)' );
2023-05-13 10:15:07 +03:00
2023-06-25 23:29:30 +03:00
$query -> execute ([ $hostPageId , $storageLocal , $storageMega ]);
2023-05-13 10:15:07 +03:00
return $query -> fetch () -> total ;
}
2023-07-07 12:30:07 +03:00
public function getHostPageSnaps ( int $hostPageId , bool $storageLocal = true , bool $storageMega = true , string $condition = 'OR' ) {
2023-05-13 10:15:07 +03:00
2023-07-07 12:30:07 +03:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND (`storageLocal` = ? ' . ( $condition == 'OR' ? 'OR' : 'AND' ) . ' `storageMega` = ?) ORDER BY `timeAdded` DESC' );
2023-05-13 10:15:07 +03:00
2023-06-25 23:29:30 +03:00
$query -> execute ([ $hostPageId , $storageLocal , $storageMega ]);
2023-05-13 10:15:07 +03:00
return $query -> fetchAll ();
}
2023-05-15 09:18:18 +03:00
public function getHostPageSnap ( int $hostPageSnapId ) {
2023-05-13 10:15:07 +03:00
2023-05-15 09:18:18 +03:00
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $hostPageSnapId ]);
return $query -> fetch ();
}
public function findHostPageSnap ( int $hostPageId , int $crc32data ) {
$query = $this -> _db -> prepare ( 'SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1' );
2023-05-13 10:15:07 +03:00
2023-05-14 01:45:55 +03:00
$query -> execute ([ $hostPageId , $crc32data ]);
2023-05-13 10:15:07 +03:00
return $query -> fetch ();
}
2023-05-15 09:18:18 +03:00
public function addHostPageSnapDownload ( int $hostPageSnapId , string $crc32ip , int $timeAdded ) {
$query = $this -> _db -> prepare ( ' INSERT INTO `hostPageSnapDownload` ( `hostPageSnapId` ,
`crc32ip` ,
`timeAdded` ) VALUES ( ? , ? , ? ) ' );
$query -> execute ([ $hostPageSnapId , $crc32ip , $timeAdded ]);
return $this -> _db -> lastInsertId ();
}
2023-06-12 13:34:25 +03:00
public function updateHostPageSnapDownload ( int $hostPageSnapDownloadId , string $storage , int $size , mixed $httpCode = NULL ) {
2023-05-15 09:18:18 +03:00
2023-06-12 13:34:25 +03:00
$query = $this -> _db -> prepare ( 'UPDATE `hostPageSnapDownload` SET `storage` = ?, `size` = ?, `httpCode` = ? WHERE `hostPageSnapDownloadId` = ? LIMIT 1' );
2023-05-15 09:18:18 +03:00
2023-06-12 13:34:25 +03:00
$query -> execute ([ $storage , $size , $httpCode , $hostPageSnapDownloadId ]);
2023-05-15 09:18:18 +03:00
return $query -> rowCount ();
}
public function deleteHostPageSnapDownloads ( int $hostPageSnapId ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageSnapDownload` WHERE `hostPageSnapId` = ? LIMIT 1' );
$query -> execute ([ $hostPageSnapId ]);
return $query -> rowCount ();
}
public function findHostPageSnapDownloadsTotalSize ( int $crc32ip , int $timeOffset ) {
$query = $this -> _db -> prepare ( ' SELECT SUM ( `size` ) AS `size` FROM `hostPageSnapDownload`
WHERE `crc32ip` = ? AND `timeAdded` < ? ' );
$query -> execute ([ $crc32ip , $timeOffset ]);
return $query -> fetch () -> size ;
}
2023-06-25 22:10:47 +03:00
public function addHostPageDom ( int $hostPageId , int $timeAdded , string $selector , string $value ) {
$query = $this -> _db -> prepare ( 'INSERT INTO `hostPageDom` SET `hostPageId` = ?, `timeAdded` = ?, `selector` = ?, `value` = ?' );
$query -> execute ([ $hostPageId , $timeAdded , $selector , $value ]);
}
public function deleteHostPageDoms ( int $hostPageId ) {
2023-06-30 13:28:22 +03:00
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageDom` WHERE `hostPageId` = ?' );
$query -> execute ([ $hostPageId ]);
2023-06-25 22:10:47 +03:00
return $query -> rowCount ();
}
public function deleteHostPageDomsByTimeAdded ( int $timeOffset ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageDom` WHERE `timeAdded` < ' . ( int ) $timeOffset );
$query -> execute ();
return $query -> rowCount ();
}
public function truncateHostPageDom () {
$query = $this -> _db -> query ( 'TRUNCATE `hostPageDom`' );
}
2023-04-09 00:06:28 +03:00
// Cleaner tools
public function getCleanerQueue ( int $limit , int $timeFrom ) {
$query = $this -> _db -> prepare ( ' SELECT * FROM `host`
2023-06-16 16:32:46 +03:00
WHERE ( `timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host` . `status` <> ?
2023-04-09 00:06:28 +03:00
ORDER BY `hostId`
LIMIT ' . ( int ) $limit );
2023-06-16 16:32:46 +03:00
$query -> execute ([ $timeFrom , 0 ]);
2023-04-09 00:06:28 +03:00
2023-06-16 16:53:14 +03:00
return $query -> fetchAll ();
}
public function getHostPagesBanned () {
$query = $this -> _db -> query ( 'SELECT * FROM `hostPage` WHERE `timeBanned` IS NOT NULL' );
2023-04-09 00:06:28 +03:00
return $query -> fetchAll ();
}
2023-05-06 08:45:37 +03:00
public function resetBannedHostPages ( int $timeOffset ) {
2023-05-08 11:04:59 +03:00
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . ( int ) $timeOffset );
2023-05-06 08:45:37 +03:00
$query -> execute ();
return $query -> rowCount ();
}
2023-05-09 08:19:49 +03:00
public function deleteHostPageDescriptionsByTimeAdded ( int $timeOffset ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `hostPageDescription` WHERE `timeAdded` < ' . ( int ) $timeOffset );
$query -> execute ();
return $query -> rowCount ();
}
2023-05-08 11:04:59 +03:00
public function addCleanerLog ( int $timeAdded ,
int $hostsTotal ,
int $hostsUpdated ,
int $hostPagesDeleted ,
2023-05-13 10:15:07 +03:00
int $hostPagesDescriptionsDeleted ,
2023-06-25 22:10:47 +03:00
int $hostPagesDomsDeleted ,
2023-05-14 01:45:55 +03:00
int $hostPagesSnapDeleted ,
2023-05-13 10:15:07 +03:00
int $hostPagesToHostPageDeleted ,
2023-05-08 11:04:59 +03:00
int $hostPagesBansRemoved ,
int $manifestsTotal ,
int $manifestsDeleted ,
int $logsCleanerDeleted ,
int $logsCrawlerDeleted ,
int $httpRequestsTotal ,
int $httpRequestsSizeTotal ,
int $httpDownloadSizeTotal ,
float $httpRequestsTimeTotal ,
float $executionTimeTotal ) {
$query = $this -> _db -> prepare ( ' INSERT INTO `logCleaner` ( `timeAdded` ,
`hostsTotal` ,
`hostsUpdated` ,
`hostPagesDeleted` ,
2023-05-13 10:15:07 +03:00
`hostPagesDescriptionsDeleted` ,
2023-06-25 22:10:47 +03:00
`hostPagesDomsDeleted` ,
2023-05-14 01:45:55 +03:00
`hostPagesSnapDeleted` ,
2023-05-13 10:15:07 +03:00
`hostPagesToHostPageDeleted` ,
2023-05-08 11:04:59 +03:00
`hostPagesBansRemoved` ,
`manifestsTotal` ,
`manifestsDeleted` ,
`logsCleanerDeleted` ,
`logsCrawlerDeleted` ,
`httpRequestsTotal` ,
`httpRequestsSizeTotal` ,
`httpDownloadSizeTotal` ,
`httpRequestsTimeTotal` ,
2023-06-25 22:10:47 +03:00
`executionTimeTotal` ) VALUES ( ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? ) ' );
2023-05-08 11:04:59 +03:00
$query -> execute ([
$timeAdded ,
$hostsTotal ,
$hostsUpdated ,
$hostPagesDeleted ,
2023-05-13 10:15:07 +03:00
$hostPagesDescriptionsDeleted ,
2023-06-25 22:10:47 +03:00
$hostPagesDomsDeleted ,
2023-05-14 01:45:55 +03:00
$hostPagesSnapDeleted ,
2023-05-13 10:15:07 +03:00
$hostPagesToHostPageDeleted ,
2023-05-08 11:04:59 +03:00
$hostPagesBansRemoved ,
$manifestsTotal ,
$manifestsDeleted ,
$logsCleanerDeleted ,
$logsCrawlerDeleted ,
$httpRequestsTotal ,
$httpRequestsSizeTotal ,
$httpDownloadSizeTotal ,
$httpRequestsTimeTotal ,
$executionTimeTotal
]);
return $this -> _db -> lastInsertId ();
}
public function deleteLogCleaner ( int $timeOffset ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `logCleaner` WHERE `timeAdded` < ' . ( int ) $timeOffset );
2023-05-06 08:45:37 +03:00
$query -> execute ();
return $query -> rowCount ();
}
2023-04-07 04:04:24 +03:00
// Crawl tools
2023-06-30 13:28:22 +03:00
public function getHostPageCrawlQueueTotal ( int $hostPageTimeFrom , int $hostPageHomeTimeFrom ) {
$query = $this -> _db -> prepare ( " SELECT COUNT(*) AS `total`
FROM `hostPage`
JOIN `host` ON ( `host` . `hostId` = `hostPage` . `hostId` )
2023-06-30 13:41:07 +03:00
WHERE ( `hostPage` . `timeUpdated` IS NULL OR `hostPage` . `timeUpdated` < ? OR ( `hostPage` . `uri` = '/' AND `hostPage` . `timeUpdated` < ? ))
2023-06-30 13:28:22 +03:00
AND `host` . `status` <> ?
AND `hostPage` . `timeBanned` IS NULL " );
$query -> execute ([ $hostPageTimeFrom , $hostPageHomeTimeFrom , 0 ]);
return $query -> fetch () -> total ;
}
public function getHostPageCrawlQueue ( int $limit , int $hostPageTimeFrom , int $hostPageHomeTimeFrom ) {
2023-04-07 04:04:24 +03:00
2023-06-30 13:28:22 +03:00
$query = $this -> _db -> prepare ( " SELECT `hostPage`.`hostId`,
2023-04-09 03:28:31 +03:00
`hostPage` . `hostPageId` ,
2023-04-07 04:04:24 +03:00
`hostPage` . `uri` ,
`host` . `scheme` ,
`host` . `name` ,
`host` . `port` ,
`host` . `crawlPageLimit` ,
2023-05-09 08:19:49 +03:00
`host` . `crawlMetaOnly` ,
2023-04-09 03:28:31 +03:00
`host` . `robots` ,
`host` . `robotsPostfix`
2023-04-07 04:04:24 +03:00
FROM `hostPage`
JOIN `host` ON ( `host` . `hostId` = `hostPage` . `hostId` )
2023-06-30 13:41:07 +03:00
WHERE ( `hostPage` . `timeUpdated` IS NULL OR `hostPage` . `timeUpdated` < ? OR ( `hostPage` . `uri` = '/' AND `hostPage` . `timeUpdated` < ? ))
2023-06-30 13:28:22 +03:00
AND `host` . `status` <> ?
AND `hostPage` . `timeBanned` IS NULL
2023-04-07 04:04:24 +03:00
2023-06-04 11:38:56 +03:00
ORDER BY LENGTH ( `hostPage` . `uri` ) ASC , RAND ()
2023-04-07 04:04:24 +03:00
2023-06-30 13:28:22 +03:00
LIMIT " . (int) $limit );
2023-04-07 04:04:24 +03:00
2023-06-30 13:28:22 +03:00
$query -> execute ([ $hostPageTimeFrom , $hostPageHomeTimeFrom , 0 ]);
2023-04-07 04:04:24 +03:00
return $query -> fetchAll ();
}
2023-06-13 12:45:12 +03:00
public function updateHostPageCrawlQueue ( int $hostPageId , int $timeUpdated , int $httpCode , int $size ) {
2023-04-07 04:04:24 +03:00
2023-06-13 12:45:12 +03:00
$query = $this -> _db -> prepare ( 'UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ?, `size` = ? WHERE `hostPageId` = ? LIMIT 1' );
2023-04-07 04:04:24 +03:00
2023-06-13 12:45:12 +03:00
$query -> execute ([ $timeUpdated , $httpCode , $size , $hostPageId ]);
2023-04-07 04:04:24 +03:00
return $query -> rowCount ();
}
2023-05-04 06:45:04 +03:00
2023-07-27 11:44:42 +03:00
public function getHostRobotsCrawlQueue ( int $limit , int $timeFrom ) {
$query = $this -> _db -> prepare ( ' SELECT * FROM `host`
WHERE ( `timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
ORDER BY RAND ()
LIMIT ' . ( int ) $limit );
$query -> execute ([ $timeFrom , 0 ]);
return $query -> fetchAll ();
}
2023-05-05 05:26:53 +03:00
public function getManifestCrawlQueue ( int $limit , int $timeFrom ) {
$query = $this -> _db -> prepare ( ' SELECT * FROM `manifest`
2023-06-16 16:32:46 +03:00
WHERE ( `timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
2023-05-05 05:26:53 +03:00
ORDER BY RAND ()
LIMIT ' . ( int ) $limit );
2023-06-16 16:32:46 +03:00
$query -> execute ([ $timeFrom , 0 ]);
2023-05-05 05:26:53 +03:00
return $query -> fetchAll ();
}
public function updateManifestCrawlQueue ( int $manifestId , int $timeUpdated , int $httpCode ) {
$query = $this -> _db -> prepare ( 'UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1' );
$query -> execute ([ $timeUpdated , $httpCode , $manifestId ]);
return $query -> rowCount ();
}
2023-05-08 11:04:59 +03:00
public function addCrawlerLog ( int $timeAdded ,
int $hostsAdded ,
int $hostPagesProcessed ,
int $hostPagesAdded ,
2023-05-14 01:45:55 +03:00
int $hostPagesSnapAdded ,
2023-05-08 11:04:59 +03:00
int $hostPagesBanned ,
int $manifestsProcessed ,
int $manifestsAdded ,
int $httpRequestsTotal ,
int $httpRequestsSizeTotal ,
int $httpDownloadSizeTotal ,
float $httpRequestsTimeTotal ,
float $executionTimeTotal ) {
$query = $this -> _db -> prepare ( ' INSERT INTO `logCrawler` ( `timeAdded` ,
`hostsAdded` ,
`hostPagesProcessed` ,
`hostPagesAdded` ,
2023-05-14 01:45:55 +03:00
`hostPagesSnapAdded` ,
2023-05-08 11:04:59 +03:00
`hostPagesBanned` ,
`manifestsProcessed` ,
`manifestsAdded` ,
`httpRequestsTotal` ,
`httpRequestsSizeTotal` ,
`httpDownloadSizeTotal` ,
`httpRequestsTimeTotal` ,
2023-06-05 22:06:55 +03:00
`executionTimeTotal` ) VALUES ( ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? ) ' );
2023-05-08 11:04:59 +03:00
$query -> execute ([
$timeAdded ,
$hostsAdded ,
$hostPagesProcessed ,
$hostPagesAdded ,
2023-05-14 01:45:55 +03:00
$hostPagesSnapAdded ,
2023-05-08 11:04:59 +03:00
$hostPagesBanned ,
$manifestsProcessed ,
$manifestsAdded ,
$httpRequestsTotal ,
$httpRequestsSizeTotal ,
$httpDownloadSizeTotal ,
$httpRequestsTimeTotal ,
$executionTimeTotal
]);
return $this -> _db -> lastInsertId ();
}
public function deleteLogCrawler ( int $timeOffset ) {
$query = $this -> _db -> prepare ( 'DELETE FROM `logCrawler` WHERE `timeAdded` < ' . ( int ) $timeOffset );
$query -> execute ();
return $query -> rowCount ();
}
2023-05-14 02:39:32 +03:00
public function optimize () {
$this -> _db -> query ( 'OPTIMIZE TABLE `host`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPage`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageDescription`' );
2023-06-25 22:10:47 +03:00
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageDom`' );
2023-05-14 02:39:32 +03:00
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageSnap`' );
2023-05-15 09:18:18 +03:00
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageSnapDownload`' );
2023-05-14 02:39:32 +03:00
$this -> _db -> query ( 'OPTIMIZE TABLE `hostPageToHostPage`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `logCleaner`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `logCrawler`' );
$this -> _db -> query ( 'OPTIMIZE TABLE `manifest`' );
}
2023-04-07 04:04:24 +03:00
}