|
|
@ -3,18 +3,13 @@ |
|
|
|
class MySQL { |
|
|
|
class MySQL { |
|
|
|
|
|
|
|
|
|
|
|
private PDO $_db; |
|
|
|
private PDO $_db; |
|
|
|
private Memcached $_memcached; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public function __construct(string $host, int $port, string $database, string $username, string $password, Memcached $memcached = null) { |
|
|
|
public function __construct(string $host, int $port, string $database, string $username, string $password) { |
|
|
|
|
|
|
|
|
|
|
|
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']); |
|
|
|
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']); |
|
|
|
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); |
|
|
|
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); |
|
|
|
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); |
|
|
|
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); |
|
|
|
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600); |
|
|
|
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600); |
|
|
|
|
|
|
|
|
|
|
|
if ($memcached) { |
|
|
|
|
|
|
|
$this->_memcached = $memcached; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// System |
|
|
|
// System |
|
|
@ -96,7 +91,22 @@ class MySQL { |
|
|
|
return $query->fetchAll(); |
|
|
|
return $query->fetchAll(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
public function getHost(int $crc32url) { |
|
|
|
public function getHost(int $hostId) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$query = $this->_db->prepare("SELECT *, |
|
|
|
|
|
|
|
IF (`port` IS NOT NULL, |
|
|
|
|
|
|
|
CONCAT(`scheme`, '://', `name`, ':', `port`), |
|
|
|
|
|
|
|
CONCAT(`scheme`, '://', `name`) |
|
|
|
|
|
|
|
) AS `url` |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FROM `host` WHERE `hostId` = ? LIMIT 1"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$query->execute([$hostId]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return $query->fetch(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public function getHostByCRC32URL(int $crc32url) { |
|
|
|
|
|
|
|
|
|
|
|
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1'); |
|
|
|
$query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1'); |
|
|
|
|
|
|
|
|
|
|
@ -204,53 +214,57 @@ class MySQL { |
|
|
|
|
|
|
|
|
|
|
|
public function getTopHostPages(int $limit = 100) { |
|
|
|
public function getTopHostPages(int $limit = 100) { |
|
|
|
|
|
|
|
|
|
|
|
if ($this->_memcached) { |
|
|
|
// Get ID (to prevent memory over usage) |
|
|
|
|
|
|
|
$query = $this->_db->query("SELECT `hostPage`.`hostPageId` |
|
|
|
|
|
|
|
|
|
|
|
if ($result = $this->_memcached->get(sprintf('MySQL.getTopHostPages.%s', $limit))) { |
|
|
|
FROM `hostPage` |
|
|
|
|
|
|
|
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) |
|
|
|
|
|
|
|
|
|
|
|
return $result; |
|
|
|
WHERE `host`.`status` = '1' |
|
|
|
} |
|
|
|
AND `hostPage`.`httpCode` = 200 |
|
|
|
} |
|
|
|
AND `hostPage`.`rank` > 0 |
|
|
|
|
|
|
|
AND `hostPage`.`timeBanned` IS NULL |
|
|
|
|
|
|
|
AND `hostPage`.`mime` IS NOT NULL |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ORDER BY `rank` DESC |
|
|
|
|
|
|
|
|
|
|
|
$query = $this->_db->query(" SELECT |
|
|
|
LIMIT " . (int) $limit); |
|
|
|
|
|
|
|
|
|
|
|
`hostPage`.`hostId`, |
|
|
|
// Get required page details |
|
|
|
`hostPage`.`hostPageId`, |
|
|
|
foreach ($query->fetchAll() as $top) { |
|
|
|
`hostPage`.`uri`, |
|
|
|
|
|
|
|
`hostPage`.`rank`, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
`host`.`scheme`, |
|
|
|
$query = $this->_db->prepare("SELECT `hostPage`.`hostId`, |
|
|
|
`host`.`name`, |
|
|
|
`hostPage`.`hostPageId`, |
|
|
|
`host`.`port`, |
|
|
|
`hostPage`.`uri`, |
|
|
|
|
|
|
|
`hostPage`.`rank`, |
|
|
|
|
|
|
|
|
|
|
|
IF (`host`.`port` IS NOT NULL, |
|
|
|
`host`.`scheme`, |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), |
|
|
|
`host`.`name`, |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`) |
|
|
|
`host`.`port`, |
|
|
|
) AS `hostURL`, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IF (`host`.`port` IS NOT NULL, |
|
|
|
IF (`host`.`port` IS NOT NULL, |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`) |
|
|
|
) AS `hostPageURL` |
|
|
|
) AS `hostURL`, |
|
|
|
|
|
|
|
|
|
|
|
FROM `hostPage` |
|
|
|
IF (`host`.`port` IS NOT NULL, |
|
|
|
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), |
|
|
|
|
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) |
|
|
|
|
|
|
|
) AS `hostPageURL` |
|
|
|
|
|
|
|
|
|
|
|
WHERE `host`.`status` = '1' |
|
|
|
FROM `hostPage` |
|
|
|
AND `hostPage`.`httpCode` = 200 |
|
|
|
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) |
|
|
|
AND `hostPage`.`rank` > 0 |
|
|
|
|
|
|
|
AND `hostPage`.`timeBanned` IS NULL |
|
|
|
|
|
|
|
AND `hostPage`.`mime` IS NOT NULL |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ORDER BY `rank` DESC |
|
|
|
WHERE `hostPage`.`hostPageId` = ? |
|
|
|
|
|
|
|
|
|
|
|
LIMIT " . (int) $limit); |
|
|
|
LIMIT 1"); |
|
|
|
|
|
|
|
|
|
|
|
$result = $query->fetchAll(); |
|
|
|
$query->execute([$top->hostPageId]); |
|
|
|
|
|
|
|
|
|
|
|
if ($this->_memcached) { |
|
|
|
if ($query->rowCount()) { |
|
|
|
|
|
|
|
|
|
|
|
$this->_memcached->set(sprintf('MySQL.getTopHostPages.%s', $limit), $result, time() + 3600); |
|
|
|
$result[] = $query->fetch(); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return $result; |
|
|
|
return $result; |
|
|
@ -582,20 +596,28 @@ class MySQL { |
|
|
|
// Cleaner tools |
|
|
|
// Cleaner tools |
|
|
|
public function getCleanerQueue(int $limit, int $timeFrom) { |
|
|
|
public function getCleanerQueue(int $limit, int $timeFrom) { |
|
|
|
|
|
|
|
|
|
|
|
$query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL, |
|
|
|
$result = []; |
|
|
|
CONCAT(`scheme`, '://', `name`, ':', `port`), |
|
|
|
|
|
|
|
CONCAT(`scheme`, '://', `name`) |
|
|
|
// Get ID (to prevent memory over usage) |
|
|
|
) AS `hostURL` FROM `host` |
|
|
|
$query = $this->_db->prepare("SELECT `hostId` |
|
|
|
|
|
|
|
|
|
|
|
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ? |
|
|
|
FROM `host` |
|
|
|
|
|
|
|
|
|
|
|
ORDER BY `hostId` |
|
|
|
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ? |
|
|
|
|
|
|
|
|
|
|
|
LIMIT " . (int) $limit); |
|
|
|
ORDER BY `hostId` |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LIMIT " . (int) $limit); |
|
|
|
|
|
|
|
|
|
|
|
$query->execute([$timeFrom, 0]); |
|
|
|
$query->execute([$timeFrom, 0]); |
|
|
|
|
|
|
|
|
|
|
|
return $query->fetchAll(); |
|
|
|
// Get required page details |
|
|
|
|
|
|
|
foreach ($query->fetchAll() as $host) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$result[] = $this->getHost($host->hostId); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return (object) $result; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
public function getHostPagesBanned() { |
|
|
|
public function getHostPagesBanned() { |
|
|
@ -702,7 +724,13 @@ class MySQL { |
|
|
|
FROM `hostPage` |
|
|
|
FROM `hostPage` |
|
|
|
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) |
|
|
|
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) |
|
|
|
|
|
|
|
|
|
|
|
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)) |
|
|
|
WHERE ( |
|
|
|
|
|
|
|
`hostPage`.`timeUpdated` IS NULL OR |
|
|
|
|
|
|
|
`hostPage`.`timeUpdated` < ? OR ( |
|
|
|
|
|
|
|
`hostPage`.`uri` = '/' AND |
|
|
|
|
|
|
|
`hostPage`.`timeUpdated` < ? |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
AND `host`.`status` <> ? |
|
|
|
AND `host`.`status` <> ? |
|
|
|
AND `hostPage`.`timeBanned` IS NULL"); |
|
|
|
AND `hostPage`.`timeBanned` IS NULL"); |
|
|
@ -714,32 +742,22 @@ class MySQL { |
|
|
|
|
|
|
|
|
|
|
|
public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { |
|
|
|
public function getHostPageCrawlQueue(int $limit, int $hostPageTimeFrom, int $hostPageHomeTimeFrom) { |
|
|
|
|
|
|
|
|
|
|
|
$query = $this->_db->prepare("SELECT `hostPage`.`hostId`, |
|
|
|
$result = []; |
|
|
|
`hostPage`.`hostPageId`, |
|
|
|
|
|
|
|
`hostPage`.`uri`, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
`host`.`scheme`, |
|
|
|
|
|
|
|
`host`.`name`, |
|
|
|
|
|
|
|
`host`.`port`, |
|
|
|
|
|
|
|
`host`.`crawlPageLimit`, |
|
|
|
|
|
|
|
`host`.`crawlMetaOnly`, |
|
|
|
|
|
|
|
`host`.`robots`, |
|
|
|
|
|
|
|
`host`.`robotsPostfix`, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IF (`host`.`port` IS NOT NULL, |
|
|
|
|
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), |
|
|
|
|
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`) |
|
|
|
|
|
|
|
) AS `hostURL`, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IF (`host`.`port` IS NOT NULL, |
|
|
|
// Get ID (to prevent memory over usage) |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), |
|
|
|
$query = $this->_db->prepare("SELECT `hostPage`.`hostPageId` |
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) |
|
|
|
|
|
|
|
) AS `hostPageURL` |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FROM `hostPage` |
|
|
|
FROM `hostPage` |
|
|
|
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) |
|
|
|
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) |
|
|
|
|
|
|
|
|
|
|
|
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)) |
|
|
|
WHERE ( |
|
|
|
|
|
|
|
`hostPage`.`timeUpdated` IS NULL OR |
|
|
|
|
|
|
|
`hostPage`.`timeUpdated` < ? |
|
|
|
|
|
|
|
OR ( |
|
|
|
|
|
|
|
`hostPage`.`uri` = '/' AND |
|
|
|
|
|
|
|
`hostPage`.`timeUpdated` < ? |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
AND `host`.`status` <> ? |
|
|
|
AND `host`.`status` <> ? |
|
|
|
AND `hostPage`.`timeBanned` IS NULL |
|
|
|
AND `hostPage`.`timeBanned` IS NULL |
|
|
@ -750,7 +768,45 @@ class MySQL { |
|
|
|
|
|
|
|
|
|
|
|
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); |
|
|
|
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); |
|
|
|
|
|
|
|
|
|
|
|
return $query->fetchAll(); |
|
|
|
// Get required page details |
|
|
|
|
|
|
|
foreach ($query->fetchAll() as $queue) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$query = $this->_db->prepare("SELECT `hostPage`.`hostId`, |
|
|
|
|
|
|
|
`hostPage`.`hostPageId`, |
|
|
|
|
|
|
|
`hostPage`.`uri`, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
`host`.`scheme`, |
|
|
|
|
|
|
|
`host`.`name`, |
|
|
|
|
|
|
|
`host`.`port`, |
|
|
|
|
|
|
|
`host`.`crawlPageLimit`, |
|
|
|
|
|
|
|
`host`.`crawlMetaOnly`, |
|
|
|
|
|
|
|
`host`.`robots`, |
|
|
|
|
|
|
|
`host`.`robotsPostfix`, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IF (`host`.`port` IS NOT NULL, |
|
|
|
|
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), |
|
|
|
|
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`) |
|
|
|
|
|
|
|
) AS `hostURL`, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IF (`host`.`port` IS NOT NULL, |
|
|
|
|
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), |
|
|
|
|
|
|
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`) |
|
|
|
|
|
|
|
) AS `hostPageURL` |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FROM `hostPage` |
|
|
|
|
|
|
|
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
WHERE `hostPage`.`hostPageId` = ? LIMIT 1"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$query->execute([$queue->hostPageId]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($query->rowCount()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$result[] = $query->fetch(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return (object) $result; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode, int $size) { |
|
|
|
public function updateHostPageCrawlQueue(int $hostPageId, int $timeUpdated, int $httpCode, int $size) { |
|
|
@ -764,22 +820,28 @@ class MySQL { |
|
|
|
|
|
|
|
|
|
|
|
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) { |
|
|
|
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) { |
|
|
|
|
|
|
|
|
|
|
|
$query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL, |
|
|
|
$result = []; |
|
|
|
CONCAT(`scheme`, '://', `name`, ':', `port`), |
|
|
|
|
|
|
|
CONCAT(`scheme`, '://', `name`) |
|
|
|
// Get ID (to prevent memory over usage) |
|
|
|
) AS `hostURL` |
|
|
|
$query = $this->_db->prepare("SELECT `hostId` |
|
|
|
|
|
|
|
|
|
|
|
FROM `host` |
|
|
|
FROM `host` |
|
|
|
|
|
|
|
|
|
|
|
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? |
|
|
|
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? |
|
|
|
|
|
|
|
|
|
|
|
ORDER BY RAND() |
|
|
|
ORDER BY RAND() |
|
|
|
|
|
|
|
|
|
|
|
LIMIT " . (int) $limit); |
|
|
|
LIMIT " . (int) $limit); |
|
|
|
|
|
|
|
|
|
|
|
$query->execute([$timeFrom, 0]); |
|
|
|
$query->execute([$timeFrom, 0]); |
|
|
|
|
|
|
|
|
|
|
|
return $query->fetchAll(); |
|
|
|
// Get required page details |
|
|
|
|
|
|
|
foreach ($query->fetchAll() as $host) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$result[] = $this->getHost($host->hostId); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return (object) $result; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
public function getManifestCrawlQueue(int $limit, int $timeFrom) { |
|
|
|
public function getManifestCrawlQueue(int $limit, int $timeFrom) { |
|
|
|