mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-09-04 18:21:53 +00:00
refactor manifest crawling
This commit is contained in:
parent
cb37c57bc4
commit
71724ae33f
@ -181,23 +181,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_PAGE_LIMIT', 20);
|
define('CRAWL_PAGE_LIMIT', 20);
|
||||||
|
|
||||||
/*
|
|
||||||
* Manifest (URI) processing limit in the crawler.php queue
|
|
||||||
*
|
|
||||||
* Used to collect distributed data index
|
|
||||||
* that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION
|
|
||||||
*
|
|
||||||
* This option related to CRAWL_MANIFEST_SECONDS_OFFSET value
|
|
||||||
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
|
|
||||||
*
|
|
||||||
* Usually up to 20 pages per minute,
|
|
||||||
* to prevent websites overload by sending GET crawling requests
|
|
||||||
*
|
|
||||||
* Set 0 to disable
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CRAWL_MANIFEST_LIMIT', 10);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Renew page index by timing offset provided
|
* Renew page index by timing offset provided
|
||||||
*
|
*
|
||||||
@ -234,19 +217,6 @@ define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
|
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
|
||||||
|
|
||||||
/*
|
|
||||||
* Renew manifests index by timing offset provided
|
|
||||||
*
|
|
||||||
* This option works with CRAWL_MANIFEST_LIMIT step queue
|
|
||||||
*
|
|
||||||
* Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair
|
|
||||||
* must have enough value to crawl all manifests collected in the DB index
|
|
||||||
*
|
|
||||||
* or the crawler can stuck in queue
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Only URL addresses match this rule will be auto-crawled
|
* Only URL addresses match this rule will be auto-crawled
|
||||||
*
|
*
|
||||||
@ -386,17 +356,6 @@ define('CRAWL_MANIFEST', true);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_MANIFEST_API_VERSION', 0.12);
|
define('CRAWL_MANIFEST_API_VERSION', 0.12);
|
||||||
|
|
||||||
/*
|
|
||||||
* Set default auto-crawl status for new manifest added
|
|
||||||
*
|
|
||||||
* true - crawler autostart manifest indexer
|
|
||||||
* false - requires manual validation by the moderator in the DB `manifest`.`status` field
|
|
||||||
*
|
|
||||||
* This option applying on CRAWL_MANIFEST enabled
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
define('CRAWL_MANIFEST_DEFAULT_STATUS', true);
|
|
||||||
|
|
||||||
// Cleaner settings
|
// Cleaner settings
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -46,7 +46,6 @@ $httpRequestsTimeTotal = 0;
|
|||||||
|
|
||||||
$hostPagesProcessed = 0;
|
$hostPagesProcessed = 0;
|
||||||
$manifestsProcessed = 0;
|
$manifestsProcessed = 0;
|
||||||
$manifestsAdded = 0;
|
|
||||||
$hostPagesAdded = 0;
|
$hostPagesAdded = 0;
|
||||||
$hostsAdded = 0;
|
$hostsAdded = 0;
|
||||||
$hostPagesBanned = 0;
|
$hostPagesBanned = 0;
|
||||||
@ -65,14 +64,76 @@ try {
|
|||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process manifests crawl queue
|
// Process robots crawl queue
|
||||||
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
||||||
|
|
||||||
$db->beginTransaction();
|
// Update robots
|
||||||
|
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
try {
|
// Update curl stats
|
||||||
|
$httpRequestsTotal++;
|
||||||
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
|
// Sitemap provided in robots.txt
|
||||||
|
if (200 == $curl->getCode()) {
|
||||||
|
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
$hostRobots = $host->robots;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update host index
|
||||||
|
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||||
|
|
||||||
|
// Process sitemaps when enabled
|
||||||
|
if (CRAWL_SITEMAPS) {
|
||||||
|
|
||||||
|
// Look for custom sitemap URL served in robots.txt
|
||||||
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
|
if ($hostSitemapPath = $robots->getSitemap()) {
|
||||||
|
|
||||||
|
// Replace relative paths
|
||||||
|
$hostSitemapPath = trim($hostSitemapPath, '/');
|
||||||
|
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
|
||||||
|
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
|
||||||
|
|
||||||
|
// Set default path when not exists
|
||||||
|
} else {
|
||||||
|
|
||||||
|
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init sitemap data
|
||||||
|
$sitemap = new Sitemap($hostSitemapPath);
|
||||||
|
|
||||||
|
// Process collected sitemap links
|
||||||
|
foreach ($sitemap->getLinks() as $link => $attributes) {
|
||||||
|
|
||||||
|
// Parse formatted link
|
||||||
|
$linkURI = Parser::uri($link);
|
||||||
|
$linkHostURL = Parser::hostURL($link);
|
||||||
|
|
||||||
|
// Add host page
|
||||||
|
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||||
|
$linkHostURL->string == $host->url && // this host links only
|
||||||
|
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||||
|
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||||
|
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||||
|
|
||||||
|
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update manifest if available for this host
|
||||||
|
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
|
||||||
|
|
||||||
|
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$httpRequestsTotal++;
|
$httpRequestsTotal++;
|
||||||
@ -80,9 +141,6 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
|||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Update manifest index anyway, with the current time and http code
|
|
||||||
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
|
|
||||||
|
|
||||||
// Skip processing non 200 code
|
// Skip processing non 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
@ -203,7 +261,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
|||||||
}
|
}
|
||||||
|
|
||||||
$hostURL = $remoteManifestHost->scheme . '://' .
|
$hostURL = $remoteManifestHost->scheme . '://' .
|
||||||
$remoteManifestHost->name .
|
$remoteManifestHost->name .
|
||||||
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
||||||
|
|
||||||
// Validate formatted link
|
// Validate formatted link
|
||||||
@ -256,87 +314,6 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply changes
|
|
||||||
$db->commit();
|
|
||||||
|
|
||||||
// Process update errors
|
|
||||||
} catch (Exception $e) {
|
|
||||||
|
|
||||||
// Debug std
|
|
||||||
var_dump($e);
|
|
||||||
|
|
||||||
// Skip item
|
|
||||||
$db->rollBack();
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process robots crawl queue
|
|
||||||
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
|
||||||
|
|
||||||
// Get robots.txt
|
|
||||||
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
|
||||||
|
|
||||||
// Update curl stats
|
|
||||||
$httpRequestsTotal++;
|
|
||||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
|
||||||
|
|
||||||
// Sitemap provided in robots.txt
|
|
||||||
if (200 == $curl->getCode()) {
|
|
||||||
|
|
||||||
$hostRobots = $curl->getContent();
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
$hostRobots = $host->robots;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update host index
|
|
||||||
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
|
||||||
|
|
||||||
// Process sitemaps when enabled
|
|
||||||
if (CRAWL_SITEMAPS) {
|
|
||||||
|
|
||||||
// Look for custom sitemap URL served in robots.txt
|
|
||||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
|
||||||
|
|
||||||
if ($hostSitemapPath = $robots->getSitemap()) {
|
|
||||||
|
|
||||||
// Replace relative paths
|
|
||||||
$hostSitemapPath = trim($hostSitemapPath, '/');
|
|
||||||
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
|
|
||||||
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
|
|
||||||
|
|
||||||
// Set default path when not exists
|
|
||||||
} else {
|
|
||||||
|
|
||||||
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Init sitemap data
|
|
||||||
$sitemap = new Sitemap($hostSitemapPath);
|
|
||||||
|
|
||||||
// Process collected sitemap links
|
|
||||||
foreach ($sitemap->getLinks() as $link => $attributes) {
|
|
||||||
|
|
||||||
// Parse formatted link
|
|
||||||
$linkURI = Parser::uri($link);
|
|
||||||
$linkHostURL = Parser::hostURL($link);
|
|
||||||
|
|
||||||
// Add host page
|
|
||||||
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
|
||||||
$linkHostURL->string == $host->url && // this host links only
|
|
||||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
|
||||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
|
||||||
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
|
||||||
|
|
||||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -720,9 +697,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
|
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
|
||||||
|
|
||||||
// Define variables
|
// Define variables
|
||||||
$metaDescription = null;
|
$metaDescription = null;
|
||||||
$metaKeywords = null;
|
$metaKeywords = null;
|
||||||
$metaYggoManifest = null;
|
$metaYggoManifestURL = null;
|
||||||
|
|
||||||
// Parse page content
|
// Parse page content
|
||||||
$dom = new DomDocument();
|
$dom = new DomDocument();
|
||||||
@ -782,7 +759,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
|
|
||||||
// Grab meta yggo:manifest link when available
|
// Grab meta yggo:manifest link when available
|
||||||
if (@$meta->getAttribute('name') == 'yggo:manifest') {
|
if (@$meta->getAttribute('name') == 'yggo:manifest') {
|
||||||
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
|
$metaYggoManifestURL = Filter::url(@$meta->getAttribute('content'));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -835,18 +812,12 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update manifest registry
|
// Update manifest registry
|
||||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
if (CRAWL_MANIFEST &&
|
||||||
|
!empty($metaYggoManifestURL) &&
|
||||||
|
filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) &&
|
||||||
|
preg_match(CRAWL_URL_REGEXP, $metaYggoManifestURL)) {
|
||||||
|
|
||||||
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
|
$manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL);
|
||||||
|
|
||||||
if (!$db->getManifest($metaYggoManifestCRC32)) {
|
|
||||||
$db->addManifest($metaYggoManifestCRC32,
|
|
||||||
$metaYggoManifest,
|
|
||||||
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
|
||||||
time());
|
|
||||||
|
|
||||||
$manifestsAdded++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Begin page links collection
|
// Begin page links collection
|
||||||
@ -1236,7 +1207,6 @@ echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
|||||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
||||||
|
|
||||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||||
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
|
|
||||||
|
|
||||||
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
||||||
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
||||||
|
Binary file not shown.
@ -28,52 +28,6 @@ class MySQL {
|
|||||||
$this->_db->rollBack();
|
$this->_db->rollBack();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Manifest
|
|
||||||
public function getTotalManifests() {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `manifest`');
|
|
||||||
|
|
||||||
$query->execute();
|
|
||||||
|
|
||||||
return $query->fetch()->total;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getManifests() {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `manifest`');
|
|
||||||
|
|
||||||
$query->execute();
|
|
||||||
|
|
||||||
return $query->fetchAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getManifest(int $crc32url) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1');
|
|
||||||
|
|
||||||
$query->execute([$crc32url]);
|
|
||||||
|
|
||||||
return $query->fetch();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)');
|
|
||||||
|
|
||||||
$query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]);
|
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function deleteManifest(int $manifestId) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1');
|
|
||||||
|
|
||||||
$query->execute([$manifestId]);
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Host
|
// Host
|
||||||
public function getAPIHosts(string $apiHostFields) {
|
public function getAPIHosts(string $apiHostFields) {
|
||||||
|
|
||||||
@ -175,7 +129,50 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pages
|
// Host settings
|
||||||
|
public function getHostSetting(int $hostId, mixed $key) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostId, $key]);
|
||||||
|
|
||||||
|
return $query->rowCount() ? $query->fetch()->value : false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHostSettings(int $hostId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) {
|
||||||
|
|
||||||
|
$query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ?
|
||||||
|
`key` = ?,
|
||||||
|
`value` = ?,
|
||||||
|
`timeAdded = ?
|
||||||
|
|
||||||
|
ON DUPLICATE KEY UPDATE `value` = ?,
|
||||||
|
`timeUpdated` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteHostSetting(int $hostSettingId) {
|
||||||
|
|
||||||
|
$query = $this->_db->query('DELETE FROM `hostSetting` WHERE `hostSettingId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostSettingId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Host pages
|
||||||
public function getTotalHostPages(int $hostId) {
|
public function getTotalHostPages(int $hostId) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?');
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?');
|
||||||
@ -696,30 +693,6 @@ class MySQL {
|
|||||||
return (object) $result;
|
return (object) $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `manifest`
|
|
||||||
|
|
||||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
|
|
||||||
|
|
||||||
ORDER BY RAND()
|
|
||||||
|
|
||||||
LIMIT ' . (int) $limit);
|
|
||||||
|
|
||||||
$query->execute([$timeFrom, 0]);
|
|
||||||
|
|
||||||
return $query->fetchAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function updateManifestCrawlQueue(int $manifestId, int $timeUpdated, int $httpCode) {
|
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1');
|
|
||||||
|
|
||||||
$query->execute([$timeUpdated, $httpCode, $manifestId]);
|
|
||||||
|
|
||||||
return $query->rowCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function optimize() {
|
public function optimize() {
|
||||||
|
|
||||||
$this->_db->query('OPTIMIZE TABLE `host`');
|
$this->_db->query('OPTIMIZE TABLE `host`');
|
||||||
@ -730,7 +703,5 @@ class MySQL {
|
|||||||
$this->_db->query('OPTIMIZE TABLE `hostPageSnapStorage`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageSnapStorage`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');
|
||||||
|
|
||||||
$this->_db->query('OPTIMIZE TABLE `manifest`');
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user