refactor manifest crawling

This commit is contained in:
ghost 2023-08-04 09:00:03 +03:00
parent cb37c57bc4
commit 71724ae33f
4 changed files with 121 additions and 221 deletions

View File

@ -181,23 +181,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
*/
define('CRAWL_PAGE_LIMIT', 20);
/*
* Manifest (URI) processing limit in the crawler.php queue
*
* Used to collect distributed data index
* that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION
*
* This option related to CRAWL_MANIFEST_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Set 0 to disable
*
*/
define('CRAWL_MANIFEST_LIMIT', 10);
/*
* Renew page index by timing offset provided
*
@ -234,19 +217,6 @@ define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30);
*/
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
/*
* Renew manifests index by timing offset provided
*
* This option works with CRAWL_MANIFEST_LIMIT step queue
*
* Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair
* must have enough value to crawl all manifests collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
/*
* Only URL addresses match this rule will be auto-crawled
*
@ -386,17 +356,6 @@ define('CRAWL_MANIFEST', true);
*/
define('CRAWL_MANIFEST_API_VERSION', 0.12);
/*
* Set default auto-crawl status for new manifest added
*
* true - crawler autostart manifest indexer
* false - requires manual validation by the moderator in the DB `manifest`.`status` field
*
* This option applying on CRAWL_MANIFEST enabled
*
*/
define('CRAWL_MANIFEST_DEFAULT_STATUS', true);
// Cleaner settings
/*

View File

@ -46,7 +46,6 @@ $httpRequestsTimeTotal = 0;
$hostPagesProcessed = 0;
$manifestsProcessed = 0;
$manifestsAdded = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
@ -65,14 +64,76 @@ try {
exit;
}
// Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
// Process robots crawl queue
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
$db->beginTransaction();
// Update robots
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
try {
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
// Sitemap provided in robots.txt
if (200 == $curl->getCode()) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = $host->robots;
}
// Update host index
$db->updateHostRobots($host->hostId, $hostRobots, time());
// Process sitemaps when enabled
if (CRAWL_SITEMAPS) {
// Look for custom sitemap URL served in robots.txt
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
if ($hostSitemapPath = $robots->getSitemap()) {
// Replace relative paths
$hostSitemapPath = trim($hostSitemapPath, '/');
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
// Set default path when not exists
} else {
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
}
// Init sitemap data
$sitemap = new Sitemap($hostSitemapPath);
// Process collected sitemap links
foreach ($sitemap->getLinks() as $link => $attributes) {
// Parse formatted link
$linkURI = Parser::uri($link);
$linkHostURL = Parser::hostURL($link);
// Add host page
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
$linkHostURL->string == $host->url && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
}
}
}
// Update manifest if available for this host
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
@ -80,9 +141,6 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update manifest index anyway, with the current time and http code
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
// Skip processing non 200 code
if (200 != $curl->getCode()) {
@ -203,7 +261,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
}
$hostURL = $remoteManifestHost->scheme . '://' .
$remoteManifestHost->name .
$remoteManifestHost->name .
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link
@ -256,87 +314,6 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
}
}
}
// Apply changes
$db->commit();
// Process update errors
} catch (Exception $e) {
// Debug std
var_dump($e);
// Skip item
$db->rollBack();
continue;
}
}
// Process robots crawl queue
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
// Get robots.txt
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Sitemap provided in robots.txt
if (200 == $curl->getCode()) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = $host->robots;
}
// Update host index
$db->updateHostRobots($host->hostId, $hostRobots, time());
// Process sitemaps when enabled
if (CRAWL_SITEMAPS) {
// Look for custom sitemap URL served in robots.txt
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
if ($hostSitemapPath = $robots->getSitemap()) {
// Replace relative paths
$hostSitemapPath = trim($hostSitemapPath, '/');
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
// Set default path when not exists
} else {
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
}
// Init sitemap data
$sitemap = new Sitemap($hostSitemapPath);
// Process collected sitemap links
foreach ($sitemap->getLinks() as $link => $attributes) {
// Parse formatted link
$linkURI = Parser::uri($link);
$linkHostURL = Parser::hostURL($link);
// Add host page
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
$linkHostURL->string == $host->url && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
}
}
}
}
@ -720,9 +697,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
// Define variables
$metaDescription = null;
$metaKeywords = null;
$metaYggoManifest = null;
$metaDescription = null;
$metaKeywords = null;
$metaYggoManifestURL = null;
// Parse page content
$dom = new DomDocument();
@ -782,7 +759,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Grab meta yggo:manifest link when available
if (@$meta->getAttribute('name') == 'yggo:manifest') {
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
$metaYggoManifestURL = Filter::url(@$meta->getAttribute('content'));
}
}
@ -835,18 +812,12 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
}
// Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
if (CRAWL_MANIFEST &&
!empty($metaYggoManifestURL) &&
filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) &&
preg_match(CRAWL_URL_REGEXP, $metaYggoManifestURL)) {
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
if (!$db->getManifest($metaYggoManifestCRC32)) {
$db->addManifest($metaYggoManifestCRC32,
$metaYggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
$manifestsAdded++;
}
$manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL);
}
// Begin page links collection
@ -1236,7 +1207,6 @@ echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;

Binary file not shown.

View File

@ -28,52 +28,6 @@ class MySQL {
$this->_db->rollBack();
}
// Manifest
public function getTotalManifests() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `manifest`');
$query->execute();
return $query->fetch()->total;
}
public function getManifests() {
$query = $this->_db->prepare('SELECT * FROM `manifest`');
$query->execute();
return $query->fetchAll();
}
public function getManifest(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1');
$query->execute([$crc32url]);
return $query->fetch();
}
public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) {
$query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]);
return $this->_db->lastInsertId();
}
public function deleteManifest(int $manifestId) {
$query = $this->_db->prepare('DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1');
$query->execute([$manifestId]);
return $query->rowCount();
}
// Host
public function getAPIHosts(string $apiHostFields) {
@ -175,7 +129,50 @@ class MySQL {
return $query->rowCount();
}
// Pages
// Host settings
public function getHostSetting(int $hostId, mixed $key) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
$query->execute([$hostId, $key]);
return $query->rowCount() ? $query->fetch()->value : false;
}
public function getHostSettings(int $hostId) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetchAll();
}
public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) {
$query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ?
`key` = ?,
`value` = ?,
`timeAdded = ?
ON DUPLICATE KEY UPDATE `value` = ?,
`timeUpdated` = ?');
$query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]);
return $query->rowCount();
}
public function deleteHostSetting(int $hostSettingId) {
$query = $this->_db->query('DELETE FROM `hostSetting` WHERE `hostSettingId` = ?');
$query->execute([$hostSettingId]);
return $query->rowCount();
}
// Host pages
public function getTotalHostPages(int $hostId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?');
@ -696,30 +693,6 @@ class MySQL {
return (object) $result;
}
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `manifest`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
ORDER BY RAND()
LIMIT ' . (int) $limit);
$query->execute([$timeFrom, 0]);
return $query->fetchAll();
}
public function updateManifestCrawlQueue(int $manifestId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $manifestId]);
return $query->rowCount();
}
public function optimize() {
$this->_db->query('OPTIMIZE TABLE `host`');
@ -730,7 +703,5 @@ class MySQL {
$this->_db->query('OPTIMIZE TABLE `hostPageSnapStorage`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');
$this->_db->query('OPTIMIZE TABLE `manifest`');
}
}