mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
refactor manifest crawling
This commit is contained in:
parent
cb37c57bc4
commit
71724ae33f
@ -181,23 +181,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
||||
*/
|
||||
define('CRAWL_PAGE_LIMIT', 20);
|
||||
|
||||
/*
|
||||
* Manifest (URI) processing limit in the crawler.php queue
|
||||
*
|
||||
* Used to collect distributed data index
|
||||
* that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION
|
||||
*
|
||||
* This option related to CRAWL_MANIFEST_SECONDS_OFFSET value
|
||||
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
|
||||
*
|
||||
* Usually up to 20 pages per minute,
|
||||
* to prevent websites overload by sending GET crawling requests
|
||||
*
|
||||
* Set 0 to disable
|
||||
*
|
||||
*/
|
||||
define('CRAWL_MANIFEST_LIMIT', 10);
|
||||
|
||||
/*
|
||||
* Renew page index by timing offset provided
|
||||
*
|
||||
@ -234,19 +217,6 @@ define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7*30);
|
||||
*/
|
||||
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
|
||||
|
||||
/*
|
||||
* Renew manifests index by timing offset provided
|
||||
*
|
||||
* This option works with CRAWL_MANIFEST_LIMIT step queue
|
||||
*
|
||||
* Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair
|
||||
* must have enough value to crawl all manifests collected in the DB index
|
||||
*
|
||||
* or the crawler can stuck in queue
|
||||
*
|
||||
*/
|
||||
define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
|
||||
|
||||
/*
|
||||
* Only URL addresses match this rule will be auto-crawled
|
||||
*
|
||||
@ -386,17 +356,6 @@ define('CRAWL_MANIFEST', true);
|
||||
*/
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.12);
|
||||
|
||||
/*
|
||||
* Set default auto-crawl status for new manifest added
|
||||
*
|
||||
* true - crawler autostart manifest indexer
|
||||
* false - requires manual validation by the moderator in the DB `manifest`.`status` field
|
||||
*
|
||||
* This option applying on CRAWL_MANIFEST enabled
|
||||
*
|
||||
*/
|
||||
define('CRAWL_MANIFEST_DEFAULT_STATUS', true);
|
||||
|
||||
// Cleaner settings
|
||||
|
||||
/*
|
||||
|
@ -46,7 +46,6 @@ $httpRequestsTimeTotal = 0;
|
||||
|
||||
$hostPagesProcessed = 0;
|
||||
$manifestsProcessed = 0;
|
||||
$manifestsAdded = 0;
|
||||
$hostPagesAdded = 0;
|
||||
$hostsAdded = 0;
|
||||
$hostPagesBanned = 0;
|
||||
@ -65,14 +64,76 @@ try {
|
||||
exit;
|
||||
}
|
||||
|
||||
// Process manifests crawl queue
|
||||
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
||||
// Process robots crawl queue
|
||||
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
||||
|
||||
$db->beginTransaction();
|
||||
// Update robots
|
||||
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
try {
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
|
||||
// Sitemap provided in robots.txt
|
||||
if (200 == $curl->getCode()) {
|
||||
|
||||
$hostRobots = $curl->getContent();
|
||||
|
||||
} else {
|
||||
|
||||
$hostRobots = $host->robots;
|
||||
}
|
||||
|
||||
// Update host index
|
||||
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||
|
||||
// Process sitemaps when enabled
|
||||
if (CRAWL_SITEMAPS) {
|
||||
|
||||
// Look for custom sitemap URL served in robots.txt
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
if ($hostSitemapPath = $robots->getSitemap()) {
|
||||
|
||||
// Replace relative paths
|
||||
$hostSitemapPath = trim($hostSitemapPath, '/');
|
||||
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
|
||||
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
|
||||
|
||||
// Set default path when not exists
|
||||
} else {
|
||||
|
||||
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
|
||||
}
|
||||
|
||||
// Init sitemap data
|
||||
$sitemap = new Sitemap($hostSitemapPath);
|
||||
|
||||
// Process collected sitemap links
|
||||
foreach ($sitemap->getLinks() as $link => $attributes) {
|
||||
|
||||
// Parse formatted link
|
||||
$linkURI = Parser::uri($link);
|
||||
$linkHostURL = Parser::hostURL($link);
|
||||
|
||||
// Add host page
|
||||
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||
$linkHostURL->string == $host->url && // this host links only
|
||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||
|
||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update manifest if available for this host
|
||||
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
|
||||
|
||||
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
@ -80,9 +141,6 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
// Update manifest index anyway, with the current time and http code
|
||||
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
|
||||
|
||||
// Skip processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
@ -203,7 +261,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
||||
}
|
||||
|
||||
$hostURL = $remoteManifestHost->scheme . '://' .
|
||||
$remoteManifestHost->name .
|
||||
$remoteManifestHost->name .
|
||||
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
||||
|
||||
// Validate formatted link
|
||||
@ -256,87 +314,6 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply changes
|
||||
$db->commit();
|
||||
|
||||
// Process update errors
|
||||
} catch (Exception $e) {
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
// Skip item
|
||||
$db->rollBack();
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Process robots crawl queue
|
||||
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
||||
|
||||
// Get robots.txt
|
||||
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
// Sitemap provided in robots.txt
|
||||
if (200 == $curl->getCode()) {
|
||||
|
||||
$hostRobots = $curl->getContent();
|
||||
|
||||
} else {
|
||||
|
||||
$hostRobots = $host->robots;
|
||||
}
|
||||
|
||||
// Update host index
|
||||
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||
|
||||
// Process sitemaps when enabled
|
||||
if (CRAWL_SITEMAPS) {
|
||||
|
||||
// Look for custom sitemap URL served in robots.txt
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
if ($hostSitemapPath = $robots->getSitemap()) {
|
||||
|
||||
// Replace relative paths
|
||||
$hostSitemapPath = trim($hostSitemapPath, '/');
|
||||
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
|
||||
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
|
||||
|
||||
// Set default path when not exists
|
||||
} else {
|
||||
|
||||
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
|
||||
}
|
||||
|
||||
// Init sitemap data
|
||||
$sitemap = new Sitemap($hostSitemapPath);
|
||||
|
||||
// Process collected sitemap links
|
||||
foreach ($sitemap->getLinks() as $link => $attributes) {
|
||||
|
||||
// Parse formatted link
|
||||
$linkURI = Parser::uri($link);
|
||||
$linkHostURL = Parser::hostURL($link);
|
||||
|
||||
// Add host page
|
||||
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||
$linkHostURL->string == $host->url && // this host links only
|
||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||
|
||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -720,9 +697,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
|
||||
|
||||
// Define variables
|
||||
$metaDescription = null;
|
||||
$metaKeywords = null;
|
||||
$metaYggoManifest = null;
|
||||
$metaDescription = null;
|
||||
$metaKeywords = null;
|
||||
$metaYggoManifestURL = null;
|
||||
|
||||
// Parse page content
|
||||
$dom = new DomDocument();
|
||||
@ -782,7 +759,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
|
||||
// Grab meta yggo:manifest link when available
|
||||
if (@$meta->getAttribute('name') == 'yggo:manifest') {
|
||||
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
|
||||
$metaYggoManifestURL = Filter::url(@$meta->getAttribute('content'));
|
||||
}
|
||||
}
|
||||
|
||||
@ -835,18 +812,12 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
}
|
||||
|
||||
// Update manifest registry
|
||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||
if (CRAWL_MANIFEST &&
|
||||
!empty($metaYggoManifestURL) &&
|
||||
filter_var($metaYggoManifestURL, FILTER_VALIDATE_URL) &&
|
||||
preg_match(CRAWL_URL_REGEXP, $metaYggoManifestURL)) {
|
||||
|
||||
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
|
||||
|
||||
if (!$db->getManifest($metaYggoManifestCRC32)) {
|
||||
$db->addManifest($metaYggoManifestCRC32,
|
||||
$metaYggoManifest,
|
||||
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
||||
time());
|
||||
|
||||
$manifestsAdded++;
|
||||
}
|
||||
$manifestsProcessed += $db->setHostSetting($queueHostPage->hostId, 'MANIFEST_URL', $metaYggoManifestURL);
|
||||
}
|
||||
|
||||
// Begin page links collection
|
||||
@ -1236,7 +1207,6 @@ echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
||||
|
||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
|
||||
|
||||
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
||||
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
||||
|
Binary file not shown.
@ -28,52 +28,6 @@ class MySQL {
|
||||
$this->_db->rollBack();
|
||||
}
|
||||
|
||||
// Manifest
|
||||
public function getTotalManifests() {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `manifest`');
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getManifests() {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `manifest`');
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getManifest(int $crc32url) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$crc32url]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
|
||||
public function deleteManifest(int $manifestId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$manifestId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Host
|
||||
public function getAPIHosts(string $apiHostFields) {
|
||||
|
||||
@ -175,7 +129,50 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Pages
|
||||
// Host settings
|
||||
public function getHostSetting(int $hostId, mixed $key) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `key` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostId, $key]);
|
||||
|
||||
return $query->rowCount() ? $query->fetch()->value : false;
|
||||
}
|
||||
|
||||
public function getHostSettings(int $hostId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ?');
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function setHostSetting(int $hostId, mixed $key, mixed $value, int $timeAdded = 0, int $timeUpdated = 0) {
|
||||
|
||||
$query = $this->_db->query('INSERT INTO `hostSetting` SET `hostId` = ?
|
||||
`key` = ?,
|
||||
`value` = ?,
|
||||
`timeAdded = ?
|
||||
|
||||
ON DUPLICATE KEY UPDATE `value` = ?,
|
||||
`timeUpdated` = ?');
|
||||
|
||||
$query->execute([$hostId, $key, $value, ($timeAdded > 0 ? $timeAdded : time()), $value, ($timeUpdated > 0 ? $timeUpdated : time())]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostSetting(int $hostSettingId) {
|
||||
|
||||
$query = $this->_db->query('DELETE FROM `hostSetting` WHERE `hostSettingId` = ?');
|
||||
|
||||
$query->execute([$hostSettingId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Host pages
|
||||
public function getTotalHostPages(int $hostId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?');
|
||||
@ -696,30 +693,6 @@ class MySQL {
|
||||
return (object) $result;
|
||||
}
|
||||
|
||||
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `manifest`
|
||||
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
|
||||
|
||||
ORDER BY RAND()
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom, 0]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function updateManifestCrawlQueue(int $manifestId, int $timeUpdated, int $httpCode) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$timeUpdated, $httpCode, $manifestId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function optimize() {
|
||||
|
||||
$this->_db->query('OPTIMIZE TABLE `host`');
|
||||
@ -730,7 +703,5 @@ class MySQL {
|
||||
$this->_db->query('OPTIMIZE TABLE `hostPageSnapStorage`');
|
||||
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
|
||||
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');
|
||||
|
||||
$this->_db->query('OPTIMIZE TABLE `manifest`');
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user