mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 18:35:04 +00:00
implement hosts crawl queue, move robots, sitemaps, manifests to this task
This commit is contained in:
parent
6ee5e53ef4
commit
ab6c0379c8
@ -270,7 +270,7 @@ define('CRAWL_HOST_DEFAULT_NSFW', false);
|
|||||||
/*
|
/*
|
||||||
* Collect sitemap index when available
|
* Collect sitemap index when available
|
||||||
*
|
*
|
||||||
* At this moment, works with CRAWL_ROBOTS_SECONDS_OFFSET/CRAWL_ROBOTS_LIMIT options enabled only
|
* At this moment, works with CRAWL_HOST_SECONDS_OFFSET/CRAWL_HOST_LIMIT options enabled only
|
||||||
*
|
*
|
||||||
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
|
* When sitemap path not provided in robots.txt, crawler scans default /sitemap.xml
|
||||||
*
|
*
|
||||||
@ -290,18 +290,23 @@ define('CRAWL_SITEMAPS', true);
|
|||||||
define('CRAWL_PAGE_RANK_UPDATE', true);
|
define('CRAWL_PAGE_RANK_UPDATE', true);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Renew robots.txt index by timing offset provided
|
* Renew hosts index by timing offset provided
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
define('CRAWL_ROBOTS_SECONDS_OFFSET', 60*60*24*7);
|
define('CRAWL_HOST_SECONDS_OFFSET', 60*60*24*7);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Hosts Robots.txt processing limit in the crawler.php queue
|
* Hosts hosts processing limit in the crawler.php queue
|
||||||
*
|
*
|
||||||
* Set 0 to disable
|
* Set 0 to disable
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
define('CRAWL_ROBOTS_LIMIT', 1);
|
define('CRAWL_HOST_LIMIT', 1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Crawl robots.txt
|
||||||
|
*/
|
||||||
|
define('CRAWL_ROBOTS', true); // true|false
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Default robots.txt rules on remote file not exists
|
* Default robots.txt rules on remote file not exists
|
||||||
|
@ -44,11 +44,12 @@ $httpRequestsSizeTotal = 0;
|
|||||||
$httpDownloadSizeTotal = 0;
|
$httpDownloadSizeTotal = 0;
|
||||||
$httpRequestsTimeTotal = 0;
|
$httpRequestsTimeTotal = 0;
|
||||||
|
|
||||||
|
$hostsProcessed = 0;
|
||||||
$hostsAdded = 0;
|
$hostsAdded = 0;
|
||||||
$hostPagesBanned = 0;
|
|
||||||
$hostPagesSnapAdded = 0;
|
|
||||||
|
|
||||||
$hostPagesProcessed = 0;
|
$hostPagesProcessed = 0;
|
||||||
|
$hostPagesBanned = 0;
|
||||||
|
$hostPagesSnapAdded = 0;
|
||||||
$hostPagesAdded = 0;
|
$hostPagesAdded = 0;
|
||||||
|
|
||||||
$manifestsProcessed = 0;
|
$manifestsProcessed = 0;
|
||||||
@ -67,261 +68,288 @@ try {
|
|||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process robots crawl queue
|
// Process hosts crawl queue
|
||||||
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $host) {
|
||||||
|
|
||||||
// Update robots
|
$db->beginTransaction();
|
||||||
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
|
||||||
|
|
||||||
// Update curl stats
|
try {
|
||||||
$httpRequestsTotal++;
|
|
||||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
|
||||||
|
|
||||||
// Sitemap provided in robots.txt
|
// Update host crawl queue
|
||||||
if (200 == $curl->getCode()) {
|
$hostsProcessed += $db->updateHostCrawlQueue($host->hostId);
|
||||||
|
|
||||||
$hostRobots = $curl->getContent();
|
// Crawl robots.txt
|
||||||
|
if (CRAWL_ROBOTS) {
|
||||||
|
|
||||||
} else {
|
// Update robots
|
||||||
|
$curl = new Curl($host->url . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
$hostRobots = $host->robots;
|
// Update curl stats
|
||||||
}
|
$httpRequestsTotal++;
|
||||||
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Update host index
|
// Sitemap provided in robots.txt
|
||||||
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
if (200 == $curl->getCode()) {
|
||||||
|
|
||||||
// Process sitemaps when enabled
|
$hostRobots = $curl->getContent();
|
||||||
if (CRAWL_SITEMAPS) {
|
|
||||||
|
|
||||||
// Look for custom sitemap URL served in robots.txt
|
} else {
|
||||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
|
||||||
|
|
||||||
if ($hostSitemapPath = $robots->getSitemap()) {
|
$hostRobots = $host->robots;
|
||||||
|
}
|
||||||
|
|
||||||
// Replace relative paths
|
// Update host index
|
||||||
$hostSitemapPath = trim($hostSitemapPath, '/');
|
$db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||||
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
|
|
||||||
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
|
|
||||||
|
|
||||||
// Set default path when not exists
|
|
||||||
} else {
|
|
||||||
|
|
||||||
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init sitemap data
|
// Process sitemaps when enabled
|
||||||
$sitemap = new Sitemap($hostSitemapPath);
|
if (CRAWL_SITEMAPS) {
|
||||||
|
|
||||||
if ($sitemapLinks = $sitemap->getLinks()) {
|
// Look for custom sitemap URL served in robots.txt
|
||||||
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
$sitemapsProcessed++;
|
if ($hostSitemapPath = $robots->getSitemap()) {
|
||||||
|
|
||||||
// Process collected sitemap links
|
// Replace relative paths
|
||||||
foreach ($sitemapLinks as $link => $attributes) {
|
$hostSitemapPath = trim($hostSitemapPath, '/');
|
||||||
|
$hostSitemapPath = str_replace($host->url, '', $hostSitemapPath);
|
||||||
|
$hostSitemapPath = sprintf('%s%s', $host->url, $hostSitemapPath);
|
||||||
|
|
||||||
// Parse formatted link
|
// Set default path when not exists
|
||||||
$linkURI = Parser::uri($link);
|
} else {
|
||||||
$linkHostURL = Parser::hostURL($link);
|
|
||||||
|
|
||||||
// Add host page
|
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->url);
|
||||||
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
}
|
||||||
$linkHostURL->string == $host->url && // this host links only
|
|
||||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
|
||||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
|
||||||
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
|
||||||
|
|
||||||
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
// Init sitemap data
|
||||||
|
$sitemap = new Sitemap($hostSitemapPath);
|
||||||
|
|
||||||
|
if ($sitemapLinks = $sitemap->getLinks()) {
|
||||||
|
|
||||||
|
$sitemapsProcessed++;
|
||||||
|
|
||||||
|
// Process collected sitemap links
|
||||||
|
foreach ($sitemapLinks as $link => $attributes) {
|
||||||
|
|
||||||
|
// Parse formatted link
|
||||||
|
$linkURI = Parser::uri($link);
|
||||||
|
$linkHostURL = Parser::hostURL($link);
|
||||||
|
|
||||||
|
// Add host page
|
||||||
|
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||||
|
$linkHostURL->string == $host->url && // this host links only
|
||||||
|
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||||
|
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||||
|
!$db->findHostPageByCRC32URI($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||||
|
|
||||||
|
$hostPagesAdded += $db->addHostPage($host->hostId, crc32($linkURI->string), $linkURI->string, time());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Update manifest if available for this host
|
// Update manifests
|
||||||
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
|
if (CRAWL_MANIFEST) {
|
||||||
|
if ($manifestURL = $db->getHostSetting($host->hostId, 'MANIFEST_URL')) {
|
||||||
|
|
||||||
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
|
$curl = new Curl($manifestURL, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$httpRequestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Skip processing non 200 code
|
// Skip processing non 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing without returned data
|
// Skip processing without returned data
|
||||||
if (!$remoteManifest = $curl->getContent()) {
|
if (!$remoteManifest = $curl->getContent()) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing on json encoding error
|
// Skip processing on json encoding error
|
||||||
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing on required fields missed
|
// Skip processing on required fields missed
|
||||||
if (empty($remoteManifest->status) ||
|
if (empty($remoteManifest->status) ||
|
||||||
empty($remoteManifest->result->config->crawlUrlRegexp) ||
|
empty($remoteManifest->result->config->crawlUrlRegexp) ||
|
||||||
empty($remoteManifest->result->api->version) ||
|
empty($remoteManifest->result->api->version) ||
|
||||||
empty($remoteManifest->result->api->hosts)) {
|
empty($remoteManifest->result->api->hosts)) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing on API version not compatible
|
// Skip processing on API version not compatible
|
||||||
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing on host API not available
|
// Skip processing on host API not available
|
||||||
if (!$remoteManifest->result->api->hosts) {
|
if (!$remoteManifest->result->api->hosts) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
||||||
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing on host link does not match condition
|
// Skip processing on host link does not match condition
|
||||||
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
|
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Begin hosts collection
|
// Begin hosts collection
|
||||||
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
|
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Update curl stats
|
// Update curl stats
|
||||||
$httpRequestsTotal++;
|
$httpRequestsTotal++;
|
||||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
// Skip processing non 200 code
|
// Skip processing non 200 code
|
||||||
if (200 != $curl->getCode()) {
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing without returned data
|
// Skip processing without returned data
|
||||||
if (!$remoteManifestHosts = $curl->getContent()) {
|
if (!$remoteManifestHosts = $curl->getContent()) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing on json encoding error
|
// Skip processing on json encoding error
|
||||||
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
|
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip processing on required fields missed
|
// Skip processing on required fields missed
|
||||||
if (empty($remoteManifestHosts->status) ||
|
if (empty($remoteManifestHosts->status) ||
|
||||||
empty($remoteManifestHosts->result)) {
|
empty($remoteManifestHosts->result)) {
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Begin hosts processing
|
// Begin hosts processing
|
||||||
foreach ($remoteManifestHosts->result as $remoteManifestHost) {
|
foreach ($remoteManifestHosts->result as $remoteManifestHost) {
|
||||||
|
|
||||||
// Skip processing on required fields missed
|
// Skip processing on required fields missed
|
||||||
if (empty($remoteManifestHost->scheme) ||
|
if (empty($remoteManifestHost->scheme) ||
|
||||||
empty($remoteManifestHost->name)) {
|
empty($remoteManifestHost->name)) {
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
|
|
||||||
$hostURL = $remoteManifestHost->scheme . '://' .
|
|
||||||
$remoteManifestHost->name .
|
|
||||||
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
|
||||||
|
|
||||||
// Validate formatted link
|
|
||||||
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
|
|
||||||
|
|
||||||
// Host not exists
|
|
||||||
if (!$db->getHostByCRC32URL(crc32($hostURL))) {
|
|
||||||
|
|
||||||
// Get robots.txt if exists
|
|
||||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
|
||||||
|
|
||||||
// Update curl stats
|
|
||||||
$httpRequestsTotal++;
|
|
||||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
||||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
||||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
|
||||||
|
|
||||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
|
||||||
$hostRobots = $curl->getContent();
|
|
||||||
} else {
|
|
||||||
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
$hostURL = $remoteManifestHost->scheme . '://' .
|
||||||
|
$remoteManifestHost->name .
|
||||||
|
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
||||||
|
|
||||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
// Validate formatted link
|
||||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
|
||||||
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
|
||||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
|
||||||
|
|
||||||
$hostId = $db->addHost( $remoteManifestHosts->result->scheme,
|
// Host not exists
|
||||||
$remoteManifestHosts->result->name,
|
if (!$db->getHostByCRC32URL(crc32($hostURL))) {
|
||||||
$remoteManifestHosts->result->port,
|
|
||||||
crc32($hostURL),
|
|
||||||
time(),
|
|
||||||
null,
|
|
||||||
$hostPageLimit,
|
|
||||||
(string) $hostMetaOnly,
|
|
||||||
(string) $hostStatus,
|
|
||||||
(string) $hostNsfw,
|
|
||||||
$hostRobots,
|
|
||||||
$hostRobotsPostfix);
|
|
||||||
|
|
||||||
// Add web root host page to make host visible in the crawl queue
|
// Get robots.txt if exists
|
||||||
$db->addHostPage($hostId, crc32('/'), '/', time());
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
// Increase counters
|
// Update curl stats
|
||||||
$hostPagesAdded++;
|
$httpRequestsTotal++;
|
||||||
$hostsAdded++;
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||||
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||||
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||||
|
|
||||||
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
} else {
|
||||||
|
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||||
|
|
||||||
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||||
|
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||||
|
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||||
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||||
|
|
||||||
|
$hostId = $db->addHost( $remoteManifestHosts->result->scheme,
|
||||||
|
$remoteManifestHosts->result->name,
|
||||||
|
$remoteManifestHosts->result->port,
|
||||||
|
crc32($hostURL),
|
||||||
|
time(),
|
||||||
|
null,
|
||||||
|
$hostPageLimit,
|
||||||
|
(string) $hostMetaOnly,
|
||||||
|
(string) $hostStatus,
|
||||||
|
(string) $hostNsfw,
|
||||||
|
$hostRobots,
|
||||||
|
$hostRobotsPostfix);
|
||||||
|
|
||||||
|
// Add web root host page to make host visible in the crawl queue
|
||||||
|
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||||
|
|
||||||
|
// Increase counters
|
||||||
|
$hostPagesAdded++;
|
||||||
|
$hostsAdded++;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$db->commit();
|
||||||
|
|
||||||
|
// Process update errors
|
||||||
|
} catch (Exception $e) {
|
||||||
|
|
||||||
|
// Debug std
|
||||||
|
var_dump($e);
|
||||||
|
|
||||||
|
// Skip item
|
||||||
|
$db->rollBack();
|
||||||
|
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1207,20 +1235,21 @@ $executionTimeTotal = microtime(true) - $timeStart;
|
|||||||
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
||||||
|
|
||||||
// Debug output
|
// Debug output
|
||||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
echo 'Hosts processed: ' . $hostsProcessed . PHP_EOL;
|
||||||
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL . PHP_EOL;
|
||||||
|
|
||||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||||
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
||||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL . PHP_EOL;
|
||||||
|
|
||||||
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL;
|
echo 'Sitemaps processed: ' . $sitemapsProcessed . PHP_EOL . PHP_EOL;
|
||||||
|
|
||||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL . PHP_EOL;
|
||||||
|
|
||||||
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
||||||
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
||||||
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
|
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
|
||||||
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
|
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL . PHP_EOL;
|
||||||
|
|
||||||
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;
|
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;
|
||||||
|
@ -667,7 +667,7 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
|
public function getHostCrawlQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
$result = [];
|
$result = [];
|
||||||
|
|
||||||
@ -693,9 +693,19 @@ class MySQL {
|
|||||||
return (object) $result;
|
return (object) $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function updateHostCrawlQueue(int $hostId, int $timeUpdated) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `host` SET `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$timeUpdated, $hostId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
public function optimize() {
|
public function optimize() {
|
||||||
|
|
||||||
$this->_db->query('OPTIMIZE TABLE `host`');
|
$this->_db->query('OPTIMIZE TABLE `host`');
|
||||||
|
$this->_db->query('OPTIMIZE TABLE `hostSetting`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPage`');
|
$this->_db->query('OPTIMIZE TABLE `hostPage`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageDom`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageDom`');
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 162 KiB After Width: | Height: | Size: 144 KiB |
Loading…
x
Reference in New Issue
Block a user