mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-03-13 05:41:02 +00:00
build host/host page URL in SQL query
This commit is contained in:
parent
b13293988a
commit
307eb03600
@ -49,11 +49,8 @@ try {
|
||||
// Get cleaner queue
|
||||
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
||||
|
||||
// Parse host info
|
||||
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
$curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
|
@ -194,7 +194,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
||||
}
|
||||
|
||||
$hostURL = $remoteManifestHost->scheme . '://' .
|
||||
$remoteManifestHost->name .
|
||||
$remoteManifestHost->name .
|
||||
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
||||
|
||||
// Validate formatted link
|
||||
@ -267,13 +267,8 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
|
||||
// Process robots crawl queue
|
||||
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
|
||||
|
||||
// Build web root URL
|
||||
$hostURL = $host->scheme . '://' .
|
||||
$host->name .
|
||||
($host->port ? ':' . $host->port : '');
|
||||
|
||||
// Get robots.txt
|
||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
$curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
@ -304,13 +299,13 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
|
||||
|
||||
// Replace relative paths
|
||||
$hostSitemapPath = trim($hostSitemapPath, '/');
|
||||
$hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath);
|
||||
$hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath);
|
||||
$hostSitemapPath = str_replace($host->hostURL, '', $hostSitemapPath);
|
||||
$hostSitemapPath = sprintf('%s%s', $host->hostURL, $hostSitemapPath);
|
||||
|
||||
// Set default path when not exists
|
||||
} else {
|
||||
|
||||
$hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL);
|
||||
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->hostURL);
|
||||
}
|
||||
|
||||
// Init sitemap data
|
||||
@ -325,7 +320,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
|
||||
|
||||
// Add host page
|
||||
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
|
||||
$linkHostURL->string == $hostURL && // this host links only
|
||||
$linkHostURL->string == $host->hostURL && // this host links only
|
||||
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
|
||||
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
|
||||
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
|
||||
@ -343,11 +338,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
|
||||
try {
|
||||
|
||||
// Build URL from the DB
|
||||
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||
|
||||
// Init page request
|
||||
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
|
||||
$curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
@ -368,7 +360,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
// Try to receive target page location on page redirect available
|
||||
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
|
||||
$curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
@ -392,10 +384,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
//Make relative links absolute
|
||||
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
|
||||
|
||||
$url = $queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
|
||||
$url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
|
||||
}
|
||||
|
||||
// Validate formatted link
|
||||
@ -693,7 +682,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
sprintf('CRC32: %s', $crc32data . PHP_EOL .
|
||||
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
|
||||
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
|
||||
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
|
||||
sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL))))) {
|
||||
|
||||
// Done
|
||||
$zip->close();
|
||||
@ -1055,10 +1044,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
//Make relative links absolute
|
||||
if (!parse_url($link['ref'], PHP_URL_HOST)) {
|
||||
|
||||
$link['ref'] = $queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
|
||||
$link['ref'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
|
||||
}
|
||||
|
||||
// Validate formatted link
|
||||
|
@ -242,7 +242,17 @@ class MySQL {
|
||||
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`
|
||||
`host`.`port`,
|
||||
|
||||
IF (`host`.`port` IS NOT NULL,
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`)
|
||||
) AS `hostURL`,
|
||||
|
||||
IF (`host`.`port` IS NOT NULL,
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
|
||||
) AS `hostPageURL`
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
|
||||
@ -294,22 +304,33 @@ class MySQL {
|
||||
|
||||
public function getFoundHostPage(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`,
|
||||
$query = $this->_db->prepare("SELECT `hostPage`.`hostPageId`,
|
||||
`hostPage`.`uri`,
|
||||
`hostPage`.`timeAdded`,
|
||||
`hostPage`.`timeUpdated`,
|
||||
`hostPage`.`mime`,
|
||||
`hostPage`.`size`,
|
||||
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`
|
||||
`host`.`port`,
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
IF (`host`.`port` IS NOT NULL,
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`)
|
||||
) AS `hostURL`,
|
||||
|
||||
WHERE `hostPage`.`hostPageId` = ?
|
||||
IF (`host`.`port` IS NOT NULL,
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
|
||||
) AS `hostPageURL`
|
||||
|
||||
LIMIT 1');
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
|
||||
WHERE `hostPage`.`hostPageId` = ?
|
||||
|
||||
LIMIT 1");
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
@ -623,13 +644,16 @@ class MySQL {
|
||||
// Cleaner tools
|
||||
public function getCleanerQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `host`
|
||||
$query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL,
|
||||
CONCAT(`scheme`, '://', `name`, ':', `port`),
|
||||
CONCAT(`scheme`, '://', `name`)
|
||||
) AS `hostURL` FROM `host`
|
||||
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ?
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ?
|
||||
|
||||
ORDER BY `hostId`
|
||||
ORDER BY `hostId`
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom, 0]);
|
||||
|
||||
@ -755,25 +779,36 @@ class MySQL {
|
||||
$query = $this->_db->prepare("SELECT `hostPage`.`hostId`,
|
||||
`hostPage`.`hostPageId`,
|
||||
`hostPage`.`uri`,
|
||||
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`,
|
||||
`host`.`crawlPageLimit`,
|
||||
`host`.`crawlMetaOnly`,
|
||||
`host`.`robots`,
|
||||
`host`.`robotsPostfix`
|
||||
`host`.`robotsPostfix`,
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
IF (`host`.`port` IS NOT NULL,
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`)
|
||||
) AS `hostURL`,
|
||||
|
||||
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?))
|
||||
IF (`host`.`port` IS NOT NULL,
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
|
||||
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
|
||||
) AS `hostPageURL`
|
||||
|
||||
AND `host`.`status` <> ?
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
|
||||
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
|
||||
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?))
|
||||
|
||||
LIMIT " . (int) $limit);
|
||||
AND `host`.`status` <> ?
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
|
||||
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
|
||||
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
|
||||
|
||||
@ -791,13 +826,18 @@ class MySQL {
|
||||
|
||||
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `host`
|
||||
$query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL,
|
||||
CONCAT(`scheme`, '://', `name`, ':', `port`),
|
||||
CONCAT(`scheme`, '://', `name`)
|
||||
) AS `hostURL`
|
||||
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
|
||||
FROM `host`
|
||||
|
||||
ORDER BY RAND()
|
||||
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
ORDER BY RAND()
|
||||
|
||||
LIMIT " . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom, 0]);
|
||||
|
||||
|
@ -216,9 +216,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<span><?php echo htmlentities($hostPageDescription->keywords) ?></span>
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>">
|
||||
<a href="<?php echo $hostPage->hostPageURL ?>">
|
||||
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
|
||||
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . urldecode($hostPage->uri)) ?>
|
||||
<?php echo htmlentities(urldecode($hostPage->hostURL) . urldecode($hostPage->uri)) ?>
|
||||
</a>
|
||||
</div>
|
||||
<div>
|
||||
@ -256,10 +256,10 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
||||
<?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
|
||||
<p>
|
||||
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>"
|
||||
<a href="<?php echo $hostPage->hostPageURL ?>"
|
||||
title="<?php echo (!empty($hostPageDescription->title) ? $hostPageDescription->title : (!empty($hostPageDescription->description) ? $hostPageDescription->description : false)) ?>">
|
||||
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
|
||||
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 32 ? '...' . mb_substr(urldecode($hostPage->uri), -32) : urldecode($hostPage->uri))) ?>
|
||||
<?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 32 ? '...' . mb_substr(urldecode($hostPage->uri), -32) : urldecode($hostPage->uri))) ?>
|
||||
</a>
|
||||
|
|
||||
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $hostPage->hostPageId ?>">
|
||||
|
@ -339,9 +339,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
<span><?php echo $hostPageDescription->keywords ?></span>
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>">
|
||||
<a href="<?php echo $hostPage->hostPageURL ?>">
|
||||
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
|
||||
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
|
||||
<?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
|
||||
</a>
|
||||
|
|
||||
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
|
||||
@ -359,9 +359,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
||||
<?php $i++ ?>
|
||||
<p>
|
||||
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>">
|
||||
<a href="<?php echo $hostPage->hostPageURL ?>">
|
||||
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
|
||||
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
|
||||
<?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
|
||||
</a>
|
||||
<!--
|
||||
|
|
||||
|
@ -257,9 +257,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<?php $title = false ?>
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<a href="<?php echo $topHostPage->scheme . '://' . $topHostPage->name . ($topHostPage->port ? ':' . $topHostPage->port : false) . $topHostPage->uri ?>"title="<?php echo trim($title) ?>">
|
||||
<a href="<?php echo $topHostPage->hostPageURL ?>"title="<?php echo trim($title) ?>">
|
||||
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($topHostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
|
||||
<?php echo htmlentities(urldecode($topHostPage->scheme . '://' . $topHostPage->name . ($topHostPage->port ? ':' . $topHostPage->port : false))) ?>
|
||||
<?php echo htmlentities(urldecode($topHostPage->hostURL)) ?>
|
||||
</a>
|
||||
</td>
|
||||
<td>
|
||||
|
Loading…
x
Reference in New Issue
Block a user