Browse Source

build host/host page URL in SQL query

main
ghost 1 year ago
parent
commit
307eb03600
  1. 5
      crontab/cleaner.php
  2. 36
      crontab/crawler.php
  3. 86
      library/mysql.php
  4. 8
      public/explore.php
  5. 8
      public/search.php
  6. 4
      public/top.php

5
crontab/cleaner.php

@ -49,11 +49,8 @@ try {
// Get cleaner queue // Get cleaner queue
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
// Parse host info
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
// Get robots.txt if exists // Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); $curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;

36
crontab/crawler.php

@ -194,7 +194,7 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
} }
$hostURL = $remoteManifestHost->scheme . '://' . $hostURL = $remoteManifestHost->scheme . '://' .
$remoteManifestHost->name . $remoteManifestHost->name .
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link // Validate formatted link
@ -267,13 +267,8 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
// Process robots crawl queue // Process robots crawl queue
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) { foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
// Build web root URL
$hostURL = $host->scheme . '://' .
$host->name .
($host->port ? ':' . $host->port : '');
// Get robots.txt // Get robots.txt
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); $curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
@ -304,13 +299,13 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
// Replace relative paths // Replace relative paths
$hostSitemapPath = trim($hostSitemapPath, '/'); $hostSitemapPath = trim($hostSitemapPath, '/');
$hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath); $hostSitemapPath = str_replace($host->hostURL, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath); $hostSitemapPath = sprintf('%s%s', $host->hostURL, $hostSitemapPath);
// Set default path when not exists // Set default path when not exists
} else { } else {
$hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL); $hostSitemapPath = sprintf('%s/sitemap.xml', $host->hostURL);
} }
// Init sitemap data // Init sitemap data
@ -325,7 +320,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
// Add host page // Add host page
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
$linkHostURL->string == $hostURL && // this host links only $linkHostURL->string == $host->hostURL && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules $robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit $host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists !$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
@ -343,11 +338,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
try { try {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
// Init page request // Init page request
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT); $curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
@ -368,7 +360,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Try to receive target page location on page redirect available // Try to receive target page location on page redirect available
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true); $curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
@ -392,10 +384,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
//Make relative links absolute //Make relative links absolute
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
$url = $queueHostPage->scheme . '://' . $url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
} }
// Validate formatted link // Validate formatted link
@ -693,7 +682,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
sprintf('CRC32: %s', $crc32data . PHP_EOL . sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL))))) {
// Done // Done
$zip->close(); $zip->close();
@ -1055,10 +1044,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
//Make relative links absolute //Make relative links absolute
if (!parse_url($link['ref'], PHP_URL_HOST)) { if (!parse_url($link['ref'], PHP_URL_HOST)) {
$link['ref'] = $queueHostPage->scheme . '://' . $link['ref'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
} }
// Validate formatted link // Validate formatted link

86
library/mysql.php

@ -242,7 +242,17 @@ class MySQL {
`host`.`scheme`, `host`.`scheme`,
`host`.`name`, `host`.`name`,
`host`.`port` `host`.`port`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
CONCAT(`host`.`scheme`, '://', `host`.`name`)
) AS `hostURL`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
) AS `hostPageURL`
FROM `hostPage` FROM `hostPage`
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`) JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
@ -294,22 +304,33 @@ class MySQL {
public function getFoundHostPage(int $hostPageId) { public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`, $query = $this->_db->prepare("SELECT `hostPage`.`hostPageId`,
`hostPage`.`uri`, `hostPage`.`uri`,
`hostPage`.`timeAdded`, `hostPage`.`timeAdded`,
`hostPage`.`timeUpdated`, `hostPage`.`timeUpdated`,
`hostPage`.`mime`, `hostPage`.`mime`,
`hostPage`.`size`, `hostPage`.`size`,
`host`.`scheme`, `host`.`scheme`,
`host`.`name`, `host`.`name`,
`host`.`port` `host`.`port`,
FROM `hostPage` IF (`host`.`port` IS NOT NULL,
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
CONCAT(`host`.`scheme`, '://', `host`.`name`)
) AS `hostURL`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
) AS `hostPageURL`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE `hostPage`.`hostPageId` = ? WHERE `hostPage`.`hostPageId` = ?
LIMIT 1'); LIMIT 1");
$query->execute([$hostPageId]); $query->execute([$hostPageId]);
@ -623,13 +644,16 @@ class MySQL {
// Cleaner tools // Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) { public function getCleanerQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `host` $query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL,
CONCAT(`scheme`, '://', `name`, ':', `port`),
CONCAT(`scheme`, '://', `name`)
) AS `hostURL` FROM `host`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ? WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ?
ORDER BY `hostId` ORDER BY `hostId`
LIMIT ' . (int) $limit); LIMIT " . (int) $limit);
$query->execute([$timeFrom, 0]); $query->execute([$timeFrom, 0]);
@ -755,25 +779,36 @@ class MySQL {
$query = $this->_db->prepare("SELECT `hostPage`.`hostId`, $query = $this->_db->prepare("SELECT `hostPage`.`hostId`,
`hostPage`.`hostPageId`, `hostPage`.`hostPageId`,
`hostPage`.`uri`, `hostPage`.`uri`,
`host`.`scheme`, `host`.`scheme`,
`host`.`name`, `host`.`name`,
`host`.`port`, `host`.`port`,
`host`.`crawlPageLimit`, `host`.`crawlPageLimit`,
`host`.`crawlMetaOnly`, `host`.`crawlMetaOnly`,
`host`.`robots`, `host`.`robots`,
`host`.`robotsPostfix` `host`.`robotsPostfix`,
FROM `hostPage` IF (`host`.`port` IS NOT NULL,
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
CONCAT(`host`.`scheme`, '://', `host`.`name`)
) AS `hostURL`,
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?)) IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
) AS `hostPageURL`
AND `host`.`status` <> ? FROM `hostPage`
AND `hostPage`.`timeBanned` IS NULL JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? OR (`hostPage`.`uri` = '/' AND `hostPage`.`timeUpdated` < ?))
ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND() AND `host`.`status` <> ?
AND `hostPage`.`timeBanned` IS NULL
LIMIT " . (int) $limit); ORDER BY LENGTH(`hostPage`.`uri`) ASC, RAND()
LIMIT " . (int) $limit);
$query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]); $query->execute([$hostPageTimeFrom, $hostPageHomeTimeFrom, 0]);
@ -791,13 +826,18 @@ class MySQL {
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) { public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `host` $query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL,
CONCAT(`scheme`, '://', `name`, ':', `port`),
CONCAT(`scheme`, '://', `name`)
) AS `hostURL`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ? FROM `host`
ORDER BY RAND() WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
LIMIT ' . (int) $limit); ORDER BY RAND()
LIMIT " . (int) $limit);
$query->execute([$timeFrom, 0]); $query->execute([$timeFrom, 0]);

8
public/explore.php

@ -216,9 +216,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<span><?php echo htmlentities($hostPageDescription->keywords) ?></span> <span><?php echo htmlentities($hostPageDescription->keywords) ?></span>
<?php } ?> <?php } ?>
<?php } ?> <?php } ?>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>"> <a href="<?php echo $hostPage->hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" /> <img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . urldecode($hostPage->uri)) ?> <?php echo htmlentities(urldecode($hostPage->hostURL) . urldecode($hostPage->uri)) ?>
</a> </a>
</div> </div>
<div> <div>
@ -256,10 +256,10 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?> <?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?> <?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
<p> <p>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>" <a href="<?php echo $hostPage->hostPageURL ?>"
title="<?php echo (!empty($hostPageDescription->title) ? $hostPageDescription->title : (!empty($hostPageDescription->description) ? $hostPageDescription->description : false)) ?>"> title="<?php echo (!empty($hostPageDescription->title) ? $hostPageDescription->title : (!empty($hostPageDescription->description) ? $hostPageDescription->description : false)) ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" /> <img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 32 ? '...' . mb_substr(urldecode($hostPage->uri), -32) : urldecode($hostPage->uri))) ?> <?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 32 ? '...' . mb_substr(urldecode($hostPage->uri), -32) : urldecode($hostPage->uri))) ?>
</a> </a>
| |
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $hostPage->hostPageId ?>"> <a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $hostPage->hostPageId ?>">

8
public/search.php

@ -339,9 +339,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<span><?php echo $hostPageDescription->keywords ?></span> <span><?php echo $hostPageDescription->keywords ?></span>
<?php } ?> <?php } ?>
<?php } ?> <?php } ?>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>"> <a href="<?php echo $hostPage->hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" /> <img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?> <?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
</a> </a>
| |
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>"> <a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
@ -359,9 +359,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?> <?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<?php $i++ ?> <?php $i++ ?>
<p> <p>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>"> <a href="<?php echo $hostPage->hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" /> <img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?> <?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
</a> </a>
<!-- <!--
| |

4
public/top.php

@ -257,9 +257,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php $title = false ?> <?php $title = false ?>
<?php } ?> <?php } ?>
<?php } ?> <?php } ?>
<a href="<?php echo $topHostPage->scheme . '://' . $topHostPage->name . ($topHostPage->port ? ':' . $topHostPage->port : false) . $topHostPage->uri ?>"title="<?php echo trim($title) ?>"> <a href="<?php echo $topHostPage->hostPageURL ?>"title="<?php echo trim($title) ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($topHostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" /> <img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($topHostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($topHostPage->scheme . '://' . $topHostPage->name . ($topHostPage->port ? ':' . $topHostPage->port : false))) ?> <?php echo htmlentities(urldecode($topHostPage->hostURL)) ?>
</a> </a>
</td> </td>
<td> <td>

Loading…
Cancel
Save