Browse Source

build host/host page URL in SQL query

main
ghost 1 year ago
parent
commit
307eb03600
  1. 5
      crontab/cleaner.php
  2. 34
      crontab/crawler.php
  3. 58
      library/mysql.php
  4. 8
      public/explore.php
  5. 8
      public/search.php
  6. 4
      public/top.php

5
crontab/cleaner.php

@ -49,11 +49,8 @@ try { @@ -49,11 +49,8 @@ try {
// Get cleaner queue
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
// Parse host info
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
$curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;

34
crontab/crawler.php

@ -267,13 +267,8 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES @@ -267,13 +267,8 @@ foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFES
// Process robots crawl queue
foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_SECONDS_OFFSET) as $host) {
// Build web root URL
$hostURL = $host->scheme . '://' .
$host->name .
($host->port ? ':' . $host->port : '');
// Get robots.txt
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
$curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
@ -304,13 +299,13 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ @@ -304,13 +299,13 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
// Replace relative paths
$hostSitemapPath = trim($hostSitemapPath, '/');
$hostSitemapPath = str_replace($hostURL, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $hostURL, $hostSitemapPath);
$hostSitemapPath = str_replace($host->hostURL, '', $hostSitemapPath);
$hostSitemapPath = sprintf('%s%s', $host->hostURL, $hostSitemapPath);
// Set default path when not exists
} else {
$hostSitemapPath = sprintf('%s/sitemap.xml', $hostURL);
$hostSitemapPath = sprintf('%s/sitemap.xml', $host->hostURL);
}
// Init sitemap data
@ -325,7 +320,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_ @@ -325,7 +320,7 @@ foreach ($db->getHostRobotsCrawlQueue(CRAWL_ROBOTS_LIMIT, time() - CRAWL_ROBOTS_
// Add host page
if (filter_var($link, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link) && // validate link format
$linkHostURL->string == $hostURL && // this host links only
$linkHostURL->string == $host->hostURL && // this host links only
$robots->uriAllowed($linkURI->string) && // page allowed by robots.txt rules
$host->crawlPageLimit > $db->getTotalHostPages($host->hostId) && // pages quantity not reached host limit
!$db->getHostPage($host->hostId, crc32($linkURI->string))) { // page does not exists
@ -343,11 +338,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -343,11 +338,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
try {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
// Init page request
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
$curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
@ -368,7 +360,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -368,7 +360,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Try to receive target page location on page redirect available
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
$curl = new Curl($queueHostPage->hostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
// Update curl stats
$httpRequestsTotal++;
@ -392,10 +384,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -392,10 +384,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
//Make relative links absolute
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
$url = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
$url = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
}
// Validate formatted link
@ -693,7 +682,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -693,7 +682,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL))))) {
// Done
$zip->close();
@ -1055,10 +1044,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -1055,10 +1044,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
//Make relative links absolute
if (!parse_url($link['ref'], PHP_URL_HOST)) {
$link['ref'] = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
$link['ref'] = $queueHostPage->hostURL . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
}
// Validate formatted link

58
library/mysql.php

@ -242,7 +242,17 @@ class MySQL { @@ -242,7 +242,17 @@ class MySQL {
`host`.`scheme`,
`host`.`name`,
`host`.`port`
`host`.`port`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
CONCAT(`host`.`scheme`, '://', `host`.`name`)
) AS `hostURL`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
) AS `hostPageURL`
FROM `hostPage`
JOIN `host` ON (`hostPage`.`hostId` = `host`.`hostId`)
@ -294,22 +304,33 @@ class MySQL { @@ -294,22 +304,33 @@ class MySQL {
public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`,
$query = $this->_db->prepare("SELECT `hostPage`.`hostPageId`,
`hostPage`.`uri`,
`hostPage`.`timeAdded`,
`hostPage`.`timeUpdated`,
`hostPage`.`mime`,
`hostPage`.`size`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
`host`.`port`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
CONCAT(`host`.`scheme`, '://', `host`.`name`)
) AS `hostURL`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
) AS `hostPageURL`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
WHERE `hostPage`.`hostPageId` = ?
LIMIT 1');
LIMIT 1");
$query->execute([$hostPageId]);
@ -623,13 +644,16 @@ class MySQL { @@ -623,13 +644,16 @@ class MySQL {
// Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `host`
$query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL,
CONCAT(`scheme`, '://', `name`, ':', `port`),
CONCAT(`scheme`, '://', `name`)
) AS `hostURL` FROM `host`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `host`.`status` <> ?
ORDER BY `hostId`
LIMIT ' . (int) $limit);
LIMIT " . (int) $limit);
$query->execute([$timeFrom, 0]);
@ -755,13 +779,24 @@ class MySQL { @@ -755,13 +779,24 @@ class MySQL {
$query = $this->_db->prepare("SELECT `hostPage`.`hostId`,
`hostPage`.`hostPageId`,
`hostPage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlMetaOnly`,
`host`.`robots`,
`host`.`robotsPostfix`
`host`.`robotsPostfix`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`),
CONCAT(`host`.`scheme`, '://', `host`.`name`)
) AS `hostURL`,
IF (`host`.`port` IS NOT NULL,
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`),
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)
) AS `hostPageURL`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
@ -791,13 +826,18 @@ class MySQL { @@ -791,13 +826,18 @@ class MySQL {
public function getHostRobotsCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `host`
$query = $this->_db->prepare("SELECT *, IF (`port` IS NOT NULL,
CONCAT(`scheme`, '://', `name`, ':', `port`),
CONCAT(`scheme`, '://', `name`)
) AS `hostURL`
FROM `host`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> ?
ORDER BY RAND()
LIMIT ' . (int) $limit);
LIMIT " . (int) $limit);
$query->execute([$timeFrom, 0]);

8
public/explore.php

@ -216,9 +216,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -216,9 +216,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<span><?php echo htmlentities($hostPageDescription->keywords) ?></span>
<?php } ?>
<?php } ?>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>">
<a href="<?php echo $hostPage->hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . urldecode($hostPage->uri)) ?>
<?php echo htmlentities(urldecode($hostPage->hostURL) . urldecode($hostPage->uri)) ?>
</a>
</div>
<div>
@ -256,10 +256,10 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -256,10 +256,10 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<?php $hostPageDescription = $db->getLastPageDescription($hostPageIdSource->hostPageIdSource); ?>
<p>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>"
<a href="<?php echo $hostPage->hostPageURL ?>"
title="<?php echo (!empty($hostPageDescription->title) ? $hostPageDescription->title : (!empty($hostPageDescription->description) ? $hostPageDescription->description : false)) ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 32 ? '...' . mb_substr(urldecode($hostPage->uri), -32) : urldecode($hostPage->uri))) ?>
<?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 32 ? '...' . mb_substr(urldecode($hostPage->uri), -32) : urldecode($hostPage->uri))) ?>
</a>
|
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $hostPage->hostPageId ?>">

8
public/search.php

@ -339,9 +339,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -339,9 +339,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<span><?php echo $hostPageDescription->keywords ?></span>
<?php } ?>
<?php } ?>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>">
<a href="<?php echo $hostPage->hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
<?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
</a>
|
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
@ -359,9 +359,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -359,9 +359,9 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<?php $i++ ?>
<p>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>">
<a href="<?php echo $hostPage->hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($hostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
<?php echo htmlentities(urldecode($hostPage->hostURL) . (mb_strlen(urldecode($hostPage->uri)) > 28 ? '...' . mb_substr(urldecode($hostPage->uri), -28) : urldecode($hostPage->uri))) ?>
</a>
<!--
|

4
public/top.php

@ -257,9 +257,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -257,9 +257,9 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php $title = false ?>
<?php } ?>
<?php } ?>
<a href="<?php echo $topHostPage->scheme . '://' . $topHostPage->name . ($topHostPage->port ? ':' . $topHostPage->port : false) . $topHostPage->uri ?>"title="<?php echo trim($title) ?>">
<a href="<?php echo $topHostPage->hostPageURL ?>"title="<?php echo trim($title) ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/file.php?type=identicon&query=<?php echo urlencode($topHostPage->name) ?>" alt="identicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($topHostPage->scheme . '://' . $topHostPage->name . ($topHostPage->port ? ':' . $topHostPage->port : false))) ?>
<?php echo htmlentities(urldecode($topHostPage->hostURL)) ?>
</a>
</td>
<td>

Loading…
Cancel
Save