Browse Source

implement proxied image search #1

main
ghost 2 years ago
parent
commit
6b18202588
  1. 3
      README.md
  2. 34
      library/mysql.php
  3. 28
      library/sphinxql.php
  4. 36
      public/api.php
  5. 91
      public/search.php

3
README.md

@ -62,6 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option @@ -62,6 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option
```
GET action=search - required
GET query={string} - optional, search request, empty if not provided
GET type={string} - optional, search type, image|default or empty
GET page={int} - optional, search results page, 1 if not provided
GET mode=SphinxQL - optional, enable extended SphinxQL syntax
```
@ -142,7 +143,7 @@ GET m=SphinxQL @@ -142,7 +143,7 @@ GET m=SphinxQL
* [x] Add robots.txt support (Issue #2)
* [ ] Improve yggdrasil links detection, add .ygg domain zone support
* [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights
* [ ] Images search (basically implemented but requires testing and some performance optimization)
* [x] Images search (basically implemented but requires testing and some performance optimization)
* [x] Index cleaner
* [ ] Crawl queue balancer, that depends from CPU available
* [ ] Implement smart queue algorithm that indexing new sites homepage in higher priority

34
library/mysql.php

@ -224,6 +224,15 @@ class MySQL { @@ -224,6 +224,15 @@ class MySQL {
return $query->fetch();
}
public function getHostImageHostPages(int $hostImageId) {
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
$query->execute([$hostImageId]);
return $query->fetchAll();
}
public function addHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) {
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
@ -346,6 +355,31 @@ class MySQL { @@ -346,6 +355,31 @@ class MySQL {
return $query->fetch();
}
public function getFoundHostImage(int $hostImageId) {
$query = $this->_db->prepare('SELECT `hostImage`.`uri`,
`hostImage`.`rank`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
(SELECT GROUP_CONCAT(CONCAT_WS(" | ", `hostImageDescription`.`alt`, `hostImageDescription`.`title`))
FROM `hostImageDescription`
WHERE `hostImageDescription`.`hostImageId` = `hostImage`.`hostImageId`) AS `description`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
WHERE `hostImage`.`hostImageId` = ?
LIMIT 1');
$query->execute([$hostImageId]);
return $query->fetch();
}
public function addHostPage(int $hostId,
int $crc32uri,
string $uri,

28
library/sphinxql.php

@ -30,6 +30,25 @@ class SphinxQL { @@ -30,6 +30,25 @@ class SphinxQL {
return $query->fetchAll();
}
public function searchHostImages(string $keyword, int $start, int $limit, int $maxMatches) {
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
FROM `hostImage`
WHERE MATCH(?)
ORDER BY `rank` DESC, WEIGHT() DESC
LIMIT ' . (int) ($start > $maxMatches ? ($maxMatches > 0 ? $maxMatches - 1 : 0) : $start) . ',' . (int) $limit . '
OPTION `max_matches`=' . (int) ($maxMatches > 1 ? $maxMatches : 1));
$query->execute([$keyword]);
return $query->fetchAll();
}
public function searchHostPagesTotal(string $keyword) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
@ -38,4 +57,13 @@ class SphinxQL { @@ -38,4 +57,13 @@ class SphinxQL {
return $query->fetch()->total;
}
public function searchHostImagesTotal(string $keyword) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE MATCH(?)');
$query->execute([$keyword]);
return $query->fetch()->total;
}
}

36
public/api.php

@ -30,24 +30,48 @@ if (API_ENABLED) { @@ -30,24 +30,48 @@ if (API_ENABLED) {
// Filter request data
$type = !empty($_GET['type']) ? Filter::url($_GET['type']) : 'page';
$mode = !empty($_GET['mode']) ? Filter::url($_GET['mode']) : 'default';
$query = !empty($_GET['query']) ? Filter::url($_GET['query']) : '';
$page = !empty($_GET['page']) ? (int) $_GET['page'] : 1;
// Make search request
$sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode));
$sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
// Make image search request
if (!empty($type) && $type == 'image') {
$sphinxResultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($query, $mode));
$sphinxResults = $sphinx->searchHostImages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
// Make default search request
} else {
$sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode));
$sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
}
// Generate results
$dbResults = [];
foreach ($sphinxResults as $i => $sphinxResult) {
if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
// Image
if (!empty($type) && $type == 'image') {
if ($hostImage = $db->getFoundHostImage($sphinxResult->id)) {
$dbResults[$i] = $hostImage;
$dbResults[$i]->weight = $sphinxResult->weight;
}
// Default
} else {
if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
$dbResults[$i] = $hostPage;
$dbResults[$i] = $hostPage;
$dbResults[$i]->weight = $sphinxResult->weight;
$dbResults[$i]->weight = $sphinxResult->weight;
}
}
}

91
public/search.php

@ -24,6 +24,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -24,6 +24,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
]);
// Filter request data
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'page';
$m = !empty($_GET['m']) ? Filter::url($_GET['m']) : 'default';
$q = !empty($_GET['q']) ? Filter::url($_GET['q']) : '';
$p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
@ -107,8 +108,16 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -107,8 +108,16 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
// Search request
if (!empty($q)) {
$resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m));
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, $resultsTotal);
if (!empty($t) && $t == 'image') {
$resultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($q, $m));
$results = $sphinx->searchHostImages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, $resultsTotal);
} else {
$resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m));
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, $resultsTotal);
}
} else {
@ -177,6 +186,14 @@ if (!empty($q)) { @@ -177,6 +186,14 @@ if (!empty($q)) {
color: #fff;
}
h3 {
display: block;
font-size: 16px;
font-weight: normal;
margin: 8px 0;
color: #fff;
}
form {
display: block;
max-width: 678px;
@ -208,6 +225,19 @@ if (!empty($q)) { @@ -208,6 +225,19 @@ if (!empty($q)) {
color: #090808
}
label {
font-size: 14px;
position: fixed;
top: 30px;
right: 120px;
color: #fff
}
label > input {
width: auto;
margin: 0 4px;
}
button {
padding: 12px 16px;
border-radius: 4px;
@ -235,12 +265,17 @@ if (!empty($q)) { @@ -235,12 +265,17 @@ if (!empty($q)) {
color: #54a3f7;
}
img {
img.icon {
float: left;
border-radius: 50%;
margin-right: 8px;
}
img.image {
max-width: 100%;
border-radius: 3px;
}
div {
max-width: 640px;
margin: 0 auto;
@ -262,6 +297,7 @@ if (!empty($q)) { @@ -262,6 +297,7 @@ if (!empty($q)) {
<form name="search" method="GET" action="<?php echo WEBSITE_DOMAIN; ?>/search.php">
<h1><a href="<?php echo WEBSITE_DOMAIN; ?>"><?php echo _('YGGo!') ?></a></h1>
<input type="text" name="q" placeholder="<?php echo $placeholder ?>" value="<?php echo htmlentities($q) ?>" />
<label><input type="checkbox" name="t" value="image" <?php echo (!empty($t) && $t == 'image' ? 'checked="checked"' : false) ?>/> <?php echo _('Images') ?></label>
<button type="submit"><?php echo _('Search'); ?></button>
</form>
</header>
@ -274,15 +310,58 @@ if (!empty($q)) { @@ -274,15 +310,58 @@ if (!empty($q)) {
<?php } ?>
</div>
<?php foreach ($results as $result) { ?>
<?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<?php if (!empty($t) && $t == 'image' &&
$hostImage = $db->getFoundHostImage($result->id)) { ?>
<?php
// Built image url
$hostImageURL = $hostImage->scheme . '://' .
$hostImage->name .
($hostImage->port ? ':' . $hostImage->port : false) .
$hostImage->uri;
// Convert remote image to base64 string for the privacy reasons
if (!$hostImageType = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue;
if (!$hostImageData = @file_get_contents($hostImageURL)) continue;
if (!$hostImageBase64 = @base64_encode($hostImageData)) continue;
$hostImageURLencoded = 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64;
?>
<div>
<a href="<?php echo $hostImageURL ?>">
<img src="<?php echo $hostImageURLencoded ?>" alt="<?php echo $hostImage->description ?>" title="<?php echo $hostImageURL ?>" class="image" />
</a>
<?php foreach ((array) $db->getHostImageHostPages($result->id) as $hostPage) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<h3><?php echo $hostPage->metaTitle ?></h3>
<?php if (!empty($hostImage->description)) { ?>
<span><?php echo $hostImage->description ?></span>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
<?php echo $hostPageURL ?>
</a>
<?php } ?>
<?php } ?>
</div>
<?php } else if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<?php
$hostPageURL = $hostPage->scheme . '://' .
$hostPage->name .
($hostPage->port ? ':' . $hostPage->port : false) .
$hostPage->uri;
?>
<div>
<h2><?php echo $hostPage->metaTitle ?></h2>
<?php if (!empty($hostPage->metaDescription)) { ?>
<span><?php echo $hostPage->metaDescription ?></span>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" />
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
<?php echo $hostPageURL ?>
</a>
</div>

Loading…
Cancel
Save