From 6b18202588be17a1d84c280411fb13aa2a064d61 Mon Sep 17 00:00:00 2001 From: ghost Date: Thu, 4 May 2023 03:48:57 +0300 Subject: [PATCH] implement proxied image search #1 --- README.md | 3 +- library/mysql.php | 34 +++++++++++++++++ library/sphinxql.php | 28 ++++++++++++++ public/api.php | 36 +++++++++++++++--- public/search.php | 91 +++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 179 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 1da6253..5163a3c 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option ``` GET action=search - required GET query={string} - optional, search request, empty if not provided +GET type={string} - optional, search type, image|default or empty GET page={int} - optional, search results page, 1 if not provided GET mode=SphinxQL - optional, enable extended SphinxQL syntax ``` @@ -142,7 +143,7 @@ GET m=SphinxQL * [x] Add robots.txt support (Issue #2) * [ ] Improve yggdrasil links detection, add .ygg domain zone support * [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights -* [ ] Images search (basically implemented but requires testing and some performance optimization) +* [x] Images search (basically implemented but requires testing and some performance optimization) * [x] Index cleaner * [ ] Crawl queue balancer, that depends from CPU available * [ ] Implement smart queue algorithm that indexing new sites homepage in higher priority diff --git a/library/mysql.php b/library/mysql.php index 24e782d..1abd47a 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -224,6 +224,15 @@ class MySQL { return $query->fetch(); } + public function getHostImageHostPages(int $hostImageId) { + + $query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` WHERE `hostImageId` = ?'); + + $query->execute([$hostImageId]); + + return $query->fetchAll(); + } + public function addHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) { $query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`, @@ -346,6 +355,31 @@ class MySQL { return $query->fetch(); } + public function getFoundHostImage(int $hostImageId) { + + $query = $this->_db->prepare('SELECT `hostImage`.`uri`, + `hostImage`.`rank`, + `host`.`scheme`, + `host`.`name`, + `host`.`port`, + + (SELECT GROUP_CONCAT(CONCAT_WS(" | ", `hostImageDescription`.`alt`, `hostImageDescription`.`title`)) + + FROM `hostImageDescription` + WHERE `hostImageDescription`.`hostImageId` = `hostImage`.`hostImageId`) AS `description` + + FROM `hostImage` + JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`) + + WHERE `hostImage`.`hostImageId` = ? + + LIMIT 1'); + + $query->execute([$hostImageId]); + + return $query->fetch(); + } + public function addHostPage(int $hostId, int $crc32uri, string $uri, diff --git a/library/sphinxql.php b/library/sphinxql.php index 0859c84..0e27362 100644 --- a/library/sphinxql.php +++ b/library/sphinxql.php @@ -30,6 +30,25 @@ class SphinxQL { return $query->fetchAll(); } + public function searchHostImages(string $keyword, int $start, int $limit, int $maxMatches) { + + $query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight` + + FROM `hostImage` + + WHERE MATCH(?) + + ORDER BY `rank` DESC, WEIGHT() DESC + + LIMIT ' . (int) ($start > $maxMatches ? ($maxMatches > 0 ? $maxMatches - 1 : 0) : $start) . ',' . (int) $limit . ' + + OPTION `max_matches`=' . (int) ($maxMatches > 1 ? $maxMatches : 1)); + + $query->execute([$keyword]); + + return $query->fetchAll(); + } + public function searchHostPagesTotal(string $keyword) { $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)'); @@ -38,4 +57,13 @@ class SphinxQL { return $query->fetch()->total; } + + public function searchHostImagesTotal(string $keyword) { + + $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE MATCH(?)'); + + $query->execute([$keyword]); + + return $query->fetch()->total; + } } diff --git a/public/api.php b/public/api.php index ded5442..164ef7f 100644 --- a/public/api.php +++ b/public/api.php @@ -30,24 +30,48 @@ if (API_ENABLED) { // Filter request data + $type = !empty($_GET['type']) ? Filter::url($_GET['type']) : 'page'; $mode = !empty($_GET['mode']) ? Filter::url($_GET['mode']) : 'default'; $query = !empty($_GET['query']) ? Filter::url($_GET['query']) : ''; $page = !empty($_GET['page']) ? (int) $_GET['page'] : 1; - // Make search request - $sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode)); - $sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal); + // Make image search request + if (!empty($type) && $type == 'image') { + + $sphinxResultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($query, $mode)); + $sphinxResults = $sphinx->searchHostImages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal); + + // Make default search request + } else { + + $sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode)); + $sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal); + } // Generate results $dbResults = []; foreach ($sphinxResults as $i => $sphinxResult) { - if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) { + // Image + if (!empty($type) && $type == 'image') { + + if ($hostImage = $db->getFoundHostImage($sphinxResult->id)) { + + $dbResults[$i] = $hostImage; + + $dbResults[$i]->weight = $sphinxResult->weight; + } + + // Default + } else { + + if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) { - $dbResults[$i] = $hostPage; + $dbResults[$i] = $hostPage; - $dbResults[$i]->weight = $sphinxResult->weight; + $dbResults[$i]->weight = $sphinxResult->weight; + } } } diff --git a/public/search.php b/public/search.php index 2c20bad..dbbc433 100644 --- a/public/search.php +++ b/public/search.php @@ -24,6 +24,7 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the ]); // Filter request data +$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'page'; $m = !empty($_GET['m']) ? Filter::url($_GET['m']) : 'default'; $q = !empty($_GET['q']) ? Filter::url($_GET['q']) : ''; $p = !empty($_GET['p']) ? (int) $_GET['p'] : 1; @@ -107,8 +108,16 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { // Search request if (!empty($q)) { - $resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m)); - $results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, $resultsTotal); + if (!empty($t) && $t == 'image') { + + $resultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($q, $m)); + $results = $sphinx->searchHostImages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, $resultsTotal); + + } else { + + $resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m)); + $results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, $resultsTotal); + } } else { @@ -177,6 +186,14 @@ if (!empty($q)) { color: #fff; } + h3 { + display: block; + font-size: 16px; + font-weight: normal; + margin: 8px 0; + color: #fff; + } + form { display: block; max-width: 678px; @@ -208,6 +225,19 @@ if (!empty($q)) { color: #090808 } + label { + font-size: 14px; + position: fixed; + top: 30px; + right: 120px; + color: #fff + } + + label > input { + width: auto; + margin: 0 4px; + } + button { padding: 12px 16px; border-radius: 4px; @@ -235,12 +265,17 @@ if (!empty($q)) { color: #54a3f7; } - img { + img.icon { float: left; border-radius: 50%; margin-right: 8px; } + img.image { + max-width: 100%; + border-radius: 3px; + } + div { max-width: 640px; margin: 0 auto; @@ -262,6 +297,7 @@ if (!empty($q)) {

+
@@ -274,15 +310,58 @@ if (!empty($q)) { - getFoundHostPage($result->id)) { ?> - scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?> + getFoundHostImage($result->id)) { ?> + scheme . '://' . + $hostImage->name . + ($hostImage->port ? ':' . $hostImage->port : false) . + $hostImage->uri; + + // Convert remote image to base64 string for the privacy reasons + if (!$hostImageType = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue; + if (!$hostImageData = @file_get_contents($hostImageURL)) continue; + if (!$hostImageBase64 = @base64_encode($hostImageData)) continue; + + $hostImageURLencoded = 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64; + + ?> +
+ + <?php echo $hostImage->description ?> + + getHostImageHostPages($result->id) as $hostPage) { ?> + getFoundHostPage($hostPage->hostPageId)) { ?> + scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?> +

metaTitle ?>

+ description)) { ?> + description ?> + + + favicon + + + + +
+ getFoundHostPage($result->id)) { ?> + scheme . '://' . + $hostPage->name . + ($hostPage->port ? ':' . $hostPage->port : false) . + $hostPage->uri; + + ?>

metaTitle ?>

metaDescription)) { ?> metaDescription ?> - favicon + favicon