YGGo/public/search.php

396 lines
14 KiB
PHP
Raw Normal View History

2023-04-01 16:29:39 +00:00
<?php
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
2023-04-01 16:29:39 +00:00
require_once('../library/filter.php');
require_once('../library/parser.php');
require_once('../library/mysql.php');
require_once('../library/sphinxql.php');
// Connect Sphinx search server
$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
2023-04-01 16:29:39 +00:00
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
2023-04-01 16:29:39 +00:00
// Filter request data
2023-05-10 22:45:36 +00:00
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'text';
$m = !empty($_GET['m']) ? Filter::url($_GET['m']) : 'default';
2023-04-01 16:29:39 +00:00
$q = !empty($_GET['q']) ? Filter::url($_GET['q']) : '';
2023-04-02 20:36:35 +00:00
$p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
2023-04-01 16:29:39 +00:00
// Search request
if (!empty($q)) {
$resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m), $t);
2023-05-13 02:54:15 +00:00
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $t, $p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, $resultsTotal);
} else {
$resultsTotal = 0;
$results = [];
}
// Mime list
2023-05-10 22:34:09 +00:00
$hostPagesMime = $sphinx->searchHostPagesMime(Filter::searchQuery($q, $m));
// Define page basics
$totalPages = $sphinx->getHostPagesTotal();
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]);
2023-04-01 16:29:39 +00:00
// Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$db->beginTransaction();
try {
// Parse host info
if ($hostURL = Parser::hostURL($q)) {
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
2023-05-09 10:26:19 +00:00
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
2023-04-23 01:05:00 +00:00
// Disk quota not reached
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
2023-04-23 01:05:00 +00:00
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
2023-04-23 01:05:00 +00:00
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost( $hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
}
}
// Parse page URI
$hostPageURI = Parser::uri($q);
// Init robots parser
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
$db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
}
}
$db->commit();
} catch(Exception $e){
var_dump($e);
2023-05-04 00:48:57 +00:00
$db->rollBack();
2023-05-04 00:48:57 +00:00
}
2023-04-01 16:29:39 +00:00
}
?>
<!DOCTYPE html>
<html lang="<?php echo _('en-US'); ?>">
<head>
2023-04-02 20:49:04 +00:00
<title><?php echo (empty($q) ? _('Empty request - YGGo!') : ($p > 1 ? sprintf(_('%s - #%s - YGGo!'), htmlentities($q), $p) : sprintf(_('%s - YGGo!'), htmlentities($q)))) ?></title>
2023-04-01 16:29:39 +00:00
<meta charset="utf-8" />
2023-04-02 13:07:57 +00:00
<meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
2023-04-08 22:22:36 +00:00
<meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" />
2023-04-01 16:29:39 +00:00
<style>
* {
border: 0;
margin: 0;
padding: 0;
font-family: Sans-serif;
}
body {
background-color: #2e3436;
}
header {
background-color: #34393b;
position: fixed;
top: 0;
left: 0;
right: 0;
}
main {
2023-05-04 06:52:08 +00:00
margin-top: 110px;
2023-04-01 16:29:39 +00:00
margin-bottom: 76px;
padding: 0 20px;
2023-04-01 16:29:39 +00:00
}
h1 {
2023-04-02 22:55:26 +00:00
position: fixed;
top: 8px;
left: 24px;
}
h1 > a,
h1 > a:visited,
h1 > a:active,
h1 > a:hover {
2023-04-01 16:29:39 +00:00
color: #fff;
font-weight: normal;
2023-05-04 06:52:08 +00:00
font-size: 24px;
margin: 10px 0;
2023-04-02 22:55:26 +00:00
text-decoration: none;
2023-04-01 16:29:39 +00:00
}
h2 {
display: block;
font-size: 16px;
font-weight: normal;
margin: 4px 0;
color: #fff;
}
form {
display: block;
2023-04-02 22:55:26 +00:00
max-width: 678px;
2023-04-01 16:29:39 +00:00
margin: 0 auto;
text-align: center;
}
input {
width: 100%;
2023-05-04 06:52:08 +00:00
margin: 12px 0;
padding: 10px 0;
2023-04-01 16:29:39 +00:00
border-radius: 32px;
background-color: #000;
color: #fff;
font-size: 16px;
text-align: center;
}
input:hover {
background-color: #111
}
input:focus {
outline: none;
background-color: #111
}
input:focus::placeholder {
color: #090808
}
2023-05-04 00:48:57 +00:00
label {
font-size: 14px;
2023-05-04 06:52:08 +00:00
color: #fff;
float: left;
margin-left: 16px;
margin-bottom: 14px;
2023-05-04 00:48:57 +00:00
}
label > input {
width: auto;
margin: 0 4px;
}
2023-04-01 16:29:39 +00:00
button {
2023-05-04 06:52:08 +00:00
padding: 8px 16px;
2023-04-01 16:29:39 +00:00
border-radius: 4px;
cursor: pointer;
background-color: #3394fb;
color: #fff;
font-size: 14px;
position: fixed;
2023-05-04 06:52:08 +00:00
top: 15px;
2023-04-01 16:29:39 +00:00
right: 24px;
}
button:hover {
background-color: #4b9df4;
}
a, a:visited, a:active {
2023-04-02 22:55:26 +00:00
color: #9ba2ac;
display: inline-block;
2023-04-02 22:30:09 +00:00
font-size: 12px;
2023-04-02 22:55:26 +00:00
margin-top: 8px;
2023-04-01 16:29:39 +00:00
}
a:hover {
color: #54a3f7;
}
2023-05-04 00:48:57 +00:00
img.icon {
2023-04-02 22:30:09 +00:00
float: left;
border-radius: 50%;
margin-right: 8px;
}
2023-05-04 00:48:57 +00:00
img.image {
max-width: 100%;
border-radius: 3px;
}
2023-04-01 16:29:39 +00:00
div {
max-width: 640px;
margin: 0 auto;
padding: 16px 0;
border-top: 1px #000 dashed;
font-size: 14px
}
span {
2023-04-02 19:02:53 +00:00
color: #ccc;
2023-04-01 16:29:39 +00:00
display: block;
margin: 8px 0;
}
p {
margin: 16px 0;
text-align: right;
2023-05-13 00:01:00 +00:00
font-size: 11px;
}
2023-05-13 02:54:15 +00:00
p > a, p > a:visited, p > a:active {
2023-05-13 00:01:00 +00:00
font-size: 11px;
}
2023-04-01 16:29:39 +00:00
</style>
</head>
<body>
<header>
<form name="search" method="GET" action="<?php echo WEBSITE_DOMAIN; ?>/search.php">
2023-04-02 22:55:26 +00:00
<h1><a href="<?php echo WEBSITE_DOMAIN; ?>"><?php echo _('YGGo!') ?></a></h1>
2023-04-01 16:29:39 +00:00
<input type="text" name="q" placeholder="<?php echo $placeholder ?>" value="<?php echo htmlentities($q) ?>" />
<?php foreach ($hostPagesMime as $hostPageMime) { ?>
2023-05-10 22:34:09 +00:00
<label><input type="radio" name="t" value="<?php echo $hostPageMime->mime ?>" <?php echo ($t == $hostPageMime->mime ? 'checked="checked"' : false) ?>/> <?php echo $hostPageMime->mime ?> <!--(<?php // echo $sphinx->searchHostPagesTotalByMime(Filter::searchQuery($q, $m), $hostPageMime->mime) ?>)--></label>
<?php } ?>
2023-04-01 16:29:39 +00:00
<button type="submit"><?php echo _('Search'); ?></button>
</form>
</header>
<main>
<?php if ($results) { ?>
<div>
2023-04-02 19:02:53 +00:00
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
2023-04-01 22:30:50 +00:00
<?php if ($queueTotal = $db->getTotalPagesByHttpCode(null)) { ?>
2023-04-02 19:02:53 +00:00
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
2023-04-01 22:30:50 +00:00
<?php } ?>
2023-04-01 16:29:39 +00:00
</div>
<?php foreach ($results as $result) { ?>
<?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<div>
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
2023-05-11 04:40:09 +00:00
<?php if (!empty($hostPageDescription->title)) { ?>
<h2><?php echo $hostPageDescription->title ?></h2>
<?php } ?>
<?php if (!empty($hostPageDescription->description)) { ?>
<span><?php echo $hostPageDescription->description ?></span>
<?php } ?>
2023-05-11 04:40:09 +00:00
<?php if (!empty($hostPageDescription->keywords)) { ?>
<span><?php echo $hostPageDescription->keywords ?></span>
<?php } ?>
<?php } ?>
2023-05-13 00:01:00 +00:00
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>">
2023-05-04 00:48:57 +00:00
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
2023-05-13 02:54:15 +00:00
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 36 ? '...' . mb_substr(urldecode($hostPage->uri), -36) : urldecode($hostPage->uri))) ?>
</a>
|
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>">
<?php echo _('explore'); ?>
</a>
2023-05-13 00:01:00 +00:00
<?php if ($result->mime != 'text' && $totalHostPageIdSources = $db->getTotalHostPageIdSourcesByHostPageIdTarget($result->id)) { ?>
2023-05-13 00:51:34 +00:00
<p>
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources),
]) ?>
</p>
2023-05-13 02:54:15 +00:00
<?php $i = 1 ?>
<?php foreach ($db->getHostPageIdSourcesByHostPageIdTarget($result->id, 5) as $hostPageIdSource) { ?>
2023-05-13 00:01:00 +00:00
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
2023-05-13 02:54:15 +00:00
<?php $i++ ?>
2023-05-13 00:01:00 +00:00
<p>
<a href="<?php echo $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>">
<img src="<?php echo WEBSITE_DOMAIN; ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
2023-05-13 02:54:15 +00:00
<?php echo htmlentities(urldecode($hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false)) . (mb_strlen(urldecode($hostPage->uri)) > 36 ? '...' . mb_substr(urldecode($hostPage->uri), -36) : urldecode($hostPage->uri))) ?>
</a>
<!--
|
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $hostPage->hostPageId ?>">
<?php echo _('explore'); ?>
2023-05-13 00:01:00 +00:00
</a>
2023-05-13 02:54:15 +00:00
-->
2023-05-13 00:01:00 +00:00
</p>
<?php } ?>
<?php } ?>
2023-05-13 02:54:15 +00:00
<?php if ($i < $totalHostPageIdSources) { ?>
<p>
<a href="<?php echo WEBSITE_DOMAIN; ?>/explore.php?hp=<?php echo $result->id ?>#referrers">
<?php echo _('view all'); ?>
</a>
</p>
<?php } ?>
2023-05-13 00:01:00 +00:00
<?php } ?>
</div>
<?php } ?>
2023-04-01 16:29:39 +00:00
<?php } ?>
2023-05-13 02:54:15 +00:00
<?php if ($p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT <= $resultsTotal) { ?>
2023-04-02 20:36:35 +00:00
<div>
2023-05-13 00:51:34 +00:00
<a href="<?php echo WEBSITE_DOMAIN; ?>/search.php?q=<?php echo urlencode(htmlentities($q)) ?>&t=<?php echo $t ?>&m=<?php echo $m ?>&p=<?php echo $p + 1 ?>"><?php echo _('Next page') ?></a>
2023-04-02 20:36:35 +00:00
</div>
<?php } ?>
2023-04-01 16:29:39 +00:00
<?php } else { ?>
<div style="text-align:center">
2023-04-02 19:02:53 +00:00
<span><?php echo sprintf(_('Total found: %s'), $resultsTotal) ?></span>
2023-04-01 22:30:50 +00:00
<?php if ($q && $queueTotal = $db->getTotalPagesByHttpCode(null)) { ?>
2023-04-02 19:02:53 +00:00
<span><?php echo sprintf(_('* Please wait for all pages crawl to complete (%s in queue).'), $queueTotal) ?></span>
2023-04-01 22:30:50 +00:00
<?php } ?>
2023-04-01 16:29:39 +00:00
</div>
<?php } ?>
</main>
</body>
</html>