Browse Source

refactor to mime-based content index #1

main
ghost 2 years ago
parent
commit
db0e66c846
  1. 17
      README.md
  2. 88
      config/app.php.txt
  3. 51
      config/sphinx.conf.txt
  4. 65
      crontab/cleaner.php
  5. 408
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 18
      library/filter.php
  8. 488
      library/mysql.php
  9. 48
      library/sphinxql.php
  10. BIN
      media/db-prototype.png
  11. 36
      public/api.php
  12. 1
      public/index.php
  13. 225
      public/search.php

17
README.md

@ -62,7 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option @@ -62,7 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option
```
GET action=search - required
GET query={string} - optional, search request, empty if not provided
GET type={string} - optional, search type, image|default or empty
GET type={string} - optional, filter mime type of available or empty
GET page={int} - optional, search results page, 1 if not provided
GET mode=SphinxQL - optional, enable extended SphinxQL syntax
```
@ -141,7 +141,7 @@ GET m=SphinxQL @@ -141,7 +141,7 @@ GET m=SphinxQL
##### Basic features
* [x] Web pages full text ranking search
* [x] Images search with safe proxy preview support
* [x] MIME filtering search with safe proxy images preview
* [x] Extended syntax support
* [x] Flexible settings compatible with IPv4/IPv6 networks
@ -159,18 +159,14 @@ GET m=SphinxQL @@ -159,18 +159,14 @@ GET m=SphinxQL
* [ ] Index API
+ [x] Manifest
+ [x] Search
+ [x] Pages
+ [x] Images
+ [x] Hosts
+ [ ] Pages
+ [ ] Images
+ [ ] MIME list
* [ ] Context advertising API
##### Crawler
* [x] Auto crawl links by regular expression rules
+ [x] Pages
+ [x] Images
+ [x] Manifests
* [x] Robots.txt / robots meta tags support (#2)
* [x] Specific rules configuration for every host
@ -181,8 +177,6 @@ GET m=SphinxQL @@ -181,8 +177,6 @@ GET m=SphinxQL
* [x] Ban non-condition links to prevent extra requests
* [x] Debug log
* [x] History snaps
+ [x] Pages
+ [x] Images
* [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing
* [ ] Palette image index / filter
@ -191,17 +185,12 @@ GET m=SphinxQL @@ -191,17 +185,12 @@ GET m=SphinxQL
##### Cleaner
* [x] Deprecated DB items auto deletion / host settings update
+ [x] Pages
+ [x] Images
+ [x] Manifests
+ [x] Logs
+ [x] Crawler
+ [x] Cleaner
* [x] Deprecated history snaps removing
+ [x] Pages
+ [x] Images
* [x] Banned resources reset by timeout
+ [x] Pages
+ [x] Images
* [x] Debug log
##### Other

88
config/app.php.txt

@ -47,7 +47,7 @@ error_reporting(E_ALL); @@ -47,7 +47,7 @@ error_reporting(E_ALL);
* Project domain, without slash on postfix
*
*/
define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : ''));
define('WEBSITE_DOMAIN', '');
/*
* Page search results before show the read more link
@ -55,18 +55,6 @@ define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' @@ -55,18 +55,6 @@ define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on'
*/
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
/*
* Image search results before show the read more link
*
*/
define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10);
/*
* Quantity of related pages for each image in the search results
*
*/
define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
/*
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
*
@ -76,7 +64,7 @@ define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5); @@ -76,7 +64,7 @@ define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database
define('DB_HOST', 'localhost');
define('DB_HOST', '127.0.0.1');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
@ -144,20 +132,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); @@ -144,20 +132,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
*/
define('CRAWL_PAGE_LIMIT', 20);
/*
* Images (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Set 0 to disable
*
*/
define('CRAWL_IMAGE_LIMIT', 10);
/*
* Manifest (URI) processing limit in the crawler.php queue
*
@ -194,28 +168,7 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); @@ -194,28 +168,7 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
* comma separated
*
*/
define('CRAWL_PAGE_MIME', 'text/html');
/*
* Index images match MIME types
*
* comma separated
*
*/
define('CRAWL_IMAGE_MIME', 'image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
/*
* Renew image index by timing offset provided
*
* This option works with CRAWL_IMAGE_LIMIT step queue
*
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
* must have enough value to crawl all images collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
/*
* Renew manifests index by timing offset provided
@ -234,7 +187,7 @@ define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30); @@ -234,7 +187,7 @@ define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
* Only URL addresses match this rule will be auto-crawled
*
*/
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui');
/*
* Pages limit per new host by default
@ -244,7 +197,7 @@ define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/u @@ -244,7 +197,7 @@ define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/u
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
*
*/
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
/*
* Set default auto-crawl status for new host added
@ -264,7 +217,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true); @@ -264,7 +217,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
*
* This option able to change search results relevance
* This option enables image data caching in base64
*
*/
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
@ -279,16 +231,6 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', false); @@ -279,16 +231,6 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', false);
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
/*
* Not suitable/safe for work status for new host by default
*
* Could be filtered in crawl conditions or search results
*
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
*
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules
@ -324,7 +266,7 @@ define('CRAWL_MANIFEST', true); @@ -324,7 +266,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility
*
*/
define('CRAWL_MANIFEST_API_VERSION', 0.7);
define('CRAWL_MANIFEST_API_VERSION', 0.8);
/*
* Set default auto-crawl status for new manifest added
@ -389,20 +331,6 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); @@ -389,20 +331,6 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
*/
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
/*
* Remove image ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove image description history after following time
*
*/
define('CLEAN_IMAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
// API settings
@ -445,14 +373,12 @@ define('API_HOSTS_FIELDS', @@ -445,14 +373,12 @@ define('API_HOSTS_FIELDS',
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlImageLimit`,
`host`.`robots`,
`host`.`robotsPostfix`,
`host`.`nsfw`,
`host`.`timeAdded`,
`host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,
(SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');
/*
* Manifest API

51
config/sphinx.conf.txt

@ -12,36 +12,24 @@ source common @@ -12,36 +12,24 @@ source common
source hostPage : common
{
sql_query = \
SELECT hostPage.hostPageId, \
hostPage.rank, \
hostPage.uri, \
host.name, \
(SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \
hostPageDescription.metaDescription, \
hostPageDescription.metaKeywords) \
FROM hostPageDescription \
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
ORDER BY hostPageDescription.timeUpdated DESC, hostPageDescription.timeAdded DESC \
LIMIT 1) AS pageDescription \
FROM hostPage \
JOIN host ON (host.hostId = hostPage.hostId) \
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
sql_attr_uint = rank
}
source hostImage : common
{
sql_query = \
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \
FROM hostImageDescription \
WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription \
FROM hostImage \
JOIN host ON (host.hostId = hostImage.hostId) \
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
SELECT `hostPage`.`hostPageId`, \
`hostPage`.`uri`, \
`host`.`name`, \
REGEXP_REPLACE(`hostPage`.`mime`, '^[A-z-]+/([A-z-]+).*', '$1') AS `mime`, \
(SELECT COUNT(*) FROM `hostPageToHostPage` \
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \
AND `hostPageToHostPage`.`hostPageIdSource` <> `hostPage`.`hostPageId`) AS `rank`, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
`hostPageDescription`.`description`, \
`hostPageDescription`.`keywords`)) \
FROM `hostPageDescription` \
WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId`) AS `pageDescription` \
FROM `hostPage` \
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) \
WHERE `host`.`status` = '1' AND `hostPage`.`httpCode` = 200 AND `hostPage`.`timeBanned` IS NULL
sql_attr_uint = rank
sql_attr_string = mime
}
index hostPage
@ -50,10 +38,3 @@ index hostPage @@ -50,10 +38,3 @@ index hostPage
morphology = stem_enru, stem_cz, stem_ar
path = /var/lib/sphinxsearch/data/hostPage
}
index hostImage
{
source = hostImage
morphology = stem_enru, stem_cz, stem_ar
path = /var/lib/sphinxsearch/data/hostImage
}

65
crontab/cleaner.php

@ -31,11 +31,8 @@ $manifestsTotal = $db->getTotalManifests(); @@ -31,11 +31,8 @@ $manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostPagesDeleted = 0;
$hostPageDescriptionsDeleted = 0;
$hostImagesDeleted = 0;
$hostImageDescriptionsDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
$hostImagesBansRemoved = 0;
$logsCleanerDeleted = 0;
$logsCrawlerDeleted = 0;
@ -69,22 +66,6 @@ try { @@ -69,22 +66,6 @@ try {
// Update host data
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
// Apply host images limits
$totalHostImages = $db->getTotalHostImages($host->hostId);
if ($totalHostImages > $host->crawlImageLimit) {
foreach ((array) $db->getHostImagesByLimit($host->hostId, $totalHostImages - $host->crawlImageLimit) as $hostImage) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
// Apply host pages limits
$totalHostPages = $db->getTotalHostPages($host->hostId);
@ -92,55 +73,31 @@ try { @@ -92,55 +73,31 @@ try {
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
// Delete foreign key relations
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
$db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}
}
// Apply new robots.txt rules
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
foreach ($db->getHostImages($host->hostId) as $hostImage) {
if (!$robots->uriAllowed($hostImage->uri)) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
foreach ($db->getHostPages($host->hostId) as $hostPage) {
if (!$robots->uriAllowed($hostPage->uri)) {
// Delete foreign key relations
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
$db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}
// Clean up host images unrelated to host pages
foreach ($db->getUnrelatedHostImages() as $hostImage) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
@ -207,12 +164,6 @@ try { @@ -207,12 +164,6 @@ try {
// Delete page description history
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
// Reset banned images
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
// Delete image description history
$hostImageDescriptionsDeleted += $db->deleteHostImageDescriptionsByTimeAdded(time() - CLEAN_IMAGE_DESCRIPTION_OFFSET);
// Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
@ -238,9 +189,6 @@ if (CLEAN_LOG_ENABLED) { @@ -238,9 +189,6 @@ if (CLEAN_LOG_ENABLED) {
$hostPagesDeleted,
$hostPageDescriptionsDeleted,
$hostPagesBansRemoved,
$hostImagesDeleted,
$hostImageDescriptionsDeleted,
$hostImagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
$logsCleanerDeleted,
@ -256,15 +204,12 @@ if (CLEAN_LOG_ENABLED) { @@ -256,15 +204,12 @@ if (CLEAN_LOG_ENABLED) {
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
echo 'Hosts images deleted: ' . $hostImagesDeleted . PHP_EOL;
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL;
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
echo 'Host image descriptions deleted: ' . $hostImageDescriptionsDeleted . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;

408
crontab/crawler.php

@ -33,16 +33,12 @@ $httpDownloadSizeTotal = 0; @@ -33,16 +33,12 @@ $httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;
$hostPagesProcessed = 0;
$hostImagesProcessed = 0;
$manifestsProcessed = 0;
$hostPagesIndexed = 0;
$hostImagesIndexed = 0;
$manifestsAdded = 0;
$hostPagesAdded = 0;
$hostImagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
$hostImagesBanned = 0;
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@ -167,19 +163,8 @@ try { @@ -167,19 +163,8 @@ try {
// Validate formatted link
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
// Host exists
if ($host = $db->getHost(crc32($hostURL))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Host not exists
if (!$db->getHost(crc32($hostURL))) {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
@ -198,159 +183,34 @@ try { @@ -198,159 +183,34 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($remoteManifestHosts->result->scheme,
$hostId = $db->addHost( $remoteManifestHosts->result->scheme,
$remoteManifestHosts->result->name,
$remoteManifestHosts->result->port,
crc32($hostURL),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save home page info
// Until page API not implemented, save at least home page to have ability to crawl
// @TODO
if ($hostStatus && // host enabled
$robots->uriAllowed('/') && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32('/'))) { // page not exists
if ($db->addHostPage($hostId, crc32('/'), '/', time())) {
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
}
}
}
}
}
// Process images crawl queue
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
// Init image request
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip image processing non 200 code
if (200 != $curl->getCode()) {
$db->updateHostImageHttpCode($queueHostImage->hostImageId, $curl->getCode(), time());
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $curl->getContentType()) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
// Skip image processing on MIME type not allowed in settings
$hostImageBanned = true;
foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
if (false !== strpos($hostImageContentType, trim($mime))) {
$hostImageBanned = false;
break;
}
}
if ($hostImageBanned) {
$db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
// Convert remote image data to base64 string
if (!$queueHostImage->crawlMetaOnly) {
// Skip image processing without returned content
if (!$hostImageContent = $curl->getContent()) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
$hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
// Set host image description
// On link collection we knew meta but data,
// this step use latest description slice and insert the data received by curl request
if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) {
$db->setHostImageDescription($queueHostImage->hostImageId,
crc32($lastHostImageDescription->alt .
$lastHostImageDescription->title .
$hostImageData),
$lastHostImageDescription->alt,
$lastHostImageDescription->title,
$hostImageData,
time(),
time());
}
}
$hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
Filter::mime($hostImageContentType),
time());
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
@ -476,12 +336,11 @@ try { @@ -476,12 +336,11 @@ try {
time());
// Add queued page description if not exists
$db->setHostPageDescription($queueHostPage->hostPageId,
crc32($content),
$db->addHostPageDescription($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
$queueHostPage->crawlMetaOnly ? null : Filter::string($content),
$queueHostPage->crawlMetaOnly ? null : base64_encode($content),
time());
// Update manifest registry
@ -499,155 +358,42 @@ try { @@ -499,155 +358,42 @@ try {
}
}
// Collect page images
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
// Init links registry
$links = [];
// Collect image links
foreach (@$dom->getElementsByTagName('img') as $img) {
// Skip images without src attribute
if (!$imageSrc = @$img->getAttribute('src')) {
if (!$src = @$img->getAttribute('src')) {
continue;
}
// Skip images without alt attribute
if (!$imageAlt = @$img->getAttribute('alt')) {
if (!$alt = @$img->getAttribute('alt')) {
continue;
}
if (!$imageTitle = @$img->getAttribute('title')) {
$imageTitle = null;
if (!$title = @$img->getAttribute('title')) {
$title = null;
}
// Add domain to the relative src links
if (!parse_url($imageSrc, PHP_URL_HOST)) {
$imageSrc = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
}
// Validate formatted src link
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
// Parse formatted src link
$hostImageURL = Parser::hostURL($imageSrc);
$hostImageURI = Parser::uri($imageSrc);
// Host exists
if ($host = $db->getHost(crc32($hostImageURL->string))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostImageURL->scheme,
$hostImageURL->name,
$hostImageURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save new image info
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
if (!$hostImageId && // image not exists
$hostStatus && // host enabled
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
// Add host image
if ($hostImageId = $db->addHostImage($hostId,
crc32($hostImageURI->string),
$hostImageURI->string,
time())) {
$hostImagesAdded++;
} else {
// Skip encoded content
if (false !== strpos($src, 'data:')) {
continue;
}
}
// Add/update host image description
$imageAlt = Filter::imageAlt($imageAlt);
$imageTitle = Filter::imageTitle($imageTitle);
$db->setHostImageDescription($hostImageId,
crc32($imageAlt . $imageTitle),
$imageAlt,
$imageTitle,
null,
time(),
null);
// Relate host image with host page was found
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
// Increase image rank when link does not match the current host
if ($hostImageURL->scheme . '://' .
$hostImageURL->name .
($hostImageURL->port ? ':' . $hostImageURL->port : '')
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
}
}
}
// Add link to queue
$links[] = [
'title' => null,
'description' => null,
'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
'data' => null,
'ref' => $src,
];
}
// Collect internal links from page content
@ -659,6 +405,11 @@ try { @@ -659,6 +405,11 @@ try {
continue;
}
// Get title attribute if available
if (!$title = @$a->getAttribute('title')) {
$title = null;
}
// Skip anchor links
if (false !== strpos($href, '#')) {
@ -683,23 +434,34 @@ try { @@ -683,23 +434,34 @@ try {
continue;
}
// @TODO skip other apps
// Add link to queue
$links[] = [
'title' => null,
'description' => null,
'keywords' => Filter::pageKeywords($title),
'data' => null,
'ref' => $href,
];
}
// Process links collected
foreach ($links as $link) {
// Add absolute URL prefixes to the relative links found
if (!parse_url($href, PHP_URL_HOST)) {
//Make relative links absolute
if (!parse_url($link['ref'], PHP_URL_HOST)) {
$href = $queueHostPage->scheme . '://' .
$link['ref'] = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.');
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
}
// Validate formatted link
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) {
// Parse formatted link
$hostURL = Parser::hostURL($href);
$hostPageURI = Parser::uri($href);
$hostURL = Parser::hostURL($link['ref']);
$hostPageURI = Parser::uri($link['ref']);
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
@ -707,7 +469,7 @@ try { @@ -707,7 +469,7 @@ try {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
@ -731,30 +493,33 @@ try { @@ -731,30 +493,33 @@ try {
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostId = $db->addHost( $hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
} else {
// When page is root, skip next operations
if ($hostPageURI->string == '/') {
continue;
}
@ -766,25 +531,27 @@ try { @@ -766,25 +531,27 @@ try {
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId;
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
} else {
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
$db->addHostPageDescription($hostPageId,
$link['title'],
$link['description'],
$link['keywords'],
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
time());
$hostPagesAdded++;
}
}
// Increase page rank when link does not match the current host
if ($hostURL->scheme . '://' .
$hostURL->name .
($hostURL->port ? ':' . $hostURL->port : '')
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
}
}
}
@ -811,10 +578,6 @@ if (CRAWL_LOG_ENABLED) { @@ -811,10 +578,6 @@ if (CRAWL_LOG_ENABLED) {
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesBanned,
$hostImagesIndexed,
$hostImagesProcessed,
$hostImagesAdded,
$hostImagesBanned,
$manifestsProcessed,
$manifestsAdded,
$httpRequestsTotal,
@ -832,11 +595,6 @@ echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; @@ -832,11 +595,6 @@ echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Images banned: ' . $hostImagesBanned . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

18
library/filter.php

@ -54,24 +54,6 @@ class Filter { @@ -54,24 +54,6 @@ class Filter {
return $keywords;
}
static public function imageAlt(mixed $alt) {
$alt = (string) $alt;
$alt = trim($alt);
return $alt;
}
static public function imageTitle(mixed $title) {
$title = (string) $title;
$title = trim($title);
return $title;
}
static public function pageData(mixed $data) {
$data = (string) $data;

488
library/mysql.php

@ -102,267 +102,53 @@ class MySQL { @@ -102,267 +102,53 @@ class MySQL {
return $query->fetch()->total;
}
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, string $nsfw, mixed $robots, mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `nsfw`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $nsfw, $robots, $robotsPostfix]);
return $this->_db->lastInsertId();
}
public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
$query->execute([$robots, $timeUpdated, $hostId]);
return $query->rowCount();
}
// Images
public function getTotalHostImages(int $hostId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetch()->total;
}
public function getHostImageId(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT `hostImageId` FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->rowCount() ? $query->fetch()->hostImageId : 0;
}
public function getHostImages(int $hostId) {
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetchAll();
}
public function getUnrelatedHostImages() {
$query = $this->_db->prepare('SELECT * FROM `hostImage`
WHERE `hostImage`.`hostImageId` NOT IN (SELECT `hostImageToHostPage`.`hostImageId`
FROM `hostImageToHostPage`
WHERE `hostImageToHostPage`.`hostImageId` = `hostImage`.`hostImageId`)');
$query->execute();
return $query->fetchAll();
}
public function getHostImagesByLimit(int $hostId, int $limit) {
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? ORDER BY hostImageId DESC LIMIT ' . (int) $limit);
$query->execute([$hostId]);
return $query->fetchAll();
}
public function addHostImage(int $hostId,
int $crc32uri,
string $uri,
public function addHost(string $scheme,
string $name,
mixed $port,
int $crc32url,
int $timeAdded,
mixed $timeUpdated = null,
mixed $timeBanned = null,
mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null) {
$query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
`crc32uri`,
`uri`,
mixed $timeUpdated,
int $crawlPageLimit,
string $crawlMetaOnly,
string $status,
string $nsfw,
mixed $robots,
mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`,
`name`,
`port`,
`crc32url`,
`timeAdded`,
`timeUpdated`,
`timeBanned`,
`httpCode`,
`mime`,
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
return $this->_db->lastInsertId();
}
public function updateHostImageRank(int $hostId,
int $crc32uri,
int $increment) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `rank` = `rank` + ' . (int) $increment . ' WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->rowCount();
}
public function updateHostImageTimeBanned(int $hostImageId, int $timeBanned) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$timeBanned, $hostImageId]);
return $query->rowCount();
}
public function updateHostImageHttpCode(int $hostImageId,
int $httpCode,
int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `httpCode` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$httpCode, $timeUpdated, $hostImageId]);
return $query->rowCount();
}
public function updateHostImageMime(int $hostImageId,
string $mime,
int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$mime, $timeUpdated, $hostImageId]);
return $query->rowCount();
}
public function updateHostImage(int $hostImageId,
string $mime,
int $timeUpdated,
mixed $timeBanned = null) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$mime, $timeUpdated, $timeBanned, $hostImageId]);
return $query->rowCount();
}
public function deleteHostImage(int $hostImageId) {
$query = $this->_db->prepare('DELETE FROM `hostImage` WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$hostImageId]);
return $query->rowCount();
}
public function setHostImageDescription(int $hostImageId,
int $crc32id,
string $alt,
string $title,
mixed $data,
int $timeAdded,
mixed $timeUpdated) {
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
`crc32id`,
`alt`,
`title`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `alt` = ?,
`title` = ?,
`timeUpdated` = ?');
$query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded, $alt, $title, $timeUpdated]);
return $this->_db->lastInsertId();
}
public function setHostImageDescriptionData(int $hostImageId,
int $crc32id,
mixed $data,
int $timeAdded,
mixed $timeUpdated) {
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
`crc32id`,
`data`,
`timeAdded`) VALUES (?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
$query->execute([$hostImageId, $crc32id, $data, $timeAdded, $timeUpdated]);
`crawlPageLimit`,
`crawlMetaOnly`,
`status`,
`nsfw`,
`robots`,
`robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $scheme,
$name,
$port,
$crc32url,
$timeAdded,
$timeUpdated,
$crawlPageLimit,
$crawlMetaOnly,
$status,
$nsfw,
$robots,
$robotsPostfix]);
return $this->_db->lastInsertId();
}
public function deleteHostImageDescription(int $hostImageId) {
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `hostImageId` = ?');
$query->execute([$hostImageId]);
return $query->rowCount();
}
public function getLastHostImageDescription(int $hostImageId) {
$query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
$query->execute([$hostImageId]);
return $query->fetch();
}
public function getHostImageHostPages(int $hostImageId, int $limit = 5) {
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage`
JOIN `hostPage` ON (`hostPage`.`hostPageId` = `hostImageToHostPage`.`hostPageId`)
WHERE `hostImageId` = ?
ORDER BY `hostPage`.`rank` DESC, RAND(`hostPage`.`hostId`)
LIMIT ' . (int) $limit);
$query->execute([$hostImageId]);
return $query->fetchAll();
}
public function getHostImageHostPagesTotal(int $hostImageId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
$query->execute([$hostImageId]);
return $query->fetch()->total;
}
public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $time, int $quantity) {
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
`hostPageId`,
`timeAdded`,
`timeUpdated`,
`quantity`) VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `timeUpdated` = ?,
`quantity` = `quantity` + ' . (int) $quantity);
$query->execute([$hostImageId, $hostPageId, $time, null, $quantity, $time]);
return $query->rowCount(); // no primary key
}
public function deleteHostImageToHostPage(int $hostImageId) {
public function updateHostRobots(int $hostId, mixed $robots, int $timeUpdated) {
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
$query = $this->_db->prepare('UPDATE `host` SET `robots` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
$query->execute([$hostImageId]);
$query->execute([$robots, $timeUpdated, $hostId]);
return $query->rowCount();
}
@ -421,18 +207,9 @@ class MySQL { @@ -421,18 +207,9 @@ class MySQL {
return $query->fetchAll();
}
public function getHostPageDescription(int $hostPageId, int $crc32data) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data]);
return $query->fetch();
}
public function getLastPageDescription(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
$query->execute([$hostPageId]);
@ -442,7 +219,6 @@ class MySQL { @@ -442,7 +219,6 @@ class MySQL {
public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`uri`,
`hostPage`.`rank`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
@ -459,28 +235,6 @@ class MySQL { @@ -459,28 +235,6 @@ class MySQL {
return $query->fetch();
}
public function getFoundHostImage(int $hostImageId) {
$query = $this->_db->prepare('SELECT `hostImage`.`hostImageId`,
`hostImage`.`uri`,
`hostImage`.`rank`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlMetaOnly`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
WHERE `hostImage`.`hostImageId` = ?
LIMIT 1');
$query->execute([$hostImageId]);
return $query->fetch();
}
public function addHostPage(int $hostId,
int $crc32uri,
string $uri,
@ -488,8 +242,7 @@ class MySQL { @@ -488,8 +242,7 @@ class MySQL {
mixed $timeUpdated = null,
mixed $timeBanned = null,
mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null) {
mixed $mime = null) {
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
`crc32uri`,
@ -498,10 +251,9 @@ class MySQL { @@ -498,10 +251,9 @@ class MySQL {
`timeUpdated`,
`timeBanned`,
`httpCode`,
`mime`,
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
`mime`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime]);
return $this->_db->lastInsertId();
}
@ -515,22 +267,6 @@ class MySQL { @@ -515,22 +267,6 @@ class MySQL {
return $query->rowCount();
}
public function updateHostPageRank(int $hostId,
int $crc32uri,
int $increment) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = `rank` + ' . (int) $increment . '
WHERE `hostId` = ?
AND `crc32uri` = ?
LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->rowCount();
}
public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
@ -576,48 +312,52 @@ class MySQL { @@ -576,48 +312,52 @@ class MySQL {
return $query->rowCount();
}
public function deleteHostPageToHostImage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->rowCount();
}
public function setHostPageDescription(int $hostPageId,
int $crc32data,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
public function addHostPageDescription(int $hostPageId,
mixed $title,
mixed $description,
mixed $keywords,
mixed $data,
int $time) {
int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
`crc32data`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`title`,
`description`,
`keywords`,
`data`,
`timeAdded`
) VALUES (?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
) VALUES (?, ?, ?, ?, ?, ?)');
$query->execute([
$hostPageId,
$crc32data,
$metaTitle,
$metaDescription,
$metaKeywords,
$title,
$description,
$keywords,
$data,
$time,
$time
$timeAdded,
]);
return $query->rowCount();
}
public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
$query = $this->_db->prepare('INSERT INTO `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`, `quantity`) VALUES (?, ?, 0)
ON DUPLICATE KEY UPDATE `quantity` = `quantity` + 1');
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
}
public function deleteHostPageToHostPage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? OR `hostPageIdTarget` = ?');
$query->execute([$hostPageId, $hostPageId]);
return $query->rowCount();
}
// Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) {
@ -652,33 +392,12 @@ class MySQL { @@ -652,33 +392,12 @@ class MySQL {
return $query->rowCount();
}
public function resetBannedHostImages(int $timeOffset) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function deleteHostImageDescriptionsByTimeAdded(int $timeOffset) {
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function addCleanerLog(int $timeAdded,
int $hostsTotal,
int $hostsUpdated,
int $hostPagesDeleted,
int $hostPageDescriptionsDeleted,
int $hostPagesBansRemoved,
int $hostImagesDeleted,
int $hostImageDescriptionsDeleted,
int $hostImagesBansRemoved,
int $manifestsTotal,
int $manifestsDeleted,
int $logsCleanerDeleted,
@ -695,9 +414,6 @@ class MySQL { @@ -695,9 +414,6 @@ class MySQL {
`hostPagesDeleted`,
`hostPageDescriptionsDeleted`,
`hostPagesBansRemoved`,
`hostImagesDeleted`,
`hostImageDescriptionsDeleted`,
`hostImagesBansRemoved`,
`manifestsTotal`,
`manifestsDeleted`,
`logsCleanerDeleted`,
@ -706,7 +422,7 @@ class MySQL { @@ -706,7 +422,7 @@ class MySQL {
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
@ -715,9 +431,6 @@ class MySQL { @@ -715,9 +431,6 @@ class MySQL {
$hostPagesDeleted,
$hostPageDescriptionsDeleted,
$hostPagesBansRemoved,
$hostImagesDeleted,
$hostImageDescriptionsDeleted,
$hostImagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
$logsCleanerDeleted,
@ -751,7 +464,6 @@ class MySQL { @@ -751,7 +464,6 @@ class MySQL {
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlImageLimit`,
`host`.`crawlMetaOnly`,
`host`.`robots`,
`host`.`robotsPostfix`
@ -762,7 +474,7 @@ class MySQL { @@ -762,7 +474,7 @@ class MySQL {
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
AND `hostPage`.`timeBanned` IS NULL
ORDER BY `hostPage`.`rank` DESC, RAND()
ORDER BY RAND()
LIMIT ' . (int) $limit);
@ -780,40 +492,6 @@ class MySQL { @@ -780,40 +492,6 @@ class MySQL {
return $query->rowCount();
}
public function getHostImageCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostImage`.`hostId`,
`hostImage`.`hostImageId`,
`hostImage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlMetaOnly`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
AND `hostImage`.`timeBanned` IS NULL
ORDER BY `hostImage`.`rank` DESC, RAND()
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $hostImageId]);
return $query->rowCount();
}
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `manifest`
@ -844,10 +522,6 @@ class MySQL { @@ -844,10 +522,6 @@ class MySQL {
int $hostPagesIndexed,
int $hostPagesAdded,
int $hostPagesBanned,
int $hostImagesIndexed,
int $hostImagesProcessed,
int $hostImagesAdded,
int $hostImagesBanned,
int $manifestsProcessed,
int $manifestsAdded,
int $httpRequestsTotal,
@ -862,17 +536,13 @@ class MySQL { @@ -862,17 +536,13 @@ class MySQL {
`hostPagesIndexed`,
`hostPagesAdded`,
`hostPagesBanned`,
`hostImagesIndexed`,
`hostImagesProcessed`,
`hostImagesAdded`,
`hostImagesBanned`,
`manifestsProcessed`,
`manifestsAdded`,
`httpRequestsTotal`,
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
@ -881,10 +551,6 @@ class MySQL { @@ -881,10 +551,6 @@ class MySQL {
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesBanned,
$hostImagesIndexed,
$hostImagesProcessed,
$hostImagesAdded,
$hostImagesBanned,
$manifestsProcessed,
$manifestsAdded,
$httpRequestsTotal,

48
library/sphinxql.php

@ -11,13 +11,13 @@ class SphinxQL { @@ -11,13 +11,13 @@ class SphinxQL {
$this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
}
public function searchHostPages(string $keyword, int $start, int $limit, int $maxMatches) {
public function searchHostPages(string $keyword, string $mime, int $start, int $limit, int $maxMatches) {
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
FROM `hostPage`
WHERE MATCH(?)
WHERE MATCH(?) AND `mime` = ?
ORDER BY `rank` DESC, WEIGHT() DESC
@ -25,26 +25,7 @@ class SphinxQL { @@ -25,26 +25,7 @@ class SphinxQL {
OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1));
$query->execute([$keyword]);
return $query->fetchAll();
}
public function searchHostImages(string $keyword, int $start, int $limit, int $maxMatches) {
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
FROM `hostImage`
WHERE MATCH(?)
ORDER BY `rank` DESC, WEIGHT() DESC
LIMIT ' . (int) ($start >= $maxMatches ? ($maxMatches > 0 ? $maxMatches - 1 : 0) : $start) . ',' . (int) $limit . '
OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1));
$query->execute([$keyword]);
$query->execute([$keyword, $mime]);
return $query->fetchAll();
}
@ -58,29 +39,20 @@ class SphinxQL { @@ -58,29 +39,20 @@ class SphinxQL {
return $query->fetch()->total;
}
public function searchHostPagesTotal(string $keyword) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
$query->execute([$keyword]);
return $query->fetch()->total;
}
public function searchHostImagesTotal(string $keyword) {
public function getHostPagesMime() {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE MATCH(?)');
$query = $this->_sphinx->prepare('SELECT `mime` FROM `hostPage` GROUP BY `mime` ORDER BY `mime` ASC');
$query->execute([$keyword]);
$query->execute();
return $query->fetch()->total;
return $query->fetchAll();
}
public function getHostImagesTotal() {
public function searchHostPagesTotal(string $keyword, string $mime) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage`');
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?) AND `mime` = ?');
$query->execute();
$query->execute([$keyword, $mime]);
return $query->fetch()->total;
}

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 134 KiB

After

Width:  |  Height:  |  Size: 147 KiB

36
public/api.php

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
<?php
// Current version
define('API_VERSION', 0.7);
define('API_VERSION', 0.8);
// Load system dependencies
require_once('../config/app.php');
@ -30,42 +30,20 @@ if (API_ENABLED) { @@ -30,42 +30,20 @@ if (API_ENABLED) {
// Filter request data
$type = !empty($_GET['type']) ? Filter::url($_GET['type']) : 'page';
$type = !empty($_GET['type']) ? Filter::url($_GET['type']) : 'html';
$mode = !empty($_GET['mode']) ? Filter::url($_GET['mode']) : 'default';
$query = !empty($_GET['query']) ? Filter::url($_GET['query']) : '';
$page = !empty($_GET['page']) ? (int) $_GET['page'] : 1;
// Make image search request
if (!empty($type) && $type == 'image') {
$sphinxResultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($query, $mode));
$sphinxResults = $sphinx->searchHostImages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
// Make default search request
} else {
$sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode));
$sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
}
// Make search request
$sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode), $type);
$sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $type, $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
// Generate results
$dbResults = [];
foreach ($sphinxResults as $i => $sphinxResult) {
// Image
if (!empty($type) && $type == 'image') {
if ($hostImage = $db->getFoundHostImage($sphinxResult->id)) {
$dbResults[$i] = $hostImage;
$dbResults[$i]->weight = $sphinxResult->weight;
}
// Default
} else {
if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
$dbResults[$i] = $hostPage;
@ -73,7 +51,6 @@ if (API_ENABLED) { @@ -73,7 +51,6 @@ if (API_ENABLED) {
$dbResults[$i]->weight = $sphinxResult->weight;
}
}
}
// Make response
$response = [
@ -129,13 +106,10 @@ if (API_ENABLED) { @@ -129,13 +106,10 @@ if (API_ENABLED) {
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageMime' => CRAWL_PAGE_MIME,
'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET,
'crawlHostImageMime' => CRAWL_IMAGE_MIME,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,

1
public/index.php

@ -24,7 +24,6 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -24,7 +24,6 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php if (API_MANIFEST_ENABLED) { ?>
<meta name="yggo:manifest" content="<?php echo sprintf('%s/api.php?action=manifest', WEBSITE_DOMAIN) ?>" />
<?php } ?>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
<meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" />
<style>

225
public/search.php

@ -16,34 +16,34 @@ $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); @@ -16,34 +16,34 @@ $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Filter request data
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'page';
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'html';
$m = !empty($_GET['m']) ? Filter::url($_GET['m']) : 'default';
$q = !empty($_GET['q']) ? Filter::url($_GET['q']) : '';
$p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
// Define page basics
switch ($t) {
// Search request
if (!empty($q)) {
case 'image':
$resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m), $t);
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $t, $p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, $resultsTotal);
$totalPages = $sphinx->getHostImagesTotal();
} else {
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s image or enter the new one...'), $totalPages),
sprintf(_('Over %s images or enter the new one...'), $totalPages),
sprintf(_('Over %s images or enter the new one...'), $totalPages),
]);
$resultsTotal = 0;
$results = [];
}
// Mime list
$hostPagesMime = $sphinx->getHostPagesMime();
break;
default:
// Define page basics
$totalPages = $sphinx->getHostPagesTotal();
$totalPages = $sphinx->getHostPagesTotal();
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]);
}
// Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
@ -61,6 +61,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -61,6 +61,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
@ -82,21 +83,26 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -82,21 +83,26 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostId = $db->addHost( $hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
}
}
@ -120,28 +126,10 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { @@ -120,28 +126,10 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
} catch(Exception $e){
$db->rollBack();
}
}
var_dump($e);
// Search request
if (!empty($q)) {
if ($t == 'image') {
$resultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($q, $m));
$results = $sphinx->searchHostImages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT, $resultsTotal);
} else {
$resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m));
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, $resultsTotal);
$db->rollBack();
}
} else {
$resultsTotal = 0;
$results = [];
}
?>
@ -151,7 +139,6 @@ if (!empty($q)) { @@ -151,7 +139,6 @@ if (!empty($q)) {
<head>
<title><?php echo (empty($q) ? _('Empty request - YGGo!') : ($p > 1 ? sprintf(_('%s - #%s - YGGo!'), htmlentities($q), $p) : sprintf(_('%s - YGGo!'), htmlentities($q)))) ?></title>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
<meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" />
<style>
@ -322,8 +309,9 @@ if (!empty($q)) { @@ -322,8 +309,9 @@ if (!empty($q)) {
<form name="search" method="GET" action="<?php echo WEBSITE_DOMAIN; ?>/search.php">
<h1><a href="<?php echo WEBSITE_DOMAIN; ?>"><?php echo _('YGGo!') ?></a></h1>
<input type="text" name="q" placeholder="<?php echo $placeholder ?>" value="<?php echo htmlentities($q) ?>" />
<label><input type="radio" name="t" value="page" <?php echo ($t == 'page' ? 'checked="checked"' : false) ?>/> <?php echo _('Pages') ?></label>
<label><input type="radio" name="t" value="image" <?php echo ($t == 'image' ? 'checked="checked"' : false) ?>/> <?php echo _('Images') ?></label>
<?php foreach ($hostPagesMime as $hostPageMime) { ?>
<label><input type="radio" name="t" value="<?php echo $hostPageMime->mime ?>" <?php echo ($t == $hostPageMime->mime ? 'checked="checked"' : false) ?>/> <?php echo $hostPageMime->mime ?></label>
<?php } ?>
<button type="submit"><?php echo _('Search'); ?></button>
</form>
</header>
@ -336,156 +324,13 @@ if (!empty($q)) { @@ -336,156 +324,13 @@ if (!empty($q)) {
<?php } ?>
</div>
<?php foreach ($results as $result) { ?>
<?php if ($t == 'image' && $hostImage = $db->getFoundHostImage($result->id)) { ?>
<?php
// Built image url
$hostImageURL = $hostImage->scheme . '://' .
$hostImage->name .
($hostImage->port ? ':' . $hostImage->port : false) .
$hostImage->uri;
// Get local image data
$lastHostImageDescription = $db->getLastHostImageDescription($result->id);
if (!empty($lastHostImageDescription->data)) {
$hostImageURLencoded = $lastHostImageDescription->data;
// Get remote if local index not found or CRAWL_HOST_DEFAULT_META_ONLY enabled
} else {
// Init image request
$hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);
// Skip item render on timeout
$hostImageHttpCode = $hostImageCurl->getCode();
$db->updateHostImageHttpCode($result->id, (int) $hostImageHttpCode, time());
if (200 != $hostImageHttpCode) {
$db->updateHostImageHttpCode($result->id, $hostImageHttpCode, time());
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $hostImageCurl->getContentType()) {
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
// Skip image processing on MIME type not allowed in settings
$hostImageBanned = true;
foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
if (false !== strpos($hostImageContentType, trim($mime))) {
$hostImageBanned = false;
break;
}
}
if ($hostImageBanned) {
$db->updateHostImageMime($result->id, $hostImageContentType, time());
$hostImagesBanned += $db->updateHostImageTimeBanned($result->id, time());
continue;
}
// Skip image processing without returned content
if (!$hostImageContent = $hostImageCurl->getContent()) {
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) {
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
$hostImageURLencoded = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
// Save image content on data settings enabled
$db->updateHostImage($result->id,
Filter::mime($hostImageContentType),
time());
$db->setHostImageDescriptionData($result->id,
crc32($hostImageURLencoded),
$hostImage->crawlMetaOnly ? null : $hostImageURLencoded,
time(),
null);
}
?>
<div>
<a href="<?php echo $hostImageURL ?>">
<img src="<?php echo $hostImageURLencoded ?>" alt="<?php echo htmlentities($hostImageURL) ?>" title="<?php echo htmlentities($hostImageURL) ?>" class="image" />
</a>
<br />
<?php $hostImageHostPagesTotal = $db->getHostImageHostPagesTotal($result->id) ?>
<?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
<?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
<?php } ?>
<?php if ($lastHostImageDescription) { ?>
<span><?php echo $lastHostImageDescription->title ?> <?php echo $lastHostImageDescription->alt ?></span>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPageURL)) ?>
</a>
<br />
<?php } ?>
<?php } ?>
<?php if ($hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT > 0) { ?>
<p>
<small>
<?php echo Filter::plural($hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT,
[
sprintf(_('+%s other page'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
sprintf(_('+%s other pages'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
sprintf(_('+%s other pages'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
]); ?>
</small>
</p>
<?php } ?>
</div>
<?php } else if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<?php
$hostPageURL = $hostPage->scheme . '://' .
$hostPage->name .
($hostPage->port ? ':' . $hostPage->port : false) .
$hostPage->uri;
?>
<div>
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<h2><?php echo $hostPageDescription->metaTitle ?></h2>
<?php if (!empty($hostPageDescription->metaDescription)) { ?>
<span><?php echo $hostPageDescription->metaDescription ?></span>
<h2><?php echo $hostPageDescription->title ?></h2>
<?php if (!empty($hostPageDescription->description)) { ?>
<span><?php echo $hostPageDescription->description ?></span>
<?php } ?>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
@ -495,7 +340,7 @@ if (!empty($q)) { @@ -495,7 +340,7 @@ if (!empty($q)) {
</div>
<?php } ?>
<?php } ?>
<?php if ($p * ($t == 'image' ? WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT : WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT) <= $resultsTotal) { ?>
<?php if ($p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT <= $resultsTotal) { ?>
<div>
<a href="<?php echo WEBSITE_DOMAIN; ?>/search.php?q=<?php echo urlencode(htmlentities($q)) ?>&t=<?php echo $t ?>&p=<?php echo $p + 1 ?>"><?php echo _('Next page') ?></a>
</div>

Loading…
Cancel
Save