Browse Source

refactor to mime-based content index #1

main
ghost 2 years ago
parent
commit
db0e66c846
  1. 17
      README.md
  2. 88
      config/app.php.txt
  3. 53
      config/sphinx.conf.txt
  4. 73
      crontab/cleaner.php
  5. 492
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 18
      library/filter.php
  8. 488
      library/mysql.php
  9. 48
      library/sphinxql.php
  10. BIN
      media/db-prototype.png
  11. 42
      public/api.php
  12. 1
      public/index.php
  13. 251
      public/search.php

17
README.md

@ -62,7 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option
``` ```
GET action=search - required GET action=search - required
GET query={string} - optional, search request, empty if not provided GET query={string} - optional, search request, empty if not provided
GET type={string} - optional, search type, image|default or empty GET type={string} - optional, filter mime type of available or empty
GET page={int} - optional, search results page, 1 if not provided GET page={int} - optional, search results page, 1 if not provided
GET mode=SphinxQL - optional, enable extended SphinxQL syntax GET mode=SphinxQL - optional, enable extended SphinxQL syntax
``` ```
@ -141,7 +141,7 @@ GET m=SphinxQL
##### Basic features ##### Basic features
* [x] Web pages full text ranking search * [x] Web pages full text ranking search
* [x] Images search with safe proxy preview support * [x] MIME filtering search with safe proxy images preview
* [x] Extended syntax support * [x] Extended syntax support
* [x] Flexible settings compatible with IPv4/IPv6 networks * [x] Flexible settings compatible with IPv4/IPv6 networks
@ -159,18 +159,14 @@ GET m=SphinxQL
* [ ] Index API * [ ] Index API
+ [x] Manifest + [x] Manifest
+ [x] Search + [x] Search
+ [x] Pages
+ [x] Images
+ [x] Hosts + [x] Hosts
+ [ ] Pages + [ ] MIME list
+ [ ] Images
* [ ] Context advertising API * [ ] Context advertising API
##### Crawler ##### Crawler
* [x] Auto crawl links by regular expression rules * [x] Auto crawl links by regular expression rules
+ [x] Pages + [x] Pages
+ [x] Images
+ [x] Manifests + [x] Manifests
* [x] Robots.txt / robots meta tags support (#2) * [x] Robots.txt / robots meta tags support (#2)
* [x] Specific rules configuration for every host * [x] Specific rules configuration for every host
@ -181,8 +177,6 @@ GET m=SphinxQL
* [x] Ban non-condition links to prevent extra requests * [x] Ban non-condition links to prevent extra requests
* [x] Debug log * [x] Debug log
* [x] History snaps * [x] History snaps
+ [x] Pages
+ [x] Images
* [ ] Indexing new sites homepage in higher priority * [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing * [ ] Redirect codes extended processing
* [ ] Palette image index / filter * [ ] Palette image index / filter
@ -191,17 +185,12 @@ GET m=SphinxQL
##### Cleaner ##### Cleaner
* [x] Deprecated DB items auto deletion / host settings update * [x] Deprecated DB items auto deletion / host settings update
+ [x] Pages + [x] Pages
+ [x] Images
+ [x] Manifests + [x] Manifests
+ [x] Logs + [x] Logs
+ [x] Crawler + [x] Crawler
+ [x] Cleaner + [x] Cleaner
* [x] Deprecated history snaps removing * [x] Deprecated history snaps removing
+ [x] Pages
+ [x] Images
* [x] Banned resources reset by timeout * [x] Banned resources reset by timeout
+ [x] Pages
+ [x] Images
* [x] Debug log * [x] Debug log
##### Other ##### Other

88
config/app.php.txt

@ -47,7 +47,7 @@ error_reporting(E_ALL);
* Project domain, without slash on postfix * Project domain, without slash on postfix
* *
*/ */
define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : '')); define('WEBSITE_DOMAIN', '');
/* /*
* Page search results before show the read more link * Page search results before show the read more link
@ -55,18 +55,6 @@ define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on'
*/ */
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100); define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
/*
* Image search results before show the read more link
*
*/
define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10);
/*
* Quantity of related pages for each image in the search results
*
*/
define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
/* /*
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload * Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
* *
@ -76,7 +64,7 @@ define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true); define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database // Database
define('DB_HOST', 'localhost'); define('DB_HOST', '127.0.0.1');
define('DB_PORT', 3306); define('DB_PORT', 3306);
define('DB_NAME', ''); define('DB_NAME', '');
define('DB_USERNAME', ''); define('DB_USERNAME', '');
@ -144,20 +132,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
*/ */
define('CRAWL_PAGE_LIMIT', 20); define('CRAWL_PAGE_LIMIT', 20);
/*
* Images (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Set 0 to disable
*
*/
define('CRAWL_IMAGE_LIMIT', 10);
/* /*
* Manifest (URI) processing limit in the crawler.php queue * Manifest (URI) processing limit in the crawler.php queue
* *
@ -194,28 +168,7 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
* comma separated * comma separated
* *
*/ */
define('CRAWL_PAGE_MIME', 'text/html'); define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
/*
* Index images match MIME types
*
* comma separated
*
*/
define('CRAWL_IMAGE_MIME', 'image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
/*
* Renew image index by timing offset provided
*
* This option works with CRAWL_IMAGE_LIMIT step queue
*
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
* must have enough value to crawl all images collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
/* /*
* Renew manifests index by timing offset provided * Renew manifests index by timing offset provided
@ -234,7 +187,7 @@ define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
* Only URL addresses match this rule will be auto-crawled * Only URL addresses match this rule will be auto-crawled
* *
*/ */
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui' define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui');
/* /*
* Pages limit per new host by default * Pages limit per new host by default
@ -244,7 +197,7 @@ define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/u
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field * Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
* *
*/ */
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000); define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
/* /*
* Set default auto-crawl status for new host added * Set default auto-crawl status for new host added
@ -264,7 +217,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field * Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
* *
* This option able to change search results relevance * This option able to change search results relevance
* This option enables image data caching in base64
* *
*/ */
define('CRAWL_HOST_DEFAULT_META_ONLY', false); define('CRAWL_HOST_DEFAULT_META_ONLY', false);
@ -279,16 +231,6 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', false);
*/ */
define('CRAWL_HOST_DEFAULT_NSFW', false); define('CRAWL_HOST_DEFAULT_NSFW', false);
/*
* Not suitable/safe for work status for new host by default
*
* Could be filtered in crawl conditions or search results
*
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
*
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
/* /*
* Default robots.txt rules on remote file not exists * Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules * The crawler able to overwrite these rules
@ -324,7 +266,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility * Manifest API version compatibility
* *
*/ */
define('CRAWL_MANIFEST_API_VERSION', 0.7); define('CRAWL_MANIFEST_API_VERSION', 0.8);
/* /*
* Set default auto-crawl status for new manifest added * Set default auto-crawl status for new manifest added
@ -389,20 +331,6 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
*/ */
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10); define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
/*
* Remove image ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
/*
* Remove image description history after following time
*
*/
define('CLEAN_IMAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
// API settings // API settings
@ -445,14 +373,12 @@ define('API_HOSTS_FIELDS',
`host`.`name`, `host`.`name`,
`host`.`port`, `host`.`port`,
`host`.`crawlPageLimit`, `host`.`crawlPageLimit`,
`host`.`crawlImageLimit`,
`host`.`robots`, `host`.`robots`,
`host`.`robotsPostfix`, `host`.`robotsPostfix`,
`host`.`nsfw`, `host`.`nsfw`,
`host`.`timeAdded`, `host`.`timeAdded`,
`host`.`timeUpdated`, `host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`, (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');
(SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated
/* /*
* Manifest API * Manifest API

53
config/sphinx.conf.txt

@ -12,36 +12,24 @@ source common
source hostPage : common source hostPage : common
{ {
sql_query = \ sql_query = \
SELECT hostPage.hostPageId, \ SELECT `hostPage`.`hostPageId`, \
hostPage.rank, \ `hostPage`.`uri`, \
hostPage.uri, \ `host`.`name`, \
host.name, \ REGEXP_REPLACE(`hostPage`.`mime`, '^[A-z-]+/([A-z-]+).*', '$1') AS `mime`, \
(SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \ (SELECT COUNT(*) FROM `hostPageToHostPage` \
hostPageDescription.metaDescription, \ WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \
hostPageDescription.metaKeywords) \ AND `hostPageToHostPage`.`hostPageIdSource` <> `hostPage`.`hostPageId`) AS `rank`, \
FROM hostPageDescription \ (SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \ `hostPageDescription`.`description`, \
ORDER BY hostPageDescription.timeUpdated DESC, hostPageDescription.timeAdded DESC \ `hostPageDescription`.`keywords`)) \
LIMIT 1) AS pageDescription \ FROM `hostPageDescription` \
FROM hostPage \ WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId`) AS `pageDescription` \
JOIN host ON (host.hostId = hostPage.hostId) \ FROM `hostPage` \
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) \
WHERE `host`.`status` = '1' AND `hostPage`.`httpCode` = 200 AND `hostPage`.`timeBanned` IS NULL
sql_attr_uint = rank sql_attr_uint = rank
} sql_attr_string = mime
source hostImage : common
{
sql_query = \
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \
FROM hostImageDescription \
WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription \
FROM hostImage \
JOIN host ON (host.hostId = hostImage.hostId) \
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
sql_attr_uint = rank
} }
index hostPage index hostPage
@ -49,11 +37,4 @@ index hostPage
source = hostPage source = hostPage
morphology = stem_enru, stem_cz, stem_ar morphology = stem_enru, stem_cz, stem_ar
path = /var/lib/sphinxsearch/data/hostPage path = /var/lib/sphinxsearch/data/hostPage
}
index hostImage
{
source = hostImage
morphology = stem_enru, stem_cz, stem_ar
path = /var/lib/sphinxsearch/data/hostImage
} }

73
crontab/cleaner.php

@ -31,11 +31,8 @@ $manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0; $hostsUpdated = 0;
$hostPagesDeleted = 0; $hostPagesDeleted = 0;
$hostPageDescriptionsDeleted = 0; $hostPageDescriptionsDeleted = 0;
$hostImagesDeleted = 0;
$hostImageDescriptionsDeleted = 0;
$manifestsDeleted = 0; $manifestsDeleted = 0;
$hostPagesBansRemoved = 0; $hostPagesBansRemoved = 0;
$hostImagesBansRemoved = 0;
$logsCleanerDeleted = 0; $logsCleanerDeleted = 0;
$logsCrawlerDeleted = 0; $logsCrawlerDeleted = 0;
@ -56,7 +53,7 @@ try {
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
@ -69,22 +66,6 @@ try {
// Update host data // Update host data
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time()); $hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
// Apply host images limits
$totalHostImages = $db->getTotalHostImages($host->hostId);
if ($totalHostImages > $host->crawlImageLimit) {
foreach ((array) $db->getHostImagesByLimit($host->hostId, $totalHostImages - $host->crawlImageLimit) as $hostImage) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
// Apply host pages limits // Apply host pages limits
$totalHostPages = $db->getTotalHostPages($host->hostId); $totalHostPages = $db->getTotalHostPages($host->hostId);
@ -92,56 +73,32 @@ try {
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) { foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
// Delete foreign key relations
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page // Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId); $db->deleteHostPageDescriptions($hostPage->hostPageId);
$db->deleteHostPageToHostPage($hostPage->hostPageId);
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
} }
} }
// Apply new robots.txt rules // Apply new robots.txt rules
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
foreach ($db->getHostImages($host->hostId) as $hostImage) {
if (!$robots->uriAllowed($hostImage->uri)) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
foreach ($db->getHostPages($host->hostId) as $hostPage) { foreach ($db->getHostPages($host->hostId) as $hostPage) {
if (!$robots->uriAllowed($hostPage->uri)) { if (!$robots->uriAllowed($hostPage->uri)) {
// Delete foreign key relations
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page // Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId); $db->deleteHostPageDescriptions($hostPage->hostPageId);
$db->deleteHostPageToHostPage($hostPage->hostPageId);
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
} }
} }
// Clean up host images unrelated to host pages
foreach ($db->getUnrelatedHostImages() as $hostImage) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
} }
// Clean up deprecated manifests // Clean up deprecated manifests
@ -207,12 +164,6 @@ try {
// Delete page description history // Delete page description history
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET); $hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
// Reset banned images
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
// Delete image description history
$hostImageDescriptionsDeleted += $db->deleteHostImageDescriptionsByTimeAdded(time() - CLEAN_IMAGE_DESCRIPTION_OFFSET);
// Delete deprecated logs // Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET); $logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET); $logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
@ -238,9 +189,6 @@ if (CLEAN_LOG_ENABLED) {
$hostPagesDeleted, $hostPagesDeleted,
$hostPageDescriptionsDeleted, $hostPageDescriptionsDeleted,
$hostPagesBansRemoved, $hostPagesBansRemoved,
$hostImagesDeleted,
$hostImageDescriptionsDeleted,
$hostImagesBansRemoved,
$manifestsTotal, $manifestsTotal,
$manifestsDeleted, $manifestsDeleted,
$logsCleanerDeleted, $logsCleanerDeleted,
@ -256,15 +204,12 @@ if (CLEAN_LOG_ENABLED) {
echo 'Hosts total: ' . $hostsTotal . PHP_EOL; echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL; echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL; echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
echo 'Hosts images deleted: ' . $hostImagesDeleted . PHP_EOL;
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL; echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL; echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL; echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL;
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
echo 'Host image descriptions deleted: ' . $hostImageDescriptionsDeleted . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL; echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL; echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;

492
crontab/crawler.php

@ -33,16 +33,12 @@ $httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0; $httpRequestsTimeTotal = 0;
$hostPagesProcessed = 0; $hostPagesProcessed = 0;
$hostImagesProcessed = 0;
$manifestsProcessed = 0; $manifestsProcessed = 0;
$hostPagesIndexed = 0; $hostPagesIndexed = 0;
$hostImagesIndexed = 0;
$manifestsAdded = 0; $manifestsAdded = 0;
$hostPagesAdded = 0; $hostPagesAdded = 0;
$hostImagesAdded = 0;
$hostsAdded = 0; $hostsAdded = 0;
$hostPagesBanned = 0; $hostPagesBanned = 0;
$hostImagesBanned = 0;
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@ -121,7 +117,7 @@ try {
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
@ -167,26 +163,15 @@ try {
// Validate formatted link // Validate formatted link
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
// Host exists // Host not exists
if ($host = $db->getHost(crc32($hostURL))) { if (!$db->getHost(crc32($hostURL))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists // Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats // Update curl stats
$httpRequestsTotal++; $httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest(); $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload(); $httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime(); $httpRequestsTimeTotal += $curl->getTotalTime();
@ -198,158 +183,33 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW; $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($remoteManifestHosts->result->scheme,
$remoteManifestHosts->result->name,
$remoteManifestHosts->result->port,
crc32($hostURL),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else { $hostId = $db->addHost( $remoteManifestHosts->result->scheme,
$remoteManifestHosts->result->name,
continue; $remoteManifestHosts->result->port,
} crc32($hostURL),
time(),
null,
$hostPageLimit,
(string) $hostMetaOnly,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
// Increase counters
$hostPagesAdded++;
$hostsAdded++;
} }
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save home page info
// Until page API not implemented, save at least home page to have ability to crawl
// @TODO
if ($hostStatus && // host enabled
$robots->uriAllowed('/') && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32('/'))) { // page not exists
if ($db->addHostPage($hostId, crc32('/'), '/', time())) {
$hostPagesAdded++;
}
}
}
}
}
// Process images crawl queue
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
// Init image request
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip image processing non 200 code
if (200 != $curl->getCode()) {
$db->updateHostImageHttpCode($queueHostImage->hostImageId, $curl->getCode(), time());
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $curl->getContentType()) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
// Skip image processing on MIME type not allowed in settings
$hostImageBanned = true;
foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
if (false !== strpos($hostImageContentType, trim($mime))) {
$hostImageBanned = false;
break;
}
}
if ($hostImageBanned) {
$db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
// Convert remote image data to base64 string
if (!$queueHostImage->crawlMetaOnly) {
// Skip image processing without returned content
if (!$hostImageContent = $curl->getContent()) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
$hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
// Set host image description
// On link collection we knew meta but data,
// this step use latest description slice and insert the data received by curl request
if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) {
$db->setHostImageDescription($queueHostImage->hostImageId,
crc32($lastHostImageDescription->alt .
$lastHostImageDescription->title .
$hostImageData),
$lastHostImageDescription->alt,
$lastHostImageDescription->title,
$hostImageData,
time(),
time());
} }
} }
$hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
Filter::mime($hostImageContentType),
time());
} }
// Process pages crawl queue // Process pages crawl queue
@ -476,12 +336,11 @@ try {
time()); time());
// Add queued page description if not exists // Add queued page description if not exists
$db->setHostPageDescription($queueHostPage->hostPageId, $db->addHostPageDescription($queueHostPage->hostPageId,
crc32($content),
Filter::pageTitle($title->item(0)->nodeValue), Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription), Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords), Filter::pageKeywords($metaKeywords),
$queueHostPage->crawlMetaOnly ? null : Filter::string($content), $queueHostPage->crawlMetaOnly ? null : base64_encode($content),
time()); time());
// Update manifest registry // Update manifest registry
@ -499,155 +358,42 @@ try {
} }
} }
// Collect page images // Init links registry
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) { $links = [];
foreach (@$dom->getElementsByTagName('img') as $img) {
// Skip images without src attribute
if (!$imageSrc = @$img->getAttribute('src')) {
continue;
}
// Skip images without alt attribute
if (!$imageAlt = @$img->getAttribute('alt')) {
continue;
}
if (!$imageTitle = @$img->getAttribute('title')) {
$imageTitle = null;
}
// Add domain to the relative src links
if (!parse_url($imageSrc, PHP_URL_HOST)) {
$imageSrc = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
}
// Validate formatted src link
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
// Parse formatted src link
$hostImageURL = Parser::hostURL($imageSrc);
$hostImageURI = Parser::uri($imageSrc);
// Host exists
if ($host = $db->getHost(crc32($hostImageURL->string))) {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostImageURL->scheme,
$hostImageURL->name,
$hostImageURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
(string) $hostNsfw,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else { // Collect image links
foreach (@$dom->getElementsByTagName('img') as $img) {
continue; // Skip images without src attribute
} if (!$src = @$img->getAttribute('src')) {
}
// Init robots parser continue;
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); }
// Save new image info
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
if (!$hostImageId && // image not exists
$hostStatus && // host enabled
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
// Add host image // Skip images without alt attribute
if ($hostImageId = $db->addHostImage($hostId, if (!$alt = @$img->getAttribute('alt')) {
crc32($hostImageURI->string),
$hostImageURI->string,
time())) {
$hostImagesAdded++; continue;
}
} else { if (!$title = @$img->getAttribute('title')) {
$title = null;
}
continue; // Skip encoded content
} if (false !== strpos($src, 'data:')) {
}
// Add/update host image description continue;
$imageAlt = Filter::imageAlt($imageAlt);
$imageTitle = Filter::imageTitle($imageTitle);
$db->setHostImageDescription($hostImageId,
crc32($imageAlt . $imageTitle),
$imageAlt,
$imageTitle,
null,
time(),
null);
// Relate host image with host page was found
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
// Increase image rank when link does not match the current host
if ($hostImageURL->scheme . '://' .
$hostImageURL->name .
($hostImageURL->port ? ':' . $hostImageURL->port : '')
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
}
}
} }
// Add link to queue
$links[] = [
'title' => null,
'description' => null,
'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
'data' => null,
'ref' => $src,
];
} }
// Collect internal links from page content // Collect internal links from page content
@ -659,6 +405,11 @@ try {
continue; continue;
} }
// Get title attribute if available
if (!$title = @$a->getAttribute('title')) {
$title = null;
}
// Skip anchor links // Skip anchor links
if (false !== strpos($href, '#')) { if (false !== strpos($href, '#')) {
@ -683,23 +434,34 @@ try {
continue; continue;
} }
// @TODO skip other apps // Add link to queue
$links[] = [
'title' => null,
'description' => null,
'keywords' => Filter::pageKeywords($title),
'data' => null,
'ref' => $href,
];
}
// Process links collected
foreach ($links as $link) {
// Add absolute URL prefixes to the relative links found //Make relative links absolute
if (!parse_url($href, PHP_URL_HOST)) { if (!parse_url($link['ref'], PHP_URL_HOST)) {
$href = $queueHostPage->scheme . '://' . $link['ref'] = $queueHostPage->scheme . '://' .
$queueHostPage->name . $queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') . ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.'); '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
} }
// Validate formatted link // Validate formatted link
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) {
// Parse formatted link // Parse formatted link
$hostURL = Parser::hostURL($href); $hostURL = Parser::hostURL($link['ref']);
$hostPageURI = Parser::uri($href); $hostPageURI = Parser::uri($link['ref']);
// Host exists // Host exists
if ($host = $db->getHost(crc32($hostURL->string))) { if ($host = $db->getHost(crc32($hostURL->string))) {
@ -707,7 +469,7 @@ try {
$hostStatus = $host->status; $hostStatus = $host->status;
$hostNsfw = $host->nsfw; $hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit; $hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit; $hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId; $hostId = $host->hostId;
$hostRobots = $host->robots; $hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix; $hostRobotsPostfix = $host->robotsPostfix;
@ -731,30 +493,33 @@ try {
} }
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW; $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme, $hostId = $db->addHost( $hostURL->scheme,
$hostURL->name, $hostURL->name,
$hostURL->port, $hostURL->port,
crc32($hostURL->string), crc32($hostURL->string),
time(), time(),
null, null,
$hostPageLimit, $hostPageLimit,
$hostImageLimit, (string) $hostMetaOnly,
(string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus,
(string) $hostStatus, (string) $hostNsfw,
(string) $hostNsfw, $hostRobots,
$hostRobots, $hostRobotsPostfix);
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
if ($hostId) { $db->addHostPage($hostId, crc32('/'), '/', time());
$hostsAdded++; // Increase counters
$hostPagesAdded++;
} else { $hostsAdded++;
// When page is root, skip next operations
if ($hostPageURI->string == '/') {
continue; continue;
} }
@ -766,25 +531,27 @@ try {
// Save page info // Save page info
if ($hostStatus && // host enabled if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) { if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
$hostPageId = $hostPage->hostPageId;
} else {
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
$db->addHostPageDescription($hostPageId,
$link['title'],
$link['description'],
$link['keywords'],
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
time());
$hostPagesAdded++; $hostPagesAdded++;
} }
}
// Increase page rank when link does not match the current host $db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
if ($hostURL->scheme . '://' .
$hostURL->name .
($hostURL->port ? ':' . $hostURL->port : '')
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
} }
} }
} }
@ -811,10 +578,6 @@ if (CRAWL_LOG_ENABLED) {
$hostPagesIndexed, $hostPagesIndexed,
$hostPagesAdded, $hostPagesAdded,
$hostPagesBanned, $hostPagesBanned,
$hostImagesIndexed,
$hostImagesProcessed,
$hostImagesAdded,
$hostImagesBanned,
$manifestsProcessed, $manifestsProcessed,
$manifestsAdded, $manifestsAdded,
$httpRequestsTotal, $httpRequestsTotal,
@ -832,11 +595,6 @@ echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Images banned: ' . $hostImagesBanned . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL; echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

18
library/filter.php

@ -54,24 +54,6 @@ class Filter {
return $keywords; return $keywords;
} }
static public function imageAlt(mixed $alt) {
$alt = (string) $alt;
$alt = trim($alt);
return $alt;
}
static public function imageTitle(mixed $title) {
$title = (string) $title;
$title = trim($title);
return $title;
}
static public function pageData(mixed $data) { static public function pageData(mixed $data) {
$data = (string) $data; $data = (string) $data;

488
library/mysql.php

@ -102,11 +102,44 @@ class MySQL {
return $query->fetch()->total; return $query->fetch()->total;
} }
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, string $nsfw, mixed $robots, mixed $robotsPostfix) { public function addHost(string $scheme,
string $name,
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `nsfw`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); mixed $port,
int $crc32url,
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $nsfw, $robots, $robotsPostfix]); int $timeAdded,
mixed $timeUpdated,
int $crawlPageLimit,
string $crawlMetaOnly,
string $status,
string $nsfw,
mixed $robots,
mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`,
`name`,
`port`,
`crc32url`,
`timeAdded`,
`timeUpdated`,
`crawlPageLimit`,
`crawlMetaOnly`,
`status`,
`nsfw`,
`robots`,
`robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $scheme,
$name,
$port,
$crc32url,
$timeAdded,
$timeUpdated,
$crawlPageLimit,
$crawlMetaOnly,
$status,
$nsfw,
$robots,
$robotsPostfix]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
@ -120,253 +153,6 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
// Images
public function getTotalHostImages(int $hostId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetch()->total;
}
public function getHostImageId(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT `hostImageId` FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->rowCount() ? $query->fetch()->hostImageId : 0;
}
public function getHostImages(int $hostId) {
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetchAll();
}
public function getUnrelatedHostImages() {
$query = $this->_db->prepare('SELECT * FROM `hostImage`
WHERE `hostImage`.`hostImageId` NOT IN (SELECT `hostImageToHostPage`.`hostImageId`
FROM `hostImageToHostPage`
WHERE `hostImageToHostPage`.`hostImageId` = `hostImage`.`hostImageId`)');
$query->execute();
return $query->fetchAll();
}
public function getHostImagesByLimit(int $hostId, int $limit) {
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? ORDER BY hostImageId DESC LIMIT ' . (int) $limit);
$query->execute([$hostId]);
return $query->fetchAll();
}
public function addHostImage(int $hostId,
int $crc32uri,
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $timeBanned = null,
mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null) {
$query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
`crc32uri`,
`uri`,
`timeAdded`,
`timeUpdated`,
`timeBanned`,
`httpCode`,
`mime`,
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
return $this->_db->lastInsertId();
}
public function updateHostImageRank(int $hostId,
int $crc32uri,
int $increment) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `rank` = `rank` + ' . (int) $increment . ' WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->rowCount();
}
public function updateHostImageTimeBanned(int $hostImageId, int $timeBanned) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$timeBanned, $hostImageId]);
return $query->rowCount();
}
public function updateHostImageHttpCode(int $hostImageId,
int $httpCode,
int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `httpCode` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$httpCode, $timeUpdated, $hostImageId]);
return $query->rowCount();
}
public function updateHostImageMime(int $hostImageId,
string $mime,
int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$mime, $timeUpdated, $hostImageId]);
return $query->rowCount();
}
public function updateHostImage(int $hostImageId,
string $mime,
int $timeUpdated,
mixed $timeBanned = null) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$mime, $timeUpdated, $timeBanned, $hostImageId]);
return $query->rowCount();
}
public function deleteHostImage(int $hostImageId) {
$query = $this->_db->prepare('DELETE FROM `hostImage` WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$hostImageId]);
return $query->rowCount();
}
public function setHostImageDescription(int $hostImageId,
int $crc32id,
string $alt,
string $title,
mixed $data,
int $timeAdded,
mixed $timeUpdated) {
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
`crc32id`,
`alt`,
`title`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `alt` = ?,
`title` = ?,
`timeUpdated` = ?');
$query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded, $alt, $title, $timeUpdated]);
return $this->_db->lastInsertId();
}
public function setHostImageDescriptionData(int $hostImageId,
int $crc32id,
mixed $data,
int $timeAdded,
mixed $timeUpdated) {
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
`crc32id`,
`data`,
`timeAdded`) VALUES (?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
$query->execute([$hostImageId, $crc32id, $data, $timeAdded, $timeUpdated]);
return $this->_db->lastInsertId();
}
public function deleteHostImageDescription(int $hostImageId) {
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `hostImageId` = ?');
$query->execute([$hostImageId]);
return $query->rowCount();
}
public function getLastHostImageDescription(int $hostImageId) {
$query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
$query->execute([$hostImageId]);
return $query->fetch();
}
public function getHostImageHostPages(int $hostImageId, int $limit = 5) {
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage`
JOIN `hostPage` ON (`hostPage`.`hostPageId` = `hostImageToHostPage`.`hostPageId`)
WHERE `hostImageId` = ?
ORDER BY `hostPage`.`rank` DESC, RAND(`hostPage`.`hostId`)
LIMIT ' . (int) $limit);
$query->execute([$hostImageId]);
return $query->fetchAll();
}
public function getHostImageHostPagesTotal(int $hostImageId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
$query->execute([$hostImageId]);
return $query->fetch()->total;
}
public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $time, int $quantity) {
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
`hostPageId`,
`timeAdded`,
`timeUpdated`,
`quantity`) VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `timeUpdated` = ?,
`quantity` = `quantity` + ' . (int) $quantity);
$query->execute([$hostImageId, $hostPageId, $time, null, $quantity, $time]);
return $query->rowCount(); // no primary key
}
public function deleteHostImageToHostPage(int $hostImageId) {
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
$query->execute([$hostImageId]);
return $query->rowCount();
}
// Pages // Pages
public function getTotalHostPages(int $hostId) { public function getTotalHostPages(int $hostId) {
@ -421,18 +207,9 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function getHostPageDescription(int $hostPageId, int $crc32data) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data]);
return $query->fetch();
}
public function getLastPageDescription(int $hostPageId) { public function getLastPageDescription(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
$query->execute([$hostPageId]); $query->execute([$hostPageId]);
@ -442,7 +219,6 @@ class MySQL {
public function getFoundHostPage(int $hostPageId) { public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`uri`, $query = $this->_db->prepare('SELECT `hostPage`.`uri`,
`hostPage`.`rank`,
`host`.`scheme`, `host`.`scheme`,
`host`.`name`, `host`.`name`,
`host`.`port` `host`.`port`
@ -459,28 +235,6 @@ class MySQL {
return $query->fetch(); return $query->fetch();
} }
public function getFoundHostImage(int $hostImageId) {
$query = $this->_db->prepare('SELECT `hostImage`.`hostImageId`,
`hostImage`.`uri`,
`hostImage`.`rank`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlMetaOnly`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
WHERE `hostImage`.`hostImageId` = ?
LIMIT 1');
$query->execute([$hostImageId]);
return $query->fetch();
}
public function addHostPage(int $hostId, public function addHostPage(int $hostId,
int $crc32uri, int $crc32uri,
string $uri, string $uri,
@ -488,8 +242,7 @@ class MySQL {
mixed $timeUpdated = null, mixed $timeUpdated = null,
mixed $timeBanned = null, mixed $timeBanned = null,
mixed $httpCode = null, mixed $httpCode = null,
mixed $mime = null, mixed $mime = null) {
mixed $rank = null) {
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`, $query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
`crc32uri`, `crc32uri`,
@ -498,10 +251,9 @@ class MySQL {
`timeUpdated`, `timeUpdated`,
`timeBanned`, `timeBanned`,
`httpCode`, `httpCode`,
`mime`, `mime`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)');
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]); $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
@ -515,22 +267,6 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function updateHostPageRank(int $hostId,
int $crc32uri,
int $increment) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = `rank` + ' . (int) $increment . '
WHERE `hostId` = ?
AND `crc32uri` = ?
LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->rowCount();
}
public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) { public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1'); $query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
@ -576,48 +312,52 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function deleteHostPageToHostImage(int $hostPageId) { public function addHostPageDescription(int $hostPageId,
mixed $title,
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?'); mixed $description,
mixed $keywords,
$query->execute([$hostPageId]);
return $query->rowCount();
}
public function setHostPageDescription(int $hostPageId,
int $crc32data,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
mixed $data, mixed $data,
int $time) { int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`, $query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
`crc32data`, `title`,
`metaTitle`, `description`,
`metaDescription`, `keywords`,
`metaKeywords`,
`data`, `data`,
`timeAdded` `timeAdded`
) VALUES (?, ?, ?, ?, ?, ?, ?) ) VALUES (?, ?, ?, ?, ?, ?)');
ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
$query->execute([ $query->execute([
$hostPageId, $hostPageId,
$crc32data, $title,
$metaTitle, $description,
$metaDescription, $keywords,
$metaKeywords,
$data, $data,
$time, $timeAdded,
$time
]); ]);
return $query->rowCount(); return $query->rowCount();
} }
public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
$query = $this->_db->prepare('INSERT INTO `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`, `quantity`) VALUES (?, ?, 0)
ON DUPLICATE KEY UPDATE `quantity` = `quantity` + 1');
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
}
public function deleteHostPageToHostPage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? OR `hostPageIdTarget` = ?');
$query->execute([$hostPageId, $hostPageId]);
return $query->rowCount();
}
// Cleaner tools // Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) { public function getCleanerQueue(int $limit, int $timeFrom) {
@ -652,33 +392,12 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function resetBannedHostImages(int $timeOffset) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function deleteHostImageDescriptionsByTimeAdded(int $timeOffset) {
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function addCleanerLog(int $timeAdded, public function addCleanerLog(int $timeAdded,
int $hostsTotal, int $hostsTotal,
int $hostsUpdated, int $hostsUpdated,
int $hostPagesDeleted, int $hostPagesDeleted,
int $hostPageDescriptionsDeleted, int $hostPageDescriptionsDeleted,
int $hostPagesBansRemoved, int $hostPagesBansRemoved,
int $hostImagesDeleted,
int $hostImageDescriptionsDeleted,
int $hostImagesBansRemoved,
int $manifestsTotal, int $manifestsTotal,
int $manifestsDeleted, int $manifestsDeleted,
int $logsCleanerDeleted, int $logsCleanerDeleted,
@ -695,9 +414,6 @@ class MySQL {
`hostPagesDeleted`, `hostPagesDeleted`,
`hostPageDescriptionsDeleted`, `hostPageDescriptionsDeleted`,
`hostPagesBansRemoved`, `hostPagesBansRemoved`,
`hostImagesDeleted`,
`hostImageDescriptionsDeleted`,
`hostImagesBansRemoved`,
`manifestsTotal`, `manifestsTotal`,
`manifestsDeleted`, `manifestsDeleted`,
`logsCleanerDeleted`, `logsCleanerDeleted`,
@ -706,7 +422,7 @@ class MySQL {
`httpRequestsSizeTotal`, `httpRequestsSizeTotal`,
`httpDownloadSizeTotal`, `httpDownloadSizeTotal`,
`httpRequestsTimeTotal`, `httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $query->execute([
$timeAdded, $timeAdded,
@ -715,9 +431,6 @@ class MySQL {
$hostPagesDeleted, $hostPagesDeleted,
$hostPageDescriptionsDeleted, $hostPageDescriptionsDeleted,
$hostPagesBansRemoved, $hostPagesBansRemoved,
$hostImagesDeleted,
$hostImageDescriptionsDeleted,
$hostImagesBansRemoved,
$manifestsTotal, $manifestsTotal,
$manifestsDeleted, $manifestsDeleted,
$logsCleanerDeleted, $logsCleanerDeleted,
@ -751,7 +464,6 @@ class MySQL {
`host`.`name`, `host`.`name`,
`host`.`port`, `host`.`port`,
`host`.`crawlPageLimit`, `host`.`crawlPageLimit`,
`host`.`crawlImageLimit`,
`host`.`crawlMetaOnly`, `host`.`crawlMetaOnly`,
`host`.`robots`, `host`.`robots`,
`host`.`robotsPostfix` `host`.`robotsPostfix`
@ -762,7 +474,7 @@ class MySQL {
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0 WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
AND `hostPage`.`timeBanned` IS NULL AND `hostPage`.`timeBanned` IS NULL
ORDER BY `hostPage`.`rank` DESC, RAND() ORDER BY RAND()
LIMIT ' . (int) $limit); LIMIT ' . (int) $limit);
@ -780,40 +492,6 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function getHostImageCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostImage`.`hostId`,
`hostImage`.`hostImageId`,
`hostImage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlMetaOnly`
FROM `hostImage`
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
AND `hostImage`.`timeBanned` IS NULL
ORDER BY `hostImage`.`rank` DESC, RAND()
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $hostImageId]);
return $query->rowCount();
}
public function getManifestCrawlQueue(int $limit, int $timeFrom) { public function getManifestCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `manifest` $query = $this->_db->prepare('SELECT * FROM `manifest`
@ -844,10 +522,6 @@ class MySQL {
int $hostPagesIndexed, int $hostPagesIndexed,
int $hostPagesAdded, int $hostPagesAdded,
int $hostPagesBanned, int $hostPagesBanned,
int $hostImagesIndexed,
int $hostImagesProcessed,
int $hostImagesAdded,
int $hostImagesBanned,
int $manifestsProcessed, int $manifestsProcessed,
int $manifestsAdded, int $manifestsAdded,
int $httpRequestsTotal, int $httpRequestsTotal,
@ -862,17 +536,13 @@ class MySQL {
`hostPagesIndexed`, `hostPagesIndexed`,
`hostPagesAdded`, `hostPagesAdded`,
`hostPagesBanned`, `hostPagesBanned`,
`hostImagesIndexed`,
`hostImagesProcessed`,
`hostImagesAdded`,
`hostImagesBanned`,
`manifestsProcessed`, `manifestsProcessed`,
`manifestsAdded`, `manifestsAdded`,
`httpRequestsTotal`, `httpRequestsTotal`,
`httpRequestsSizeTotal`, `httpRequestsSizeTotal`,
`httpDownloadSizeTotal`, `httpDownloadSizeTotal`,
`httpRequestsTimeTotal`, `httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $query->execute([
$timeAdded, $timeAdded,
@ -881,10 +551,6 @@ class MySQL {
$hostPagesIndexed, $hostPagesIndexed,
$hostPagesAdded, $hostPagesAdded,
$hostPagesBanned, $hostPagesBanned,
$hostImagesIndexed,
$hostImagesProcessed,
$hostImagesAdded,
$hostImagesBanned,
$manifestsProcessed, $manifestsProcessed,
$manifestsAdded, $manifestsAdded,
$httpRequestsTotal, $httpRequestsTotal,

48
library/sphinxql.php

@ -11,13 +11,13 @@ class SphinxQL {
$this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); $this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
} }
public function searchHostPages(string $keyword, int $start, int $limit, int $maxMatches) { public function searchHostPages(string $keyword, string $mime, int $start, int $limit, int $maxMatches) {
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight` $query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
FROM `hostPage` FROM `hostPage`
WHERE MATCH(?) WHERE MATCH(?) AND `mime` = ?
ORDER BY `rank` DESC, WEIGHT() DESC ORDER BY `rank` DESC, WEIGHT() DESC
@ -25,26 +25,7 @@ class SphinxQL {
OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1)); OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1));
$query->execute([$keyword]); $query->execute([$keyword, $mime]);
return $query->fetchAll();
}
public function searchHostImages(string $keyword, int $start, int $limit, int $maxMatches) {
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
FROM `hostImage`
WHERE MATCH(?)
ORDER BY `rank` DESC, WEIGHT() DESC
LIMIT ' . (int) ($start >= $maxMatches ? ($maxMatches > 0 ? $maxMatches - 1 : 0) : $start) . ',' . (int) $limit . '
OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1));
$query->execute([$keyword]);
return $query->fetchAll(); return $query->fetchAll();
} }
@ -58,29 +39,20 @@ class SphinxQL {
return $query->fetch()->total; return $query->fetch()->total;
} }
public function searchHostPagesTotal(string $keyword) { public function getHostPagesMime() {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
$query->execute([$keyword]);
return $query->fetch()->total;
}
public function searchHostImagesTotal(string $keyword) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE MATCH(?)'); $query = $this->_sphinx->prepare('SELECT `mime` FROM `hostPage` GROUP BY `mime` ORDER BY `mime` ASC');
$query->execute([$keyword]); $query->execute();
return $query->fetch()->total; return $query->fetchAll();
} }
public function getHostImagesTotal() { public function searchHostPagesTotal(string $keyword, string $mime) {
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage`'); $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?) AND `mime` = ?');
$query->execute(); $query->execute([$keyword, $mime]);
return $query->fetch()->total; return $query->fetch()->total;
} }

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 134 KiB

After

Width:  |  Height:  |  Size: 147 KiB

42
public/api.php

@ -1,7 +1,7 @@
<?php <?php
// Current version // Current version
define('API_VERSION', 0.7); define('API_VERSION', 0.8);
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
@ -30,48 +30,25 @@ if (API_ENABLED) {
// Filter request data // Filter request data
$type = !empty($_GET['type']) ? Filter::url($_GET['type']) : 'page'; $type = !empty($_GET['type']) ? Filter::url($_GET['type']) : 'html';
$mode = !empty($_GET['mode']) ? Filter::url($_GET['mode']) : 'default'; $mode = !empty($_GET['mode']) ? Filter::url($_GET['mode']) : 'default';
$query = !empty($_GET['query']) ? Filter::url($_GET['query']) : ''; $query = !empty($_GET['query']) ? Filter::url($_GET['query']) : '';
$page = !empty($_GET['page']) ? (int) $_GET['page'] : 1; $page = !empty($_GET['page']) ? (int) $_GET['page'] : 1;
// Make image search request // Make search request
if (!empty($type) && $type == 'image') { $sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode), $type);
$sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $type, $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
$sphinxResultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($query, $mode));
$sphinxResults = $sphinx->searchHostImages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
// Make default search request
} else {
$sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode));
$sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
}
// Generate results // Generate results
$dbResults = []; $dbResults = [];
foreach ($sphinxResults as $i => $sphinxResult) { foreach ($sphinxResults as $i => $sphinxResult) {
// Image if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
if (!empty($type) && $type == 'image') {
if ($hostImage = $db->getFoundHostImage($sphinxResult->id)) {
$dbResults[$i] = $hostImage;
$dbResults[$i]->weight = $sphinxResult->weight;
}
// Default
} else {
if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
$dbResults[$i] = $hostPage; $dbResults[$i] = $hostPage;
$dbResults[$i]->weight = $sphinxResult->weight; $dbResults[$i]->weight = $sphinxResult->weight;
}
} }
} }
@ -129,13 +106,10 @@ if (API_ENABLED) {
'crawlUrlRegexp' => CRAWL_URL_REGEXP, 'crawlUrlRegexp' => CRAWL_URL_REGEXP,
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW, 'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageMime' => CRAWL_PAGE_MIME, 'crawlHostPageMime' => CRAWL_PAGE_MIME,
'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET,
'crawlHostImageMime' => CRAWL_IMAGE_MIME,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,

1
public/index.php

@ -24,7 +24,6 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<?php if (API_MANIFEST_ENABLED) { ?> <?php if (API_MANIFEST_ENABLED) { ?>
<meta name="yggo:manifest" content="<?php echo sprintf('%s/api.php?action=manifest', WEBSITE_DOMAIN) ?>" /> <meta name="yggo:manifest" content="<?php echo sprintf('%s/api.php?action=manifest', WEBSITE_DOMAIN) ?>" />
<?php } ?> <?php } ?>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" /> <meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
<meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" /> <meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" />
<style> <style>

251
public/search.php

@ -16,34 +16,34 @@ $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Filter request data // Filter request data
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'page'; $t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'html';
$m = !empty($_GET['m']) ? Filter::url($_GET['m']) : 'default'; $m = !empty($_GET['m']) ? Filter::url($_GET['m']) : 'default';
$q = !empty($_GET['q']) ? Filter::url($_GET['q']) : ''; $q = !empty($_GET['q']) ? Filter::url($_GET['q']) : '';
$p = !empty($_GET['p']) ? (int) $_GET['p'] : 1; $p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
// Define page basics // Search request
switch ($t) { if (!empty($q)) {
case 'image': $resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m), $t);
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $t, $p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, $resultsTotal);
$totalPages = $sphinx->getHostImagesTotal(); } else {
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s image or enter the new one...'), $totalPages), $resultsTotal = 0;
sprintf(_('Over %s images or enter the new one...'), $totalPages), $results = [];
sprintf(_('Over %s images or enter the new one...'), $totalPages), }
]);
break; // Mime list
default: $hostPagesMime = $sphinx->getHostPagesMime();
$totalPages = $sphinx->getHostPagesTotal(); // Define page basics
$totalPages = $sphinx->getHostPagesTotal();
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]);
}
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
]);
// Crawl request // Crawl request
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
@ -61,6 +61,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostStatus = $host->status; $hostStatus = $host->status;
$hostNsfw = $host->nsfw; $hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit; $hostPageLimit = $host->crawlPageLimit;
$hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId; $hostId = $host->hostId;
$hostRobots = $host->robots; $hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix; $hostRobotsPostfix = $host->robotsPostfix;
@ -82,21 +83,26 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW; $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name, $hostId = $db->addHost( $hostURL->scheme,
$hostURL->port, $hostURL->name,
crc32($hostURL->string), $hostURL->port,
time(), crc32($hostURL->string),
null, time(),
$hostPageLimit, null,
(string) CRAWL_HOST_DEFAULT_META_ONLY, $hostPageLimit,
(string) $hostStatus, (string) $hostMetaOnly,
(string) $hostNsfw, (string) $hostStatus,
$hostRobots, (string) $hostNsfw,
$hostRobotsPostfix); $hostRobots,
$hostRobotsPostfix);
// Add web root host page to make host visible in the crawl queue
$db->addHostPage($hostId, crc32('/'), '/', time());
} }
} }
@ -120,28 +126,10 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
} catch(Exception $e){ } catch(Exception $e){
$db->rollBack(); var_dump($e);
}
}
// Search request
if (!empty($q)) {
if ($t == 'image') {
$resultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($q, $m)); $db->rollBack();
$results = $sphinx->searchHostImages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT, $resultsTotal);
} else {
$resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m));
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, $resultsTotal);
} }
} else {
$resultsTotal = 0;
$results = [];
} }
?> ?>
@ -151,7 +139,6 @@ if (!empty($q)) {
<head> <head>
<title><?php echo (empty($q) ? _('Empty request - YGGo!') : ($p > 1 ? sprintf(_('%s - #%s - YGGo!'), htmlentities($q), $p) : sprintf(_('%s - YGGo!'), htmlentities($q)))) ?></title> <title><?php echo (empty($q) ? _('Empty request - YGGo!') : ($p > 1 ? sprintf(_('%s - #%s - YGGo!'), htmlentities($q), $p) : sprintf(_('%s - YGGo!'), htmlentities($q)))) ?></title>
<meta charset="utf-8" /> <meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" /> <meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
<meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" /> <meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" />
<style> <style>
@ -322,8 +309,9 @@ if (!empty($q)) {
<form name="search" method="GET" action="<?php echo WEBSITE_DOMAIN; ?>/search.php"> <form name="search" method="GET" action="<?php echo WEBSITE_DOMAIN; ?>/search.php">
<h1><a href="<?php echo WEBSITE_DOMAIN; ?>"><?php echo _('YGGo!') ?></a></h1> <h1><a href="<?php echo WEBSITE_DOMAIN; ?>"><?php echo _('YGGo!') ?></a></h1>
<input type="text" name="q" placeholder="<?php echo $placeholder ?>" value="<?php echo htmlentities($q) ?>" /> <input type="text" name="q" placeholder="<?php echo $placeholder ?>" value="<?php echo htmlentities($q) ?>" />
<label><input type="radio" name="t" value="page" <?php echo ($t == 'page' ? 'checked="checked"' : false) ?>/> <?php echo _('Pages') ?></label> <?php foreach ($hostPagesMime as $hostPageMime) { ?>
<label><input type="radio" name="t" value="image" <?php echo ($t == 'image' ? 'checked="checked"' : false) ?>/> <?php echo _('Images') ?></label> <label><input type="radio" name="t" value="<?php echo $hostPageMime->mime ?>" <?php echo ($t == $hostPageMime->mime ? 'checked="checked"' : false) ?>/> <?php echo $hostPageMime->mime ?></label>
<?php } ?>
<button type="submit"><?php echo _('Search'); ?></button> <button type="submit"><?php echo _('Search'); ?></button>
</form> </form>
</header> </header>
@ -336,156 +324,13 @@ if (!empty($q)) {
<?php } ?> <?php } ?>
</div> </div>
<?php foreach ($results as $result) { ?> <?php foreach ($results as $result) { ?>
<?php if ($t == 'image' && $hostImage = $db->getFoundHostImage($result->id)) { ?> <?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<?php <?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
// Built image url
$hostImageURL = $hostImage->scheme . '://' .
$hostImage->name .
($hostImage->port ? ':' . $hostImage->port : false) .
$hostImage->uri;
// Get local image data
$lastHostImageDescription = $db->getLastHostImageDescription($result->id);
if (!empty($lastHostImageDescription->data)) {
$hostImageURLencoded = $lastHostImageDescription->data;
// Get remote if local index not found or CRAWL_HOST_DEFAULT_META_ONLY enabled
} else {
// Init image request
$hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);
// Skip item render on timeout
$hostImageHttpCode = $hostImageCurl->getCode();
$db->updateHostImageHttpCode($result->id, (int) $hostImageHttpCode, time());
if (200 != $hostImageHttpCode) {
$db->updateHostImageHttpCode($result->id, $hostImageHttpCode, time());
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $hostImageCurl->getContentType()) {
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
// Skip image processing on MIME type not allowed in settings
$hostImageBanned = true;
foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
if (false !== strpos($hostImageContentType, trim($mime))) {
$hostImageBanned = false;
break;
}
}
if ($hostImageBanned) {
$db->updateHostImageMime($result->id, $hostImageContentType, time());
$hostImagesBanned += $db->updateHostImageTimeBanned($result->id, time());
continue;
}
// Skip image processing without returned content
if (!$hostImageContent = $hostImageCurl->getContent()) {
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) {
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$db->updateHostImageTimeBanned($result->id, time());
continue;
}
$hostImageURLencoded = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
// Save image content on data settings enabled
$db->updateHostImage($result->id,
Filter::mime($hostImageContentType),
time());
$db->setHostImageDescriptionData($result->id,
crc32($hostImageURLencoded),
$hostImage->crawlMetaOnly ? null : $hostImageURLencoded,
time(),
null);
}
?>
<div>
<a href="<?php echo $hostImageURL ?>">
<img src="<?php echo $hostImageURLencoded ?>" alt="<?php echo htmlentities($hostImageURL) ?>" title="<?php echo htmlentities($hostImageURL) ?>" class="image" />
</a>
<br />
<?php $hostImageHostPagesTotal = $db->getHostImageHostPagesTotal($result->id) ?>
<?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
<?php } ?>
<?php if ($lastHostImageDescription) { ?>
<span><?php echo $lastHostImageDescription->title ?> <?php echo $lastHostImageDescription->alt ?></span>
<?php } ?>
<a href="<?php echo $hostPageURL ?>">
<img src="<?php echo WEBSITE_DOMAIN ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
<?php echo htmlentities(urldecode($hostPageURL)) ?>
</a>
<br />
<?php } ?>
<?php } ?>
<?php if ($hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT > 0) { ?>
<p>
<small>
<?php echo Filter::plural($hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT,
[
sprintf(_('+%s other page'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
sprintf(_('+%s other pages'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
sprintf(_('+%s other pages'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
]); ?>
</small>
</p>
<?php } ?>
</div>
<?php } else if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
<?php
$hostPageURL = $hostPage->scheme . '://' .
$hostPage->name .
($hostPage->port ? ':' . $hostPage->port : false) .
$hostPage->uri;
?>
<div> <div>
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?> <?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
<h2><?php echo $hostPageDescription->metaTitle ?></h2> <h2><?php echo $hostPageDescription->title ?></h2>
<?php if (!empty($hostPageDescription->metaDescription)) { ?> <?php if (!empty($hostPageDescription->description)) { ?>
<span><?php echo $hostPageDescription->metaDescription ?></span> <span><?php echo $hostPageDescription->description ?></span>
<?php } ?> <?php } ?>
<?php } ?> <?php } ?>
<a href="<?php echo $hostPageURL ?>"> <a href="<?php echo $hostPageURL ?>">
@ -495,7 +340,7 @@ if (!empty($q)) {
</div> </div>
<?php } ?> <?php } ?>
<?php } ?> <?php } ?>
<?php if ($p * ($t == 'image' ? WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT : WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT) <= $resultsTotal) { ?> <?php if ($p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT <= $resultsTotal) { ?>
<div> <div>
<a href="<?php echo WEBSITE_DOMAIN; ?>/search.php?q=<?php echo urlencode(htmlentities($q)) ?>&t=<?php echo $t ?>&p=<?php echo $p + 1 ?>"><?php echo _('Next page') ?></a> <a href="<?php echo WEBSITE_DOMAIN; ?>/search.php?q=<?php echo urlencode(htmlentities($q)) ?>&t=<?php echo $t ?>&p=<?php echo $p + 1 ?>"><?php echo _('Next page') ?></a>
</div> </div>

Loading…
Cancel
Save