diff --git a/README.md b/README.md
index e6f03bc..6448927 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option
```
GET action=search - required
GET query={string} - optional, search request, empty if not provided
-GET type={string} - optional, search type, image|default or empty
+GET type={string} - optional, filter mime type of available or empty
GET page={int} - optional, search results page, 1 if not provided
GET mode=SphinxQL - optional, enable extended SphinxQL syntax
```
@@ -141,7 +141,7 @@ GET m=SphinxQL
##### Basic features
* [x] Web pages full text ranking search
-* [x] Images search with safe proxy preview support
+* [x] MIME filtering search with safe proxy images preview
* [x] Extended syntax support
* [x] Flexible settings compatible with IPv4/IPv6 networks
@@ -159,18 +159,14 @@ GET m=SphinxQL
* [ ] Index API
+ [x] Manifest
+ [x] Search
- + [x] Pages
- + [x] Images
+ [x] Hosts
- + [ ] Pages
- + [ ] Images
+ + [ ] MIME list
* [ ] Context advertising API
##### Crawler
* [x] Auto crawl links by regular expression rules
+ [x] Pages
- + [x] Images
+ [x] Manifests
* [x] Robots.txt / robots meta tags support (#2)
* [x] Specific rules configuration for every host
@@ -181,8 +177,6 @@ GET m=SphinxQL
* [x] Ban non-condition links to prevent extra requests
* [x] Debug log
* [x] History snaps
- + [x] Pages
- + [x] Images
* [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing
* [ ] Palette image index / filter
@@ -191,17 +185,12 @@ GET m=SphinxQL
##### Cleaner
* [x] Deprecated DB items auto deletion / host settings update
+ [x] Pages
- + [x] Images
+ [x] Manifests
+ [x] Logs
+ [x] Crawler
+ [x] Cleaner
* [x] Deprecated history snaps removing
- + [x] Pages
- + [x] Images
* [x] Banned resources reset by timeout
- + [x] Pages
- + [x] Images
* [x] Debug log
##### Other
diff --git a/config/app.php.txt b/config/app.php.txt
index 8469469..724a8fe 100644
--- a/config/app.php.txt
+++ b/config/app.php.txt
@@ -47,7 +47,7 @@ error_reporting(E_ALL);
* Project domain, without slash on postfix
*
*/
-define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : ''));
+define('WEBSITE_DOMAIN', '');
/*
* Page search results before show the read more link
@@ -55,18 +55,6 @@ define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on'
*/
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
-/*
- * Image search results before show the read more link
- *
- */
-define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10);
-
-/*
- * Quantity of related pages for each image in the search results
- *
- */
-define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
-
/*
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
*
@@ -76,7 +64,7 @@ define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
// Database
-define('DB_HOST', 'localhost');
+define('DB_HOST', '127.0.0.1');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
@@ -144,20 +132,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
*/
define('CRAWL_PAGE_LIMIT', 20);
-/*
- * Images (URI) processing limit in the crawler.php queue
- *
- * This option related to CRAWL_IMAGE_SECONDS_OFFSET value
- * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
- *
- * Usually up to 20 pages per minute,
- * to prevent websites overload by sending GET crawling requests
- *
- * Set 0 to disable
- *
- */
-define('CRAWL_IMAGE_LIMIT', 10);
-
/*
* Manifest (URI) processing limit in the crawler.php queue
*
@@ -194,28 +168,7 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
* comma separated
*
*/
-define('CRAWL_PAGE_MIME', 'text/html');
-
-/*
- * Index images match MIME types
- *
- * comma separated
- *
- */
-define('CRAWL_IMAGE_MIME', 'image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
-
-/*
- * Renew image index by timing offset provided
- *
- * This option works with CRAWL_IMAGE_LIMIT step queue
- *
- * Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
- * must have enough value to crawl all images collected in the DB index
- *
- * or the crawler can stuck in queue
- *
- */
-define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
+define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
/*
* Renew manifests index by timing offset provided
@@ -234,7 +187,7 @@ define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
* Only URL addresses match this rule will be auto-crawled
*
*/
-define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
+define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui');
/*
* Pages limit per new host by default
@@ -244,7 +197,7 @@ define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/u
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
*
*/
-define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
+define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
/*
* Set default auto-crawl status for new host added
@@ -264,7 +217,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
*
* This option able to change search results relevance
- * This option enables image data caching in base64
*
*/
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
@@ -279,16 +231,6 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', false);
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);
-/*
- * Not suitable/safe for work status for new host by default
- *
- * Could be filtered in crawl conditions or search results
- *
- * Custom rule for specified host could be provided in the DB `host`.`nsfw` field
- *
- */
-define('CRAWL_HOST_DEFAULT_NSFW', false);
-
/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules
@@ -324,7 +266,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility
*
*/
-define('CRAWL_MANIFEST_API_VERSION', 0.7);
+define('CRAWL_MANIFEST_API_VERSION', 0.8);
/*
* Set default auto-crawl status for new manifest added
@@ -389,20 +331,6 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
*/
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
-/*
- * Remove image ban after following time
- *
- * This option used in crawler and search page
- * to prevent extra http requests to unavailable or not condition resources
- *
- */
-define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
-
-/*
- * Remove image description history after following time
- *
- */
-define('CLEAN_IMAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
// API settings
@@ -445,14 +373,12 @@ define('API_HOSTS_FIELDS',
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
- `host`.`crawlImageLimit`,
`host`.`robots`,
`host`.`robotsPostfix`,
`host`.`nsfw`,
`host`.`timeAdded`,
`host`.`timeUpdated`,
- (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,
- (SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated
+ (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');
/*
* Manifest API
diff --git a/config/sphinx.conf.txt b/config/sphinx.conf.txt
index 1e06c9d..06ef702 100644
--- a/config/sphinx.conf.txt
+++ b/config/sphinx.conf.txt
@@ -12,36 +12,24 @@ source common
source hostPage : common
{
sql_query = \
- SELECT hostPage.hostPageId, \
- hostPage.rank, \
- hostPage.uri, \
- host.name, \
- (SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \
- hostPageDescription.metaDescription, \
- hostPageDescription.metaKeywords) \
- FROM hostPageDescription \
- WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
- ORDER BY hostPageDescription.timeUpdated DESC, hostPageDescription.timeAdded DESC \
- LIMIT 1) AS pageDescription \
- FROM hostPage \
- JOIN host ON (host.hostId = hostPage.hostId) \
- WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
+ SELECT `hostPage`.`hostPageId`, \
+ `hostPage`.`uri`, \
+ `host`.`name`, \
+ REGEXP_REPLACE(`hostPage`.`mime`, '^[A-z-]+/([A-z-]+).*', '$1') AS `mime`, \
+ (SELECT COUNT(*) FROM `hostPageToHostPage` \
+ WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \
+ AND `hostPageToHostPage`.`hostPageIdSource` <> `hostPage`.`hostPageId`) AS `rank`, \
+ (SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
+ `hostPageDescription`.`description`, \
+ `hostPageDescription`.`keywords`)) \
+ FROM `hostPageDescription` \
+ WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId`) AS `pageDescription` \
+ FROM `hostPage` \
+ JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) \
+ WHERE `host`.`status` = '1' AND `hostPage`.`httpCode` = 200 AND `hostPage`.`timeBanned` IS NULL
- sql_attr_uint = rank
-}
-
-source hostImage : common
-{
- sql_query = \
- SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
- (SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \
- FROM hostImageDescription \
- WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription \
- FROM hostImage \
- JOIN host ON (host.hostId = hostImage.hostId) \
- WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
-
- sql_attr_uint = rank
+ sql_attr_uint = rank
+ sql_attr_string = mime
}
index hostPage
@@ -49,11 +37,4 @@ index hostPage
source = hostPage
morphology = stem_enru, stem_cz, stem_ar
path = /var/lib/sphinxsearch/data/hostPage
-}
-
-index hostImage
-{
- source = hostImage
- morphology = stem_enru, stem_cz, stem_ar
- path = /var/lib/sphinxsearch/data/hostImage
}
\ No newline at end of file
diff --git a/crontab/cleaner.php b/crontab/cleaner.php
index 8e30731..21a48e4 100644
--- a/crontab/cleaner.php
+++ b/crontab/cleaner.php
@@ -31,11 +31,8 @@ $manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostPagesDeleted = 0;
$hostPageDescriptionsDeleted = 0;
-$hostImagesDeleted = 0;
-$hostImageDescriptionsDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
-$hostImagesBansRemoved = 0;
$logsCleanerDeleted = 0;
$logsCrawlerDeleted = 0;
@@ -56,7 +53,7 @@ try {
// Update curl stats
$httpRequestsTotal++;
- $httpRequestsSizeTotal += $curl->getSizeRequest();
+ $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
@@ -69,22 +66,6 @@ try {
// Update host data
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
- // Apply host images limits
- $totalHostImages = $db->getTotalHostImages($host->hostId);
-
- if ($totalHostImages > $host->crawlImageLimit) {
-
- foreach ((array) $db->getHostImagesByLimit($host->hostId, $totalHostImages - $host->crawlImageLimit) as $hostImage) {
-
- // Delete foreign key relations
- $db->deleteHostImageDescription($hostImage->hostImageId);
- $db->deleteHostImageToHostPage($hostImage->hostImageId);
-
- // Delete host image
- $hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
- }
- }
-
// Apply host pages limits
$totalHostPages = $db->getTotalHostPages($host->hostId);
@@ -92,56 +73,32 @@ try {
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
- // Delete foreign key relations
- $db->deleteHostPageToHostImage($hostPage->hostPageId);
-
// Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
+ $db->deleteHostPageToHostPage($hostPage->hostPageId);
- $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
+ if ($hostPage->uri != '/') {
+ $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
+ }
}
}
// Apply new robots.txt rules
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
- foreach ($db->getHostImages($host->hostId) as $hostImage) {
-
- if (!$robots->uriAllowed($hostImage->uri)) {
-
- // Delete foreign key relations
- $db->deleteHostImageDescription($hostImage->hostImageId);
- $db->deleteHostImageToHostPage($hostImage->hostImageId);
-
- // Delete host image
- $hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
- }
- }
-
foreach ($db->getHostPages($host->hostId) as $hostPage) {
if (!$robots->uriAllowed($hostPage->uri)) {
- // Delete foreign key relations
- $db->deleteHostPageToHostImage($hostPage->hostPageId);
-
// Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
+ $db->deleteHostPageToHostPage($hostPage->hostPageId);
- $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
+ if ($hostPage->uri != '/') {
+ $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
+ }
}
}
-
- // Clean up host images unrelated to host pages
- foreach ($db->getUnrelatedHostImages() as $hostImage) {
-
- // Delete foreign key relations
- $db->deleteHostImageDescription($hostImage->hostImageId);
- $db->deleteHostImageToHostPage($hostImage->hostImageId);
-
- // Delete host image
- $hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
- }
}
// Clean up deprecated manifests
@@ -207,12 +164,6 @@ try {
// Delete page description history
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
- // Reset banned images
- $hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
-
- // Delete image description history
- $hostImageDescriptionsDeleted += $db->deleteHostImageDescriptionsByTimeAdded(time() - CLEAN_IMAGE_DESCRIPTION_OFFSET);
-
// Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
@@ -238,9 +189,6 @@ if (CLEAN_LOG_ENABLED) {
$hostPagesDeleted,
$hostPageDescriptionsDeleted,
$hostPagesBansRemoved,
- $hostImagesDeleted,
- $hostImageDescriptionsDeleted,
- $hostImagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
$logsCleanerDeleted,
@@ -256,15 +204,12 @@ if (CLEAN_LOG_ENABLED) {
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
-echo 'Hosts images deleted: ' . $hostImagesDeleted . PHP_EOL;
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL;
-echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
-echo 'Host image descriptions deleted: ' . $hostImageDescriptionsDeleted . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
diff --git a/crontab/crawler.php b/crontab/crawler.php
index 914a233..150ad14 100644
--- a/crontab/crawler.php
+++ b/crontab/crawler.php
@@ -33,16 +33,12 @@ $httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;
$hostPagesProcessed = 0;
-$hostImagesProcessed = 0;
$manifestsProcessed = 0;
$hostPagesIndexed = 0;
-$hostImagesIndexed = 0;
$manifestsAdded = 0;
$hostPagesAdded = 0;
-$hostImagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
-$hostImagesBanned = 0;
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@@ -121,7 +117,7 @@ try {
// Update curl stats
$httpRequestsTotal++;
- $httpRequestsSizeTotal += $curl->getSizeRequest();
+ $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
@@ -167,26 +163,15 @@ try {
// Validate formatted link
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
- // Host exists
- if ($host = $db->getHost(crc32($hostURL))) {
-
- $hostStatus = $host->status;
- $hostNsfw = $host->nsfw;
- $hostPageLimit = $host->crawlPageLimit;
- $hostImageLimit = $host->crawlImageLimit;
- $hostId = $host->hostId;
- $hostRobots = $host->robots;
- $hostRobotsPostfix = $host->robotsPostfix;
-
- // Register new host
- } else {
+ // Host not exists
+ if (!$db->getHost(crc32($hostURL))) {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$httpRequestsTotal++;
- $httpRequestsSizeTotal += $curl->getSizeRequest();
+ $httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
@@ -198,158 +183,33 @@ try {
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
- $hostStatus = CRAWL_HOST_DEFAULT_STATUS;
- $hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
+ $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
+ $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
+ $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
- $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
-
- $hostId = $db->addHost($remoteManifestHosts->result->scheme,
- $remoteManifestHosts->result->name,
- $remoteManifestHosts->result->port,
- crc32($hostURL),
- time(),
- null,
- $hostPageLimit,
- $hostImageLimit,
- (string) CRAWL_HOST_DEFAULT_META_ONLY,
- (string) $hostStatus,
- (string) $hostNsfw,
- $hostRobots,
- $hostRobotsPostfix);
-
- if ($hostId) {
-
- $hostsAdded++;
- } else {
-
- continue;
- }
+ $hostId = $db->addHost( $remoteManifestHosts->result->scheme,
+ $remoteManifestHosts->result->name,
+ $remoteManifestHosts->result->port,
+ crc32($hostURL),
+ time(),
+ null,
+ $hostPageLimit,
+ (string) $hostMetaOnly,
+ (string) $hostStatus,
+ (string) $hostNsfw,
+ $hostRobots,
+ $hostRobotsPostfix);
+
+ // Add web root host page to make host visible in the crawl queue
+ $db->addHostPage($hostId, crc32('/'), '/', time());
+
+ // Increase counters
+ $hostPagesAdded++;
+ $hostsAdded++;
}
-
- // Init robots parser
- $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
-
- // Save home page info
- // Until page API not implemented, save at least home page to have ability to crawl
- // @TODO
- if ($hostStatus && // host enabled
- $robots->uriAllowed('/') && // page allowed by robots.txt rules
- $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
- !$db->getHostPage($hostId, crc32('/'))) { // page not exists
-
- if ($db->addHostPage($hostId, crc32('/'), '/', time())) {
-
- $hostPagesAdded++;
- }
- }
- }
- }
- }
-
- // Process images crawl queue
- foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
-
- // Build URL from the DB
- $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
-
- // Init image request
- $curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
-
- // Update curl stats
- $httpRequestsTotal++;
- $httpRequestsSizeTotal += $curl->getSizeRequest();
- $httpDownloadSizeTotal += $curl->getSizeDownload();
- $httpRequestsTimeTotal += $curl->getTotalTime();
-
- // Update image index anyway, with the current time and http code
- $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
-
- // Skip image processing non 200 code
- if (200 != $curl->getCode()) {
-
- $db->updateHostImageHttpCode($queueHostImage->hostImageId, $curl->getCode(), time());
-
- $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
-
- continue;
- }
-
- // Skip image processing on MIME type not provided
- if (!$hostImageContentType = $curl->getContentType()) {
-
- $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
-
- continue;
- }
-
- // Skip image processing on MIME type not allowed in settings
- $hostImageBanned = true;
- foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
-
- if (false !== strpos($hostImageContentType, trim($mime))) {
-
- $hostImageBanned = false;
- break;
- }
- }
-
- if ($hostImageBanned) {
-
- $db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
-
- $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
-
- continue;
- }
-
- // Convert remote image data to base64 string
- if (!$queueHostImage->crawlMetaOnly) {
-
- // Skip image processing without returned content
- if (!$hostImageContent = $curl->getContent()) {
-
- $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
-
- continue;
- }
-
- if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
-
- $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
-
- continue;
- }
-
- if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
-
- $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
-
- continue;
- }
-
- $hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
-
- // Set host image description
- // On link collection we knew meta but data,
- // this step use latest description slice and insert the data received by curl request
- if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) {
-
- $db->setHostImageDescription($queueHostImage->hostImageId,
- crc32($lastHostImageDescription->alt .
- $lastHostImageDescription->title .
- $hostImageData),
- $lastHostImageDescription->alt,
- $lastHostImageDescription->title,
- $hostImageData,
- time(),
- time());
}
}
-
- $hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
- Filter::mime($hostImageContentType),
- time());
}
// Process pages crawl queue
@@ -476,12 +336,11 @@ try {
time());
// Add queued page description if not exists
- $db->setHostPageDescription($queueHostPage->hostPageId,
- crc32($content),
+ $db->addHostPageDescription($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
- $queueHostPage->crawlMetaOnly ? null : Filter::string($content),
+ $queueHostPage->crawlMetaOnly ? null : base64_encode($content),
time());
// Update manifest registry
@@ -499,155 +358,42 @@ try {
}
}
- // Collect page images
- if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
-
- foreach (@$dom->getElementsByTagName('img') as $img) {
-
- // Skip images without src attribute
- if (!$imageSrc = @$img->getAttribute('src')) {
-
- continue;
- }
-
- // Skip images without alt attribute
- if (!$imageAlt = @$img->getAttribute('alt')) {
-
- continue;
- }
-
- if (!$imageTitle = @$img->getAttribute('title')) {
- $imageTitle = null;
- }
-
- // Add domain to the relative src links
- if (!parse_url($imageSrc, PHP_URL_HOST)) {
-
- $imageSrc = $queueHostPage->scheme . '://' .
- $queueHostPage->name .
- ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
- '/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
- }
-
- // Validate formatted src link
- if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
-
- // Parse formatted src link
- $hostImageURL = Parser::hostURL($imageSrc);
- $hostImageURI = Parser::uri($imageSrc);
-
- // Host exists
- if ($host = $db->getHost(crc32($hostImageURL->string))) {
-
- $hostStatus = $host->status;
- $hostNsfw = $host->nsfw;
- $hostPageLimit = $host->crawlPageLimit;
- $hostImageLimit = $host->crawlImageLimit;
- $hostId = $host->hostId;
- $hostRobots = $host->robots;
- $hostRobotsPostfix = $host->robotsPostfix;
-
- // Register new host
- } else {
-
- // Get robots.txt if exists
- $curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
-
- // Update curl stats
- $httpRequestsTotal++;
- $httpRequestsSizeTotal += $curl->getSizeRequest();
- $httpDownloadSizeTotal += $curl->getSizeDownload();
- $httpRequestsTimeTotal += $curl->getTotalTime();
-
- if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
- $hostRobots = $curl->getContent();
- } else {
- $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
- }
-
- $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
-
- $hostStatus = CRAWL_HOST_DEFAULT_STATUS;
- $hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
- $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
- $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
- $hostId = $db->addHost($hostImageURL->scheme,
- $hostImageURL->name,
- $hostImageURL->port,
- crc32($hostURL->string),
- time(),
- null,
- $hostPageLimit,
- $hostImageLimit,
- (string) CRAWL_HOST_DEFAULT_META_ONLY,
- (string) $hostStatus,
- (string) $hostNsfw,
- $hostRobots,
- $hostRobotsPostfix);
-
- if ($hostId) {
-
- $hostsAdded++;
+ // Init links registry
+ $links = [];
- } else {
+ // Collect image links
+ foreach (@$dom->getElementsByTagName('img') as $img) {
- continue;
- }
- }
+ // Skip images without src attribute
+ if (!$src = @$img->getAttribute('src')) {
- // Init robots parser
- $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
-
- // Save new image info
- $hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
-
- if (!$hostImageId && // image not exists
- $hostStatus && // host enabled
- $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
- $hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
+ continue;
+ }
- // Add host image
- if ($hostImageId = $db->addHostImage($hostId,
- crc32($hostImageURI->string),
- $hostImageURI->string,
- time())) {
+ // Skip images without alt attribute
+ if (!$alt = @$img->getAttribute('alt')) {
- $hostImagesAdded++;
+ continue;
+ }
- } else {
+ if (!$title = @$img->getAttribute('title')) {
+ $title = null;
+ }
- continue;
- }
- }
+ // Skip encoded content
+ if (false !== strpos($src, 'data:')) {
- // Add/update host image description
- $imageAlt = Filter::imageAlt($imageAlt);
- $imageTitle = Filter::imageTitle($imageTitle);
-
- $db->setHostImageDescription($hostImageId,
- crc32($imageAlt . $imageTitle),
- $imageAlt,
- $imageTitle,
- null,
- time(),
- null);
-
- // Relate host image with host page was found
- $db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
-
- // Increase image rank when link does not match the current host
- if ($hostImageURL->scheme . '://' .
- $hostImageURL->name .
- ($hostImageURL->port ? ':' . $hostImageURL->port : '')
- !=
- $queueHostPage->scheme . '://' .
- $queueHostPage->name .
- ($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
-
- $db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
- }
- }
+ continue;
}
+
+ // Add link to queue
+ $links[] = [
+ 'title' => null,
+ 'description' => null,
+ 'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
+ 'data' => null,
+ 'ref' => $src,
+ ];
}
// Collect internal links from page content
@@ -659,6 +405,11 @@ try {
continue;
}
+ // Get title attribute if available
+ if (!$title = @$a->getAttribute('title')) {
+ $title = null;
+ }
+
// Skip anchor links
if (false !== strpos($href, '#')) {
@@ -683,23 +434,34 @@ try {
continue;
}
- // @TODO skip other apps
+ // Add link to queue
+ $links[] = [
+ 'title' => null,
+ 'description' => null,
+ 'keywords' => Filter::pageKeywords($title),
+ 'data' => null,
+ 'ref' => $href,
+ ];
+ }
+
+ // Process links collected
+ foreach ($links as $link) {
- // Add absolute URL prefixes to the relative links found
- if (!parse_url($href, PHP_URL_HOST)) {
+ //Make relative links absolute
+ if (!parse_url($link['ref'], PHP_URL_HOST)) {
- $href = $queueHostPage->scheme . '://' .
- $queueHostPage->name .
- ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
- '/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.');
+ $link['ref'] = $queueHostPage->scheme . '://' .
+ $queueHostPage->name .
+ ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
+ '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
}
// Validate formatted link
- if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
+ if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) {
// Parse formatted link
- $hostURL = Parser::hostURL($href);
- $hostPageURI = Parser::uri($href);
+ $hostURL = Parser::hostURL($link['ref']);
+ $hostPageURI = Parser::uri($link['ref']);
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
@@ -707,7 +469,7 @@ try {
$hostStatus = $host->status;
$hostNsfw = $host->nsfw;
$hostPageLimit = $host->crawlPageLimit;
- $hostImageLimit = $host->crawlImageLimit;
+ $hostMetaOnly = $host->crawlMetaOnly;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
@@ -731,30 +493,33 @@ try {
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
-
- $hostStatus = CRAWL_HOST_DEFAULT_STATUS;
- $hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
- $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
- $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
- $hostId = $db->addHost($hostURL->scheme,
- $hostURL->name,
- $hostURL->port,
- crc32($hostURL->string),
- time(),
- null,
- $hostPageLimit,
- $hostImageLimit,
- (string) CRAWL_HOST_DEFAULT_META_ONLY,
- (string) $hostStatus,
- (string) $hostNsfw,
- $hostRobots,
- $hostRobotsPostfix);
-
- if ($hostId) {
-
- $hostsAdded++;
-
- } else {
+ $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
+ $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
+ $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
+ $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
+
+ $hostId = $db->addHost( $hostURL->scheme,
+ $hostURL->name,
+ $hostURL->port,
+ crc32($hostURL->string),
+ time(),
+ null,
+ $hostPageLimit,
+ (string) $hostMetaOnly,
+ (string) $hostStatus,
+ (string) $hostNsfw,
+ $hostRobots,
+ $hostRobotsPostfix);
+
+ // Add web root host page to make host visible in the crawl queue
+ $db->addHostPage($hostId, crc32('/'), '/', time());
+
+ // Increase counters
+ $hostPagesAdded++;
+ $hostsAdded++;
+
+ // When page is root, skip next operations
+ if ($hostPageURI->string == '/') {
continue;
}
@@ -766,25 +531,27 @@ try {
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
- $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
- !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
+ $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
- if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
+ if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
+
+ $hostPageId = $hostPage->hostPageId;
+
+ } else {
+
+ $hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
+
+ $db->addHostPageDescription($hostPageId,
+ $link['title'],
+ $link['description'],
+ $link['keywords'],
+ $hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
+ time());
$hostPagesAdded++;
}
- }
- // Increase page rank when link does not match the current host
- if ($hostURL->scheme . '://' .
- $hostURL->name .
- ($hostURL->port ? ':' . $hostURL->port : '')
- !=
- $queueHostPage->scheme . '://' .
- $queueHostPage->name .
- ($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
-
- $db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
+ $db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
}
}
}
@@ -811,10 +578,6 @@ if (CRAWL_LOG_ENABLED) {
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesBanned,
- $hostImagesIndexed,
- $hostImagesProcessed,
- $hostImagesAdded,
- $hostImagesBanned,
$manifestsProcessed,
$manifestsAdded,
$httpRequestsTotal,
@@ -832,11 +595,6 @@ echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
-echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
-echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
-echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
-echo 'Images banned: ' . $hostImagesBanned . PHP_EOL;
-
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
diff --git a/database/yggo.mwb b/database/yggo.mwb
index 35ee90c..da1d4f6 100644
Binary files a/database/yggo.mwb and b/database/yggo.mwb differ
diff --git a/library/filter.php b/library/filter.php
index 1570de3..edaeff5 100644
--- a/library/filter.php
+++ b/library/filter.php
@@ -54,24 +54,6 @@ class Filter {
return $keywords;
}
- static public function imageAlt(mixed $alt) {
-
- $alt = (string) $alt;
-
- $alt = trim($alt);
-
- return $alt;
- }
-
- static public function imageTitle(mixed $title) {
-
- $title = (string) $title;
-
- $title = trim($title);
-
- return $title;
- }
-
static public function pageData(mixed $data) {
$data = (string) $data;
diff --git a/library/mysql.php b/library/mysql.php
index f0aaa64..605b892 100644
--- a/library/mysql.php
+++ b/library/mysql.php
@@ -102,11 +102,44 @@ class MySQL {
return $query->fetch()->total;
}
- public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, string $nsfw, mixed $robots, mixed $robotsPostfix) {
-
- $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `nsfw`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
-
- $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $nsfw, $robots, $robotsPostfix]);
+ public function addHost(string $scheme,
+ string $name,
+ mixed $port,
+ int $crc32url,
+ int $timeAdded,
+ mixed $timeUpdated,
+ int $crawlPageLimit,
+ string $crawlMetaOnly,
+ string $status,
+ string $nsfw,
+ mixed $robots,
+ mixed $robotsPostfix) {
+
+ $query = $this->_db->prepare('INSERT INTO `host` (`scheme`,
+ `name`,
+ `port`,
+ `crc32url`,
+ `timeAdded`,
+ `timeUpdated`,
+ `crawlPageLimit`,
+ `crawlMetaOnly`,
+ `status`,
+ `nsfw`,
+ `robots`,
+ `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
+
+ $query->execute([ $scheme,
+ $name,
+ $port,
+ $crc32url,
+ $timeAdded,
+ $timeUpdated,
+ $crawlPageLimit,
+ $crawlMetaOnly,
+ $status,
+ $nsfw,
+ $robots,
+ $robotsPostfix]);
return $this->_db->lastInsertId();
}
@@ -120,253 +153,6 @@ class MySQL {
return $query->rowCount();
}
- // Images
- public function getTotalHostImages(int $hostId) {
-
- $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE `hostId` = ?');
-
- $query->execute([$hostId]);
-
- return $query->fetch()->total;
- }
-
- public function getHostImageId(int $hostId, int $crc32uri) {
-
- $query = $this->_db->prepare('SELECT `hostImageId` FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
-
- $query->execute([$hostId, $crc32uri]);
-
- return $query->rowCount() ? $query->fetch()->hostImageId : 0;
- }
-
- public function getHostImages(int $hostId) {
-
- $query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ?');
-
- $query->execute([$hostId]);
-
- return $query->fetchAll();
- }
-
- public function getUnrelatedHostImages() {
-
- $query = $this->_db->prepare('SELECT * FROM `hostImage`
- WHERE `hostImage`.`hostImageId` NOT IN (SELECT `hostImageToHostPage`.`hostImageId`
- FROM `hostImageToHostPage`
-
- WHERE `hostImageToHostPage`.`hostImageId` = `hostImage`.`hostImageId`)');
-
- $query->execute();
-
- return $query->fetchAll();
- }
-
- public function getHostImagesByLimit(int $hostId, int $limit) {
-
- $query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? ORDER BY hostImageId DESC LIMIT ' . (int) $limit);
-
- $query->execute([$hostId]);
-
- return $query->fetchAll();
- }
-
- public function addHostImage(int $hostId,
- int $crc32uri,
- string $uri,
- int $timeAdded,
- mixed $timeUpdated = null,
- mixed $timeBanned = null,
- mixed $httpCode = null,
- mixed $mime = null,
- mixed $rank = null) {
-
- $query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
- `crc32uri`,
- `uri`,
- `timeAdded`,
- `timeUpdated`,
- `timeBanned`,
- `httpCode`,
- `mime`,
- `rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
-
- $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
-
- return $this->_db->lastInsertId();
- }
-
- public function updateHostImageRank(int $hostId,
- int $crc32uri,
- int $increment) {
-
- $query = $this->_db->prepare('UPDATE `hostImage` SET `rank` = `rank` + ' . (int) $increment . ' WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
-
- $query->execute([$hostId, $crc32uri]);
-
- return $query->rowCount();
- }
-
- public function updateHostImageTimeBanned(int $hostImageId, int $timeBanned) {
-
- $query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
-
- $query->execute([$timeBanned, $hostImageId]);
-
- return $query->rowCount();
- }
-
- public function updateHostImageHttpCode(int $hostImageId,
- int $httpCode,
- int $timeUpdated) {
-
- $query = $this->_db->prepare('UPDATE `hostImage` SET `httpCode` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
-
- $query->execute([$httpCode, $timeUpdated, $hostImageId]);
-
- return $query->rowCount();
- }
-
- public function updateHostImageMime(int $hostImageId,
- string $mime,
- int $timeUpdated) {
-
- $query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
-
- $query->execute([$mime, $timeUpdated, $hostImageId]);
-
- return $query->rowCount();
- }
-
- public function updateHostImage(int $hostImageId,
- string $mime,
- int $timeUpdated,
- mixed $timeBanned = null) {
-
- $query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
-
- $query->execute([$mime, $timeUpdated, $timeBanned, $hostImageId]);
-
- return $query->rowCount();
- }
-
- public function deleteHostImage(int $hostImageId) {
-
- $query = $this->_db->prepare('DELETE FROM `hostImage` WHERE `hostImageId` = ? LIMIT 1');
-
- $query->execute([$hostImageId]);
-
- return $query->rowCount();
- }
-
- public function setHostImageDescription(int $hostImageId,
- int $crc32id,
- string $alt,
- string $title,
- mixed $data,
- int $timeAdded,
- mixed $timeUpdated) {
-
- $query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
- `crc32id`,
- `alt`,
- `title`,
- `timeAdded`) VALUES (?, ?, ?, ?, ?)
-
- ON DUPLICATE KEY UPDATE `alt` = ?,
- `title` = ?,
- `timeUpdated` = ?');
-
- $query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded, $alt, $title, $timeUpdated]);
-
- return $this->_db->lastInsertId();
- }
-
- public function setHostImageDescriptionData(int $hostImageId,
- int $crc32id,
- mixed $data,
- int $timeAdded,
- mixed $timeUpdated) {
-
- $query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
- `crc32id`,
- `data`,
- `timeAdded`) VALUES (?, ?, ?, ?)
-
- ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
-
- $query->execute([$hostImageId, $crc32id, $data, $timeAdded, $timeUpdated]);
-
- return $this->_db->lastInsertId();
- }
-
- public function deleteHostImageDescription(int $hostImageId) {
-
- $query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `hostImageId` = ?');
-
- $query->execute([$hostImageId]);
-
- return $query->rowCount();
- }
-
- public function getLastHostImageDescription(int $hostImageId) {
-
- $query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
-
- $query->execute([$hostImageId]);
-
- return $query->fetch();
- }
-
- public function getHostImageHostPages(int $hostImageId, int $limit = 5) {
-
- $query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage`
- JOIN `hostPage` ON (`hostPage`.`hostPageId` = `hostImageToHostPage`.`hostPageId`)
-
- WHERE `hostImageId` = ?
-
- ORDER BY `hostPage`.`rank` DESC, RAND(`hostPage`.`hostId`)
-
- LIMIT ' . (int) $limit);
-
- $query->execute([$hostImageId]);
-
- return $query->fetchAll();
- }
-
- public function getHostImageHostPagesTotal(int $hostImageId) {
-
- $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
-
- $query->execute([$hostImageId]);
-
- return $query->fetch()->total;
- }
-
- public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $time, int $quantity) {
-
- $query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
- `hostPageId`,
- `timeAdded`,
- `timeUpdated`,
- `quantity`) VALUES (?, ?, ?, ?, ?)
-
- ON DUPLICATE KEY UPDATE `timeUpdated` = ?,
- `quantity` = `quantity` + ' . (int) $quantity);
-
- $query->execute([$hostImageId, $hostPageId, $time, null, $quantity, $time]);
-
- return $query->rowCount(); // no primary key
- }
-
- public function deleteHostImageToHostPage(int $hostImageId) {
-
- $query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
-
- $query->execute([$hostImageId]);
-
- return $query->rowCount();
- }
-
// Pages
public function getTotalHostPages(int $hostId) {
@@ -421,18 +207,9 @@ class MySQL {
return $query->fetchAll();
}
- public function getHostPageDescription(int $hostPageId, int $crc32data) {
-
- $query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
-
- $query->execute([$hostPageId, $crc32data]);
-
- return $query->fetch();
- }
-
public function getLastPageDescription(int $hostPageId) {
- $query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
+ $query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
$query->execute([$hostPageId]);
@@ -442,7 +219,6 @@ class MySQL {
public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`uri`,
- `hostPage`.`rank`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`
@@ -459,28 +235,6 @@ class MySQL {
return $query->fetch();
}
- public function getFoundHostImage(int $hostImageId) {
-
- $query = $this->_db->prepare('SELECT `hostImage`.`hostImageId`,
- `hostImage`.`uri`,
- `hostImage`.`rank`,
- `host`.`scheme`,
- `host`.`name`,
- `host`.`port`,
- `host`.`crawlMetaOnly`
-
- FROM `hostImage`
- JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
-
- WHERE `hostImage`.`hostImageId` = ?
-
- LIMIT 1');
-
- $query->execute([$hostImageId]);
-
- return $query->fetch();
- }
-
public function addHostPage(int $hostId,
int $crc32uri,
string $uri,
@@ -488,8 +242,7 @@ class MySQL {
mixed $timeUpdated = null,
mixed $timeBanned = null,
mixed $httpCode = null,
- mixed $mime = null,
- mixed $rank = null) {
+ mixed $mime = null) {
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
`crc32uri`,
@@ -498,10 +251,9 @@ class MySQL {
`timeUpdated`,
`timeBanned`,
`httpCode`,
- `mime`,
- `rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
+ `mime`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)');
- $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
+ $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime]);
return $this->_db->lastInsertId();
}
@@ -515,22 +267,6 @@ class MySQL {
return $query->rowCount();
}
- public function updateHostPageRank(int $hostId,
- int $crc32uri,
- int $increment) {
-
- $query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = `rank` + ' . (int) $increment . '
-
- WHERE `hostId` = ?
- AND `crc32uri` = ?
-
- LIMIT 1');
-
- $query->execute([$hostId, $crc32uri]);
-
- return $query->rowCount();
- }
-
public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
@@ -576,48 +312,52 @@ class MySQL {
return $query->rowCount();
}
- public function deleteHostPageToHostImage(int $hostPageId) {
-
- $query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
-
- $query->execute([$hostPageId]);
-
- return $query->rowCount();
- }
-
- public function setHostPageDescription(int $hostPageId,
- int $crc32data,
- mixed $metaTitle,
- mixed $metaDescription,
- mixed $metaKeywords,
+ public function addHostPageDescription(int $hostPageId,
+ mixed $title,
+ mixed $description,
+ mixed $keywords,
mixed $data,
- int $time) {
+ int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
- `crc32data`,
- `metaTitle`,
- `metaDescription`,
- `metaKeywords`,
+ `title`,
+ `description`,
+ `keywords`,
`data`,
`timeAdded`
- ) VALUES (?, ?, ?, ?, ?, ?, ?)
-
- ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
+ ) VALUES (?, ?, ?, ?, ?, ?)');
$query->execute([
$hostPageId,
- $crc32data,
- $metaTitle,
- $metaDescription,
- $metaKeywords,
+ $title,
+ $description,
+ $keywords,
$data,
- $time,
- $time
+ $timeAdded,
]);
return $query->rowCount();
}
+ public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
+
+ $query = $this->_db->prepare('INSERT INTO `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`, `quantity`) VALUES (?, ?, 0)
+
+ ON DUPLICATE KEY UPDATE `quantity` = `quantity` + 1');
+
+ $query->execute([$hostPageIdSource, $hostPageIdTarget]);
+
+ }
+
+ public function deleteHostPageToHostPage(int $hostPageId) {
+
+ $query = $this->_db->prepare('DELETE FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? OR `hostPageIdTarget` = ?');
+
+ $query->execute([$hostPageId, $hostPageId]);
+
+ return $query->rowCount();
+ }
+
// Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) {
@@ -652,33 +392,12 @@ class MySQL {
return $query->rowCount();
}
- public function resetBannedHostImages(int $timeOffset) {
-
- $query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
-
- $query->execute();
-
- return $query->rowCount();
- }
-
- public function deleteHostImageDescriptionsByTimeAdded(int $timeOffset) {
-
- $query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
-
- $query->execute();
-
- return $query->rowCount();
- }
-
public function addCleanerLog(int $timeAdded,
int $hostsTotal,
int $hostsUpdated,
int $hostPagesDeleted,
int $hostPageDescriptionsDeleted,
int $hostPagesBansRemoved,
- int $hostImagesDeleted,
- int $hostImageDescriptionsDeleted,
- int $hostImagesBansRemoved,
int $manifestsTotal,
int $manifestsDeleted,
int $logsCleanerDeleted,
@@ -695,9 +414,6 @@ class MySQL {
`hostPagesDeleted`,
`hostPageDescriptionsDeleted`,
`hostPagesBansRemoved`,
- `hostImagesDeleted`,
- `hostImageDescriptionsDeleted`,
- `hostImagesBansRemoved`,
`manifestsTotal`,
`manifestsDeleted`,
`logsCleanerDeleted`,
@@ -706,7 +422,7 @@ class MySQL {
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
- `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
+ `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
@@ -715,9 +431,6 @@ class MySQL {
$hostPagesDeleted,
$hostPageDescriptionsDeleted,
$hostPagesBansRemoved,
- $hostImagesDeleted,
- $hostImageDescriptionsDeleted,
- $hostImagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
$logsCleanerDeleted,
@@ -751,7 +464,6 @@ class MySQL {
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
- `host`.`crawlImageLimit`,
`host`.`crawlMetaOnly`,
`host`.`robots`,
`host`.`robotsPostfix`
@@ -762,7 +474,7 @@ class MySQL {
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
AND `hostPage`.`timeBanned` IS NULL
- ORDER BY `hostPage`.`rank` DESC, RAND()
+ ORDER BY RAND()
LIMIT ' . (int) $limit);
@@ -780,40 +492,6 @@ class MySQL {
return $query->rowCount();
}
- public function getHostImageCrawlQueue(int $limit, int $timeFrom) {
-
- $query = $this->_db->prepare('SELECT `hostImage`.`hostId`,
- `hostImage`.`hostImageId`,
- `hostImage`.`uri`,
- `host`.`scheme`,
- `host`.`name`,
- `host`.`port`,
- `host`.`crawlMetaOnly`
-
- FROM `hostImage`
- JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
-
- WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
- AND `hostImage`.`timeBanned` IS NULL
-
- ORDER BY `hostImage`.`rank` DESC, RAND()
-
- LIMIT ' . (int) $limit);
-
- $query->execute([$timeFrom]);
-
- return $query->fetchAll();
- }
-
- public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) {
-
- $query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1');
-
- $query->execute([$timeUpdated, $httpCode, $hostImageId]);
-
- return $query->rowCount();
- }
-
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `manifest`
@@ -844,10 +522,6 @@ class MySQL {
int $hostPagesIndexed,
int $hostPagesAdded,
int $hostPagesBanned,
- int $hostImagesIndexed,
- int $hostImagesProcessed,
- int $hostImagesAdded,
- int $hostImagesBanned,
int $manifestsProcessed,
int $manifestsAdded,
int $httpRequestsTotal,
@@ -862,17 +536,13 @@ class MySQL {
`hostPagesIndexed`,
`hostPagesAdded`,
`hostPagesBanned`,
- `hostImagesIndexed`,
- `hostImagesProcessed`,
- `hostImagesAdded`,
- `hostImagesBanned`,
`manifestsProcessed`,
`manifestsAdded`,
`httpRequestsTotal`,
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
- `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
+ `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
@@ -881,10 +551,6 @@ class MySQL {
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesBanned,
- $hostImagesIndexed,
- $hostImagesProcessed,
- $hostImagesAdded,
- $hostImagesBanned,
$manifestsProcessed,
$manifestsAdded,
$httpRequestsTotal,
diff --git a/library/sphinxql.php b/library/sphinxql.php
index 41379d8..8062761 100644
--- a/library/sphinxql.php
+++ b/library/sphinxql.php
@@ -11,13 +11,13 @@ class SphinxQL {
$this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
}
- public function searchHostPages(string $keyword, int $start, int $limit, int $maxMatches) {
+ public function searchHostPages(string $keyword, string $mime, int $start, int $limit, int $maxMatches) {
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
FROM `hostPage`
- WHERE MATCH(?)
+ WHERE MATCH(?) AND `mime` = ?
ORDER BY `rank` DESC, WEIGHT() DESC
@@ -25,26 +25,7 @@ class SphinxQL {
OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1));
- $query->execute([$keyword]);
-
- return $query->fetchAll();
- }
-
- public function searchHostImages(string $keyword, int $start, int $limit, int $maxMatches) {
-
- $query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
-
- FROM `hostImage`
-
- WHERE MATCH(?)
-
- ORDER BY `rank` DESC, WEIGHT() DESC
-
- LIMIT ' . (int) ($start >= $maxMatches ? ($maxMatches > 0 ? $maxMatches - 1 : 0) : $start) . ',' . (int) $limit . '
-
- OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1));
-
- $query->execute([$keyword]);
+ $query->execute([$keyword, $mime]);
return $query->fetchAll();
}
@@ -58,29 +39,20 @@ class SphinxQL {
return $query->fetch()->total;
}
- public function searchHostPagesTotal(string $keyword) {
-
- $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
-
- $query->execute([$keyword]);
-
- return $query->fetch()->total;
- }
-
- public function searchHostImagesTotal(string $keyword) {
+ public function getHostPagesMime() {
- $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE MATCH(?)');
+ $query = $this->_sphinx->prepare('SELECT `mime` FROM `hostPage` GROUP BY `mime` ORDER BY `mime` ASC');
- $query->execute([$keyword]);
+ $query->execute();
- return $query->fetch()->total;
+ return $query->fetchAll();
}
- public function getHostImagesTotal() {
+ public function searchHostPagesTotal(string $keyword, string $mime) {
- $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage`');
+ $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?) AND `mime` = ?');
- $query->execute();
+ $query->execute([$keyword, $mime]);
return $query->fetch()->total;
}
diff --git a/media/db-prototype.png b/media/db-prototype.png
index 4e23c5b..0ce16fa 100644
Binary files a/media/db-prototype.png and b/media/db-prototype.png differ
diff --git a/public/api.php b/public/api.php
index becf6ae..e96b39d 100644
--- a/public/api.php
+++ b/public/api.php
@@ -1,7 +1,7 @@
searchHostImagesTotal(Filter::searchQuery($query, $mode));
- $sphinxResults = $sphinx->searchHostImages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
-
- // Make default search request
- } else {
-
- $sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode));
- $sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
- }
+ // Make search request
+ $sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode), $type);
+ $sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $type, $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
// Generate results
$dbResults = [];
foreach ($sphinxResults as $i => $sphinxResult) {
- // Image
- if (!empty($type) && $type == 'image') {
-
- if ($hostImage = $db->getFoundHostImage($sphinxResult->id)) {
-
- $dbResults[$i] = $hostImage;
-
- $dbResults[$i]->weight = $sphinxResult->weight;
- }
-
- // Default
- } else {
-
- if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
+ if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
- $dbResults[$i] = $hostPage;
+ $dbResults[$i] = $hostPage;
- $dbResults[$i]->weight = $sphinxResult->weight;
- }
+ $dbResults[$i]->weight = $sphinxResult->weight;
}
}
@@ -129,13 +106,10 @@ if (API_ENABLED) {
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
- 'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageMime' => CRAWL_PAGE_MIME,
- 'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET,
- 'crawlHostImageMime' => CRAWL_IMAGE_MIME,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
diff --git a/public/index.php b/public/index.php
index a5dbfb7..6a76ada 100644
--- a/public/index.php
+++ b/public/index.php
@@ -24,7 +24,6 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
-