diff --git a/README.md b/README.md index e6f03bc..6448927 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option ``` GET action=search - required GET query={string} - optional, search request, empty if not provided -GET type={string} - optional, search type, image|default or empty +GET type={string} - optional, filter mime type of available or empty GET page={int} - optional, search results page, 1 if not provided GET mode=SphinxQL - optional, enable extended SphinxQL syntax ``` @@ -141,7 +141,7 @@ GET m=SphinxQL ##### Basic features * [x] Web pages full text ranking search -* [x] Images search with safe proxy preview support +* [x] MIME filtering search with safe proxy images preview * [x] Extended syntax support * [x] Flexible settings compatible with IPv4/IPv6 networks @@ -159,18 +159,14 @@ GET m=SphinxQL * [ ] Index API + [x] Manifest + [x] Search - + [x] Pages - + [x] Images + [x] Hosts - + [ ] Pages - + [ ] Images + + [ ] MIME list * [ ] Context advertising API ##### Crawler * [x] Auto crawl links by regular expression rules + [x] Pages - + [x] Images + [x] Manifests * [x] Robots.txt / robots meta tags support (#2) * [x] Specific rules configuration for every host @@ -181,8 +177,6 @@ GET m=SphinxQL * [x] Ban non-condition links to prevent extra requests * [x] Debug log * [x] History snaps - + [x] Pages - + [x] Images * [ ] Indexing new sites homepage in higher priority * [ ] Redirect codes extended processing * [ ] Palette image index / filter @@ -191,17 +185,12 @@ GET m=SphinxQL ##### Cleaner * [x] Deprecated DB items auto deletion / host settings update + [x] Pages - + [x] Images + [x] Manifests + [x] Logs + [x] Crawler + [x] Cleaner * [x] Deprecated history snaps removing - + [x] Pages - + [x] Images * [x] Banned resources reset by timeout - + [x] Pages - + [x] Images * [x] Debug log ##### Other diff --git a/config/app.php.txt b/config/app.php.txt index 8469469..724a8fe 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -47,7 +47,7 @@ error_reporting(E_ALL); * Project domain, without slash on postfix * */ -define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : '')); +define('WEBSITE_DOMAIN', ''); /* * Page search results before show the read more link @@ -55,18 +55,6 @@ define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' */ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100); -/* - * Image search results before show the read more link - * - */ -define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10); - -/* - * Quantity of related pages for each image in the search results - * - */ -define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5); - /* * Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload * @@ -76,7 +64,7 @@ define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5); define('WEBSITE_IDENTICON_IMAGE_CACHE', true); // Database -define('DB_HOST', 'localhost'); +define('DB_HOST', '127.0.0.1'); define('DB_PORT', 3306); define('DB_NAME', ''); define('DB_USERNAME', ''); @@ -144,20 +132,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); */ define('CRAWL_PAGE_LIMIT', 20); -/* - * Images (URI) processing limit in the crawler.php queue - * - * This option related to CRAWL_IMAGE_SECONDS_OFFSET value - * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) - * - * Usually up to 20 pages per minute, - * to prevent websites overload by sending GET crawling requests - * - * Set 0 to disable - * - */ -define('CRAWL_IMAGE_LIMIT', 10); - /* * Manifest (URI) processing limit in the crawler.php queue * @@ -194,28 +168,7 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); * comma separated * */ -define('CRAWL_PAGE_MIME', 'text/html'); - -/* - * Index images match MIME types - * - * comma separated - * - */ -define('CRAWL_IMAGE_MIME', 'image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml'); - -/* - * Renew image index by timing offset provided - * - * This option works with CRAWL_IMAGE_LIMIT step queue - * - * Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair - * must have enough value to crawl all images collected in the DB index - * - * or the crawler can stuck in queue - * - */ -define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12); +define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml'); /* * Renew manifests index by timing offset provided @@ -234,7 +187,7 @@ define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30); * Only URL addresses match this rule will be auto-crawled * */ -define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui' +define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui'); /* * Pages limit per new host by default @@ -244,7 +197,7 @@ define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/u * Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field * */ -define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000); +define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000); /* * Set default auto-crawl status for new host added @@ -264,7 +217,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true); * Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field * * This option able to change search results relevance - * This option enables image data caching in base64 * */ define('CRAWL_HOST_DEFAULT_META_ONLY', false); @@ -279,16 +231,6 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', false); */ define('CRAWL_HOST_DEFAULT_NSFW', false); -/* - * Not suitable/safe for work status for new host by default - * - * Could be filtered in crawl conditions or search results - * - * Custom rule for specified host could be provided in the DB `host`.`nsfw` field - * - */ -define('CRAWL_HOST_DEFAULT_NSFW', false); - /* * Default robots.txt rules on remote file not exists * The crawler able to overwrite these rules @@ -324,7 +266,7 @@ define('CRAWL_MANIFEST', true); * Manifest API version compatibility * */ -define('CRAWL_MANIFEST_API_VERSION', 0.7); +define('CRAWL_MANIFEST_API_VERSION', 0.8); /* * Set default auto-crawl status for new manifest added @@ -389,20 +331,6 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30); */ define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10); -/* - * Remove image ban after following time - * - * This option used in crawler and search page - * to prevent extra http requests to unavailable or not condition resources - * - */ -define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30); - -/* - * Remove image description history after following time - * - */ -define('CLEAN_IMAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10); // API settings @@ -445,14 +373,12 @@ define('API_HOSTS_FIELDS', `host`.`name`, `host`.`port`, `host`.`crawlPageLimit`, - `host`.`crawlImageLimit`, `host`.`robots`, `host`.`robotsPostfix`, `host`.`nsfw`, `host`.`timeAdded`, `host`.`timeUpdated`, - (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`, - (SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated + (SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`'); /* * Manifest API diff --git a/config/sphinx.conf.txt b/config/sphinx.conf.txt index 1e06c9d..06ef702 100644 --- a/config/sphinx.conf.txt +++ b/config/sphinx.conf.txt @@ -12,36 +12,24 @@ source common source hostPage : common { sql_query = \ - SELECT hostPage.hostPageId, \ - hostPage.rank, \ - hostPage.uri, \ - host.name, \ - (SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \ - hostPageDescription.metaDescription, \ - hostPageDescription.metaKeywords) \ - FROM hostPageDescription \ - WHERE hostPageDescription.hostPageId = hostPage.hostPageId \ - ORDER BY hostPageDescription.timeUpdated DESC, hostPageDescription.timeAdded DESC \ - LIMIT 1) AS pageDescription \ - FROM hostPage \ - JOIN host ON (host.hostId = hostPage.hostId) \ - WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL + SELECT `hostPage`.`hostPageId`, \ + `hostPage`.`uri`, \ + `host`.`name`, \ + REGEXP_REPLACE(`hostPage`.`mime`, '^[A-z-]+/([A-z-]+).*', '$1') AS `mime`, \ + (SELECT COUNT(*) FROM `hostPageToHostPage` \ + WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \ + AND `hostPageToHostPage`.`hostPageIdSource` <> `hostPage`.`hostPageId`) AS `rank`, \ + (SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \ + `hostPageDescription`.`description`, \ + `hostPageDescription`.`keywords`)) \ + FROM `hostPageDescription` \ + WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId`) AS `pageDescription` \ + FROM `hostPage` \ + JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) \ + WHERE `host`.`status` = '1' AND `hostPage`.`httpCode` = 200 AND `hostPage`.`timeBanned` IS NULL - sql_attr_uint = rank -} - -source hostImage : common -{ - sql_query = \ - SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \ - (SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \ - FROM hostImageDescription \ - WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription \ - FROM hostImage \ - JOIN host ON (host.hostId = hostImage.hostId) \ - WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \ - - sql_attr_uint = rank + sql_attr_uint = rank + sql_attr_string = mime } index hostPage @@ -49,11 +37,4 @@ index hostPage source = hostPage morphology = stem_enru, stem_cz, stem_ar path = /var/lib/sphinxsearch/data/hostPage -} - -index hostImage -{ - source = hostImage - morphology = stem_enru, stem_cz, stem_ar - path = /var/lib/sphinxsearch/data/hostImage } \ No newline at end of file diff --git a/crontab/cleaner.php b/crontab/cleaner.php index 8e30731..21a48e4 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -31,11 +31,8 @@ $manifestsTotal = $db->getTotalManifests(); $hostsUpdated = 0; $hostPagesDeleted = 0; $hostPageDescriptionsDeleted = 0; -$hostImagesDeleted = 0; -$hostImageDescriptionsDeleted = 0; $manifestsDeleted = 0; $hostPagesBansRemoved = 0; -$hostImagesBansRemoved = 0; $logsCleanerDeleted = 0; $logsCrawlerDeleted = 0; @@ -56,7 +53,7 @@ try { // Update curl stats $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); @@ -69,22 +66,6 @@ try { // Update host data $hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time()); - // Apply host images limits - $totalHostImages = $db->getTotalHostImages($host->hostId); - - if ($totalHostImages > $host->crawlImageLimit) { - - foreach ((array) $db->getHostImagesByLimit($host->hostId, $totalHostImages - $host->crawlImageLimit) as $hostImage) { - - // Delete foreign key relations - $db->deleteHostImageDescription($hostImage->hostImageId); - $db->deleteHostImageToHostPage($hostImage->hostImageId); - - // Delete host image - $hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId); - } - } - // Apply host pages limits $totalHostPages = $db->getTotalHostPages($host->hostId); @@ -92,56 +73,32 @@ try { foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) { - // Delete foreign key relations - $db->deleteHostPageToHostImage($hostPage->hostPageId); - // Delete host page $db->deleteHostPageDescriptions($hostPage->hostPageId); + $db->deleteHostPageToHostPage($hostPage->hostPageId); - $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); + if ($hostPage->uri != '/') { + $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); + } } } // Apply new robots.txt rules $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - foreach ($db->getHostImages($host->hostId) as $hostImage) { - - if (!$robots->uriAllowed($hostImage->uri)) { - - // Delete foreign key relations - $db->deleteHostImageDescription($hostImage->hostImageId); - $db->deleteHostImageToHostPage($hostImage->hostImageId); - - // Delete host image - $hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId); - } - } - foreach ($db->getHostPages($host->hostId) as $hostPage) { if (!$robots->uriAllowed($hostPage->uri)) { - // Delete foreign key relations - $db->deleteHostPageToHostImage($hostPage->hostPageId); - // Delete host page $db->deleteHostPageDescriptions($hostPage->hostPageId); + $db->deleteHostPageToHostPage($hostPage->hostPageId); - $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); + if ($hostPage->uri != '/') { + $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); + } } } - - // Clean up host images unrelated to host pages - foreach ($db->getUnrelatedHostImages() as $hostImage) { - - // Delete foreign key relations - $db->deleteHostImageDescription($hostImage->hostImageId); - $db->deleteHostImageToHostPage($hostImage->hostImageId); - - // Delete host image - $hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId); - } } // Clean up deprecated manifests @@ -207,12 +164,6 @@ try { // Delete page description history $hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET); - // Reset banned images - $hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET); - - // Delete image description history - $hostImageDescriptionsDeleted += $db->deleteHostImageDescriptionsByTimeAdded(time() - CLEAN_IMAGE_DESCRIPTION_OFFSET); - // Delete deprecated logs $logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET); $logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET); @@ -238,9 +189,6 @@ if (CLEAN_LOG_ENABLED) { $hostPagesDeleted, $hostPageDescriptionsDeleted, $hostPagesBansRemoved, - $hostImagesDeleted, - $hostImageDescriptionsDeleted, - $hostImagesBansRemoved, $manifestsTotal, $manifestsDeleted, $logsCleanerDeleted, @@ -256,15 +204,12 @@ if (CLEAN_LOG_ENABLED) { echo 'Hosts total: ' . $hostsTotal . PHP_EOL; echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL; echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL; -echo 'Hosts images deleted: ' . $hostImagesDeleted . PHP_EOL; echo 'Manifests total: ' . $manifestsTotal . PHP_EOL; echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL; -echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL; -echo 'Host image descriptions deleted: ' . $hostImageDescriptionsDeleted . PHP_EOL; echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL; echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL; diff --git a/crontab/crawler.php b/crontab/crawler.php index 914a233..150ad14 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -33,16 +33,12 @@ $httpDownloadSizeTotal = 0; $httpRequestsTimeTotal = 0; $hostPagesProcessed = 0; -$hostImagesProcessed = 0; $manifestsProcessed = 0; $hostPagesIndexed = 0; -$hostImagesIndexed = 0; $manifestsAdded = 0; $hostPagesAdded = 0; -$hostImagesAdded = 0; $hostsAdded = 0; $hostPagesBanned = 0; -$hostImagesBanned = 0; // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); @@ -121,7 +117,7 @@ try { // Update curl stats $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); @@ -167,26 +163,15 @@ try { // Validate formatted link if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { - // Host exists - if ($host = $db->getHost(crc32($hostURL))) { - - $hostStatus = $host->status; - $hostNsfw = $host->nsfw; - $hostPageLimit = $host->crawlPageLimit; - $hostImageLimit = $host->crawlImageLimit; - $hostId = $host->hostId; - $hostRobots = $host->robots; - $hostRobotsPostfix = $host->robotsPostfix; - - // Register new host - } else { + // Host not exists + if (!$db->getHost(crc32($hostURL))) { // Get robots.txt if exists $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); + $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); @@ -198,158 +183,33 @@ try { $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - $hostStatus = CRAWL_HOST_DEFAULT_STATUS; - $hostNsfw = CRAWL_HOST_DEFAULT_NSFW; + $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; + $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; + $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; - - $hostId = $db->addHost($remoteManifestHosts->result->scheme, - $remoteManifestHosts->result->name, - $remoteManifestHosts->result->port, - crc32($hostURL), - time(), - null, - $hostPageLimit, - $hostImageLimit, - (string) CRAWL_HOST_DEFAULT_META_ONLY, - (string) $hostStatus, - (string) $hostNsfw, - $hostRobots, - $hostRobotsPostfix); - - if ($hostId) { - - $hostsAdded++; - } else { - - continue; - } + $hostId = $db->addHost( $remoteManifestHosts->result->scheme, + $remoteManifestHosts->result->name, + $remoteManifestHosts->result->port, + crc32($hostURL), + time(), + null, + $hostPageLimit, + (string) $hostMetaOnly, + (string) $hostStatus, + (string) $hostNsfw, + $hostRobots, + $hostRobotsPostfix); + + // Add web root host page to make host visible in the crawl queue + $db->addHostPage($hostId, crc32('/'), '/', time()); + + // Increase counters + $hostPagesAdded++; + $hostsAdded++; } - - // Init robots parser - $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - - // Save home page info - // Until page API not implemented, save at least home page to have ability to crawl - // @TODO - if ($hostStatus && // host enabled - $robots->uriAllowed('/') && // page allowed by robots.txt rules - $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit - !$db->getHostPage($hostId, crc32('/'))) { // page not exists - - if ($db->addHostPage($hostId, crc32('/'), '/', time())) { - - $hostPagesAdded++; - } - } - } - } - } - - // Process images crawl queue - foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) { - - // Build URL from the DB - $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri; - - // Init image request - $curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT); - - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); - - // Update image index anyway, with the current time and http code - $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode()); - - // Skip image processing non 200 code - if (200 != $curl->getCode()) { - - $db->updateHostImageHttpCode($queueHostImage->hostImageId, $curl->getCode(), time()); - - $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); - - continue; - } - - // Skip image processing on MIME type not provided - if (!$hostImageContentType = $curl->getContentType()) { - - $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); - - continue; - } - - // Skip image processing on MIME type not allowed in settings - $hostImageBanned = true; - foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) { - - if (false !== strpos($hostImageContentType, trim($mime))) { - - $hostImageBanned = false; - break; - } - } - - if ($hostImageBanned) { - - $db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time()); - - $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); - - continue; - } - - // Convert remote image data to base64 string - if (!$queueHostImage->crawlMetaOnly) { - - // Skip image processing without returned content - if (!$hostImageContent = $curl->getContent()) { - - $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); - - continue; - } - - if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { - - $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); - - continue; - } - - if (!$hostImageBase64 = @base64_encode($hostImageContent)) { - - $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); - - continue; - } - - $hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64; - - // Set host image description - // On link collection we knew meta but data, - // this step use latest description slice and insert the data received by curl request - if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) { - - $db->setHostImageDescription($queueHostImage->hostImageId, - crc32($lastHostImageDescription->alt . - $lastHostImageDescription->title . - $hostImageData), - $lastHostImageDescription->alt, - $lastHostImageDescription->title, - $hostImageData, - time(), - time()); } } - - $hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId, - Filter::mime($hostImageContentType), - time()); } // Process pages crawl queue @@ -476,12 +336,11 @@ try { time()); // Add queued page description if not exists - $db->setHostPageDescription($queueHostPage->hostPageId, - crc32($content), + $db->addHostPageDescription($queueHostPage->hostPageId, Filter::pageTitle($title->item(0)->nodeValue), Filter::pageDescription($metaDescription), Filter::pageKeywords($metaKeywords), - $queueHostPage->crawlMetaOnly ? null : Filter::string($content), + $queueHostPage->crawlMetaOnly ? null : base64_encode($content), time()); // Update manifest registry @@ -499,155 +358,42 @@ try { } } - // Collect page images - if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) { - - foreach (@$dom->getElementsByTagName('img') as $img) { - - // Skip images without src attribute - if (!$imageSrc = @$img->getAttribute('src')) { - - continue; - } - - // Skip images without alt attribute - if (!$imageAlt = @$img->getAttribute('alt')) { - - continue; - } - - if (!$imageTitle = @$img->getAttribute('title')) { - $imageTitle = null; - } - - // Add domain to the relative src links - if (!parse_url($imageSrc, PHP_URL_HOST)) { - - $imageSrc = $queueHostPage->scheme . '://' . - $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '') . - '/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.'); - } - - // Validate formatted src link - if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) { - - // Parse formatted src link - $hostImageURL = Parser::hostURL($imageSrc); - $hostImageURI = Parser::uri($imageSrc); - - // Host exists - if ($host = $db->getHost(crc32($hostImageURL->string))) { - - $hostStatus = $host->status; - $hostNsfw = $host->nsfw; - $hostPageLimit = $host->crawlPageLimit; - $hostImageLimit = $host->crawlImageLimit; - $hostId = $host->hostId; - $hostRobots = $host->robots; - $hostRobotsPostfix = $host->robotsPostfix; - - // Register new host - } else { - - // Get robots.txt if exists - $curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT); - - // Update curl stats - $httpRequestsTotal++; - $httpRequestsSizeTotal += $curl->getSizeRequest(); - $httpDownloadSizeTotal += $curl->getSizeDownload(); - $httpRequestsTimeTotal += $curl->getTotalTime(); - - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; - } - - $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - - $hostStatus = CRAWL_HOST_DEFAULT_STATUS; - $hostNsfw = CRAWL_HOST_DEFAULT_NSFW; - $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; - $hostId = $db->addHost($hostImageURL->scheme, - $hostImageURL->name, - $hostImageURL->port, - crc32($hostURL->string), - time(), - null, - $hostPageLimit, - $hostImageLimit, - (string) CRAWL_HOST_DEFAULT_META_ONLY, - (string) $hostStatus, - (string) $hostNsfw, - $hostRobots, - $hostRobotsPostfix); - - if ($hostId) { - - $hostsAdded++; + // Init links registry + $links = []; - } else { + // Collect image links + foreach (@$dom->getElementsByTagName('img') as $img) { - continue; - } - } + // Skip images without src attribute + if (!$src = @$img->getAttribute('src')) { - // Init robots parser - $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); - - // Save new image info - $hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string)); - - if (!$hostImageId && // image not exists - $hostStatus && // host enabled - $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules - $hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit + continue; + } - // Add host image - if ($hostImageId = $db->addHostImage($hostId, - crc32($hostImageURI->string), - $hostImageURI->string, - time())) { + // Skip images without alt attribute + if (!$alt = @$img->getAttribute('alt')) { - $hostImagesAdded++; + continue; + } - } else { + if (!$title = @$img->getAttribute('title')) { + $title = null; + } - continue; - } - } + // Skip encoded content + if (false !== strpos($src, 'data:')) { - // Add/update host image description - $imageAlt = Filter::imageAlt($imageAlt); - $imageTitle = Filter::imageTitle($imageTitle); - - $db->setHostImageDescription($hostImageId, - crc32($imageAlt . $imageTitle), - $imageAlt, - $imageTitle, - null, - time(), - null); - - // Relate host image with host page was found - $db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1); - - // Increase image rank when link does not match the current host - if ($hostImageURL->scheme . '://' . - $hostImageURL->name . - ($hostImageURL->port ? ':' . $hostImageURL->port : '') - != - $queueHostPage->scheme . '://' . - $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '')) { - - $db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1); - } - } + continue; } + + // Add link to queue + $links[] = [ + 'title' => null, + 'description' => null, + 'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')), + 'data' => null, + 'ref' => $src, + ]; } // Collect internal links from page content @@ -659,6 +405,11 @@ try { continue; } + // Get title attribute if available + if (!$title = @$a->getAttribute('title')) { + $title = null; + } + // Skip anchor links if (false !== strpos($href, '#')) { @@ -683,23 +434,34 @@ try { continue; } - // @TODO skip other apps + // Add link to queue + $links[] = [ + 'title' => null, + 'description' => null, + 'keywords' => Filter::pageKeywords($title), + 'data' => null, + 'ref' => $href, + ]; + } + + // Process links collected + foreach ($links as $link) { - // Add absolute URL prefixes to the relative links found - if (!parse_url($href, PHP_URL_HOST)) { + //Make relative links absolute + if (!parse_url($link['ref'], PHP_URL_HOST)) { - $href = $queueHostPage->scheme . '://' . - $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '') . - '/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.'); + $link['ref'] = $queueHostPage->scheme . '://' . + $queueHostPage->name . + ($queueHostPage->port ? ':' . $queueHostPage->port : '') . + '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.'); } // Validate formatted link - if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { + if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) { // Parse formatted link - $hostURL = Parser::hostURL($href); - $hostPageURI = Parser::uri($href); + $hostURL = Parser::hostURL($link['ref']); + $hostPageURI = Parser::uri($link['ref']); // Host exists if ($host = $db->getHost(crc32($hostURL->string))) { @@ -707,7 +469,7 @@ try { $hostStatus = $host->status; $hostNsfw = $host->nsfw; $hostPageLimit = $host->crawlPageLimit; - $hostImageLimit = $host->crawlImageLimit; + $hostMetaOnly = $host->crawlMetaOnly; $hostId = $host->hostId; $hostRobots = $host->robots; $hostRobotsPostfix = $host->robotsPostfix; @@ -731,30 +493,33 @@ try { } $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; - - $hostStatus = CRAWL_HOST_DEFAULT_STATUS; - $hostNsfw = CRAWL_HOST_DEFAULT_NSFW; - $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; - $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; - $hostId = $db->addHost($hostURL->scheme, - $hostURL->name, - $hostURL->port, - crc32($hostURL->string), - time(), - null, - $hostPageLimit, - $hostImageLimit, - (string) CRAWL_HOST_DEFAULT_META_ONLY, - (string) $hostStatus, - (string) $hostNsfw, - $hostRobots, - $hostRobotsPostfix); - - if ($hostId) { - - $hostsAdded++; - - } else { + $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; + $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; + $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; + $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; + + $hostId = $db->addHost( $hostURL->scheme, + $hostURL->name, + $hostURL->port, + crc32($hostURL->string), + time(), + null, + $hostPageLimit, + (string) $hostMetaOnly, + (string) $hostStatus, + (string) $hostNsfw, + $hostRobots, + $hostRobotsPostfix); + + // Add web root host page to make host visible in the crawl queue + $db->addHostPage($hostId, crc32('/'), '/', time()); + + // Increase counters + $hostPagesAdded++; + $hostsAdded++; + + // When page is root, skip next operations + if ($hostPageURI->string == '/') { continue; } @@ -766,25 +531,27 @@ try { // Save page info if ($hostStatus && // host enabled $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules - $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit - !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists + $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit - if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) { + if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) { + + $hostPageId = $hostPage->hostPageId; + + } else { + + $hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); + + $db->addHostPageDescription($hostPageId, + $link['title'], + $link['description'], + $link['keywords'], + $hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null), + time()); $hostPagesAdded++; } - } - // Increase page rank when link does not match the current host - if ($hostURL->scheme . '://' . - $hostURL->name . - ($hostURL->port ? ':' . $hostURL->port : '') - != - $queueHostPage->scheme . '://' . - $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '')) { - - $db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1); + $db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId); } } } @@ -811,10 +578,6 @@ if (CRAWL_LOG_ENABLED) { $hostPagesIndexed, $hostPagesAdded, $hostPagesBanned, - $hostImagesIndexed, - $hostImagesProcessed, - $hostImagesAdded, - $hostImagesBanned, $manifestsProcessed, $manifestsAdded, $httpRequestsTotal, @@ -832,11 +595,6 @@ echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; -echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL; -echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL; -echo 'Images added: ' . $hostImagesAdded . PHP_EOL; -echo 'Images banned: ' . $hostImagesBanned . PHP_EOL; - echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; echo 'Manifests added: ' . $manifestsAdded . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb index 35ee90c..da1d4f6 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/filter.php b/library/filter.php index 1570de3..edaeff5 100644 --- a/library/filter.php +++ b/library/filter.php @@ -54,24 +54,6 @@ class Filter { return $keywords; } - static public function imageAlt(mixed $alt) { - - $alt = (string) $alt; - - $alt = trim($alt); - - return $alt; - } - - static public function imageTitle(mixed $title) { - - $title = (string) $title; - - $title = trim($title); - - return $title; - } - static public function pageData(mixed $data) { $data = (string) $data; diff --git a/library/mysql.php b/library/mysql.php index f0aaa64..605b892 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -102,11 +102,44 @@ class MySQL { return $query->fetch()->total; } - public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, string $nsfw, mixed $robots, mixed $robotsPostfix) { - - $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `nsfw`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); - - $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $nsfw, $robots, $robotsPostfix]); + public function addHost(string $scheme, + string $name, + mixed $port, + int $crc32url, + int $timeAdded, + mixed $timeUpdated, + int $crawlPageLimit, + string $crawlMetaOnly, + string $status, + string $nsfw, + mixed $robots, + mixed $robotsPostfix) { + + $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, + `name`, + `port`, + `crc32url`, + `timeAdded`, + `timeUpdated`, + `crawlPageLimit`, + `crawlMetaOnly`, + `status`, + `nsfw`, + `robots`, + `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + + $query->execute([ $scheme, + $name, + $port, + $crc32url, + $timeAdded, + $timeUpdated, + $crawlPageLimit, + $crawlMetaOnly, + $status, + $nsfw, + $robots, + $robotsPostfix]); return $this->_db->lastInsertId(); } @@ -120,253 +153,6 @@ class MySQL { return $query->rowCount(); } - // Images - public function getTotalHostImages(int $hostId) { - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE `hostId` = ?'); - - $query->execute([$hostId]); - - return $query->fetch()->total; - } - - public function getHostImageId(int $hostId, int $crc32uri) { - - $query = $this->_db->prepare('SELECT `hostImageId` FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1'); - - $query->execute([$hostId, $crc32uri]); - - return $query->rowCount() ? $query->fetch()->hostImageId : 0; - } - - public function getHostImages(int $hostId) { - - $query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ?'); - - $query->execute([$hostId]); - - return $query->fetchAll(); - } - - public function getUnrelatedHostImages() { - - $query = $this->_db->prepare('SELECT * FROM `hostImage` - WHERE `hostImage`.`hostImageId` NOT IN (SELECT `hostImageToHostPage`.`hostImageId` - FROM `hostImageToHostPage` - - WHERE `hostImageToHostPage`.`hostImageId` = `hostImage`.`hostImageId`)'); - - $query->execute(); - - return $query->fetchAll(); - } - - public function getHostImagesByLimit(int $hostId, int $limit) { - - $query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? ORDER BY hostImageId DESC LIMIT ' . (int) $limit); - - $query->execute([$hostId]); - - return $query->fetchAll(); - } - - public function addHostImage(int $hostId, - int $crc32uri, - string $uri, - int $timeAdded, - mixed $timeUpdated = null, - mixed $timeBanned = null, - mixed $httpCode = null, - mixed $mime = null, - mixed $rank = null) { - - $query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`, - `crc32uri`, - `uri`, - `timeAdded`, - `timeUpdated`, - `timeBanned`, - `httpCode`, - `mime`, - `rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)'); - - $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]); - - return $this->_db->lastInsertId(); - } - - public function updateHostImageRank(int $hostId, - int $crc32uri, - int $increment) { - - $query = $this->_db->prepare('UPDATE `hostImage` SET `rank` = `rank` + ' . (int) $increment . ' WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1'); - - $query->execute([$hostId, $crc32uri]); - - return $query->rowCount(); - } - - public function updateHostImageTimeBanned(int $hostImageId, int $timeBanned) { - - $query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1'); - - $query->execute([$timeBanned, $hostImageId]); - - return $query->rowCount(); - } - - public function updateHostImageHttpCode(int $hostImageId, - int $httpCode, - int $timeUpdated) { - - $query = $this->_db->prepare('UPDATE `hostImage` SET `httpCode` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1'); - - $query->execute([$httpCode, $timeUpdated, $hostImageId]); - - return $query->rowCount(); - } - - public function updateHostImageMime(int $hostImageId, - string $mime, - int $timeUpdated) { - - $query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1'); - - $query->execute([$mime, $timeUpdated, $hostImageId]); - - return $query->rowCount(); - } - - public function updateHostImage(int $hostImageId, - string $mime, - int $timeUpdated, - mixed $timeBanned = null) { - - $query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1'); - - $query->execute([$mime, $timeUpdated, $timeBanned, $hostImageId]); - - return $query->rowCount(); - } - - public function deleteHostImage(int $hostImageId) { - - $query = $this->_db->prepare('DELETE FROM `hostImage` WHERE `hostImageId` = ? LIMIT 1'); - - $query->execute([$hostImageId]); - - return $query->rowCount(); - } - - public function setHostImageDescription(int $hostImageId, - int $crc32id, - string $alt, - string $title, - mixed $data, - int $timeAdded, - mixed $timeUpdated) { - - $query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`, - `crc32id`, - `alt`, - `title`, - `timeAdded`) VALUES (?, ?, ?, ?, ?) - - ON DUPLICATE KEY UPDATE `alt` = ?, - `title` = ?, - `timeUpdated` = ?'); - - $query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded, $alt, $title, $timeUpdated]); - - return $this->_db->lastInsertId(); - } - - public function setHostImageDescriptionData(int $hostImageId, - int $crc32id, - mixed $data, - int $timeAdded, - mixed $timeUpdated) { - - $query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`, - `crc32id`, - `data`, - `timeAdded`) VALUES (?, ?, ?, ?) - - ON DUPLICATE KEY UPDATE `timeUpdated` = ?'); - - $query->execute([$hostImageId, $crc32id, $data, $timeAdded, $timeUpdated]); - - return $this->_db->lastInsertId(); - } - - public function deleteHostImageDescription(int $hostImageId) { - - $query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `hostImageId` = ?'); - - $query->execute([$hostImageId]); - - return $query->rowCount(); - } - - public function getLastHostImageDescription(int $hostImageId) { - - $query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1'); - - $query->execute([$hostImageId]); - - return $query->fetch(); - } - - public function getHostImageHostPages(int $hostImageId, int $limit = 5) { - - $query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` - JOIN `hostPage` ON (`hostPage`.`hostPageId` = `hostImageToHostPage`.`hostPageId`) - - WHERE `hostImageId` = ? - - ORDER BY `hostPage`.`rank` DESC, RAND(`hostPage`.`hostId`) - - LIMIT ' . (int) $limit); - - $query->execute([$hostImageId]); - - return $query->fetchAll(); - } - - public function getHostImageHostPagesTotal(int $hostImageId) { - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImageToHostPage` WHERE `hostImageId` = ?'); - - $query->execute([$hostImageId]); - - return $query->fetch()->total; - } - - public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $time, int $quantity) { - - $query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`, - `hostPageId`, - `timeAdded`, - `timeUpdated`, - `quantity`) VALUES (?, ?, ?, ?, ?) - - ON DUPLICATE KEY UPDATE `timeUpdated` = ?, - `quantity` = `quantity` + ' . (int) $quantity); - - $query->execute([$hostImageId, $hostPageId, $time, null, $quantity, $time]); - - return $query->rowCount(); // no primary key - } - - public function deleteHostImageToHostPage(int $hostImageId) { - - $query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostImageId` = ?'); - - $query->execute([$hostImageId]); - - return $query->rowCount(); - } - // Pages public function getTotalHostPages(int $hostId) { @@ -421,18 +207,9 @@ class MySQL { return $query->fetchAll(); } - public function getHostPageDescription(int $hostPageId, int $crc32data) { - - $query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1'); - - $query->execute([$hostPageId, $crc32data]); - - return $query->fetch(); - } - public function getLastPageDescription(int $hostPageId) { - $query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1'); + $query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1'); $query->execute([$hostPageId]); @@ -442,7 +219,6 @@ class MySQL { public function getFoundHostPage(int $hostPageId) { $query = $this->_db->prepare('SELECT `hostPage`.`uri`, - `hostPage`.`rank`, `host`.`scheme`, `host`.`name`, `host`.`port` @@ -459,28 +235,6 @@ class MySQL { return $query->fetch(); } - public function getFoundHostImage(int $hostImageId) { - - $query = $this->_db->prepare('SELECT `hostImage`.`hostImageId`, - `hostImage`.`uri`, - `hostImage`.`rank`, - `host`.`scheme`, - `host`.`name`, - `host`.`port`, - `host`.`crawlMetaOnly` - - FROM `hostImage` - JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`) - - WHERE `hostImage`.`hostImageId` = ? - - LIMIT 1'); - - $query->execute([$hostImageId]); - - return $query->fetch(); - } - public function addHostPage(int $hostId, int $crc32uri, string $uri, @@ -488,8 +242,7 @@ class MySQL { mixed $timeUpdated = null, mixed $timeBanned = null, mixed $httpCode = null, - mixed $mime = null, - mixed $rank = null) { + mixed $mime = null) { $query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`, `crc32uri`, @@ -498,10 +251,9 @@ class MySQL { `timeUpdated`, `timeBanned`, `httpCode`, - `mime`, - `rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)'); + `mime`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)'); - $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]); + $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime]); return $this->_db->lastInsertId(); } @@ -515,22 +267,6 @@ class MySQL { return $query->rowCount(); } - public function updateHostPageRank(int $hostId, - int $crc32uri, - int $increment) { - - $query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = `rank` + ' . (int) $increment . ' - - WHERE `hostId` = ? - AND `crc32uri` = ? - - LIMIT 1'); - - $query->execute([$hostId, $crc32uri]); - - return $query->rowCount(); - } - public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) { $query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1'); @@ -576,48 +312,52 @@ class MySQL { return $query->rowCount(); } - public function deleteHostPageToHostImage(int $hostPageId) { - - $query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?'); - - $query->execute([$hostPageId]); - - return $query->rowCount(); - } - - public function setHostPageDescription(int $hostPageId, - int $crc32data, - mixed $metaTitle, - mixed $metaDescription, - mixed $metaKeywords, + public function addHostPageDescription(int $hostPageId, + mixed $title, + mixed $description, + mixed $keywords, mixed $data, - int $time) { + int $timeAdded) { $query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`, - `crc32data`, - `metaTitle`, - `metaDescription`, - `metaKeywords`, + `title`, + `description`, + `keywords`, `data`, `timeAdded` - ) VALUES (?, ?, ?, ?, ?, ?, ?) - - ON DUPLICATE KEY UPDATE `timeUpdated` = ?'); + ) VALUES (?, ?, ?, ?, ?, ?)'); $query->execute([ $hostPageId, - $crc32data, - $metaTitle, - $metaDescription, - $metaKeywords, + $title, + $description, + $keywords, $data, - $time, - $time + $timeAdded, ]); return $query->rowCount(); } + public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) { + + $query = $this->_db->prepare('INSERT INTO `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`, `quantity`) VALUES (?, ?, 0) + + ON DUPLICATE KEY UPDATE `quantity` = `quantity` + 1'); + + $query->execute([$hostPageIdSource, $hostPageIdTarget]); + + } + + public function deleteHostPageToHostPage(int $hostPageId) { + + $query = $this->_db->prepare('DELETE FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? OR `hostPageIdTarget` = ?'); + + $query->execute([$hostPageId, $hostPageId]); + + return $query->rowCount(); + } + // Cleaner tools public function getCleanerQueue(int $limit, int $timeFrom) { @@ -652,33 +392,12 @@ class MySQL { return $query->rowCount(); } - public function resetBannedHostImages(int $timeOffset) { - - $query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset); - - $query->execute(); - - return $query->rowCount(); - } - - public function deleteHostImageDescriptionsByTimeAdded(int $timeOffset) { - - $query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `timeAdded` < ' . (int) $timeOffset); - - $query->execute(); - - return $query->rowCount(); - } - public function addCleanerLog(int $timeAdded, int $hostsTotal, int $hostsUpdated, int $hostPagesDeleted, int $hostPageDescriptionsDeleted, int $hostPagesBansRemoved, - int $hostImagesDeleted, - int $hostImageDescriptionsDeleted, - int $hostImagesBansRemoved, int $manifestsTotal, int $manifestsDeleted, int $logsCleanerDeleted, @@ -695,9 +414,6 @@ class MySQL { `hostPagesDeleted`, `hostPageDescriptionsDeleted`, `hostPagesBansRemoved`, - `hostImagesDeleted`, - `hostImageDescriptionsDeleted`, - `hostImagesBansRemoved`, `manifestsTotal`, `manifestsDeleted`, `logsCleanerDeleted`, @@ -706,7 +422,7 @@ class MySQL { `httpRequestsSizeTotal`, `httpDownloadSizeTotal`, `httpRequestsTimeTotal`, - `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); $query->execute([ $timeAdded, @@ -715,9 +431,6 @@ class MySQL { $hostPagesDeleted, $hostPageDescriptionsDeleted, $hostPagesBansRemoved, - $hostImagesDeleted, - $hostImageDescriptionsDeleted, - $hostImagesBansRemoved, $manifestsTotal, $manifestsDeleted, $logsCleanerDeleted, @@ -751,7 +464,6 @@ class MySQL { `host`.`name`, `host`.`port`, `host`.`crawlPageLimit`, - `host`.`crawlImageLimit`, `host`.`crawlMetaOnly`, `host`.`robots`, `host`.`robotsPostfix` @@ -762,7 +474,7 @@ class MySQL { WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0 AND `hostPage`.`timeBanned` IS NULL - ORDER BY `hostPage`.`rank` DESC, RAND() + ORDER BY RAND() LIMIT ' . (int) $limit); @@ -780,40 +492,6 @@ class MySQL { return $query->rowCount(); } - public function getHostImageCrawlQueue(int $limit, int $timeFrom) { - - $query = $this->_db->prepare('SELECT `hostImage`.`hostId`, - `hostImage`.`hostImageId`, - `hostImage`.`uri`, - `host`.`scheme`, - `host`.`name`, - `host`.`port`, - `host`.`crawlMetaOnly` - - FROM `hostImage` - JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`) - - WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0 - AND `hostImage`.`timeBanned` IS NULL - - ORDER BY `hostImage`.`rank` DESC, RAND() - - LIMIT ' . (int) $limit); - - $query->execute([$timeFrom]); - - return $query->fetchAll(); - } - - public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) { - - $query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1'); - - $query->execute([$timeUpdated, $httpCode, $hostImageId]); - - return $query->rowCount(); - } - public function getManifestCrawlQueue(int $limit, int $timeFrom) { $query = $this->_db->prepare('SELECT * FROM `manifest` @@ -844,10 +522,6 @@ class MySQL { int $hostPagesIndexed, int $hostPagesAdded, int $hostPagesBanned, - int $hostImagesIndexed, - int $hostImagesProcessed, - int $hostImagesAdded, - int $hostImagesBanned, int $manifestsProcessed, int $manifestsAdded, int $httpRequestsTotal, @@ -862,17 +536,13 @@ class MySQL { `hostPagesIndexed`, `hostPagesAdded`, `hostPagesBanned`, - `hostImagesIndexed`, - `hostImagesProcessed`, - `hostImagesAdded`, - `hostImagesBanned`, `manifestsProcessed`, `manifestsAdded`, `httpRequestsTotal`, `httpRequestsSizeTotal`, `httpDownloadSizeTotal`, `httpRequestsTimeTotal`, - `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); $query->execute([ $timeAdded, @@ -881,10 +551,6 @@ class MySQL { $hostPagesIndexed, $hostPagesAdded, $hostPagesBanned, - $hostImagesIndexed, - $hostImagesProcessed, - $hostImagesAdded, - $hostImagesBanned, $manifestsProcessed, $manifestsAdded, $httpRequestsTotal, diff --git a/library/sphinxql.php b/library/sphinxql.php index 41379d8..8062761 100644 --- a/library/sphinxql.php +++ b/library/sphinxql.php @@ -11,13 +11,13 @@ class SphinxQL { $this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); } - public function searchHostPages(string $keyword, int $start, int $limit, int $maxMatches) { + public function searchHostPages(string $keyword, string $mime, int $start, int $limit, int $maxMatches) { $query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight` FROM `hostPage` - WHERE MATCH(?) + WHERE MATCH(?) AND `mime` = ? ORDER BY `rank` DESC, WEIGHT() DESC @@ -25,26 +25,7 @@ class SphinxQL { OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1)); - $query->execute([$keyword]); - - return $query->fetchAll(); - } - - public function searchHostImages(string $keyword, int $start, int $limit, int $maxMatches) { - - $query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight` - - FROM `hostImage` - - WHERE MATCH(?) - - ORDER BY `rank` DESC, WEIGHT() DESC - - LIMIT ' . (int) ($start >= $maxMatches ? ($maxMatches > 0 ? $maxMatches - 1 : 0) : $start) . ',' . (int) $limit . ' - - OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1)); - - $query->execute([$keyword]); + $query->execute([$keyword, $mime]); return $query->fetchAll(); } @@ -58,29 +39,20 @@ class SphinxQL { return $query->fetch()->total; } - public function searchHostPagesTotal(string $keyword) { - - $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)'); - - $query->execute([$keyword]); - - return $query->fetch()->total; - } - - public function searchHostImagesTotal(string $keyword) { + public function getHostPagesMime() { - $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE MATCH(?)'); + $query = $this->_sphinx->prepare('SELECT `mime` FROM `hostPage` GROUP BY `mime` ORDER BY `mime` ASC'); - $query->execute([$keyword]); + $query->execute(); - return $query->fetch()->total; + return $query->fetchAll(); } - public function getHostImagesTotal() { + public function searchHostPagesTotal(string $keyword, string $mime) { - $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage`'); + $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?) AND `mime` = ?'); - $query->execute(); + $query->execute([$keyword, $mime]); return $query->fetch()->total; } diff --git a/media/db-prototype.png b/media/db-prototype.png index 4e23c5b..0ce16fa 100644 Binary files a/media/db-prototype.png and b/media/db-prototype.png differ diff --git a/public/api.php b/public/api.php index becf6ae..e96b39d 100644 --- a/public/api.php +++ b/public/api.php @@ -1,7 +1,7 @@ searchHostImagesTotal(Filter::searchQuery($query, $mode)); - $sphinxResults = $sphinx->searchHostImages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal); - - // Make default search request - } else { - - $sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode)); - $sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal); - } + // Make search request + $sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode), $type); + $sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $type, $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal); // Generate results $dbResults = []; foreach ($sphinxResults as $i => $sphinxResult) { - // Image - if (!empty($type) && $type == 'image') { - - if ($hostImage = $db->getFoundHostImage($sphinxResult->id)) { - - $dbResults[$i] = $hostImage; - - $dbResults[$i]->weight = $sphinxResult->weight; - } - - // Default - } else { - - if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) { + if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) { - $dbResults[$i] = $hostPage; + $dbResults[$i] = $hostPage; - $dbResults[$i]->weight = $sphinxResult->weight; - } + $dbResults[$i]->weight = $sphinxResult->weight; } } @@ -129,13 +106,10 @@ if (API_ENABLED) { 'crawlUrlRegexp' => CRAWL_URL_REGEXP, 'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW, 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, - 'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT, 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, 'crawlHostPageMime' => CRAWL_PAGE_MIME, - 'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET, - 'crawlHostImageMime' => CRAWL_IMAGE_MIME, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, diff --git a/public/index.php b/public/index.php index a5dbfb7..6a76ada 100644 --- a/public/index.php +++ b/public/index.php @@ -24,7 +24,6 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the -