diff --git a/README.md b/README.md index 3687354..b1162b6 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ php-pdo php-curl php-gd php-mbstring +php-zip php-mysql sphinxsearch ``` @@ -37,7 +38,7 @@ sphinxsearch * Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder * Install [Sphinx Search Server](https://sphinxsearch.com) * Configuration examples are placed at `/config` folder -* Make sure `/storage` folder is writable +* Make sure `/storage`, `/public/storage` folders writable * Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt) #### JSON API @@ -147,7 +148,9 @@ GET m=SphinxQL * [x] Index explorer * [x] Safe images preview * [x] Extended search syntax support -* [ ] Page history snaps +* [ ] Page content snaps history + + [x] Local + + [ ] Remote ##### UI @@ -180,7 +183,9 @@ GET m=SphinxQL * [x] MIME Content-type settings * [x] Ban non-condition links to prevent extra requests * [x] Debug log -* [x] History snaps +* [ ] Page content snaps generation + + [x] Local + + [ ] Remote * [ ] Indexing new sites homepage in higher priority * [ ] Redirect codes extended processing * [ ] Palette image index / filter diff --git a/config/app.php.txt b/config/app.php.txt index a48b78d..9419fb3 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -168,7 +168,15 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); * comma separated * */ -define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac'); +define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac'); + +/* + * Snap pages locally match MIME types + * + * comma separated | false to disable + * + */ +define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml'); /* * Renew manifests index by timing offset provided @@ -266,7 +274,7 @@ define('CRAWL_MANIFEST', true); * Manifest API version compatibility * */ -define('CRAWL_MANIFEST_API_VERSION', 0.8); +define('CRAWL_MANIFEST_API_VERSION', 0.9); /* * Set default auto-crawl status for new manifest added diff --git a/crontab/cleaner.php b/crontab/cleaner.php index 21a48e4..0b86180 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -30,7 +30,9 @@ $hostsTotal = $db->getTotalHosts(); $manifestsTotal = $db->getTotalManifests(); $hostsUpdated = 0; $hostPagesDeleted = 0; -$hostPageDescriptionsDeleted = 0; +$hostPagesDescriptionsDeleted = 0; +$hostPagesSnapUrlDeleted = 0; +$hostPagesToHostPageDeleted = 0; $manifestsDeleted = 0; $hostPagesBansRemoved = 0; @@ -74,8 +76,9 @@ try { foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) { // Delete host page - $db->deleteHostPageDescriptions($hostPage->hostPageId); - $db->deleteHostPageToHostPage($hostPage->hostPageId); + $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); + $hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file + $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); if ($hostPage->uri != '/') { $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); @@ -91,8 +94,9 @@ try { if (!$robots->uriAllowed($hostPage->uri)) { // Delete host page - $db->deleteHostPageDescriptions($hostPage->hostPageId); - $db->deleteHostPageToHostPage($hostPage->hostPageId); + $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); + $hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file + $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); if ($hostPage->uri != '/') { $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); @@ -162,7 +166,7 @@ try { $hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET); // Delete page description history - $hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET); + $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET); // Delete deprecated logs $logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET); @@ -187,7 +191,9 @@ if (CLEAN_LOG_ENABLED) { $hostsTotal, $hostsUpdated, $hostPagesDeleted, - $hostPageDescriptionsDeleted, + $hostPagesDescriptionsDeleted, + $hostPagesSnapUrlDeleted, + $hostPagesToHostPageDeleted, $hostPagesBansRemoved, $manifestsTotal, $manifestsDeleted, @@ -209,7 +215,9 @@ echo 'Manifests total: ' . $manifestsTotal . PHP_EOL; echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; -echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL; +echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL; +echo 'Host page snaps deleted: ' . $hostPagesSnapUrlDeleted . PHP_EOL; +echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL; echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL; echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL; diff --git a/crontab/crawler.php b/crontab/crawler.php index 297a69e..9336fc4 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -39,6 +39,7 @@ $manifestsAdded = 0; $hostPagesAdded = 0; $hostsAdded = 0; $hostPagesBanned = 0; +$hostPagesSnapUrlAdded = 0; // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); @@ -251,11 +252,11 @@ try { continue; } - // Parse MIME + // Parse index MIME $hostPageIsDom = false; $hostPageInMime = false; - foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) { + foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) { $mime = Filter::mime($mime); @@ -383,6 +384,48 @@ try { } } + // Save local snap + if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) { + + foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { + + $mime = Filter::mime($mime); + + // MIME type allowed in settings + if (false !== stripos(Filter::mime($contentType), $mime)) { + + $crc32data = crc32($content); + $crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host + + // Create not duplicated data snaps only for each storage host + if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) { + + $time = time(); + + @mkdir('../public/storage/snap/hp/' . $queueHostPage->hostPageId, 755, true); + + $zip = new ZipArchive(); + + if (true === $zip->open('../public/storage/snap/hp/' . $queueHostPage->hostPageId . '/' . $time . '.zip', ZipArchive::CREATE)) { + + if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) { + + $hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId, + $crc32data, // do not create duplicated content snaps + $crc32host, // multi host storage with same timestamp / crc32data + '/storage/snap/hp/' . $queueHostPage->hostPageId . '/' . $time . '.zip', // public url + $time); + + $zip->close(); + + break; + } + } + } + } + } + } + // Begin page links collection $links = []; @@ -700,6 +743,7 @@ if (CRAWL_LOG_ENABLED) { $hostPagesProcessed, $hostPagesIndexed, $hostPagesAdded, + $hostPagesSnapUrlAdded, $hostPagesBanned, $manifestsProcessed, $manifestsAdded, @@ -716,6 +760,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; +echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb index e6c58ae..ba5317c 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/mysql.php b/library/mysql.php index 6cd213c..2fdc331 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -360,6 +360,63 @@ class MySQL { return $query->fetchAll(); } + public function addHostPageSnapURL(int $hostPageId, + int $crc32data, + int $crc32host, + string $url, + int $timeAdded) { + + $query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnapURL` (`hostPageId`, + `crc32data`, + `crc32host`, + `url`, + `timeAdded`) VALUES (?, ?, ?, ?, ?)'); + + $query->execute([$hostPageId, + $crc32data, + $crc32host, + $url, + $timeAdded]); + + return $query->rowCount(); + } + + public function deleteHostPageSnapURL(int $hostPageId) { + + $query = $this->_db->prepare('DELETE FROM `hostPageSnapURL` WHERE `hostPageId` = ?'); + + $query->execute([$hostPageId]); + + return $query->rowCount(); + } + + public function getTotalHostPageSnapURLs(int $hostPageId) { + + $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnapURL` WHERE `hostPageId` = ?'); + + $query->execute([$hostPageId]); + + return $query->fetch()->total; + } + + public function getHostPageSnapURLs(int $hostPageId) { + + $query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC'); + + $query->execute([$hostPageId]); + + return $query->fetchAll(); + } + + public function getHostPageSnapURL(int $hostPageId, int $crc32data, int $crc32host) { + + $query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? AND `hostPageId` = ? AND `crc32host` = ? LIMIT 1'); + + $query->execute([$hostPageId, $crc32data, $crc32host]); + + return $query->fetch(); + } + // Cleaner tools public function getCleanerQueue(int $limit, int $timeFrom) { @@ -398,7 +455,9 @@ class MySQL { int $hostsTotal, int $hostsUpdated, int $hostPagesDeleted, - int $hostPageDescriptionsDeleted, + int $hostPagesDescriptionsDeleted, + int $hostPagesSnapUrlDeleted, + int $hostPagesToHostPageDeleted, int $hostPagesBansRemoved, int $manifestsTotal, int $manifestsDeleted, @@ -414,7 +473,9 @@ class MySQL { `hostsTotal`, `hostsUpdated`, `hostPagesDeleted`, - `hostPageDescriptionsDeleted`, + `hostPagesDescriptionsDeleted`, + `hostPagesSnapUrlDeleted`, + `hostPagesToHostPageDeleted`, `hostPagesBansRemoved`, `manifestsTotal`, `manifestsDeleted`, @@ -424,14 +485,16 @@ class MySQL { `httpRequestsSizeTotal`, `httpDownloadSizeTotal`, `httpRequestsTimeTotal`, - `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); $query->execute([ $timeAdded, $hostsTotal, $hostsUpdated, $hostPagesDeleted, - $hostPageDescriptionsDeleted, + $hostPagesDescriptionsDeleted, + $hostPagesSnapUrlDeleted, + $hostPagesToHostPageDeleted, $hostPagesBansRemoved, $manifestsTotal, $manifestsDeleted, @@ -523,6 +586,7 @@ class MySQL { int $hostPagesProcessed, int $hostPagesIndexed, int $hostPagesAdded, + int $hostPagesSnapUrlAdded, int $hostPagesBanned, int $manifestsProcessed, int $manifestsAdded, @@ -537,6 +601,7 @@ class MySQL { `hostPagesProcessed`, `hostPagesIndexed`, `hostPagesAdded`, + `hostPagesSnapUrlAdded`, `hostPagesBanned`, `manifestsProcessed`, `manifestsAdded`, @@ -544,7 +609,7 @@ class MySQL { `httpRequestsSizeTotal`, `httpDownloadSizeTotal`, `httpRequestsTimeTotal`, - `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); $query->execute([ $timeAdded, @@ -552,6 +617,7 @@ class MySQL { $hostPagesProcessed, $hostPagesIndexed, $hostPagesAdded, + $hostPagesSnapUrlAdded, $hostPagesBanned, $manifestsProcessed, $manifestsAdded, diff --git a/media/db-prototype.png b/media/db-prototype.png index 0ce16fa..55b62a4 100644 Binary files a/media/db-prototype.png and b/media/db-prototype.png differ diff --git a/public/api.php b/public/api.php index f05c384..cd6f9b1 100644 --- a/public/api.php +++ b/public/api.php @@ -1,7 +1,7 @@ CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, - 'crawlHostPageMime' => CRAWL_PAGE_MIME, + 'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX, + 'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, diff --git a/public/explore.php b/public/explore.php index de1b471..d65f308 100644 --- a/public/explore.php +++ b/public/explore.php @@ -234,13 +234,30 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the

timeAdded) ?>

timeUpdated) ?>

- getTotalHostPageIdSourcesByHostPageIdTarget($hp)) { ?> -

- -

+ getTotalHostPageSnapURLs($hp); ?> +

+ +

+ + getHostPageSnapURLs($hp) as $hostPageSnapUrl) { ?> +

+ + timeAdded) ?> + +

+ + + getTotalHostPageIdSourcesByHostPageIdTarget($hp); ?> +

+ +

+ getHostPageIdSourcesByHostPageIdTarget($hp) as $hostPageIdSource) { ?> getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>