diff --git a/.gitignore b/.gitignore index 8d252b7..133e487 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,3 @@ config/app.php config/sphinx.conf database/yggo.mwb.bak - -storage -public/snap diff --git a/README.md b/README.md index c179686..311963e 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ sphinxsearch * Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder * Install [Sphinx Search Server](https://sphinxsearch.com) * Configuration examples are placed at `/config` folder -* Make sure `/storage`, `/public/snap` folders writable +* Make sure `/storage/cache`, `/storage/tmp`, `/public/snap` folders writable * Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt) #### JSON API @@ -148,9 +148,11 @@ GET m=SphinxQL * [x] Index explorer * [x] Safe images preview * [x] Extended search syntax support -* [ ] Compressed page snaps history +* [x] Compressed, configurable page history snaps with multi-provider storage + [x] Local - + [ ] Remote + + [x] Remote + + [x] MEGAcmd/FTP + + [ ] Yggdrasil ##### UI @@ -159,7 +161,7 @@ GET m=SphinxQL * [x] Content genre tabs (#1) * [x] Page index explorer + [x] Meta - + [x] Snaps + + [x] Snaps history + [x] Referrers * [ ] Results with found matches highlight * [ ] The time machine feature by content snaps history @@ -170,6 +172,7 @@ GET m=SphinxQL + [x] Manifest + [x] Search + [x] Hosts + + [ ] Snaps + [ ] MIME list * [ ] Context advertising API @@ -186,9 +189,6 @@ GET m=SphinxQL * [x] MIME Content-type settings * [x] Ban non-condition links to prevent extra requests * [x] Debug log -* [ ] Page content snaps generation - + [x] Local - + [ ] Remote * [ ] Indexing new sites homepage in higher priority * [ ] Redirect codes extended processing * [ ] Palette image index / filter diff --git a/config/app.php.txt b/config/app.php.txt index f9da00e..4f54080 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -74,6 +74,20 @@ define('DB_PASSWORD', ''); define('SPHINX_HOST', '127.0.0.1'); define('SPHINX_PORT', 9306); +// Third-party connections (optional) + +/* + * Mega.nz remote storage + * + * FTP storage integration through MEGAcmd (https://mega.io/cmd) + * + * Connect mega-ftp instance on CRAWL_PAGE_MIME_SNAP_MEGA enabled + * + */ +define('MEGA_FTP_HOST', '127.0.0.1'); +define('MEGA_FTP_PORT', 4990); +define('MEGA_FTP_DIRECTORY', ''); + // Proxy settings /* @@ -176,7 +190,21 @@ define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,imag * comma separated | false to disable * */ -define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml'); +define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html'); + +/* + * Snap pages to mega.nz match MIME types + * + * comma separated | false to disable + * + * Requires connection: + * + * MEGA_FTP_HOST + * MEGA_FTP_PORT + * MEGA_FTP_DIRECTORY + * + */ +define('CRAWL_PAGE_MIME_SNAP_MEGA', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico'); /* * Renew manifests index by timing offset provided diff --git a/crontab/cleaner.php b/crontab/cleaner.php index e61c408..91716b4 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -84,8 +84,35 @@ try { $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); // Delete host page snaps + $snapFilePath = chunk_split($hostPage->hostPageId, 1, '/'); + foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { - if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) { + + $snapFileLocalExists = (bool) $hostPageSnap->storageLocal; + $snapFileMegaExists = (bool) $hostPageSnap->storageMega; + + if ($snapFileLocalExists) { + + if (unlink('../public/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) { + + $snapFileLocalExists = false; + } + } + + if ($snapFileMegaExists) { + + $ftp = new Ftp(); + + if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { + + if ($ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) { + + $snapFileMegaExists = false; + } + } + } + + if (!$snapFileLocalExists && !$snapFileMegaExists) { $hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); } } @@ -110,8 +137,35 @@ try { $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); // Delete host page snaps + $snapFilePath = chunk_split($hostPage->hostPageId, 1, '/'); + foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { - if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) { + + $snapFileLocalExists = (bool) $hostPageSnap->storageLocal; + $snapFileMegaExists = (bool) $hostPageSnap->storageMega; + + if ($snapFileLocalExists) { + + if (unlink('../public/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) { + + $snapFileLocalExists = false; + } + } + + if ($snapFileMegaExists) { + + $ftp = new Ftp(); + + if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { + + if ($ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) { + + $snapFileMegaExists = false; + } + } + } + + if (!$snapFileLocalExists && !$snapFileMegaExists) { $hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); } } diff --git a/crontab/crawler.php b/crontab/crawler.php index ecdcf72..d25ba0c 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -11,6 +11,7 @@ if (false === sem_acquire($semaphore, true)) { // Load system dependencies require_once('../config/app.php'); +require_once('../library/ftp.php'); require_once('../library/curl.php'); require_once('../library/robots.php'); require_once('../library/filter.php'); @@ -384,7 +385,11 @@ try { } } - // Save local snap + // Begin snaps + $snapLocal = false; + $snapMega = false; + + // Snap local enabled and MIME in white list if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) { foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { @@ -394,40 +399,103 @@ try { // MIME type allowed in settings if (false !== stripos(Filter::mime($contentType), $mime)) { - $crc32data = crc32($content); + $snapLocal = true; + break; + } + } + } - // Create not duplicated data snaps only for each storage host - if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) { + // Snap MEGA enabled and MIME in white list + if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) { - $time = time(); + foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) { - $directory = chunk_split($queueHostPage->hostPageId, 1, '/'); + $mime = Filter::mime($mime); - @mkdir('../public/snap/hp/' . $directory, 0755, true); + // MIME type allowed in settings + if (false !== stripos(Filter::mime($contentType), $mime)) { - $zip = new ZipArchive(); + $snapMega = true; + break; + } + } + } - // Create new container - if (true === $zip->open('../public/snap/hp/' . $directory . $time . '.zip', ZipArchive::CREATE)) { + // At least one snap storage match settings condition + if ($snapLocal || $snapMega) { - // Insert compressed snap data - if (true === $zip->addFromString('DATA', $content) && - true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $time) . PHP_EOL . - sprintf('CRC32: %s', $crc32data . PHP_EOL . - sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . - sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/snap/hp/' . $directory . $time . '.zip')) . PHP_EOL . - sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { + $crc32data = crc32($content); - // Update DB registry - $hostPagesSnapAdded += $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $time); - } - } + // Create not duplicated data snaps only, even new time + if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) { + + $snapTime = time(); + $snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); + + $snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; + @mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true); + + // Create new ZIP container + $zip = new ZipArchive(); + + if (true === $zip->open($snapTmp, ZipArchive::CREATE)) { + + // Insert compressed snap data into the tmp storage + if (true === $zip->addFromString('DATA', $content) && + true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . + sprintf('CRC32: %s', $crc32data . PHP_EOL . + sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . + sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . + sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { + // Done $zip->close(); - } - break; + // Temporarily snap file exists + if (file_exists($snapTmp)) { + + // Register snap in DB + if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) { + + $hostPagesSnapAdded++; + + // Copy tmp snap to the permanent local storage + if ($snapLocal) { + + @mkdir('../public/snap/hp/' . $snapPath, 0755, true); + + if (copy($snapTmp, '../public/snap/hp/' . $snapPath . $snapTime . '.zip')) { + + // Update snap location info + $db->updateHostPageSnapStorageLocal($hostPageSnapId, true); + } + } + + // Copy tmp snap to the permanent MEGA storage + if ($snapMega) { + + $ftp = new Ftp(); + + if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { + + $ftp->mkdir('hp/' . $snapPath, true); + + if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) { + + // Update snap location info + $db->updateHostPageSnapStorageMega($hostPageSnapId, true); + } + + $ftp->close(); + } + } + } + } + } } + + // Remove tmp + @unlink($snapTmp); } } diff --git a/database/yggo.mwb b/database/yggo.mwb index e029b60..0a4d9e3 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/ftp.php b/library/ftp.php new file mode 100644 index 0000000..1b51f0f --- /dev/null +++ b/library/ftp.php @@ -0,0 +1,90 @@ +_passive = $passive; + } + + public function connect(string $host, + int $port, + mixed $login = null, + mixed $password = null, + string $directory = '/', + int $timeout = 90) { + + if (!$this->_connection = ftp_connect($host, $port, $timeout)) { + + return false; + } + + if (!ftp_pasv($this->_connection, $this->_passive)) { + + return false; + } + + if (!empty($login) && !empty($password)) { + + if (!ftp_login($this->_connection, $login, $password)) { + + return false; + } + } + + return ftp_chdir($this->_connection, $directory); + } + + public function delete(string $target) { + + return ftp_delete($this->_connection, $target); + } + + public function copy(string $source, string $target) { + + return ftp_put($this->_connection, $target, $source); + } + + public function get(string $source, string $target) { + + return ftp_get($this->_connection, $source, $target); + } + + public function mkdir(string $name, bool $recursive = false) { + + if ($recursive) { + + $path = []; + + foreach ((array) explode('/', trim($name, '/')) as $directory) { + + $path[] = $directory; + + @ftp_mkdir($this->_connection, implode('/', $path)); + } + + } else { + + @ftp_mkdir($this->_connection, $name); + } + } + + public function size(string $target) { + + if (-1 !== $size = ftp_size($this->_connection, $target)) { + + return $size; + + } + + return false; + } + + public function close() { + + return ftp_close($this->_connection); + } +} \ No newline at end of file diff --git a/library/mysql.php b/library/mysql.php index 501a2f4..4d0f159 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -362,12 +362,30 @@ class MySQL { public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) { - $query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnap` (`hostPageId`, - `crc32data`, - `timeAdded`) VALUES (?, ?, ?)'); + $query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, + `crc32data`, + `timeAdded`) VALUES (?, ?, ?)'); $query->execute([$hostPageId, $crc32data, $timeAdded]); + return $this->_db->lastInsertId(); + } + + public function updateHostPageSnapStorageLocal(int $hostPageSnapId, mixed $value) { + + $query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageLocal` = ? WHERE `hostPageSnapId` = ? LIMIT 1'); + + $query->execute([$value, $hostPageSnapId]); + + return $query->rowCount(); + } + + public function updateHostPageSnapStorageMega(int $hostPageSnapId, mixed $value) { + + $query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageMega` = ? WHERE `hostPageSnapId` = ? LIMIT 1'); + + $query->execute([$value, $hostPageSnapId]); + return $query->rowCount(); } diff --git a/media/db-prototype.png b/media/db-prototype.png index 0ea432b..da660df 100644 Binary files a/media/db-prototype.png and b/media/db-prototype.png differ diff --git a/storage/tmp/index.html b/storage/tmp/index.html new file mode 100644 index 0000000..e69de29