Browse Source

integrate optional MEGA/cmd snap storage

main
ghost 2 years ago
parent
commit
1969707eeb
  1. 3
      .gitignore
  2. 14
      README.md
  3. 30
      config/app.php.txt
  4. 58
      crontab/cleaner.php
  5. 114
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 90
      library/ftp.php
  8. 24
      library/mysql.php
  9. BIN
      media/db-prototype.png
  10. 0
      storage/tmp/index.html

3
.gitignore vendored

@ -5,6 +5,3 @@ config/app.php
config/sphinx.conf config/sphinx.conf
database/yggo.mwb.bak database/yggo.mwb.bak
storage
public/snap

14
README.md

@ -38,7 +38,7 @@ sphinxsearch
* Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder * Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder
* Install [Sphinx Search Server](https://sphinxsearch.com) * Install [Sphinx Search Server](https://sphinxsearch.com)
* Configuration examples are placed at `/config` folder * Configuration examples are placed at `/config` folder
* Make sure `/storage`, `/public/snap` folders writable * Make sure `/storage/cache`, `/storage/tmp`, `/public/snap` folders writable
* Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt) * Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt)
#### JSON API #### JSON API
@ -148,9 +148,11 @@ GET m=SphinxQL
* [x] Index explorer * [x] Index explorer
* [x] Safe images preview * [x] Safe images preview
* [x] Extended search syntax support * [x] Extended search syntax support
* [ ] Compressed page snaps history * [x] Compressed, configurable page history snaps with multi-provider storage
+ [x] Local + [x] Local
+ [ ] Remote + [x] Remote
+ [x] MEGAcmd/FTP
+ [ ] Yggdrasil
##### UI ##### UI
@ -159,7 +161,7 @@ GET m=SphinxQL
* [x] Content genre tabs (#1) * [x] Content genre tabs (#1)
* [x] Page index explorer * [x] Page index explorer
+ [x] Meta + [x] Meta
+ [x] Snaps + [x] Snaps history
+ [x] Referrers + [x] Referrers
* [ ] Results with found matches highlight * [ ] Results with found matches highlight
* [ ] The time machine feature by content snaps history * [ ] The time machine feature by content snaps history
@ -170,6 +172,7 @@ GET m=SphinxQL
+ [x] Manifest + [x] Manifest
+ [x] Search + [x] Search
+ [x] Hosts + [x] Hosts
+ [ ] Snaps
+ [ ] MIME list + [ ] MIME list
* [ ] Context advertising API * [ ] Context advertising API
@ -186,9 +189,6 @@ GET m=SphinxQL
* [x] MIME Content-type settings * [x] MIME Content-type settings
* [x] Ban non-condition links to prevent extra requests * [x] Ban non-condition links to prevent extra requests
* [x] Debug log * [x] Debug log
* [ ] Page content snaps generation
+ [x] Local
+ [ ] Remote
* [ ] Indexing new sites homepage in higher priority * [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing * [ ] Redirect codes extended processing
* [ ] Palette image index / filter * [ ] Palette image index / filter

30
config/app.php.txt

@ -74,6 +74,20 @@ define('DB_PASSWORD', '');
define('SPHINX_HOST', '127.0.0.1'); define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306); define('SPHINX_PORT', 9306);
// Third-party connections (optional)
/*
* Mega.nz remote storage
*
* FTP storage integration through MEGAcmd (https://mega.io/cmd)
*
* Connect mega-ftp instance on CRAWL_PAGE_MIME_SNAP_MEGA enabled
*
*/
define('MEGA_FTP_HOST', '127.0.0.1');
define('MEGA_FTP_PORT', 4990);
define('MEGA_FTP_DIRECTORY', '');
// Proxy settings // Proxy settings
/* /*
@ -176,7 +190,21 @@ define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,imag
* comma separated | false to disable * comma separated | false to disable
* *
*/ */
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml'); define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html');
/*
* Snap pages to mega.nz match MIME types
*
* comma separated | false to disable
*
* Requires connection:
*
* MEGA_FTP_HOST
* MEGA_FTP_PORT
* MEGA_FTP_DIRECTORY
*
*/
define('CRAWL_PAGE_MIME_SNAP_MEGA', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico');
/* /*
* Renew manifests index by timing offset provided * Renew manifests index by timing offset provided

58
crontab/cleaner.php

@ -84,8 +84,35 @@ try {
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps // Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
$snapFileLocalExists = (bool) $hostPageSnap->storageLocal;
$snapFileMegaExists = (bool) $hostPageSnap->storageMega;
if ($snapFileLocalExists) {
if (unlink('../public/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) {
$snapFileLocalExists = false;
}
}
if ($snapFileMegaExists) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
if ($ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) {
$snapFileMegaExists = false;
}
}
}
if (!$snapFileLocalExists && !$snapFileMegaExists) {
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); $hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
} }
} }
@ -110,8 +137,35 @@ try {
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps // Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
$snapFileLocalExists = (bool) $hostPageSnap->storageLocal;
$snapFileMegaExists = (bool) $hostPageSnap->storageMega;
if ($snapFileLocalExists) {
if (unlink('../public/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) {
$snapFileLocalExists = false;
}
}
if ($snapFileMegaExists) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
if ($ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) {
$snapFileMegaExists = false;
}
}
}
if (!$snapFileLocalExists && !$snapFileMegaExists) {
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); $hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
} }
} }

114
crontab/crawler.php

@ -11,6 +11,7 @@ if (false === sem_acquire($semaphore, true)) {
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
require_once('../library/ftp.php');
require_once('../library/curl.php'); require_once('../library/curl.php');
require_once('../library/robots.php'); require_once('../library/robots.php');
require_once('../library/filter.php'); require_once('../library/filter.php');
@ -384,7 +385,11 @@ try {
} }
} }
// Save local snap // Begin snaps
$snapLocal = false;
$snapMega = false;
// Snap local enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) { if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
@ -394,40 +399,103 @@ try {
// MIME type allowed in settings // MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) { if (false !== stripos(Filter::mime($contentType), $mime)) {
$crc32data = crc32($content); $snapLocal = true;
break;
}
}
}
// Create not duplicated data snaps only for each storage host // Snap MEGA enabled and MIME in white list
if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) { if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
$time = time(); foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
$directory = chunk_split($queueHostPage->hostPageId, 1, '/'); $mime = Filter::mime($mime);
@mkdir('../public/snap/hp/' . $directory, 0755, true); // MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) {
$zip = new ZipArchive(); $snapMega = true;
break;
}
}
}
// Create new container // At least one snap storage match settings condition
if (true === $zip->open('../public/snap/hp/' . $directory . $time . '.zip', ZipArchive::CREATE)) { if ($snapLocal || $snapMega) {
// Insert compressed snap data $crc32data = crc32($content);
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $time) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/snap/hp/' . $directory . $time . '.zip')) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Update DB registry // Create not duplicated data snaps only, even new time
$hostPagesSnapAdded += $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $time); if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
}
} $snapTime = time();
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create new ZIP container
$zip = new ZipArchive();
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) {
// Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Done
$zip->close(); $zip->close();
}
break; // Temporarily snap file exists
if (file_exists($snapTmp)) {
// Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) {
$hostPagesSnapAdded++;
// Copy tmp snap to the permanent local storage
if ($snapLocal) {
@mkdir('../public/snap/hp/' . $snapPath, 0755, true);
if (copy($snapTmp, '../public/snap/hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
}
}
// Copy tmp snap to the permanent MEGA storage
if ($snapMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->mkdir('hp/' . $snapPath, true);
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
}
$ftp->close();
}
}
}
}
}
} }
// Remove tmp
@unlink($snapTmp);
} }
} }

BIN
database/yggo.mwb

Binary file not shown.

90
library/ftp.php

@ -0,0 +1,90 @@
<?php
class Ftp {
private $_connection;
private $_passive;
public function __construct(bool $passive = true) {
$this->_passive = $passive;
}
public function connect(string $host,
int $port,
mixed $login = null,
mixed $password = null,
string $directory = '/',
int $timeout = 90) {
if (!$this->_connection = ftp_connect($host, $port, $timeout)) {
return false;
}
if (!ftp_pasv($this->_connection, $this->_passive)) {
return false;
}
if (!empty($login) && !empty($password)) {
if (!ftp_login($this->_connection, $login, $password)) {
return false;
}
}
return ftp_chdir($this->_connection, $directory);
}
public function delete(string $target) {
return ftp_delete($this->_connection, $target);
}
public function copy(string $source, string $target) {
return ftp_put($this->_connection, $target, $source);
}
public function get(string $source, string $target) {
return ftp_get($this->_connection, $source, $target);
}
public function mkdir(string $name, bool $recursive = false) {
if ($recursive) {
$path = [];
foreach ((array) explode('/', trim($name, '/')) as $directory) {
$path[] = $directory;
@ftp_mkdir($this->_connection, implode('/', $path));
}
} else {
@ftp_mkdir($this->_connection, $name);
}
}
public function size(string $target) {
if (-1 !== $size = ftp_size($this->_connection, $target)) {
return $size;
}
return false;
}
public function close() {
return ftp_close($this->_connection);
}
}

24
library/mysql.php

@ -362,12 +362,30 @@ class MySQL {
public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) { public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) {
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnap` (`hostPageId`, $query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`,
`crc32data`, `crc32data`,
`timeAdded`) VALUES (?, ?, ?)'); `timeAdded`) VALUES (?, ?, ?)');
$query->execute([$hostPageId, $crc32data, $timeAdded]); $query->execute([$hostPageId, $crc32data, $timeAdded]);
return $this->_db->lastInsertId();
}
public function updateHostPageSnapStorageLocal(int $hostPageSnapId, mixed $value) {
$query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageLocal` = ? WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$value, $hostPageSnapId]);
return $query->rowCount();
}
public function updateHostPageSnapStorageMega(int $hostPageSnapId, mixed $value) {
$query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageMega` = ? WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$value, $hostPageSnapId]);
return $query->rowCount(); return $query->rowCount();
} }

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 159 KiB

After

Width:  |  Height:  |  Size: 161 KiB

0
storage/tmp/index.html

Loading…
Cancel
Save