Browse Source

integrate optional MEGA/cmd snap storage

main
ghost 2 years ago
parent
commit
1969707eeb
  1. 3
      .gitignore
  2. 14
      README.md
  3. 30
      config/app.php.txt
  4. 58
      crontab/cleaner.php
  5. 98
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 90
      library/ftp.php
  8. 20
      library/mysql.php
  9. BIN
      media/db-prototype.png
  10. 0
      storage/tmp/index.html

3
.gitignore vendored

@ -5,6 +5,3 @@ config/app.php @@ -5,6 +5,3 @@ config/app.php
config/sphinx.conf
database/yggo.mwb.bak
storage
public/snap

14
README.md

@ -38,7 +38,7 @@ sphinxsearch @@ -38,7 +38,7 @@ sphinxsearch
* Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder
* Install [Sphinx Search Server](https://sphinxsearch.com)
* Configuration examples are placed at `/config` folder
* Make sure `/storage`, `/public/snap` folders writable
* Make sure `/storage/cache`, `/storage/tmp`, `/public/snap` folders writable
* Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt)
#### JSON API
@ -148,9 +148,11 @@ GET m=SphinxQL @@ -148,9 +148,11 @@ GET m=SphinxQL
* [x] Index explorer
* [x] Safe images preview
* [x] Extended search syntax support
* [ ] Compressed page snaps history
* [x] Compressed, configurable page history snaps with multi-provider storage
+ [x] Local
+ [ ] Remote
+ [x] Remote
+ [x] MEGAcmd/FTP
+ [ ] Yggdrasil
##### UI
@ -159,7 +161,7 @@ GET m=SphinxQL @@ -159,7 +161,7 @@ GET m=SphinxQL
* [x] Content genre tabs (#1)
* [x] Page index explorer
+ [x] Meta
+ [x] Snaps
+ [x] Snaps history
+ [x] Referrers
* [ ] Results with found matches highlight
* [ ] The time machine feature by content snaps history
@ -170,6 +172,7 @@ GET m=SphinxQL @@ -170,6 +172,7 @@ GET m=SphinxQL
+ [x] Manifest
+ [x] Search
+ [x] Hosts
+ [ ] Snaps
+ [ ] MIME list
* [ ] Context advertising API
@ -186,9 +189,6 @@ GET m=SphinxQL @@ -186,9 +189,6 @@ GET m=SphinxQL
* [x] MIME Content-type settings
* [x] Ban non-condition links to prevent extra requests
* [x] Debug log
* [ ] Page content snaps generation
+ [x] Local
+ [ ] Remote
* [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing
* [ ] Palette image index / filter

30
config/app.php.txt

@ -74,6 +74,20 @@ define('DB_PASSWORD', ''); @@ -74,6 +74,20 @@ define('DB_PASSWORD', '');
define('SPHINX_HOST', '127.0.0.1');
define('SPHINX_PORT', 9306);
// Third-party connections (optional)
/*
* Mega.nz remote storage
*
* FTP storage integration through MEGAcmd (https://mega.io/cmd)
*
* Connect mega-ftp instance on CRAWL_PAGE_MIME_SNAP_MEGA enabled
*
*/
define('MEGA_FTP_HOST', '127.0.0.1');
define('MEGA_FTP_PORT', 4990);
define('MEGA_FTP_DIRECTORY', '');
// Proxy settings
/*
@ -176,7 +190,21 @@ define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,imag @@ -176,7 +190,21 @@ define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,imag
* comma separated | false to disable
*
*/
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html');
/*
* Snap pages to mega.nz match MIME types
*
* comma separated | false to disable
*
* Requires connection:
*
* MEGA_FTP_HOST
* MEGA_FTP_PORT
* MEGA_FTP_DIRECTORY
*
*/
define('CRAWL_PAGE_MIME_SNAP_MEGA', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico');
/*
* Renew manifests index by timing offset provided

58
crontab/cleaner.php

@ -84,8 +84,35 @@ try { @@ -84,8 +84,35 @@ try {
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
$snapFileLocalExists = (bool) $hostPageSnap->storageLocal;
$snapFileMegaExists = (bool) $hostPageSnap->storageMega;
if ($snapFileLocalExists) {
if (unlink('../public/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) {
$snapFileLocalExists = false;
}
}
if ($snapFileMegaExists) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
if ($ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) {
$snapFileMegaExists = false;
}
}
}
if (!$snapFileLocalExists && !$snapFileMegaExists) {
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
@ -110,8 +137,35 @@ try { @@ -110,8 +137,35 @@ try {
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
$snapFileLocalExists = (bool) $hostPageSnap->storageLocal;
$snapFileMegaExists = (bool) $hostPageSnap->storageMega;
if ($snapFileLocalExists) {
if (unlink('../public/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) {
$snapFileLocalExists = false;
}
}
if ($snapFileMegaExists) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
if ($ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip')) {
$snapFileMegaExists = false;
}
}
}
if (!$snapFileLocalExists && !$snapFileMegaExists) {
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}

98
crontab/crawler.php

@ -11,6 +11,7 @@ if (false === sem_acquire($semaphore, true)) { @@ -11,6 +11,7 @@ if (false === sem_acquire($semaphore, true)) {
// Load system dependencies
require_once('../config/app.php');
require_once('../library/ftp.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/filter.php');
@ -384,7 +385,11 @@ try { @@ -384,7 +385,11 @@ try {
}
}
// Save local snap
// Begin snaps
$snapLocal = false;
$snapMega = false;
// Snap local enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
@ -394,42 +399,105 @@ try { @@ -394,42 +399,105 @@ try {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) {
$snapLocal = true;
break;
}
}
}
// Snap MEGA enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
$mime = Filter::mime($mime);
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) {
$snapMega = true;
break;
}
}
}
// At least one snap storage match settings condition
if ($snapLocal || $snapMega) {
$crc32data = crc32($content);
// Create not duplicated data snaps only for each storage host
// Create not duplicated data snaps only, even new time
if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
$time = time();
$snapTime = time();
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$directory = chunk_split($queueHostPage->hostPageId, 1, '/');
@mkdir('../public/snap/hp/' . $directory, 0755, true);
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create new ZIP container
$zip = new ZipArchive();
// Create new container
if (true === $zip->open('../public/snap/hp/' . $directory . $time . '.zip', ZipArchive::CREATE)) {
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) {
// Insert compressed snap data
// Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $time) . PHP_EOL .
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/snap/hp/' . $directory . $time . '.zip')) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Update DB registry
$hostPagesSnapAdded += $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $time);
// Done
$zip->close();
// Temporarily snap file exists
if (file_exists($snapTmp)) {
// Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) {
$hostPagesSnapAdded++;
// Copy tmp snap to the permanent local storage
if ($snapLocal) {
@mkdir('../public/snap/hp/' . $snapPath, 0755, true);
if (copy($snapTmp, '../public/snap/hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
}
}
$zip->close();
// Copy tmp snap to the permanent MEGA storage
if ($snapMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->mkdir('hp/' . $snapPath, true);
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
}
break;
$ftp->close();
}
}
}
}
}
}
// Remove tmp
@unlink($snapTmp);
}
}
// Begin page links collection
$links = [];

BIN
database/yggo.mwb

Binary file not shown.

90
library/ftp.php

@ -0,0 +1,90 @@ @@ -0,0 +1,90 @@
<?php
class Ftp {
private $_connection;
private $_passive;
public function __construct(bool $passive = true) {
$this->_passive = $passive;
}
public function connect(string $host,
int $port,
mixed $login = null,
mixed $password = null,
string $directory = '/',
int $timeout = 90) {
if (!$this->_connection = ftp_connect($host, $port, $timeout)) {
return false;
}
if (!ftp_pasv($this->_connection, $this->_passive)) {
return false;
}
if (!empty($login) && !empty($password)) {
if (!ftp_login($this->_connection, $login, $password)) {
return false;
}
}
return ftp_chdir($this->_connection, $directory);
}
public function delete(string $target) {
return ftp_delete($this->_connection, $target);
}
public function copy(string $source, string $target) {
return ftp_put($this->_connection, $target, $source);
}
public function get(string $source, string $target) {
return ftp_get($this->_connection, $source, $target);
}
public function mkdir(string $name, bool $recursive = false) {
if ($recursive) {
$path = [];
foreach ((array) explode('/', trim($name, '/')) as $directory) {
$path[] = $directory;
@ftp_mkdir($this->_connection, implode('/', $path));
}
} else {
@ftp_mkdir($this->_connection, $name);
}
}
public function size(string $target) {
if (-1 !== $size = ftp_size($this->_connection, $target)) {
return $size;
}
return false;
}
public function close() {
return ftp_close($this->_connection);
}
}

20
library/mysql.php

@ -362,12 +362,30 @@ class MySQL { @@ -362,12 +362,30 @@ class MySQL {
public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) {
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnap` (`hostPageId`,
$query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`,
`crc32data`,
`timeAdded`) VALUES (?, ?, ?)');
$query->execute([$hostPageId, $crc32data, $timeAdded]);
return $this->_db->lastInsertId();
}
public function updateHostPageSnapStorageLocal(int $hostPageSnapId, mixed $value) {
$query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageLocal` = ? WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$value, $hostPageSnapId]);
return $query->rowCount();
}
public function updateHostPageSnapStorageMega(int $hostPageSnapId, mixed $value) {
$query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageMega` = ? WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$value, $hostPageSnapId]);
return $query->rowCount();
}

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 159 KiB

After

Width:  |  Height:  |  Size: 161 KiB

0
storage/tmp/index.html

Loading…
Cancel
Save