implement unlimited snap storage mirrors, delete megaCMD integration

This commit is contained in:
ghost 2023-07-29 14:37:01 +03:00
parent 9b52e3b7f5
commit 712d67f6bf
11 changed files with 371 additions and 359 deletions

View File

@ -39,7 +39,7 @@ sphinxsearch
* The web root dir is `/public` * The web root dir is `/public`
* Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder * Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder
* Install [Sphinx Search Server](https://sphinxsearch.com), [MEGAcmd](https://mega.nz/cmd) (on remote snaps enabled) * Install [Sphinx Search Server](https://sphinxsearch.com)
* Configuration examples presented at `/config` folder * Configuration examples presented at `/config` folder
* Make sure `/storage/cache`, `/storage/tmp`, `/storage/snap` folders are writable * Make sure `/storage/cache`, `/storage/tmp`, `/storage/snap` folders are writable
* Set up the `/crontab` by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt) * Set up the `/crontab` by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt)
@ -155,10 +155,8 @@ GET m=SphinxQL
* [x] Flexible settings compatible with IPv4/IPv6 networks * [x] Flexible settings compatible with IPv4/IPv6 networks
* [x] Extended search syntax support * [x] Extended search syntax support
* [x] Compressed page history snaps with multi-provider storage sync * [x] Compressed page history snaps with multi-provider storage sync
+ [x] Local + [x] Local (unlimited locations)
+ [x] Remote + [x] Remote FTP (unlimited mirrors)
+ [x] MEGAcmd/FTP
+ [ ] Yggdrasil over NAT
+ [x] Privacy-oriented downloads counting, traffic controls + [x] Privacy-oriented downloads counting, traffic controls
##### UI ##### UI
@ -213,7 +211,7 @@ GET m=SphinxQL
* [x] Deprecated DB items auto deletion / host settings update * [x] Deprecated DB items auto deletion / host settings update
+ [x] Pages + [x] Pages
+ [x] Snaps + [x] Snaps
+ [x] Snap downloads + [ ] Snap downloads
+ [ ] Missed snap file relations + [ ] Missed snap file relations
+ [x] Manifests + [x] Manifests
+ [x] Logs + [x] Logs
@ -232,7 +230,7 @@ GET m=SphinxQL
+ [x] generate + [x] generate
+ [x] truncate + [x] truncate
* [x] hostPageSnap * [x] hostPageSnap
+ [x] truncate + [ ] truncate
* [ ] hostPage * [ ] hostPage
+ [ ] add + [ ] add

View File

@ -176,56 +176,6 @@ switch ($argv[1]) {
} }
break; break;
case 'hostPageSnap':
if (empty($argv[2])) {
echo PHP_EOL . _('hostPageSnap method requires action argument') . PHP_EOL;
}
switch ($argv[2]) {
case 'truncate':
foreach ($db->getHosts() as $host) {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if ($hostPageSnap->storageLocal) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
if ($hostPageSnap->storageMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
}
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
$db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
// @TODO reset primary key indexes
}
}
}
echo _('hostPageSnap, hostPageSnapDownload tables successfully truncated') . PHP_EOL;
exit;
break;
default:
echo PHP_EOL . _('undefined action argument') . PHP_EOL;
}
break;
} }
// Default message // Default message
@ -242,7 +192,6 @@ echo _(' crawl - execute crawler step in the crontab
echo _(' clean - execute cleaner step in the crontab queue') . PHP_EOL; echo _(' clean - execute cleaner step in the crontab queue') . PHP_EOL;
echo _(' hostPage rank reindex - generate rank indexes in hostPage table') . PHP_EOL; echo _(' hostPage rank reindex - generate rank indexes in hostPage table') . PHP_EOL;
echo _(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field') . PHP_EOL; echo _(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field') . PHP_EOL;
echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL; echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL . PHP_EOL;
echo _(' hostPageSnap truncate - flush hostPageSnap, hostPageSnapDownload tables') . PHP_EOL . PHP_EOL;
echo _('get support: https://github.com/YGGverse/YGGo/issues') . PHP_EOL . PHP_EOL; echo _('get support: https://github.com/YGGverse/YGGo/issues') . PHP_EOL . PHP_EOL;

View File

@ -63,18 +63,6 @@ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
*/ */
define('WEBSITE_IDENTICON_IMAGE_CACHE', true); define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
/*
* Total snap files size allowed to download in bytes in WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE_TIME_OFFSET period
*
*/
define('WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE', 10485760);
/*
* Time offset quota when WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE reached
*
*/
define('WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE_TIME_OFFSET', 60*60);
// Database // Database
define('DB_HOST', '127.0.0.1'); define('DB_HOST', '127.0.0.1');
define('DB_PORT', 3306); define('DB_PORT', 3306);
@ -90,19 +78,60 @@ define('SPHINX_PORT', 9306);
define('MEMCACHED_HOST', '127.0.0.1'); define('MEMCACHED_HOST', '127.0.0.1');
define('MEMCACHED_PORT', 11211); define('MEMCACHED_PORT', 11211);
// Third-party connections (optional) // Snaps
/* /*
* Mega.nz remote storage * Storage nodes configuration
* *
* FTP storage integration through MEGAcmd (https://mega.io/cmd) * Supports optional single 'localhost' and multiple 'FTP' servers
* *
* Connect mega-ftp instance on CRAWL_PAGE_MIME_SNAP_MEGA enabled * Comment specified node to disable specified connection
*
* Make empty array to disable snaps or set quote.mime = false or quote.size = 0 to disable specified instance
* *
*/ */
define('MEGA_FTP_HOST', '127.0.0.1'); define('SNAP_STORAGE', json_encode((object)
define('MEGA_FTP_PORT', 4990); [
define('MEGA_FTP_DIRECTORY', ''); 'localhost' => [
[
'directory' => __DIR__ . '/../storage/snap/hp/',
'quota' => [
'mime' => false,
'size' => 10000000024,
'request' => [
'download' => [
'size' => 10000024,
'seconds' => 60*60
]
]
],
// ...
]
],
'ftp' => [
[
'port' => 21,
'host' => '',
'username' => '',
'password' => '',
'directory' => '/snap',
'timeout' => 30,
'passive' => true,
'quota' => [
'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico',
'size' => 10000000024,
'request' => [
'download' => [
'size' => 10000024,
'seconds' => 60*60
]
]
],
],
// ...
]
]
));
// Proxy settings // Proxy settings
@ -217,28 +246,6 @@ define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7);
*/ */
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac'); define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
/*
* Snap pages locally match MIME types
*
* comma separated | false to disable
*
*/
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html');
/*
* Snap pages to mega.nz match MIME types
*
* comma separated | false to disable
*
* Requires connection:
*
* MEGA_FTP_HOST
* MEGA_FTP_PORT
* MEGA_FTP_DIRECTORY
*
*/
define('CRAWL_PAGE_MIME_SNAP_MEGA', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico');
/* /*
* Renew manifests index by timing offset provided * Renew manifests index by timing offset provided
* *
@ -289,9 +296,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
* this option disabled requires huge disk storage, * this option disabled requires huge disk storage,
* it's experimental feature, oriented for index operations * it's experimental feature, oriented for index operations
* *
* see CRAWL_PAGE_MIME_SNAP_LOCAL
* to create compressed data snaps
*
*/ */
define('CRAWL_HOST_DEFAULT_META_ONLY', true); define('CRAWL_HOST_DEFAULT_META_ONLY', true);

View File

@ -93,23 +93,43 @@ try {
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if ($hostPageSnap->storageLocal) { // Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip'); foreach ($storages as $storage) {
}
if ($hostPageSnap->storageMega) { // Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
$ftp = new Ftp(); switch ($name) {
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { case 'localhost':
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
} }
} }
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
} }
// Delete host page // Delete host page
@ -139,23 +159,43 @@ try {
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if ($hostPageSnap->storageLocal) { // Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip'); foreach ($storages as $storage) {
}
if ($hostPageSnap->storageMega) { // Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
$ftp = new Ftp(); switch ($name) {
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { case 'localhost':
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
} }
} }
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
} }
// Delete host page // Delete host page
@ -225,39 +265,59 @@ try {
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET); $hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
// Clean up banned pages extra data // Clean up banned pages extra data
foreach ($db->getHostPagesBanned() as $hostPageBanned) { foreach ($db->getHostPagesBanned() as $hostPage) {
// Delete host page descriptions // Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPageBanned->hostPageId); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
// Delete host page DOMs // Delete host page DOMs
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPageBanned->hostPageId); $hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
// Delete host page refs data // Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPageBanned->hostPageId); $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps // Delete host page snaps
$snapFilePath = chunk_split($hostPageBanned->hostPageId, 1, '/'); $snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPageBanned->hostPageId) as $hostPageSnap) { foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if ($hostPageSnap->storageLocal) { // Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip'); foreach ($storages as $storage) {
}
if ($hostPageSnap->storageMega) { // Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
$ftp = new Ftp(); switch ($name) {
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { case 'localhost':
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
} }
} }
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
} }
} }
@ -271,35 +331,8 @@ try {
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET); $logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET); $logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
// Delete failed snaps // Delete failed snap files
foreach ($db->getHosts() as $host) { // @TODO
foreach ($db->getHostPages($host->hostId) as $hostPage) {
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId, false, false, 'AND') as $hostPageSnap) {
if ($hostPageSnap->storageLocal) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
if ($hostPageSnap->storageMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
}
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
}
// Commit results // Commit results
$db->commit(); $db->commit();

View File

@ -665,50 +665,18 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
} }
// Begin snaps // Begin snaps
$snapLocal = false; if (SNAP_STORAGE) {
$snapMega = false;
// Snap local enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapLocal = true;
break;
}
}
}
// Snap MEGA enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMega = true;
break;
}
}
}
// At least one snap storage match settings condition
if ($snapLocal || $snapMega) {
$crc32data = crc32($content); $crc32data = crc32($content);
// Create not duplicated data snaps only, even new time // Create not duplicated data snaps only, even newer by time added
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) { if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
$snapTime = time(); $snapTime = time();
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); $snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; $snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true); @mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create new ZIP container // Create new ZIP container
$zip = new ZipArchive(); $zip = new ZipArchive();
@ -718,10 +686,10 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Insert compressed snap data into the tmp storage // Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) && if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL . sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Done // Done
$zip->close(); $zip->close();
@ -730,48 +698,103 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if (file_exists($snapTmp)) { if (file_exists($snapTmp)) {
// Register snap in DB // Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) { if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, filesize($snapTmp), $snapTime)) {
$hostPagesSnapAdded++; $hostPagesSnapAdded++;
// Copy tmp snap to the permanent local storage
if ($snapLocal) {
@mkdir(__DIR__ . '/../storage/snap/hp/' . $snapPath, 0755, true);
if (copy($snapTmp, __DIR__ . '/../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
}
}
// Copy tmp snap to the permanent MEGA storage
if ($snapMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->mkdir('hp/' . $snapPath, true);
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
}
$ftp->close();
}
}
} }
} }
} }
} }
// Remove tmp
@unlink($snapTmp);
} }
// Copy files to each storage
$snapStorageIndex = 0;
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
foreach ($storages as $storage) {
$snapStorageIndex++;
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
switch ($name) {
case 'localhost':
// Validate size quota
if ($db->getTotalHostPageSnapSizeByStorage($hostPageSnapId, $crc32name) >= $storage->quota->size) continue 2;
// Validate mime
if (!$storage->quota->mime) continue 2;
$snapMimeValid = false;
foreach ((array) explode(',', $storage->quota->mime) as $mime) {
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMimeValid = true;
break;
}
}
if (!$snapMimeValid) continue 2;
// Copy tmp snap file to the permanent storage
@mkdir($storage->directory . $snapPath, 0755, true);
if (copy($snapTmp, $storage->directory . $snapPath . $snapTime . '.zip')) {
// Register storage name
$db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time());
}
break;
case 'ftp':
// Validate size quota
if ($db->getTotalHostPageSnapSizeByStorage($hostPageSnapId, $crc32name) >= $storage->quota->size) continue 2;
// Validate mime
if (!$storage->quota->mime) continue 2;
$snapMimeValid = false;
foreach ((array) explode(',', $storage->quota->mime) as $mime) {
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMimeValid = true;
break;
}
}
if (!$snapMimeValid) continue 2;
// Copy tmp snap file to the permanent storage
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->mkdir('hp/' . $snapPath, true);
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
// Register storage name
$db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time());
}
$ftp->close();
}
break;
}
}
}
// Delete tmp snap
unlink($snapTmp);
} }
// Skip page links following with meta robots:nofollow attribute // Skip page links following with meta robots:nofollow attribute

Binary file not shown.

View File

@ -15,18 +15,14 @@ class Ftp {
mixed $login = null, mixed $login = null,
mixed $password = null, mixed $password = null,
string $directory = '/', string $directory = '/',
int $timeout = 90) { int $timeout = 90,
bool $passive = false) {
if (!$this->_connection = ftp_connect($host, $port, $timeout)) { if (!$this->_connection = ftp_connect($host, $port, $timeout)) {
return false; return false;
} }
if (!ftp_pasv($this->_connection, $this->_passive)) {
return false;
}
if (!empty($login) && !empty($password)) { if (!empty($login) && !empty($password)) {
if (!ftp_login($this->_connection, $login, $password)) { if (!ftp_login($this->_connection, $login, $password)) {
@ -35,6 +31,11 @@ class Ftp {
} }
} }
if ($passive && !ftp_pasv($this->_connection, $this->_passive)) {
return false;
}
return ftp_chdir($this->_connection, $directory); return ftp_chdir($this->_connection, $directory);
} }

View File

@ -462,35 +462,18 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) { public function addHostPageSnap(int $hostPageId, string $crc32data, int $size, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, $query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`,
`crc32data`, `crc32data`,
`timeAdded`) VALUES (?, ?, ?)'); `size`,
`timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$hostPageId, $crc32data, $timeAdded]); $query->execute([$hostPageId, $crc32data, $size, $timeAdded]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function updateHostPageSnapStorageLocal(int $hostPageSnapId, mixed $value) {
$query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageLocal` = ? WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$value, $hostPageSnapId]);
return $query->rowCount();
}
public function updateHostPageSnapStorageMega(int $hostPageSnapId, mixed $value) {
$query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageMega` = ? WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$value, $hostPageSnapId]);
return $query->rowCount();
}
public function deleteHostPageSnap(int $hostPageSnapId) { public function deleteHostPageSnap(int $hostPageSnapId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1'); $query = $this->_db->prepare('DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1');
@ -500,24 +483,37 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function getTotalHostPageSnaps(int $hostPageId, bool $storageLocal = true, bool $storageMega = true) { public function getTotalHostPageSnaps(int $hostPageId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ? AND (`storageLocal` = ? OR `storageMega` = ?)'); $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ?');
$query->execute([$hostPageId, $storageLocal, $storageMega]); $query->execute([$hostPageId]);
return $query->fetch()->total; return $query->fetch()->total;
} }
public function getHostPageSnaps(int $hostPageId, bool $storageLocal = true, bool $storageMega = true, string $condition = 'OR') { public function getHostPageSnaps(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND (`storageLocal` = ? ' . ($condition == 'OR' ? 'OR' : 'AND') . ' `storageMega` = ?) ORDER BY `timeAdded` DESC'); $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
$query->execute([$hostPageId, $storageLocal, $storageMega]); $query->execute([$hostPageId]);
return $query->fetchAll(); return $query->fetchAll();
} }
public function getTotalHostPageSnapSizeByStorage(int $hostPageId, int $crc32name) {
$query = $this->_db->prepare('SELECT SUM(`hostPageSnap`.`size`) AS `total` FROM `hostPageSnap`
JOIN `hostPageSnapStorage` ON (`hostPageSnapStorage`.`hostPageSnapId` = `hostPageSnap`.`hostPageSnapId`)
WHERE `hostPageSnap`.`hostPageSnapId` = ?
AND `hostPageSnapStorage`.`crc32name` = ?');
$query->execute([$hostPageId, $crc32name]);
return $query->fetch()->total;
}
public function getHostPageSnap(int $hostPageSnapId) { public function getHostPageSnap(int $hostPageSnapId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1');
@ -536,44 +532,62 @@ class MySQL {
return $query->fetch(); return $query->fetch();
} }
public function addHostPageSnapDownload(int $hostPageSnapId, string $crc32ip, int $timeAdded) { public function addHostPageSnapDownload(int $hostPageSnapStorageId, string $crc32ip, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnapDownload` (`hostPageSnapId`, $query = $this->_db->prepare('INSERT INTO `hostPageSnapDownload` (`hostPageSnapStorageId`,
`crc32ip`, `crc32ip`,
`timeAdded`) VALUES (?, ?, ?)'); `timeAdded`) VALUES (?, ?, ?)');
$query->execute([$hostPageSnapId, $crc32ip, $timeAdded]); $query->execute([$hostPageSnapStorageId, $crc32ip, $timeAdded]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function updateHostPageSnapDownload(int $hostPageSnapDownloadId, string $storage, int $size, mixed $httpCode = NULL) { public function addHostPageSnapStorage(int $hostPageSnapId, int $crc32name, int $timeAdded) {
$query = $this->_db->prepare('UPDATE `hostPageSnapDownload` SET `storage` = ?, `size` = ?, `httpCode` = ? WHERE `hostPageSnapDownloadId` = ? LIMIT 1'); $query = $this->_db->prepare('INSERT INTO `hostPageSnapStorage` (`hostPageSnapId`,
`crc32name`,
`timeAdded`) VALUES (?, ?, ?)');
$query->execute([$storage, $size, $httpCode, $hostPageSnapDownloadId]); $query->execute([$hostPageSnapId, $crc32name, $timeAdded]);
return $query->rowCount(); return $this->_db->lastInsertId();
} }
public function deleteHostPageSnapDownloads(int $hostPageSnapId) { public function getHostPageSnapStorageByCRC32Name(int $hostPageSnapId, int $crc32name) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnapDownload` WHERE `hostPageSnapId` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `hostPageSnapStorage` WHERE `hostPageSnapId` = ? AND `crc32name` = ?');
$query->execute([$hostPageSnapId, $crc32name]);
return $query->fetch();
}
public function getHostPageSnapStorages(int $hostPageSnapId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapStorage` WHERE `hostPageSnapId` = ?');
$query->execute([$hostPageSnapId]);
return $query->fetchAll();
}
public function deleteHostPageSnapStorages(int $hostPageSnapId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnapStorage` WHERE `hostPageSnapId` = ?');
$query->execute([$hostPageSnapId]); $query->execute([$hostPageSnapId]);
return $query->rowCount(); return $query->rowCount();
} }
public function findHostPageSnapDownloadsTotalSize(int $crc32ip, int $timeOffset) { public function deleteHostPageSnapDownloads(int $hostPageSnapStorageId) {
$query = $this->_db->prepare('SELECT SUM(`size`) AS `size` FROM `hostPageSnapDownload` $query = $this->_db->prepare('DELETE FROM `hostPageSnapDownload` WHERE `hostPageSnapStorageId` = ?');
WHERE `crc32ip` = ? AND `timeAdded` < ?'); $query->execute([$hostPageSnapStorageId]);
$query->execute([$crc32ip, $timeOffset]); return $query->rowCount();
return $query->fetch()->size;
} }
public function addHostPageDom(int $hostPageId, int $timeAdded, string $selector, string $value) { public function addHostPageDom(int $hostPageId, int $timeAdded, string $selector, string $value) {

Binary file not shown.

Before

Width:  |  Height:  |  Size: 211 KiB

After

Width:  |  Height:  |  Size: 159 KiB

View File

@ -111,7 +111,6 @@ if (API_ENABLED) {
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET, 'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX, 'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,

View File

@ -46,102 +46,93 @@ switch ($type) {
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Init request
$crc32ip = crc32(!empty($_SERVER['REMOTE_ADDR']) ? $_SERVER['REMOTE_ADDR'] : '');
// Get snap details from DB // Get snap details from DB
if ($hostPageSnap = $db->getHostPageSnap(!empty($_GET['hps']) ? (int) $_GET['hps'] : 0)) { if ($hostPageSnap = $db->getHostPageSnap(!empty($_GET['hps']) ? (int) $_GET['hps'] : 0)) {
// Init variables // Get snap file
$crc32ip = crc32(!empty($_SERVER['REMOTE_ADDR']) ? $_SERVER['REMOTE_ADDR'] : '');
$time = time();
$hostPageDownloadsTotalSize = $db->findHostPageSnapDownloadsTotalSize($crc32ip, $time - WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE_TIME_OFFSET); $snapStorageIndex = 0;
// Check for downloading quotas foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
if ($hostPageDownloadsTotalSize >= WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE) {
header('HTTP/1.0 403 Forbidden'); foreach ($storages as $storage) {
echo _('403 Access forbidden by requests quota'); $snapStorageIndex++;
exit; // Generate storage id
} $crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
// Register snap download switch ($name) {
$hostPageSnapDownloadId = $db->addHostPageSnapDownload($hostPageSnap->hostPageSnapId, $crc32ip, $time);
// Init variables case 'localhost':
$snapSize = 0;
$snapFile = 'hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip';
// Download local snap in higher priority if possible if ($hostPageSnapStorage = $db->getHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
if ($hostPageSnap->storageLocal && file_exists(__DIR__ . '/../storage/snap/' . $snapFile) &&
is_readable(__DIR__ . '/../storage/snap/' . $snapFile)) {
$snapSize = (int) @filesize(__DIR__ . '/../storage/snap/' . $snapFile); // Check request quota
//if ()
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'local', $snapSize, 200); // Get file
$snapFile = 'hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip';
header('Content-Type: application/zip'); // Download local snap in higher priority if possible
header(sprintf('Content-Length: %s', $snapSize)); if (file_exists($storage->directory . $snapFile) &&
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId, is_readable($storage->directory . $snapFile)) {
$hostPageSnap->hostPageId,
$hostPageSnap->timeAdded));
readfile(__DIR__ . '/../storage/snap/' . $snapFile);
// Then try to download from MEGA storage if exists // Register snap download
} else if ($hostPageSnap->storageMega) { $db->addHostPageSnapDownload($hostPageSnapStorage->hostPageSnapStorageId, $crc32ip, time());
$ftp = new Ftp(); // Return snap file
header('Content-Type: application/zip');
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
$hostPageSnap->hostPageId,
$hostPageSnap->timeAdded));
readfile($storage->directory . $snapFile);
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { exit;
}
}
if ($snapSize = $ftp->size($snapFile)) { break;
case 'ftp':
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'mega', $snapSize, 200); if ($hostPageSnapStorage = $db->getHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
header('Content-Type: application/zip'); $ftp = new Ftp();
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
$hostPageSnap->hostPageId,
$hostPageSnap->timeAdded));
$ftp->get($snapFile, 'php://output'); if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
} else { // Register snap download
$db->addHostPageSnapDownload($hostPageSnapStorage->hostPageSnapStorageId, $crc32ip, time());
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'mega', $snapSize, 404); // Return snap file
header('Content-Type: application/zip');
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
$hostPageSnap->hostPageId,
$hostPageSnap->timeAdded));
header('HTTP/1.0 404 Not Found'); $ftp->get($snapFile, 'php://output');
echo _('404 File not found'); exit;
}
}
break;
} }
} else {
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'mega', $snapSize, 404);
header('HTTP/1.0 404 Not Found');
echo _('404 File not found');
} }
// Return 404 when file not found
} else {
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'other', $snapSize, 404);
header('HTTP/1.0 404 Not Found');
echo _('404 File not found');
} }
} else {
header('HTTP/1.0 404 Not Found');
echo _('404 Snap not found');
} }
header('HTTP/1.0 404 Not Found');
echo _('404 Snap not found');
break; break;
default: default:
header('HTTP/1.0 404 Not Found'); header('HTTP/1.0 404 Not Found');