Browse Source

implement unlimited snap storage mirrors, delete megaCMD integration

main
ghost 1 year ago
parent
commit
712d67f6bf
  1. 12
      README.md
  2. 53
      cli/yggo.php
  3. 104
      config/app.php.txt
  4. 169
      crontab/cleaner.php
  5. 147
      crontab/crawler.php
  6. BIN
      database/yggo.mwb
  7. 13
      library/ftp.php
  8. 96
      library/mysql.php
  9. BIN
      media/db-prototype.png
  10. 1
      public/api.php
  11. 119
      public/file.php

12
README.md

@ -39,7 +39,7 @@ sphinxsearch @@ -39,7 +39,7 @@ sphinxsearch
* The web root dir is `/public`
* Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder
* Install [Sphinx Search Server](https://sphinxsearch.com), [MEGAcmd](https://mega.nz/cmd) (on remote snaps enabled)
* Install [Sphinx Search Server](https://sphinxsearch.com)
* Configuration examples presented at `/config` folder
* Make sure `/storage/cache`, `/storage/tmp`, `/storage/snap` folders are writable
* Set up the `/crontab` by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt)
@ -155,10 +155,8 @@ GET m=SphinxQL @@ -155,10 +155,8 @@ GET m=SphinxQL
* [x] Flexible settings compatible with IPv4/IPv6 networks
* [x] Extended search syntax support
* [x] Compressed page history snaps with multi-provider storage sync
+ [x] Local
+ [x] Remote
+ [x] MEGAcmd/FTP
+ [ ] Yggdrasil over NAT
+ [x] Local (unlimited locations)
+ [x] Remote FTP (unlimited mirrors)
+ [x] Privacy-oriented downloads counting, traffic controls
##### UI
@ -213,7 +211,7 @@ GET m=SphinxQL @@ -213,7 +211,7 @@ GET m=SphinxQL
* [x] Deprecated DB items auto deletion / host settings update
+ [x] Pages
+ [x] Snaps
+ [x] Snap downloads
+ [ ] Snap downloads
+ [ ] Missed snap file relations
+ [x] Manifests
+ [x] Logs
@ -232,7 +230,7 @@ GET m=SphinxQL @@ -232,7 +230,7 @@ GET m=SphinxQL
+ [x] generate
+ [x] truncate
* [x] hostPageSnap
+ [x] truncate
+ [ ] truncate
* [ ] hostPage
+ [ ] add

53
cli/yggo.php

@ -176,56 +176,6 @@ switch ($argv[1]) { @@ -176,56 +176,6 @@ switch ($argv[1]) {
}
break;
case 'hostPageSnap':
if (empty($argv[2])) {
echo PHP_EOL . _('hostPageSnap method requires action argument') . PHP_EOL;
}
switch ($argv[2]) {
case 'truncate':
foreach ($db->getHosts() as $host) {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if ($hostPageSnap->storageLocal) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
if ($hostPageSnap->storageMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
}
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
$db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
// @TODO reset primary key indexes
}
}
}
echo _('hostPageSnap, hostPageSnapDownload tables successfully truncated') . PHP_EOL;
exit;
break;
default:
echo PHP_EOL . _('undefined action argument') . PHP_EOL;
}
break;
}
// Default message
@ -242,7 +192,6 @@ echo _(' crawl - execute crawler step in the crontab @@ -242,7 +192,6 @@ echo _(' crawl - execute crawler step in the crontab
echo _(' clean - execute cleaner step in the crontab queue') . PHP_EOL;
echo _(' hostPage rank reindex - generate rank indexes in hostPage table') . PHP_EOL;
echo _(' hostPageDom generate [selectors] - make hostPageDom index based on related hostPage.data field') . PHP_EOL;
echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL;
echo _(' hostPageSnap truncate - flush hostPageSnap, hostPageSnapDownload tables') . PHP_EOL . PHP_EOL;
echo _(' hostPageDom truncate - flush hostPageDom table') . PHP_EOL . PHP_EOL;
echo _('get support: https://github.com/YGGverse/YGGo/issues') . PHP_EOL . PHP_EOL;

104
config/app.php.txt

@ -63,18 +63,6 @@ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100); @@ -63,18 +63,6 @@ define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
*/
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
/*
* Total snap files size allowed to download in bytes in WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE_TIME_OFFSET period
*
*/
define('WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE', 10485760);
/*
* Time offset quota when WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE reached
*
*/
define('WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE_TIME_OFFSET', 60*60);
// Database
define('DB_HOST', '127.0.0.1');
define('DB_PORT', 3306);
@ -90,19 +78,60 @@ define('SPHINX_PORT', 9306); @@ -90,19 +78,60 @@ define('SPHINX_PORT', 9306);
define('MEMCACHED_HOST', '127.0.0.1');
define('MEMCACHED_PORT', 11211);
// Third-party connections (optional)
/*
* Mega.nz remote storage
*
* FTP storage integration through MEGAcmd (https://mega.io/cmd)
*
* Connect mega-ftp instance on CRAWL_PAGE_MIME_SNAP_MEGA enabled
*
*/
define('MEGA_FTP_HOST', '127.0.0.1');
define('MEGA_FTP_PORT', 4990);
define('MEGA_FTP_DIRECTORY', '');
// Snaps
/*
* Storage nodes configuration
*
* Supports optional single 'localhost' and multiple 'FTP' servers
*
* Comment specified node to disable specified connection
*
* Make empty array to disable snaps or set quote.mime = false or quote.size = 0 to disable specified instance
*
*/
define('SNAP_STORAGE', json_encode((object)
[
'localhost' => [
[
'directory' => __DIR__ . '/../storage/snap/hp/',
'quota' => [
'mime' => false,
'size' => 10000000024,
'request' => [
'download' => [
'size' => 10000024,
'seconds' => 60*60
]
]
],
// ...
]
],
'ftp' => [
[
'port' => 21,
'host' => '',
'username' => '',
'password' => '',
'directory' => '/snap',
'timeout' => 30,
'passive' => true,
'quota' => [
'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico',
'size' => 10000000024,
'request' => [
'download' => [
'size' => 10000024,
'seconds' => 60*60
]
]
],
],
// ...
]
]
));
// Proxy settings
@ -217,28 +246,6 @@ define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7); @@ -217,28 +246,6 @@ define('CRAWL_PAGE_HOME_SECONDS_OFFSET', 60*60*24*7);
*/
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/x-mpegurl,audio/flac');
/*
* Snap pages locally match MIME types
*
* comma separated | false to disable
*
*/
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html');
/*
* Snap pages to mega.nz match MIME types
*
* comma separated | false to disable
*
* Requires connection:
*
* MEGA_FTP_HOST
* MEGA_FTP_PORT
* MEGA_FTP_DIRECTORY
*
*/
define('CRAWL_PAGE_MIME_SNAP_MEGA', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico');
/*
* Renew manifests index by timing offset provided
*
@ -289,9 +296,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true); @@ -289,9 +296,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
* this option disabled requires huge disk storage,
* it's experimental feature, oriented for index operations
*
* see CRAWL_PAGE_MIME_SNAP_LOCAL
* to create compressed data snaps
*
*/
define('CRAWL_HOST_DEFAULT_META_ONLY', true);

169
crontab/cleaner.php

@ -93,23 +93,43 @@ try { @@ -93,23 +93,43 @@ try {
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if ($hostPageSnap->storageLocal) {
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
foreach ($storages as $storage) {
if ($hostPageSnap->storageMega) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
$ftp = new Ftp();
switch ($name) {
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
}
case 'localhost':
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
break;
case 'ftp':
$ftp = new Ftp();
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
}
// Delete host page
@ -139,23 +159,43 @@ try { @@ -139,23 +159,43 @@ try {
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if ($hostPageSnap->storageLocal) {
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
foreach ($storages as $storage) {
if ($hostPageSnap->storageMega) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
$ftp = new Ftp();
switch ($name) {
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
}
case 'localhost':
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
}
// Delete host page
@ -225,39 +265,59 @@ try { @@ -225,39 +265,59 @@ try {
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
// Clean up banned pages extra data
foreach ($db->getHostPagesBanned() as $hostPageBanned) {
foreach ($db->getHostPagesBanned() as $hostPage) {
// Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPageBanned->hostPageId);
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
// Delete host page DOMs
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPageBanned->hostPageId);
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
// Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPageBanned->hostPageId);
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
$snapFilePath = chunk_split($hostPageBanned->hostPageId, 1, '/');
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPageBanned->hostPageId) as $hostPageSnap) {
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if ($hostPageSnap->storageLocal) {
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
foreach ($storages as $storage) {
if ($hostPageSnap->storageMega) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
$ftp = new Ftp();
switch ($name) {
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
}
case 'localhost':
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
}
}
@ -271,35 +331,8 @@ try { @@ -271,35 +331,8 @@ try {
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
// Delete failed snaps
foreach ($db->getHosts() as $host) {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId, false, false, 'AND') as $hostPageSnap) {
if ($hostPageSnap->storageLocal) {
unlink(__DIR__ . '/../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
if ($hostPageSnap->storageMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
}
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
}
// Delete failed snap files
// @TODO
// Commit results
$db->commit();

147
crontab/crawler.php

@ -665,50 +665,18 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -665,50 +665,18 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
}
// Begin snaps
$snapLocal = false;
$snapMega = false;
// Snap local enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapLocal = true;
break;
}
}
}
// Snap MEGA enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMega = true;
break;
}
}
}
// At least one snap storage match settings condition
if ($snapLocal || $snapMega) {
if (SNAP_STORAGE) {
$crc32data = crc32($content);
// Create not duplicated data snaps only, even new time
// Create not duplicated data snaps only, even newer by time added
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
$snapTime = time();
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true);
@mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create new ZIP container
$zip = new ZipArchive();
@ -718,10 +686,10 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -718,10 +686,10 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Done
$zip->close();
@ -730,48 +698,103 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -730,48 +698,103 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if (file_exists($snapTmp)) {
// Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) {
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, filesize($snapTmp), $snapTime)) {
$hostPagesSnapAdded++;
}
}
}
}
}
// Copy files to each storage
$snapStorageIndex = 0;
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
foreach ($storages as $storage) {
// Copy tmp snap to the permanent local storage
if ($snapLocal) {
$snapStorageIndex++;
@mkdir(__DIR__ . '/../storage/snap/hp/' . $snapPath, 0755, true);
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
if (copy($snapTmp, __DIR__ . '/../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) {
switch ($name) {
// Update snap location info
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
}
case 'localhost':
// Validate size quota
if ($db->getTotalHostPageSnapSizeByStorage($hostPageSnapId, $crc32name) >= $storage->quota->size) continue 2;
// Validate mime
if (!$storage->quota->mime) continue 2;
$snapMimeValid = false;
foreach ((array) explode(',', $storage->quota->mime) as $mime) {
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMimeValid = true;
break;
}
}
// Copy tmp snap to the permanent MEGA storage
if ($snapMega) {
if (!$snapMimeValid) continue 2;
$ftp = new Ftp();
// Copy tmp snap file to the permanent storage
@mkdir($storage->directory . $snapPath, 0755, true);
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
if (copy($snapTmp, $storage->directory . $snapPath . $snapTime . '.zip')) {
$ftp->mkdir('hp/' . $snapPath, true);
// Register storage name
$db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time());
}
break;
case 'ftp':
// Validate size quota
if ($db->getTotalHostPageSnapSizeByStorage($hostPageSnapId, $crc32name) >= $storage->quota->size) continue 2;
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
// Validate mime
if (!$storage->quota->mime) continue 2;
// Update snap location info
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
}
$snapMimeValid = false;
foreach ((array) explode(',', $storage->quota->mime) as $mime) {
$ftp->close();
}
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMimeValid = true;
break;
}
}
}
if (!$snapMimeValid) continue 2;
// Copy tmp snap file to the permanent storage
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->mkdir('hp/' . $snapPath, true);
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
// Register storage name
$db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time());
}
$ftp->close();
}
break;
}
}
// Remove tmp
@unlink($snapTmp);
}
// Delete tmp snap
unlink($snapTmp);
}
// Skip page links following with meta robots:nofollow attribute

BIN
database/yggo.mwb

Binary file not shown.

13
library/ftp.php

@ -15,18 +15,14 @@ class Ftp { @@ -15,18 +15,14 @@ class Ftp {
mixed $login = null,
mixed $password = null,
string $directory = '/',
int $timeout = 90) {
int $timeout = 90,
bool $passive = false) {
if (!$this->_connection = ftp_connect($host, $port, $timeout)) {
return false;
}
if (!ftp_pasv($this->_connection, $this->_passive)) {
return false;
}
if (!empty($login) && !empty($password)) {
if (!ftp_login($this->_connection, $login, $password)) {
@ -35,6 +31,11 @@ class Ftp { @@ -35,6 +31,11 @@ class Ftp {
}
}
if ($passive && !ftp_pasv($this->_connection, $this->_passive)) {
return false;
}
return ftp_chdir($this->_connection, $directory);
}

96
library/mysql.php

@ -462,60 +462,56 @@ class MySQL { @@ -462,60 +462,56 @@ class MySQL {
return $query->fetchAll();
}
public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) {
public function addHostPageSnap(int $hostPageId, string $crc32data, int $size, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`,
`crc32data`,
`timeAdded`) VALUES (?, ?, ?)');
`size`,
`timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$hostPageId, $crc32data, $timeAdded]);
$query->execute([$hostPageId, $crc32data, $size, $timeAdded]);
return $this->_db->lastInsertId();
}
public function updateHostPageSnapStorageLocal(int $hostPageSnapId, mixed $value) {
public function deleteHostPageSnap(int $hostPageSnapId) {
$query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageLocal` = ? WHERE `hostPageSnapId` = ? LIMIT 1');
$query = $this->_db->prepare('DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$value, $hostPageSnapId]);
$query->execute([$hostPageSnapId]);
return $query->rowCount();
}
public function updateHostPageSnapStorageMega(int $hostPageSnapId, mixed $value) {
public function getTotalHostPageSnaps(int $hostPageId) {
$query = $this->_db->prepare('UPDATE `hostPageSnap` SET `storageMega` = ? WHERE `hostPageSnapId` = ? LIMIT 1');
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ?');
$query->execute([$value, $hostPageSnapId]);
$query->execute([$hostPageId]);
return $query->rowCount();
return $query->fetch()->total;
}
public function deleteHostPageSnap(int $hostPageSnapId) {
public function getHostPageSnaps(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1');
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
$query->execute([$hostPageSnapId]);
$query->execute([$hostPageId]);
return $query->rowCount();
return $query->fetchAll();
}
public function getTotalHostPageSnaps(int $hostPageId, bool $storageLocal = true, bool $storageMega = true) {
public function getTotalHostPageSnapSizeByStorage(int $hostPageId, int $crc32name) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ? AND (`storageLocal` = ? OR `storageMega` = ?)');
$query = $this->_db->prepare('SELECT SUM(`hostPageSnap`.`size`) AS `total` FROM `hostPageSnap`
JOIN `hostPageSnapStorage` ON (`hostPageSnapStorage`.`hostPageSnapId` = `hostPageSnap`.`hostPageSnapId`)
$query->execute([$hostPageId, $storageLocal, $storageMega]);
WHERE `hostPageSnap`.`hostPageSnapId` = ?
AND `hostPageSnapStorage`.`crc32name` = ?');
return $query->fetch()->total;
}
$query->execute([$hostPageId, $crc32name]);
public function getHostPageSnaps(int $hostPageId, bool $storageLocal = true, bool $storageMega = true, string $condition = 'OR') {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND (`storageLocal` = ? ' . ($condition == 'OR' ? 'OR' : 'AND') . ' `storageMega` = ?) ORDER BY `timeAdded` DESC');
$query->execute([$hostPageId, $storageLocal, $storageMega]);
return $query->fetchAll();
return $query->fetch()->total;
}
public function getHostPageSnap(int $hostPageSnapId) {
@ -536,44 +532,62 @@ class MySQL { @@ -536,44 +532,62 @@ class MySQL {
return $query->fetch();
}
public function addHostPageSnapDownload(int $hostPageSnapId, string $crc32ip, int $timeAdded) {
public function addHostPageSnapDownload(int $hostPageSnapStorageId, string $crc32ip, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnapDownload` (`hostPageSnapId`,
$query = $this->_db->prepare('INSERT INTO `hostPageSnapDownload` (`hostPageSnapStorageId`,
`crc32ip`,
`timeAdded`) VALUES (?, ?, ?)');
$query->execute([$hostPageSnapId, $crc32ip, $timeAdded]);
$query->execute([$hostPageSnapStorageId, $crc32ip, $timeAdded]);
return $this->_db->lastInsertId();
}
public function updateHostPageSnapDownload(int $hostPageSnapDownloadId, string $storage, int $size, mixed $httpCode = NULL) {
public function addHostPageSnapStorage(int $hostPageSnapId, int $crc32name, int $timeAdded) {
$query = $this->_db->prepare('UPDATE `hostPageSnapDownload` SET `storage` = ?, `size` = ?, `httpCode` = ? WHERE `hostPageSnapDownloadId` = ? LIMIT 1');
$query = $this->_db->prepare('INSERT INTO `hostPageSnapStorage` (`hostPageSnapId`,
`crc32name`,
`timeAdded`) VALUES (?, ?, ?)');
$query->execute([$storage, $size, $httpCode, $hostPageSnapDownloadId]);
$query->execute([$hostPageSnapId, $crc32name, $timeAdded]);
return $query->rowCount();
return $this->_db->lastInsertId();
}
public function getHostPageSnapStorageByCRC32Name(int $hostPageSnapId, int $crc32name) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapStorage` WHERE `hostPageSnapId` = ? AND `crc32name` = ?');
$query->execute([$hostPageSnapId, $crc32name]);
return $query->fetch();
}
public function deleteHostPageSnapDownloads(int $hostPageSnapId) {
public function getHostPageSnapStorages(int $hostPageSnapId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnapDownload` WHERE `hostPageSnapId` = ? LIMIT 1');
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapStorage` WHERE `hostPageSnapId` = ?');
$query->execute([$hostPageSnapId]);
return $query->rowCount();
return $query->fetchAll();
}
public function findHostPageSnapDownloadsTotalSize(int $crc32ip, int $timeOffset) {
public function deleteHostPageSnapStorages(int $hostPageSnapId) {
$query = $this->_db->prepare('SELECT SUM(`size`) AS `size` FROM `hostPageSnapDownload`
$query = $this->_db->prepare('DELETE FROM `hostPageSnapStorage` WHERE `hostPageSnapId` = ?');
WHERE `crc32ip` = ? AND `timeAdded` < ?');
$query->execute([$hostPageSnapId]);
$query->execute([$crc32ip, $timeOffset]);
return $query->rowCount();
}
return $query->fetch()->size;
public function deleteHostPageSnapDownloads(int $hostPageSnapStorageId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnapDownload` WHERE `hostPageSnapStorageId` = ?');
$query->execute([$hostPageSnapStorageId]);
return $query->rowCount();
}
public function addHostPageDom(int $hostPageId, int $timeAdded, string $selector, string $value) {

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 211 KiB

After

Width:  |  Height:  |  Size: 159 KiB

1
public/api.php

@ -111,7 +111,6 @@ if (API_ENABLED) { @@ -111,7 +111,6 @@ if (API_ENABLED) {
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageHomeSecondsOffset' => CRAWL_PAGE_HOME_SECONDS_OFFSET,
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,

119
public/file.php

@ -46,102 +46,93 @@ switch ($type) { @@ -46,102 +46,93 @@ switch ($type) {
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Init request
$crc32ip = crc32(!empty($_SERVER['REMOTE_ADDR']) ? $_SERVER['REMOTE_ADDR'] : '');
// Get snap details from DB
if ($hostPageSnap = $db->getHostPageSnap(!empty($_GET['hps']) ? (int) $_GET['hps'] : 0)) {
// Init variables
$crc32ip = crc32(!empty($_SERVER['REMOTE_ADDR']) ? $_SERVER['REMOTE_ADDR'] : '');
$time = time();
$hostPageDownloadsTotalSize = $db->findHostPageSnapDownloadsTotalSize($crc32ip, $time - WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE_TIME_OFFSET);
// Get snap file
// Check for downloading quotas
if ($hostPageDownloadsTotalSize >= WEBSITE_QUOTA_IP_SNAP_DOWNLOAD_TOTAL_SIZE) {
$snapStorageIndex = 0;
header('HTTP/1.0 403 Forbidden');
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
echo _('403 Access forbidden by requests quota');
foreach ($storages as $storage) {
exit;
}
$snapStorageIndex++;
// Register snap download
$hostPageSnapDownloadId = $db->addHostPageSnapDownload($hostPageSnap->hostPageSnapId, $crc32ip, $time);
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $snapStorageIndex));
// Init variables
$snapSize = 0;
$snapFile = 'hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip';
switch ($name) {
// Download local snap in higher priority if possible
if ($hostPageSnap->storageLocal && file_exists(__DIR__ . '/../storage/snap/' . $snapFile) &&
is_readable(__DIR__ . '/../storage/snap/' . $snapFile)) {
case 'localhost':
$snapSize = (int) @filesize(__DIR__ . '/../storage/snap/' . $snapFile);
if ($hostPageSnapStorage = $db->getHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'local', $snapSize, 200);
// Check request quota
//if ()
header('Content-Type: application/zip');
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
$hostPageSnap->hostPageId,
$hostPageSnap->timeAdded));
readfile(__DIR__ . '/../storage/snap/' . $snapFile);
// Get file
$snapFile = 'hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip';
// Then try to download from MEGA storage if exists
} else if ($hostPageSnap->storageMega) {
// Download local snap in higher priority if possible
if (file_exists($storage->directory . $snapFile) &&
is_readable($storage->directory . $snapFile)) {
$ftp = new Ftp();
// Register snap download
$db->addHostPageSnapDownload($hostPageSnapStorage->hostPageSnapStorageId, $crc32ip, time());
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
// Return snap file
header('Content-Type: application/zip');
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
$hostPageSnap->hostPageId,
$hostPageSnap->timeAdded));
readfile($storage->directory . $snapFile);
if ($snapSize = $ftp->size($snapFile)) {
exit;
}
}
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'mega', $snapSize, 200);
break;
case 'ftp':
header('Content-Type: application/zip');
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
$hostPageSnap->hostPageId,
$hostPageSnap->timeAdded));
if ($hostPageSnapStorage = $db->getHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
$ftp->get($snapFile, 'php://output');
$ftp = new Ftp();
} else {
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'mega', $snapSize, 404);
// Register snap download
$db->addHostPageSnapDownload($hostPageSnapStorage->hostPageSnapStorageId, $crc32ip, time());
header('HTTP/1.0 404 Not Found');
echo _('404 File not found');
}
// Return snap file
header('Content-Type: application/zip');
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
$hostPageSnap->hostPageId,
$hostPageSnap->timeAdded));
} else {
$ftp->get($snapFile, 'php://output');
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'mega', $snapSize, 404);
exit;
}
}
header('HTTP/1.0 404 Not Found');
echo _('404 File not found');
break;
}
}
// Return 404 when file not found
} else {
$db->updateHostPageSnapDownload($hostPageSnapDownloadId, 'other', $snapSize, 404);
header('HTTP/1.0 404 Not Found');
echo _('404 File not found');
}
}
} else {
header('HTTP/1.0 404 Not Found');
header('HTTP/1.0 404 Not Found');
echo _('404 Snap not found');
}
echo _('404 Snap not found');
break;
default:
header('HTTP/1.0 404 Not Found');

Loading…
Cancel
Save