mirror of
synced 2025-02-03 10:25:52 +00:00
refactor snap storage
This commit is contained in:
@ -227,10 +227,10 @@ GET m=SphinxQL
* [x] crawl
* [x] clean
* [x] hostPageSnap
+ [x] repair
+ [x] repair (not tested)
+ [x] _sync DB-FS relations_
+ [x] _FTP_
+ [x] _localhost (not tested)_
+ [x] _localhost_
+ [x] _delete FS missed in the DB_
+ [x] _FTP_
+ [ ] _localhost_
@ -97,6 +97,12 @@ switch ($argv[1]) {
case 'repair':
// @TODO
CLI::danger(_('this function upgraded but not tested after snaps refactor.'));
CLI::danger(_('make sure you have backups then remove this alert.'));
// Normalize & cleanup DB
CLI::notice(_('scan database registry for missed snap files...'));
@ -104,29 +110,31 @@ switch ($argv[1]) {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
$snapPath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
// Prepare filenames
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
// Define variables
$hostPageSnapStorageFilesExists = false;
// Check file exists
foreach (json_decode(SNAP_STORAGE) as $hostPageSnapStorageName => $storages) {
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
foreach ($storages as $i => $storage) {
foreach ($storages as $location => $storage) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $hostPageSnapStorageName, $i));
$crc32name = crc32(sprintf('%s.%s', $node, $location));
switch ($hostPageSnapStorageName) {
switch ($node) {
case 'localhost':
// @TODO implemented, not tested
$hostPageSnapFilename = $storage->directory . $snapPath . $hostPageSnap->timeAdded . '.zip';
$hostPageSnapFile = $storage->directory . $hostPageSnapFile;
if (file_exists($hostPageSnapFilename)) {
if (file_exists($hostPageSnapFile)) {
$hostPageSnapStorageFilesExists = true;
@ -134,12 +142,12 @@ switch ($argv[1]) {
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s storage: %s index: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFilename, $hostPageSnapStorageName, $i));
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
} else {
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s storage: %s index: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFilename, $hostPageSnapStorageName, $i));
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
@ -151,9 +159,7 @@ switch ($argv[1]) {
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$hostPageSnapFilename = 'hp/' . $snapPath . $hostPageSnap->timeAdded . '.zip';
if ($ftp->size($hostPageSnapFilename)) {
if ($ftp->size($hostPageSnapFile)) {
$hostPageSnapStorageFilesExists = true;
@ -161,18 +167,18 @@ switch ($argv[1]) {
if ($db->addHostPageSnapStorage($hostPageSnap->hostPageSnapId, $crc32name, $hostPageSnap->timeAdded)) {
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s storage: %s index: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFilename, $hostPageSnapStorageName, $i));
CLI::warning(sprintf(_('add index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
} else {
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s storage: %s index: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFilename, $hostPageSnapStorageName, $i));
CLI::success(sprintf(_('skip related index hostPageSnapId #%s file: %s node: %s location: %s;'), $hostPageSnap->hostPageSnapId, $hostPageSnapFile, $node, $location));
// Prevent snap deletion from registry on FTP connection lost
} else {
CLI::danger(sprintf(_('could not connect to storage %s index %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $i));
CLI::danger(sprintf(_('could not connect to storage %s location %s. operation stopped to prevent the data lose.'), $hostPageSnapStorageName, $location));
@ -218,11 +224,15 @@ switch ($argv[1]) {
// Cleanup FS
CLI::notice(_('scan storage for snap files missed in the DB...'));
foreach (json_decode(SNAP_STORAGE) as $hostPageSnapStorageName => $storages) {
// Copy files to each storage
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
foreach ($storages as $i => $storage) {
foreach ($storages as $location => $storage) {
switch ($hostPageSnapStorageName) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $node, $location));
switch ($node) {
case 'localhost':
@ -236,27 +246,26 @@ switch ($argv[1]) {
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
foreach ($ftp->nlistr($storage->directory) as $hostPageSnapFilename) {
foreach ($ftp->nlistr($storage->directory) as $filename) {
if (false !== preg_match(sprintf('!/hp/([\d/]+)/([\d]+)\.zip$!ui', $storage->directory), $hostPageSnapFilename, $matches)) {
if (false !== preg_match(sprintf('!/hps/([\d]+)\.zip$!ui', $storage->directory), $filename, $matches)) {
if (!empty($matches[1]) && // hostPageId
!empty($matches[2])) { // timeAdded
if (!empty($matches[1])) { // hostPageSnapId
if (!$db->findHostPageSnapByTimeAdded($matches[1], $matches[2])) {
if (!$db->getHostPageSnap($matches[1])) {
if ($ftp->delete($hostPageSnapFilename)) {
if ($ftp->delete($filename)) {
CLI::warning(sprintf(_('delete snap file: #%s from storage %s index %s not found in registry;'), $hostPageSnapFilename, $hostPageSnapStorageName, $i));
CLI::warning(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
} else {
CLI::danger(sprintf(_('delete snap file: #%s from storage %s index %s not found in registry;'), $hostPageSnapFilename, $hostPageSnapStorageName, $i));
CLI::danger(sprintf(_('delete snap file: #%s from node %s location %s not found in registry;'), $filename, $node, $location));
} else {
CLI::success(sprintf(_('skip snap file: #%s available in storage %s index %s;'), $hostPageSnapFilename, $hostPageSnapStorageName, $i));
CLI::success(sprintf(_('skip snap file: #%s available in node %s location %s;'), $filename, $node, $location));
@ -93,8 +93,8 @@ define('MEMCACHED_PORT', 11211);
define('SNAP_STORAGE', json_encode((object)
'localhost' => [ // @TODO see https://github.com/YGGverse/YGGo#roadmap
'directory' => __DIR__ . '/../storage/snap/hp/',
'storage-1' => [
'directory' => __DIR__ . '/../storage/snap/hps/',
'quota' => [
'mime' => false,
'size' => 10000000024, // @TODO
@ -104,12 +104,12 @@ define('SNAP_STORAGE', json_encode((object)
'seconds' => 60*60
// ...
// ...
'ftp' => [
'storage-1' => [
'port' => 21,
'host' => '',
'username' => '',
@ -95,23 +95,25 @@ try {
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
// Prepare filenames
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
foreach ($storages as $i => $storage) {
foreach ($storages as $location => $storage) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $i));
switch ($name) {
switch ($node) {
case 'localhost':
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
if (file_exists($storage->directory . $hostPageSnapFile)) {
unlink($storage->directory . $hostPageSnapFile);
case 'ftp':
@ -119,7 +121,8 @@ try {
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
@ -161,23 +164,25 @@ try {
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
// Prepare filenames
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
foreach ($storages as $i => $storage) {
foreach ($storages as $location => $storage) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $i));
switch ($name) {
switch ($node) {
case 'localhost':
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
if (file_exists($storage->directory . $hostPageSnapFile)) {
unlink($storage->directory . $hostPageSnapFile);
case 'ftp':
@ -185,7 +190,8 @@ try {
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
@ -283,23 +289,25 @@ try {
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
// Prepare filenames
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
foreach ($storages as $i => $storage) {
foreach ($storages as $location => $storage) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $i));
switch ($name) {
switch ($node) {
case 'localhost':
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
if (file_exists($storage->directory . $hostPageSnapFile)) {
unlink($storage->directory . $hostPageSnapFile);
case 'ftp':
@ -307,7 +315,8 @@ try {
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
@ -665,133 +665,131 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Begin snaps
$hostPageSnapTimeAdded = time();
$hostPageSnapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
// Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, time())) {
$hostPageSnapFilenameTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip';
@mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $hostPageSnapPath, 0755, true);
// Default storage success
$snapFilesExists = false;
// Create new ZIP container
$zip = new ZipArchive();
// Prepare filenames
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnapId, -1) . '.zip';
if (true === $zip->open($hostPageSnapFilenameTmp, ZipArchive::CREATE)) {
$hostPageSnapFilenameTmp = __DIR__ . '/../storage/tmp/' . md5($hostPageSnapFile);
// Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $hostPageSnapTimeAdded) . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL)))) {
// Create ZIP container
$zip = new ZipArchive();
// Done
if (true === $zip->open($hostPageSnapFilenameTmp, ZipArchive::CREATE)) {
// Temporarily snap file exists
if (file_exists($hostPageSnapFilenameTmp)) {
// Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('TIMESTAMP: %s', time()) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url($queueHostPage->hostPageURL)))) {
// Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $hostPageSnapTimeAdded)) {
// Default storage success
$snapFilesExists = false;
// Temporarily snap file exists
if (file_exists($hostPageSnapFilenameTmp)) {
// Copy files to each storage
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
// Copy files to each storage
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
foreach ($storages as $i => $storage) {
foreach ($storages as $location => $storage) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $i));
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $node, $location));
switch ($name) {
switch ($node) {
case 'localhost':
case 'localhost':
// Validate mime
if (!$storage->quota->mime) continue 2;
// Validate mime
if (!$storage->quota->mime) continue 2;
$snapMimeValid = false;
foreach ((array) explode(',', $storage->quota->mime) as $mime) {
$snapMimeValid = false;
foreach ((array) explode(',', $storage->quota->mime) as $mime) {
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMimeValid = true;
if (!$snapMimeValid) continue 2;
// Copy tmp snap file to the permanent storage
@mkdir($storage->directory . $hostPageSnapPath, 0755, true);
if (copy($hostPageSnapFilenameTmp, $storage->directory . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip')) {
// Register storage name
if ($db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time())) {
$snapFilesExists = true;
case 'ftp':
// Validate mime
if (!$storage->quota->mime) continue 2;
$snapMimeValid = false;
foreach ((array) explode(',', $storage->quota->mime) as $mime) {
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMimeValid = true;
if (!$snapMimeValid) continue 2;
// Copy tmp snap file to the permanent storage
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->mkdir('hp/' . $hostPageSnapPath, true);
if ($ftp->copy($hostPageSnapFilenameTmp, 'hp/' . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip')) {
// Register storage name
if ($db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time())) {
$snapFilesExists = true;
$snapMimeValid = true;
// At least one file have been stored
if ($snapFilesExists) {
if (!$snapMimeValid) continue 2;
// Copy tmp snap file to the permanent storage
@mkdir($storage->directory . $hostPageSnapPath, 0755, true);
} else {
if (copy($hostPageSnapFilenameTmp, $storage->directory . $hostPageSnapFile)) {
// Register storage name
if ($db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time())) {
$snapFilesExists = true;
case 'ftp':
// Validate mime
if (!$storage->quota->mime) continue 2;
$snapMimeValid = false;
foreach ((array) explode(',', $storage->quota->mime) as $mime) {
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMimeValid = true;
if (!$snapMimeValid) continue 2;
// Copy tmp snap file to the permanent storage
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->mkdir($hostPageSnapPath, true);
if ($ftp->copy($hostPageSnapFilenameTmp, $hostPageSnapFile)) {
// Register storage name
if ($db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time())) {
$snapFilesExists = true;
// Delete tmp snap
// At least one file have been stored
if ($snapFilesExists) {
} else {
// Delete tmp snap
// Skip page links following with meta robots:nofollow attribute
@ -528,15 +528,6 @@ class MySQL {
return $query->fetch();
public function findHostPageSnapByTimeAdded(int $hostPageId, int $timeAdded) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `timeAdded` = ? LIMIT 1');
$query->execute([$hostPageId, $timeAdded]);
return $query->fetch();
public function addHostPageSnapDownload(int $hostPageSnapStorageId, string $crc32ip, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnapDownload` (`hostPageSnapStorageId`,
@ -52,37 +52,36 @@ switch ($type) {
// Get snap details from DB
if ($hostPageSnap = $db->getHostPageSnap(!empty($_GET['hps']) ? (int) $_GET['hps'] : 0)) {
// Get file
$snapFile = 'hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip';
// Prepare filenames
$hostPageSnapPath = 'hps/' . substr(trim(chunk_split($hostPageSnap->hostPageSnapId, 1, '/'), '/'), 0, -1);
$hostPageSnapFile = $hostPageSnapPath . substr($hostPageSnap->hostPageSnapId, -1) . '.zip';
// Get snap file
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
foreach (json_decode(SNAP_STORAGE) as $node => $storages) {
foreach ($storages as $i => $storage) {
foreach ($storages as $location => $storage) {
// Generate storage id
$crc32name = crc32(sprintf('%s.%s', $name, $i));
$crc32name = crc32(sprintf('%s.%s', $node, $location));
if ($hostPageSnapStorage = $db->findHostPageSnapStorageByCRC32Name($hostPageSnap->hostPageSnapId, $crc32name)) {
switch ($name) {
switch ($node) {
case 'localhost':
// Download local snap in higher priority if possible
if (file_exists($storage->directory . $snapFile) &&
is_readable($storage->directory . $snapFile)) {
if (file_exists($storage->directory . $hostPageSnapFile) &&
is_readable($storage->directory . $hostPageSnapFile)) {
// Register snap download
$db->addHostPageSnapDownload($hostPageSnapStorage->hostPageSnapStorageId, $crc32ip, time());
// Return snap file
header('Content-Type: application/zip');
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
readfile($storage->directory . $snapFile);
header(sprintf('Content-Length: %s', filesize($storage->directory . $hostPageSnapFile)));
header(sprintf('Content-Disposition: filename="snap.%s.zip"', $hostPageSnap->hostPageSnapId));
readfile($storage->directory . $hostPageSnapFile);
@ -99,12 +98,10 @@ switch ($type) {
// Return snap file
header('Content-Type: application/zip');
header(sprintf('Content-Length: %s', $snapSize));
header(sprintf('Content-Disposition: filename="snap.%s.%s.%s.zip"', $hostPageSnap->hostPageSnapId,
header(sprintf('Content-Length: %s', $ftp->size($hostPageSnapFile)));
header(sprintf('Content-Disposition: filename="snap.%s.zip"', $hostPageSnap->hostPageSnapId));
$ftp->get($snapFile, 'php://output');
$ftp->get($hostPageSnapFile, 'php://output');
Reference in New Issue
Block a user