diff --git a/config/app.php.txt b/config/app.php.txt index 7376e52..3e56d50 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -97,10 +97,10 @@ define('SNAP_STORAGE', json_encode((object) 'directory' => __DIR__ . '/../storage/snap/hp/', 'quota' => [ 'mime' => false, - 'size' => 10000000024, + 'size' => 10000000024, // @TODO 'request' => [ 'download' => [ - 'size' => 10000024, + 'size' => 10000024, // @TODO 'seconds' => 60*60 ] ] @@ -119,10 +119,10 @@ define('SNAP_STORAGE', json_encode((object) 'passive' => true, 'quota' => [ 'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico', - 'size' => 10000000024, + 'size' => 10000000024, // @TODO 'request' => [ 'download' => [ - 'size' => 10000024, + 'size' => 10000024, // @TODO 'seconds' => 60*60 ] ] diff --git a/crontab/crawler.php b/crontab/crawler.php index c0370a0..615a9cc 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -656,133 +656,131 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Begin snaps if (SNAP_STORAGE) { - $crc32data = crc32($content); + $hostPageSnapTimeAdded = time(); + $hostPageSnapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); - $snapTime = time(); - $snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); + $hostPageSnapFilenameTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip'; + @mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $hostPageSnapPath, 0755, true); - $snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; - @mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true); + // Create new ZIP container + $zip = new ZipArchive(); - // Create not duplicated data snaps only, even newer by time added - if ($hostPageSnap = $db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) { + if (true === $zip->open($hostPageSnapFilenameTmp, ZipArchive::CREATE)) { - $hostPageSnapId = $hostPageSnap->hostPageSnapId; + // Insert compressed snap data into the tmp storage + if (true === $zip->addFromString('DATA', $content) && + true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $hostPageSnapTimeAdded) . PHP_EOL . + sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . + sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . + sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL)))) { - } else { + // Done + $zip->close(); - // Create new ZIP container - $zip = new ZipArchive(); + // Temporarily snap file exists + if (file_exists($hostPageSnapFilenameTmp)) { - if (true === $zip->open($snapTmp, ZipArchive::CREATE)) { + // Register snap in DB + if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $hostPageSnapTimeAdded)) { - // Insert compressed snap data into the tmp storage - if (true === $zip->addFromString('DATA', $content) && - true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . - sprintf('CRC32: %s', $crc32data . PHP_EOL . - sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . - sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . - sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL))))) { + // Default storage success + $snapFilesExists = false; - // Done - $zip->close(); + // Copy files to each storage + foreach (json_decode(SNAP_STORAGE) as $name => $storages) { - // Temporarily snap file exists - if (file_exists($snapTmp)) { + foreach ($storages as $i => $storage) { - // Register snap in DB - if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, filesize($snapTmp), $snapTime)) { + // Generate storage id + $crc32name = crc32(sprintf('%s.%s', $name, $i)); - $hostPagesSnapAdded++; - } - } - } - } - } + switch ($name) { - // Copy files to each storage - foreach (json_decode(SNAP_STORAGE) as $name => $storages) { + case 'localhost': - foreach ($storages as $i => $storage) { + // Validate mime + if (!$storage->quota->mime) continue 2; - // Generate storage id - $crc32name = crc32(sprintf('%s.%s', $name, $i)); + $snapMimeValid = false; + foreach ((array) explode(',', $storage->quota->mime) as $mime) { - switch ($name) { + if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { - case 'localhost': + $snapMimeValid = true; + break; + } + } - // Validate size quota - if ($db->getTotalHostPageSnapSizeByStorage($hostPageSnapId, $crc32name) >= $storage->quota->size) continue 2; + if (!$snapMimeValid) continue 2; - // Validate mime - if (!$storage->quota->mime) continue 2; + // Copy tmp snap file to the permanent storage + @mkdir($storage->directory . $hostPageSnapPath, 0755, true); - $snapMimeValid = false; - foreach ((array) explode(',', $storage->quota->mime) as $mime) { + if (copy($hostPageSnapFilenameTmp, $storage->directory . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip')) { - if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { + // Register storage name + $db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time()); - $snapMimeValid = true; - break; - } - } + $snapFilesExists = true; + } - if (!$snapMimeValid) continue 2; + break; + case 'ftp': - // Copy tmp snap file to the permanent storage - @mkdir($storage->directory . $snapPath, 0755, true); + // Validate mime + if (!$storage->quota->mime) continue 2; - if (copy($snapTmp, $storage->directory . $snapPath . $snapTime . '.zip')) { + $snapMimeValid = false; + foreach ((array) explode(',', $storage->quota->mime) as $mime) { - // Register storage name - $db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time()); - } + if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { - break; - case 'ftp': + $snapMimeValid = true; + break; + } + } - // Validate size quota - if ($db->getTotalHostPageSnapSizeByStorage($hostPageSnapId, $crc32name) >= $storage->quota->size) continue 2; + if (!$snapMimeValid) continue 2; - // Validate mime - if (!$storage->quota->mime) continue 2; + // Copy tmp snap file to the permanent storage + $ftp = new Ftp(); - $snapMimeValid = false; - foreach ((array) explode(',', $storage->quota->mime) as $mime) { + if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { - if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { + $ftp->mkdir('hp/' . $hostPageSnapPath, true); - $snapMimeValid = true; - break; - } - } + if ($ftp->copy($hostPageSnapFilenameTmp, 'hp/' . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip')) { - if (!$snapMimeValid) continue 2; + // Register storage name + $db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time()); - // Copy tmp snap file to the permanent storage - $ftp = new Ftp(); + $snapFilesExists = true; + } - if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { + $ftp->close(); + } - $ftp->mkdir('hp/' . $snapPath, true); + break; + } + } + } - if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) { + // At least one file have been stored + if ($snapFilesExists) { - // Register storage name - $db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time()); - } + $hostPagesSnapAdded++; - $ftp->close(); - } + } else { - break; + $db->deleteHostPageSnap($hostPageSnapId); + } + } } } } // Delete tmp snap - unlink($snapTmp); + unlink($hostPageSnapFilenameTmp); } // Skip page links following with meta robots:nofollow attribute diff --git a/database/yggo.mwb b/database/yggo.mwb index 60b72b1..4a90b91 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/mysql.php b/library/mysql.php index 2a0a20d..ef39df1 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -483,14 +483,11 @@ class MySQL { return $query->fetchAll(); } - public function addHostPageSnap(int $hostPageId, string $crc32data, int $size, int $timeAdded) { + public function addHostPageSnap(int $hostPageId, int $timeAdded) { - $query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, - `crc32data`, - `size`, - `timeAdded`) VALUES (?, ?, ?, ?)'); + $query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, `timeAdded`) VALUES (?, ?)'); - $query->execute([$hostPageId, $crc32data, $size, $timeAdded]); + $query->execute([$hostPageId, $timeAdded]); return $this->_db->lastInsertId(); } @@ -522,19 +519,6 @@ class MySQL { return $query->fetchAll(); } - public function getTotalHostPageSnapSizeByStorage(int $hostPageId, int $crc32name) { - - $query = $this->_db->prepare('SELECT SUM(`hostPageSnap`.`size`) AS `total` FROM `hostPageSnap` - JOIN `hostPageSnapStorage` ON (`hostPageSnapStorage`.`hostPageSnapId` = `hostPageSnap`.`hostPageSnapId`) - - WHERE `hostPageSnap`.`hostPageSnapId` = ? - AND `hostPageSnapStorage`.`crc32name` = ?'); - - $query->execute([$hostPageId, $crc32name]); - - return $query->fetch()->total; - } - public function getHostPageSnap(int $hostPageSnapId) { $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1'); @@ -544,15 +528,6 @@ class MySQL { return $query->fetch(); } - public function findHostPageSnap(int $hostPageId, int $crc32data) { - - $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1'); - - $query->execute([$hostPageId, $crc32data]); - - return $query->fetch(); - } - public function addHostPageSnapDownload(int $hostPageSnapStorageId, string $crc32ip, int $timeAdded) { $query = $this->_db->prepare('INSERT INTO `hostPageSnapDownload` (`hostPageSnapStorageId`,