Browse Source

optimize snaps, delete unused constructions

main
ghost 1 year ago
parent
commit
3e3b7ee2ef
  1. 8
      config/app.php.txt
  2. 70
      crontab/crawler.php
  3. BIN
      database/yggo.mwb
  4. 31
      library/mysql.php

8
config/app.php.txt

@ -97,10 +97,10 @@ define('SNAP_STORAGE', json_encode((object)
'directory' => __DIR__ . '/../storage/snap/hp/', 'directory' => __DIR__ . '/../storage/snap/hp/',
'quota' => [ 'quota' => [
'mime' => false, 'mime' => false,
'size' => 10000000024, 'size' => 10000000024, // @TODO
'request' => [ 'request' => [
'download' => [ 'download' => [
'size' => 10000024, 'size' => 10000024, // @TODO
'seconds' => 60*60 'seconds' => 60*60
] ]
] ]
@ -119,10 +119,10 @@ define('SNAP_STORAGE', json_encode((object)
'passive' => true, 'passive' => true,
'quota' => [ 'quota' => [
'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico', 'mime' => 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico',
'size' => 10000000024, 'size' => 10000000024, // @TODO
'request' => [ 'request' => [
'download' => [ 'download' => [
'size' => 10000024, 'size' => 10000024, // @TODO
'seconds' => 60*60 'seconds' => 60*60
] ]
] ]

70
crontab/crawler.php

@ -656,49 +656,35 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Begin snaps // Begin snaps
if (SNAP_STORAGE) { if (SNAP_STORAGE) {
$crc32data = crc32($content); $hostPageSnapTimeAdded = time();
$hostPageSnapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$snapTime = time(); $hostPageSnapFilenameTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip';
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); @mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $hostPageSnapPath, 0755, true);
$snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create not duplicated data snaps only, even newer by time added
if ($hostPageSnap = $db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
$hostPageSnapId = $hostPageSnap->hostPageSnapId;
} else {
// Create new ZIP container // Create new ZIP container
$zip = new ZipArchive(); $zip = new ZipArchive();
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) { if (true === $zip->open($hostPageSnapFilenameTmp, ZipArchive::CREATE)) {
// Insert compressed snap data into the tmp storage // Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) && if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $hostPageSnapTimeAdded) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL))))) { sprintf('TARGET: %s', Filter::url($queueHostPage->hostPageURL)))) {
// Done // Done
$zip->close(); $zip->close();
// Temporarily snap file exists // Temporarily snap file exists
if (file_exists($snapTmp)) { if (file_exists($hostPageSnapFilenameTmp)) {
// Register snap in DB // Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, filesize($snapTmp), $snapTime)) { if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $hostPageSnapTimeAdded)) {
$hostPagesSnapAdded++; // Default storage success
} $snapFilesExists = false;
}
}
}
}
// Copy files to each storage // Copy files to each storage
foreach (json_decode(SNAP_STORAGE) as $name => $storages) { foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
@ -712,9 +698,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
case 'localhost': case 'localhost':
// Validate size quota
if ($db->getTotalHostPageSnapSizeByStorage($hostPageSnapId, $crc32name) >= $storage->quota->size) continue 2;
// Validate mime // Validate mime
if (!$storage->quota->mime) continue 2; if (!$storage->quota->mime) continue 2;
@ -731,20 +714,19 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if (!$snapMimeValid) continue 2; if (!$snapMimeValid) continue 2;
// Copy tmp snap file to the permanent storage // Copy tmp snap file to the permanent storage
@mkdir($storage->directory . $snapPath, 0755, true); @mkdir($storage->directory . $hostPageSnapPath, 0755, true);
if (copy($snapTmp, $storage->directory . $snapPath . $snapTime . '.zip')) { if (copy($hostPageSnapFilenameTmp, $storage->directory . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip')) {
// Register storage name // Register storage name
$db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time()); $db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time());
$snapFilesExists = true;
} }
break; break;
case 'ftp': case 'ftp':
// Validate size quota
if ($db->getTotalHostPageSnapSizeByStorage($hostPageSnapId, $crc32name) >= $storage->quota->size) continue 2;
// Validate mime // Validate mime
if (!$storage->quota->mime) continue 2; if (!$storage->quota->mime) continue 2;
@ -765,12 +747,14 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) { if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->mkdir('hp/' . $snapPath, true); $ftp->mkdir('hp/' . $hostPageSnapPath, true);
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) { if ($ftp->copy($hostPageSnapFilenameTmp, 'hp/' . $hostPageSnapPath . $hostPageSnapTimeAdded . '.zip')) {
// Register storage name // Register storage name
$db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time()); $db->addHostPageSnapStorage($hostPageSnapId, $crc32name, time());
$snapFilesExists = true;
} }
$ftp->close(); $ftp->close();
@ -781,8 +765,22 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
} }
} }
// At least one file have been stored
if ($snapFilesExists) {
$hostPagesSnapAdded++;
} else {
$db->deleteHostPageSnap($hostPageSnapId);
}
}
}
}
}
// Delete tmp snap // Delete tmp snap
unlink($snapTmp); unlink($hostPageSnapFilenameTmp);
} }
// Skip page links following with meta robots:nofollow attribute // Skip page links following with meta robots:nofollow attribute

BIN
database/yggo.mwb

Binary file not shown.

31
library/mysql.php

@ -483,14 +483,11 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function addHostPageSnap(int $hostPageId, string $crc32data, int $size, int $timeAdded) { public function addHostPageSnap(int $hostPageId, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, $query = $this->_db->prepare('INSERT INTO `hostPageSnap` (`hostPageId`, `timeAdded`) VALUES (?, ?)');
`crc32data`,
`size`,
`timeAdded`) VALUES (?, ?, ?, ?)');
$query->execute([$hostPageId, $crc32data, $size, $timeAdded]); $query->execute([$hostPageId, $timeAdded]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
@ -522,19 +519,6 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function getTotalHostPageSnapSizeByStorage(int $hostPageId, int $crc32name) {
$query = $this->_db->prepare('SELECT SUM(`hostPageSnap`.`size`) AS `total` FROM `hostPageSnap`
JOIN `hostPageSnapStorage` ON (`hostPageSnapStorage`.`hostPageSnapId` = `hostPageSnap`.`hostPageSnapId`)
WHERE `hostPageSnap`.`hostPageSnapId` = ?
AND `hostPageSnapStorage`.`crc32name` = ?');
$query->execute([$hostPageId, $crc32name]);
return $query->fetch()->total;
}
public function getHostPageSnap(int $hostPageSnapId) { public function getHostPageSnap(int $hostPageSnapId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1');
@ -544,15 +528,6 @@ class MySQL {
return $query->fetch(); return $query->fetch();
} }
public function findHostPageSnap(int $hostPageId, int $crc32data) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data]);
return $query->fetch();
}
public function addHostPageSnapDownload(int $hostPageSnapStorageId, string $crc32ip, int $timeAdded) { public function addHostPageSnapDownload(int $hostPageSnapStorageId, string $crc32ip, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostPageSnapDownload` (`hostPageSnapStorageId`, $query = $this->_db->prepare('INSERT INTO `hostPageSnapDownload` (`hostPageSnapStorageId`,

Loading…
Cancel
Save