Browse Source

make local snap storage optimization

main
ghost 2 years ago
parent
commit
0d19004e86
  1. 6
      README.md
  2. 13
      config/app.php.txt
  3. 47
      crontab/cleaner.php
  4. 28
      crontab/crawler.php
  5. BIN
      database/yggo.mwb
  6. 56
      library/mysql.php
  7. BIN
      media/db-prototype.png
  8. 18
      public/explore.php

6
README.md

@ -148,7 +148,7 @@ GET m=SphinxQL
* [x] Index explorer * [x] Index explorer
* [x] Safe images preview * [x] Safe images preview
* [x] Extended search syntax support * [x] Extended search syntax support
* [ ] Page content snaps history * [ ] Compressed page snaps history
+ [x] Local + [x] Local
+ [ ] Remote + [ ] Remote
@ -157,6 +157,10 @@ GET m=SphinxQL
* [x] CSS only, JS-less interface * [x] CSS only, JS-less interface
* [x] Unique ident icons for sites without favicons * [x] Unique ident icons for sites without favicons
* [x] Content genre tabs (#1) * [x] Content genre tabs (#1)
* [x] Page index exploring feature
+ [x] Meta
+ [x] Snaps
+ [x] Referrers
* [ ] Results with found matches highlight * [ ] Results with found matches highlight
* [ ] The time machine feature by content snaps history * [ ] The time machine feature by content snaps history

13
config/app.php.txt

@ -219,15 +219,20 @@ define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
define('CRAWL_HOST_DEFAULT_STATUS', true); define('CRAWL_HOST_DEFAULT_STATUS', true);
/* /*
* Index only meta tags to prevent disk overuse * Index only meta tags
* or false to save meta tags + overall plain text page content * or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
* *
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field * Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
* *
* This option able to change search results relevance * Warning!
* this option disabled requires huge disk storage,
* it's experimental feature, oriented for index operations
*
* see CRAWL_PAGE_MIME_SNAP_LOCAL
* to create compressed data snaps
* *
*/ */
define('CRAWL_HOST_DEFAULT_META_ONLY', false); define('CRAWL_HOST_DEFAULT_META_ONLY', true);
/* /*
* Not suitable/safe for work status for new host by default * Not suitable/safe for work status for new host by default

47
crontab/cleaner.php

@ -31,7 +31,7 @@ $manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0; $hostsUpdated = 0;
$hostPagesDeleted = 0; $hostPagesDeleted = 0;
$hostPagesDescriptionsDeleted = 0; $hostPagesDescriptionsDeleted = 0;
$hostPagesSnapUrlDeleted = 0; $hostPagesSnapDeleted = 0;
$hostPagesToHostPageDeleted = 0; $hostPagesToHostPageDeleted = 0;
$manifestsDeleted = 0; $manifestsDeleted = 0;
$hostPagesBansRemoved = 0; $hostPagesBansRemoved = 0;
@ -75,13 +75,23 @@ try {
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) { foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
// Delete host page
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') { if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
// Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
// Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
// Delete host page
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
} }
} }
} }
@ -91,16 +101,23 @@ try {
foreach ($db->getHostPages($host->hostId) as $hostPage) { foreach ($db->getHostPages($host->hostId) as $hostPage) {
if (!$robots->uriAllowed($hostPage->uri)) { if ($hostPage->uri != '/' && !$robots->uriAllowed($hostPage->uri)) {
// Delete host page // Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') { // Delete host page refs data
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
} }
// Delete host page
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
} }
} }
} }
@ -192,7 +209,7 @@ if (CLEAN_LOG_ENABLED) {
$hostsUpdated, $hostsUpdated,
$hostPagesDeleted, $hostPagesDeleted,
$hostPagesDescriptionsDeleted, $hostPagesDescriptionsDeleted,
$hostPagesSnapUrlDeleted, $hostPagesSnapDeleted,
$hostPagesToHostPageDeleted, $hostPagesToHostPageDeleted,
$hostPagesBansRemoved, $hostPagesBansRemoved,
$manifestsTotal, $manifestsTotal,
@ -216,7 +233,7 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL; echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
echo 'Host page snaps deleted: ' . $hostPagesSnapUrlDeleted . PHP_EOL; echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL; echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL; echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;

28
crontab/crawler.php

@ -39,7 +39,7 @@ $manifestsAdded = 0;
$hostPagesAdded = 0; $hostPagesAdded = 0;
$hostsAdded = 0; $hostsAdded = 0;
$hostPagesBanned = 0; $hostPagesBanned = 0;
$hostPagesSnapUrlAdded = 0; $hostPagesSnapAdded = 0;
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@ -395,31 +395,31 @@ try {
if (false !== stripos(Filter::mime($contentType), $mime)) { if (false !== stripos(Filter::mime($contentType), $mime)) {
$crc32data = crc32($content); $crc32data = crc32($content);
$crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host
// Create not duplicated data snaps only for each storage host // Create not duplicated data snaps only for each storage host
if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) { if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
$time = time(); $time = time();
$dir = chunk_split($queueHostPage->hostPageId, 1, '/'); $directory = chunk_split($queueHostPage->hostPageId, 1, '/');
@mkdir('../public/snap/hp/' . $dir, 755, true); @mkdir('../public/snap/hp/' . $directory, 755, true);
$zip = new ZipArchive(); $zip = new ZipArchive();
// Create new container // Create new container
if (true === $zip->open('../public/snap/hp/' . $dir . $time . '.zip', ZipArchive::CREATE)) { if (true === $zip->open('../public/snap/hp/' . $directory . $time . '.zip', ZipArchive::CREATE)) {
// Insert compressed snap data // Insert compressed snap data
if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) { if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $time) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/snap/hp/' . $directory . $time . '.zip')) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Update DB registry // Update DB registry
$hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId, $hostPagesSnapAdded += $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $time);
$crc32data, // do not create duplicated content snaps
$crc32host, // multi host storage with same timestamp / crc32data
'/snap/hp/' . $dir . $time . '.zip', // public url
$time);
$zip->close(); $zip->close();
@ -748,7 +748,7 @@ if (CRAWL_LOG_ENABLED) {
$hostPagesProcessed, $hostPagesProcessed,
$hostPagesIndexed, $hostPagesIndexed,
$hostPagesAdded, $hostPagesAdded,
$hostPagesSnapUrlAdded, $hostPagesSnapAdded,
$hostPagesBanned, $hostPagesBanned,
$manifestsProcessed, $manifestsProcessed,
$manifestsAdded, $manifestsAdded,
@ -765,7 +765,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL; echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

56
library/mysql.php

@ -360,59 +360,49 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function addHostPageSnapURL(int $hostPageId, public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) {
int $crc32data,
int $crc32host, $query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnap` (`hostPageId`,
string $url, `crc32data`,
int $timeAdded) { `timeAdded`) VALUES (?, ?, ?)');
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnapURL` (`hostPageId`, $query->execute([$hostPageId, $crc32data, $timeAdded]);
`crc32data`,
`crc32host`,
`url`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$hostPageId,
$crc32data,
$crc32host,
$url,
$timeAdded]);
return $query->rowCount(); return $query->rowCount();
} }
public function deleteHostPageSnapURL(int $hostPageId) { public function deleteHostPageSnap(int $hostPageSnapId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnapURL` WHERE `hostPageId` = ?'); $query = $this->_db->prepare('DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$hostPageId]); $query->execute([$hostPageSnapId]);
return $query->rowCount(); return $query->rowCount();
} }
public function getTotalHostPageSnapURLs(int $hostPageId) { public function getTotalHostPageSnaps(int $hostPageId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnapURL` WHERE `hostPageId` = ?'); $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]); $query->execute([$hostPageId]);
return $query->fetch()->total; return $query->fetch()->total;
} }
public function getHostPageSnapURLs(int $hostPageId) { public function getHostPageSnaps(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC'); $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
$query->execute([$hostPageId]); $query->execute([$hostPageId]);
return $query->fetchAll(); return $query->fetchAll();
} }
public function getHostPageSnapURL(int $hostPageId, int $crc32data, int $crc32host) { public function getHostPageSnap(int $hostPageId, int $crc32data) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? AND `hostPageId` = ? AND `crc32host` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `hostPageId` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data, $crc32host]); $query->execute([$hostPageId, $crc32data]);
return $query->fetch(); return $query->fetch();
} }
@ -456,7 +446,7 @@ class MySQL {
int $hostsUpdated, int $hostsUpdated,
int $hostPagesDeleted, int $hostPagesDeleted,
int $hostPagesDescriptionsDeleted, int $hostPagesDescriptionsDeleted,
int $hostPagesSnapUrlDeleted, int $hostPagesSnapDeleted,
int $hostPagesToHostPageDeleted, int $hostPagesToHostPageDeleted,
int $hostPagesBansRemoved, int $hostPagesBansRemoved,
int $manifestsTotal, int $manifestsTotal,
@ -474,7 +464,7 @@ class MySQL {
`hostsUpdated`, `hostsUpdated`,
`hostPagesDeleted`, `hostPagesDeleted`,
`hostPagesDescriptionsDeleted`, `hostPagesDescriptionsDeleted`,
`hostPagesSnapUrlDeleted`, `hostPagesSnapDeleted`,
`hostPagesToHostPageDeleted`, `hostPagesToHostPageDeleted`,
`hostPagesBansRemoved`, `hostPagesBansRemoved`,
`manifestsTotal`, `manifestsTotal`,
@ -493,7 +483,7 @@ class MySQL {
$hostsUpdated, $hostsUpdated,
$hostPagesDeleted, $hostPagesDeleted,
$hostPagesDescriptionsDeleted, $hostPagesDescriptionsDeleted,
$hostPagesSnapUrlDeleted, $hostPagesSnapDeleted,
$hostPagesToHostPageDeleted, $hostPagesToHostPageDeleted,
$hostPagesBansRemoved, $hostPagesBansRemoved,
$manifestsTotal, $manifestsTotal,
@ -586,7 +576,7 @@ class MySQL {
int $hostPagesProcessed, int $hostPagesProcessed,
int $hostPagesIndexed, int $hostPagesIndexed,
int $hostPagesAdded, int $hostPagesAdded,
int $hostPagesSnapUrlAdded, int $hostPagesSnapAdded,
int $hostPagesBanned, int $hostPagesBanned,
int $manifestsProcessed, int $manifestsProcessed,
int $manifestsAdded, int $manifestsAdded,
@ -601,7 +591,7 @@ class MySQL {
`hostPagesProcessed`, `hostPagesProcessed`,
`hostPagesIndexed`, `hostPagesIndexed`,
`hostPagesAdded`, `hostPagesAdded`,
`hostPagesSnapUrlAdded`, `hostPagesSnapAdded`,
`hostPagesBanned`, `hostPagesBanned`,
`manifestsProcessed`, `manifestsProcessed`,
`manifestsAdded`, `manifestsAdded`,
@ -617,7 +607,7 @@ class MySQL {
$hostPagesProcessed, $hostPagesProcessed,
$hostPagesIndexed, $hostPagesIndexed,
$hostPagesAdded, $hostPagesAdded,
$hostPagesSnapUrlAdded, $hostPagesSnapAdded,
$hostPagesBanned, $hostPagesBanned,
$manifestsProcessed, $manifestsProcessed,
$manifestsAdded, $manifestsAdded,

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 176 KiB

After

Width:  |  Height:  |  Size: 159 KiB

18
public/explore.php

@ -234,18 +234,18 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<p><?php echo date('c', $hostPage->timeAdded) ?></p> <p><?php echo date('c', $hostPage->timeAdded) ?></p>
<p><?php echo _('Time updated') ?></p> <p><?php echo _('Time updated') ?></p>
<p><?php echo date('c', $hostPage->timeUpdated) ?></p> <p><?php echo date('c', $hostPage->timeUpdated) ?></p>
<?php $totalHostPageSnapUrls = $db->getTotalHostPageSnapURLs($hp); ?> <?php $totalHostPageSnaps = $db->getTotalHostPageSnaps($hp); ?>
<p> <p>
<?php echo Filter::plural($totalHostPageSnapUrls, [sprintf(_('%s snap'), $totalHostPageSnapUrls), <?php echo Filter::plural($totalHostPageSnaps, [sprintf(_('%s snap'), $totalHostPageSnaps),
sprintf(_('%s snaps'), $totalHostPageSnapUrls), sprintf(_('%s snaps'), $totalHostPageSnaps),
sprintf(_('%s snaps'), $totalHostPageSnapUrls), sprintf(_('%s snaps'), $totalHostPageSnaps),
]) ?> ]) ?>
</p> </p>
<?php if ($totalHostPageSnapUrls) { ?> <?php if ($totalHostPageSnaps) { ?>
<?php foreach ($db->getHostPageSnapURLs($hp) as $hostPageSnapUrl) { ?> <?php foreach ($db->getHostPageSnaps($hp) as $hostPageSnap) { ?>
<p> <p>
<a href="<?php echo $hostPageSnapUrl->crc32host === 0 ? WEBSITE_DOMAIN . $hostPageSnapUrl->url : $hostPageSnapUrl->url ?>"> <a href="<?php echo WEBSITE_DOMAIN . '/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip' ?>">
<?php echo date('c', $hostPageSnapUrl->timeAdded) ?> <?php echo date('c', $hostPageSnap->timeAdded) ?>
</a> </a>
</p> </p>
<?php } ?> <?php } ?>

Loading…
Cancel
Save