Browse Source

make local snap storage optimization

main
ghost 2 years ago
parent
commit
0d19004e86
  1. 6
      README.md
  2. 13
      config/app.php.txt
  3. 47
      crontab/cleaner.php
  4. 28
      crontab/crawler.php
  5. BIN
      database/yggo.mwb
  6. 56
      library/mysql.php
  7. BIN
      media/db-prototype.png
  8. 18
      public/explore.php

6
README.md

@ -148,7 +148,7 @@ GET m=SphinxQL @@ -148,7 +148,7 @@ GET m=SphinxQL
* [x] Index explorer
* [x] Safe images preview
* [x] Extended search syntax support
* [ ] Page content snaps history
* [ ] Compressed page snaps history
+ [x] Local
+ [ ] Remote
@ -157,6 +157,10 @@ GET m=SphinxQL @@ -157,6 +157,10 @@ GET m=SphinxQL
* [x] CSS only, JS-less interface
* [x] Unique ident icons for sites without favicons
* [x] Content genre tabs (#1)
* [x] Page index exploring feature
+ [x] Meta
+ [x] Snaps
+ [x] Referrers
* [ ] Results with found matches highlight
* [ ] The time machine feature by content snaps history

13
config/app.php.txt

@ -219,15 +219,20 @@ define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000); @@ -219,15 +219,20 @@ define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
define('CRAWL_HOST_DEFAULT_STATUS', true);
/*
* Index only meta tags to prevent disk overuse
* or false to save meta tags + overall plain text page content
* Index only meta tags
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
*
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
*
* This option able to change search results relevance
* Warning!
* this option disabled requires huge disk storage,
* it's experimental feature, oriented for index operations
*
* see CRAWL_PAGE_MIME_SNAP_LOCAL
* to create compressed data snaps
*
*/
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
define('CRAWL_HOST_DEFAULT_META_ONLY', true);
/*
* Not suitable/safe for work status for new host by default

47
crontab/cleaner.php

@ -31,7 +31,7 @@ $manifestsTotal = $db->getTotalManifests(); @@ -31,7 +31,7 @@ $manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostPagesDeleted = 0;
$hostPagesDescriptionsDeleted = 0;
$hostPagesSnapUrlDeleted = 0;
$hostPagesSnapDeleted = 0;
$hostPagesToHostPageDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
@ -75,13 +75,23 @@ try { @@ -75,13 +75,23 @@ try {
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
// Delete host page
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
// Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
// Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
// Delete host page
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}
}
@ -91,16 +101,23 @@ try { @@ -91,16 +101,23 @@ try {
foreach ($db->getHostPages($host->hostId) as $hostPage) {
if (!$robots->uriAllowed($hostPage->uri)) {
if ($hostPage->uri != '/' && !$robots->uriAllowed($hostPage->uri)) {
// Delete host page
// Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
// Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
// Delete host page
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
}
}
@ -192,7 +209,7 @@ if (CLEAN_LOG_ENABLED) { @@ -192,7 +209,7 @@ if (CLEAN_LOG_ENABLED) {
$hostsUpdated,
$hostPagesDeleted,
$hostPagesDescriptionsDeleted,
$hostPagesSnapUrlDeleted,
$hostPagesSnapDeleted,
$hostPagesToHostPageDeleted,
$hostPagesBansRemoved,
$manifestsTotal,
@ -216,7 +233,7 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL; @@ -216,7 +233,7 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
echo 'Host page snaps deleted: ' . $hostPagesSnapUrlDeleted . PHP_EOL;
echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;

28
crontab/crawler.php

@ -39,7 +39,7 @@ $manifestsAdded = 0; @@ -39,7 +39,7 @@ $manifestsAdded = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
$hostPagesSnapUrlAdded = 0;
$hostPagesSnapAdded = 0;
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@ -395,31 +395,31 @@ try { @@ -395,31 +395,31 @@ try {
if (false !== stripos(Filter::mime($contentType), $mime)) {
$crc32data = crc32($content);
$crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host
// Create not duplicated data snaps only for each storage host
if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) {
if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
$time = time();
$dir = chunk_split($queueHostPage->hostPageId, 1, '/');
$directory = chunk_split($queueHostPage->hostPageId, 1, '/');
@mkdir('../public/snap/hp/' . $dir, 755, true);
@mkdir('../public/snap/hp/' . $directory, 755, true);
$zip = new ZipArchive();
// Create new container
if (true === $zip->open('../public/snap/hp/' . $dir . $time . '.zip', ZipArchive::CREATE)) {
if (true === $zip->open('../public/snap/hp/' . $directory . $time . '.zip', ZipArchive::CREATE)) {
// Insert compressed snap data
if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) {
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $time) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/snap/hp/' . $directory . $time . '.zip')) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Update DB registry
$hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId,
$crc32data, // do not create duplicated content snaps
$crc32host, // multi host storage with same timestamp / crc32data
'/snap/hp/' . $dir . $time . '.zip', // public url
$time);
$hostPagesSnapAdded += $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $time);
$zip->close();
@ -748,7 +748,7 @@ if (CRAWL_LOG_ENABLED) { @@ -748,7 +748,7 @@ if (CRAWL_LOG_ENABLED) {
$hostPagesProcessed,
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesSnapUrlAdded,
$hostPagesSnapAdded,
$hostPagesBanned,
$manifestsProcessed,
$manifestsAdded,
@ -765,7 +765,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL; @@ -765,7 +765,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL;
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

56
library/mysql.php

@ -360,59 +360,49 @@ class MySQL { @@ -360,59 +360,49 @@ class MySQL {
return $query->fetchAll();
}
public function addHostPageSnapURL(int $hostPageId,
int $crc32data,
int $crc32host,
string $url,
int $timeAdded) {
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnapURL` (`hostPageId`,
`crc32data`,
`crc32host`,
`url`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$hostPageId,
$crc32data,
$crc32host,
$url,
$timeAdded]);
public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) {
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnap` (`hostPageId`,
`crc32data`,
`timeAdded`) VALUES (?, ?, ?)');
$query->execute([$hostPageId, $crc32data, $timeAdded]);
return $query->rowCount();
}
public function deleteHostPageSnapURL(int $hostPageId) {
public function deleteHostPageSnap(int $hostPageSnapId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
$query = $this->_db->prepare('DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1');
$query->execute([$hostPageId]);
$query->execute([$hostPageSnapId]);
return $query->rowCount();
}
public function getTotalHostPageSnapURLs(int $hostPageId) {
public function getTotalHostPageSnaps(int $hostPageId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->fetch()->total;
}
public function getHostPageSnapURLs(int $hostPageId) {
public function getHostPageSnaps(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
$query->execute([$hostPageId]);
return $query->fetchAll();
}
public function getHostPageSnapURL(int $hostPageId, int $crc32data, int $crc32host) {
public function getHostPageSnap(int $hostPageId, int $crc32data) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? AND `hostPageId` = ? AND `crc32host` = ? LIMIT 1');
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `hostPageId` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data, $crc32host]);
$query->execute([$hostPageId, $crc32data]);
return $query->fetch();
}
@ -456,7 +446,7 @@ class MySQL { @@ -456,7 +446,7 @@ class MySQL {
int $hostsUpdated,
int $hostPagesDeleted,
int $hostPagesDescriptionsDeleted,
int $hostPagesSnapUrlDeleted,
int $hostPagesSnapDeleted,
int $hostPagesToHostPageDeleted,
int $hostPagesBansRemoved,
int $manifestsTotal,
@ -474,7 +464,7 @@ class MySQL { @@ -474,7 +464,7 @@ class MySQL {
`hostsUpdated`,
`hostPagesDeleted`,
`hostPagesDescriptionsDeleted`,
`hostPagesSnapUrlDeleted`,
`hostPagesSnapDeleted`,
`hostPagesToHostPageDeleted`,
`hostPagesBansRemoved`,
`manifestsTotal`,
@ -493,7 +483,7 @@ class MySQL { @@ -493,7 +483,7 @@ class MySQL {
$hostsUpdated,
$hostPagesDeleted,
$hostPagesDescriptionsDeleted,
$hostPagesSnapUrlDeleted,
$hostPagesSnapDeleted,
$hostPagesToHostPageDeleted,
$hostPagesBansRemoved,
$manifestsTotal,
@ -586,7 +576,7 @@ class MySQL { @@ -586,7 +576,7 @@ class MySQL {
int $hostPagesProcessed,
int $hostPagesIndexed,
int $hostPagesAdded,
int $hostPagesSnapUrlAdded,
int $hostPagesSnapAdded,
int $hostPagesBanned,
int $manifestsProcessed,
int $manifestsAdded,
@ -601,7 +591,7 @@ class MySQL { @@ -601,7 +591,7 @@ class MySQL {
`hostPagesProcessed`,
`hostPagesIndexed`,
`hostPagesAdded`,
`hostPagesSnapUrlAdded`,
`hostPagesSnapAdded`,
`hostPagesBanned`,
`manifestsProcessed`,
`manifestsAdded`,
@ -617,7 +607,7 @@ class MySQL { @@ -617,7 +607,7 @@ class MySQL {
$hostPagesProcessed,
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesSnapUrlAdded,
$hostPagesSnapAdded,
$hostPagesBanned,
$manifestsProcessed,
$manifestsAdded,

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 176 KiB

After

Width:  |  Height:  |  Size: 159 KiB

18
public/explore.php

@ -234,18 +234,18 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -234,18 +234,18 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<p><?php echo date('c', $hostPage->timeAdded) ?></p>
<p><?php echo _('Time updated') ?></p>
<p><?php echo date('c', $hostPage->timeUpdated) ?></p>
<?php $totalHostPageSnapUrls = $db->getTotalHostPageSnapURLs($hp); ?>
<?php $totalHostPageSnaps = $db->getTotalHostPageSnaps($hp); ?>
<p>
<?php echo Filter::plural($totalHostPageSnapUrls, [sprintf(_('%s snap'), $totalHostPageSnapUrls),
sprintf(_('%s snaps'), $totalHostPageSnapUrls),
sprintf(_('%s snaps'), $totalHostPageSnapUrls),
]) ?>
<?php echo Filter::plural($totalHostPageSnaps, [sprintf(_('%s snap'), $totalHostPageSnaps),
sprintf(_('%s snaps'), $totalHostPageSnaps),
sprintf(_('%s snaps'), $totalHostPageSnaps),
]) ?>
</p>
<?php if ($totalHostPageSnapUrls) { ?>
<?php foreach ($db->getHostPageSnapURLs($hp) as $hostPageSnapUrl) { ?>
<?php if ($totalHostPageSnaps) { ?>
<?php foreach ($db->getHostPageSnaps($hp) as $hostPageSnap) { ?>
<p>
<a href="<?php echo $hostPageSnapUrl->crc32host === 0 ? WEBSITE_DOMAIN . $hostPageSnapUrl->url : $hostPageSnapUrl->url ?>">
<?php echo date('c', $hostPageSnapUrl->timeAdded) ?>
<a href="<?php echo WEBSITE_DOMAIN . '/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip' ?>">
<?php echo date('c', $hostPageSnap->timeAdded) ?>
</a>
</p>
<?php } ?>

Loading…
Cancel
Save