mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
make local snap storage optimization
This commit is contained in:
parent
8a3b25b31c
commit
0d19004e86
@ -148,7 +148,7 @@ GET m=SphinxQL
|
||||
* [x] Index explorer
|
||||
* [x] Safe images preview
|
||||
* [x] Extended search syntax support
|
||||
* [ ] Page content snaps history
|
||||
* [ ] Compressed page snaps history
|
||||
+ [x] Local
|
||||
+ [ ] Remote
|
||||
|
||||
@ -157,6 +157,10 @@ GET m=SphinxQL
|
||||
* [x] CSS only, JS-less interface
|
||||
* [x] Unique ident icons for sites without favicons
|
||||
* [x] Content genre tabs (#1)
|
||||
* [x] Page index exploring feature
|
||||
+ [x] Meta
|
||||
+ [x] Snaps
|
||||
+ [x] Referrers
|
||||
* [ ] Results with found matches highlight
|
||||
* [ ] The time machine feature by content snaps history
|
||||
|
||||
|
@ -219,15 +219,20 @@ define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
|
||||
define('CRAWL_HOST_DEFAULT_STATUS', true);
|
||||
|
||||
/*
|
||||
* Index only meta tags to prevent disk overuse
|
||||
* or false to save meta tags + overall plain text page content
|
||||
* Index only meta tags
|
||||
* or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field
|
||||
*
|
||||
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
|
||||
*
|
||||
* This option able to change search results relevance
|
||||
* Warning!
|
||||
* this option disabled requires huge disk storage,
|
||||
* it's experimental feature, oriented for index operations
|
||||
*
|
||||
* see CRAWL_PAGE_MIME_SNAP_LOCAL
|
||||
* to create compressed data snaps
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
|
||||
define('CRAWL_HOST_DEFAULT_META_ONLY', true);
|
||||
|
||||
/*
|
||||
* Not suitable/safe for work status for new host by default
|
||||
|
@ -31,7 +31,7 @@ $manifestsTotal = $db->getTotalManifests();
|
||||
$hostsUpdated = 0;
|
||||
$hostPagesDeleted = 0;
|
||||
$hostPagesDescriptionsDeleted = 0;
|
||||
$hostPagesSnapUrlDeleted = 0;
|
||||
$hostPagesSnapDeleted = 0;
|
||||
$hostPagesToHostPageDeleted = 0;
|
||||
$manifestsDeleted = 0;
|
||||
$hostPagesBansRemoved = 0;
|
||||
@ -75,13 +75,23 @@ try {
|
||||
|
||||
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
|
||||
|
||||
// Delete host page
|
||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
|
||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
|
||||
if ($hostPage->uri != '/') {
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
|
||||
// Delete host page descriptions
|
||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
|
||||
// Delete host page refs data
|
||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
|
||||
// Delete host page snaps
|
||||
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
|
||||
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
|
||||
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
|
||||
}
|
||||
}
|
||||
|
||||
// Delete host page
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -91,16 +101,23 @@ try {
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||
if ($hostPage->uri != '/' && !$robots->uriAllowed($hostPage->uri)) {
|
||||
|
||||
// Delete host page descriptions
|
||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
|
||||
// Delete host page refs data
|
||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
|
||||
// Delete host page snaps
|
||||
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
|
||||
if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) {
|
||||
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
|
||||
}
|
||||
}
|
||||
|
||||
// Delete host page
|
||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
|
||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
|
||||
if ($hostPage->uri != '/') {
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
}
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -192,7 +209,7 @@ if (CLEAN_LOG_ENABLED) {
|
||||
$hostsUpdated,
|
||||
$hostPagesDeleted,
|
||||
$hostPagesDescriptionsDeleted,
|
||||
$hostPagesSnapUrlDeleted,
|
||||
$hostPagesSnapDeleted,
|
||||
$hostPagesToHostPageDeleted,
|
||||
$hostPagesBansRemoved,
|
||||
$manifestsTotal,
|
||||
@ -216,7 +233,7 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
||||
|
||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
|
||||
echo 'Host page snaps deleted: ' . $hostPagesSnapUrlDeleted . PHP_EOL;
|
||||
echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
|
||||
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
|
||||
|
||||
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
|
||||
|
@ -39,7 +39,7 @@ $manifestsAdded = 0;
|
||||
$hostPagesAdded = 0;
|
||||
$hostsAdded = 0;
|
||||
$hostPagesBanned = 0;
|
||||
$hostPagesSnapUrlAdded = 0;
|
||||
$hostPagesSnapAdded = 0;
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
@ -395,31 +395,31 @@ try {
|
||||
if (false !== stripos(Filter::mime($contentType), $mime)) {
|
||||
|
||||
$crc32data = crc32($content);
|
||||
$crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host
|
||||
|
||||
// Create not duplicated data snaps only for each storage host
|
||||
if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) {
|
||||
if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
|
||||
|
||||
$time = time();
|
||||
|
||||
$dir = chunk_split($queueHostPage->hostPageId, 1, '/');
|
||||
$directory = chunk_split($queueHostPage->hostPageId, 1, '/');
|
||||
|
||||
@mkdir('../public/snap/hp/' . $dir, 755, true);
|
||||
@mkdir('../public/snap/hp/' . $directory, 755, true);
|
||||
|
||||
$zip = new ZipArchive();
|
||||
|
||||
// Create new container
|
||||
if (true === $zip->open('../public/snap/hp/' . $dir . $time . '.zip', ZipArchive::CREATE)) {
|
||||
if (true === $zip->open('../public/snap/hp/' . $directory . $time . '.zip', ZipArchive::CREATE)) {
|
||||
|
||||
// Insert compressed snap data
|
||||
if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) {
|
||||
if (true === $zip->addFromString('DATA', $content) &&
|
||||
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $time) . PHP_EOL .
|
||||
sprintf('CRC32: %s', $crc32data . PHP_EOL .
|
||||
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
|
||||
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/snap/hp/' . $directory . $time . '.zip')) . PHP_EOL .
|
||||
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
|
||||
|
||||
// Update DB registry
|
||||
$hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId,
|
||||
$crc32data, // do not create duplicated content snaps
|
||||
$crc32host, // multi host storage with same timestamp / crc32data
|
||||
'/snap/hp/' . $dir . $time . '.zip', // public url
|
||||
$time);
|
||||
$hostPagesSnapAdded += $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $time);
|
||||
|
||||
$zip->close();
|
||||
|
||||
@ -748,7 +748,7 @@ if (CRAWL_LOG_ENABLED) {
|
||||
$hostPagesProcessed,
|
||||
$hostPagesIndexed,
|
||||
$hostPagesAdded,
|
||||
$hostPagesSnapUrlAdded,
|
||||
$hostPagesSnapAdded,
|
||||
$hostPagesBanned,
|
||||
$manifestsProcessed,
|
||||
$manifestsAdded,
|
||||
@ -765,7 +765,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||
echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL;
|
||||
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
||||
|
||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||
|
Binary file not shown.
@ -360,59 +360,49 @@ class MySQL {
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function addHostPageSnapURL(int $hostPageId,
|
||||
int $crc32data,
|
||||
int $crc32host,
|
||||
string $url,
|
||||
int $timeAdded) {
|
||||
public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnapURL` (`hostPageId`,
|
||||
`crc32data`,
|
||||
`crc32host`,
|
||||
`url`,
|
||||
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
|
||||
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnap` (`hostPageId`,
|
||||
`crc32data`,
|
||||
`timeAdded`) VALUES (?, ?, ?)');
|
||||
|
||||
$query->execute([$hostPageId,
|
||||
$crc32data,
|
||||
$crc32host,
|
||||
$url,
|
||||
$timeAdded]);
|
||||
$query->execute([$hostPageId, $crc32data, $timeAdded]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostPageSnapURL(int $hostPageId) {
|
||||
public function deleteHostPageSnap(int $hostPageSnapId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
$query->execute([$hostPageSnapId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getTotalHostPageSnapURLs(int $hostPageId) {
|
||||
public function getTotalHostPageSnaps(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ?');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getHostPageSnapURLs(int $hostPageId) {
|
||||
public function getHostPageSnaps(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getHostPageSnapURL(int $hostPageId, int $crc32data, int $crc32host) {
|
||||
public function getHostPageSnap(int $hostPageId, int $crc32data) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? AND `hostPageId` = ? AND `crc32host` = ? LIMIT 1');
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `hostPageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageId, $crc32data, $crc32host]);
|
||||
$query->execute([$hostPageId, $crc32data]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
@ -456,7 +446,7 @@ class MySQL {
|
||||
int $hostsUpdated,
|
||||
int $hostPagesDeleted,
|
||||
int $hostPagesDescriptionsDeleted,
|
||||
int $hostPagesSnapUrlDeleted,
|
||||
int $hostPagesSnapDeleted,
|
||||
int $hostPagesToHostPageDeleted,
|
||||
int $hostPagesBansRemoved,
|
||||
int $manifestsTotal,
|
||||
@ -474,7 +464,7 @@ class MySQL {
|
||||
`hostsUpdated`,
|
||||
`hostPagesDeleted`,
|
||||
`hostPagesDescriptionsDeleted`,
|
||||
`hostPagesSnapUrlDeleted`,
|
||||
`hostPagesSnapDeleted`,
|
||||
`hostPagesToHostPageDeleted`,
|
||||
`hostPagesBansRemoved`,
|
||||
`manifestsTotal`,
|
||||
@ -493,7 +483,7 @@ class MySQL {
|
||||
$hostsUpdated,
|
||||
$hostPagesDeleted,
|
||||
$hostPagesDescriptionsDeleted,
|
||||
$hostPagesSnapUrlDeleted,
|
||||
$hostPagesSnapDeleted,
|
||||
$hostPagesToHostPageDeleted,
|
||||
$hostPagesBansRemoved,
|
||||
$manifestsTotal,
|
||||
@ -586,7 +576,7 @@ class MySQL {
|
||||
int $hostPagesProcessed,
|
||||
int $hostPagesIndexed,
|
||||
int $hostPagesAdded,
|
||||
int $hostPagesSnapUrlAdded,
|
||||
int $hostPagesSnapAdded,
|
||||
int $hostPagesBanned,
|
||||
int $manifestsProcessed,
|
||||
int $manifestsAdded,
|
||||
@ -601,7 +591,7 @@ class MySQL {
|
||||
`hostPagesProcessed`,
|
||||
`hostPagesIndexed`,
|
||||
`hostPagesAdded`,
|
||||
`hostPagesSnapUrlAdded`,
|
||||
`hostPagesSnapAdded`,
|
||||
`hostPagesBanned`,
|
||||
`manifestsProcessed`,
|
||||
`manifestsAdded`,
|
||||
@ -617,7 +607,7 @@ class MySQL {
|
||||
$hostPagesProcessed,
|
||||
$hostPagesIndexed,
|
||||
$hostPagesAdded,
|
||||
$hostPagesSnapUrlAdded,
|
||||
$hostPagesSnapAdded,
|
||||
$hostPagesBanned,
|
||||
$manifestsProcessed,
|
||||
$manifestsAdded,
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 176 KiB After Width: | Height: | Size: 159 KiB |
@ -234,18 +234,18 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<p><?php echo date('c', $hostPage->timeAdded) ?></p>
|
||||
<p><?php echo _('Time updated') ?></p>
|
||||
<p><?php echo date('c', $hostPage->timeUpdated) ?></p>
|
||||
<?php $totalHostPageSnapUrls = $db->getTotalHostPageSnapURLs($hp); ?>
|
||||
<?php $totalHostPageSnaps = $db->getTotalHostPageSnaps($hp); ?>
|
||||
<p>
|
||||
<?php echo Filter::plural($totalHostPageSnapUrls, [sprintf(_('%s snap'), $totalHostPageSnapUrls),
|
||||
sprintf(_('%s snaps'), $totalHostPageSnapUrls),
|
||||
sprintf(_('%s snaps'), $totalHostPageSnapUrls),
|
||||
]) ?>
|
||||
<?php echo Filter::plural($totalHostPageSnaps, [sprintf(_('%s snap'), $totalHostPageSnaps),
|
||||
sprintf(_('%s snaps'), $totalHostPageSnaps),
|
||||
sprintf(_('%s snaps'), $totalHostPageSnaps),
|
||||
]) ?>
|
||||
</p>
|
||||
<?php if ($totalHostPageSnapUrls) { ?>
|
||||
<?php foreach ($db->getHostPageSnapURLs($hp) as $hostPageSnapUrl) { ?>
|
||||
<?php if ($totalHostPageSnaps) { ?>
|
||||
<?php foreach ($db->getHostPageSnaps($hp) as $hostPageSnap) { ?>
|
||||
<p>
|
||||
<a href="<?php echo $hostPageSnapUrl->crc32host === 0 ? WEBSITE_DOMAIN . $hostPageSnapUrl->url : $hostPageSnapUrl->url ?>">
|
||||
<?php echo date('c', $hostPageSnapUrl->timeAdded) ?>
|
||||
<a href="<?php echo WEBSITE_DOMAIN . '/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip' ?>">
|
||||
<?php echo date('c', $hostPageSnap->timeAdded) ?>
|
||||
</a>
|
||||
</p>
|
||||
<?php } ?>
|
||||
|
Loading…
x
Reference in New Issue
Block a user