Browse Source

implement local snaps

main
ghost 2 years ago
parent
commit
2f7d99079d
  1. 11
      README.md
  2. 12
      config/app.php.txt
  3. 24
      crontab/cleaner.php
  4. 49
      crontab/crawler.php
  5. BIN
      database/yggo.mwb
  6. 76
      library/mysql.php
  7. BIN
      media/db-prototype.png
  8. 5
      public/api.php
  9. 31
      public/explore.php

11
README.md

@ -27,6 +27,7 @@ php-pdo
php-curl php-curl
php-gd php-gd
php-mbstring php-mbstring
php-zip
php-mysql php-mysql
sphinxsearch sphinxsearch
``` ```
@ -37,7 +38,7 @@ sphinxsearch
* Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder * Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder
* Install [Sphinx Search Server](https://sphinxsearch.com) * Install [Sphinx Search Server](https://sphinxsearch.com)
* Configuration examples are placed at `/config` folder * Configuration examples are placed at `/config` folder
* Make sure `/storage` folder is writable * Make sure `/storage`, `/public/storage` folders writable
* Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt) * Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt)
#### JSON API #### JSON API
@ -147,7 +148,9 @@ GET m=SphinxQL
* [x] Index explorer * [x] Index explorer
* [x] Safe images preview * [x] Safe images preview
* [x] Extended search syntax support * [x] Extended search syntax support
* [ ] Page history snaps * [ ] Page content snaps history
+ [x] Local
+ [ ] Remote
##### UI ##### UI
@ -180,7 +183,9 @@ GET m=SphinxQL
* [x] MIME Content-type settings * [x] MIME Content-type settings
* [x] Ban non-condition links to prevent extra requests * [x] Ban non-condition links to prevent extra requests
* [x] Debug log * [x] Debug log
* [x] History snaps * [ ] Page content snaps generation
+ [x] Local
+ [ ] Remote
* [ ] Indexing new sites homepage in higher priority * [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing * [ ] Redirect codes extended processing
* [ ] Palette image index / filter * [ ] Palette image index / filter

12
config/app.php.txt

@ -168,7 +168,15 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
* comma separated * comma separated
* *
*/ */
define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac'); define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac');
/*
* Snap pages locally match MIME types
*
* comma separated | false to disable
*
*/
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
/* /*
* Renew manifests index by timing offset provided * Renew manifests index by timing offset provided
@ -266,7 +274,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility * Manifest API version compatibility
* *
*/ */
define('CRAWL_MANIFEST_API_VERSION', 0.8); define('CRAWL_MANIFEST_API_VERSION', 0.9);
/* /*
* Set default auto-crawl status for new manifest added * Set default auto-crawl status for new manifest added

24
crontab/cleaner.php

@ -30,7 +30,9 @@ $hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests(); $manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0; $hostsUpdated = 0;
$hostPagesDeleted = 0; $hostPagesDeleted = 0;
$hostPageDescriptionsDeleted = 0; $hostPagesDescriptionsDeleted = 0;
$hostPagesSnapUrlDeleted = 0;
$hostPagesToHostPageDeleted = 0;
$manifestsDeleted = 0; $manifestsDeleted = 0;
$hostPagesBansRemoved = 0; $hostPagesBansRemoved = 0;
@ -74,8 +76,9 @@ try {
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) { foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
// Delete host page // Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
$db->deleteHostPageToHostPage($hostPage->hostPageId); $hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') { if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
@ -91,8 +94,9 @@ try {
if (!$robots->uriAllowed($hostPage->uri)) { if (!$robots->uriAllowed($hostPage->uri)) {
// Delete host page // Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
$db->deleteHostPageToHostPage($hostPage->hostPageId); $hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') { if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
@ -162,7 +166,7 @@ try {
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET); $hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
// Delete page description history // Delete page description history
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
// Delete deprecated logs // Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET); $logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
@ -187,7 +191,9 @@ if (CLEAN_LOG_ENABLED) {
$hostsTotal, $hostsTotal,
$hostsUpdated, $hostsUpdated,
$hostPagesDeleted, $hostPagesDeleted,
$hostPageDescriptionsDeleted, $hostPagesDescriptionsDeleted,
$hostPagesSnapUrlDeleted,
$hostPagesToHostPageDeleted,
$hostPagesBansRemoved, $hostPagesBansRemoved,
$manifestsTotal, $manifestsTotal,
$manifestsDeleted, $manifestsDeleted,
@ -209,7 +215,9 @@ echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL; echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL; echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
echo 'Host page snaps deleted: ' . $hostPagesSnapUrlDeleted . PHP_EOL;
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL; echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL; echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;

49
crontab/crawler.php

@ -39,6 +39,7 @@ $manifestsAdded = 0;
$hostPagesAdded = 0; $hostPagesAdded = 0;
$hostsAdded = 0; $hostsAdded = 0;
$hostPagesBanned = 0; $hostPagesBanned = 0;
$hostPagesSnapUrlAdded = 0;
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@ -251,11 +252,11 @@ try {
continue; continue;
} }
// Parse MIME // Parse index MIME
$hostPageIsDom = false; $hostPageIsDom = false;
$hostPageInMime = false; $hostPageInMime = false;
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) { foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
$mime = Filter::mime($mime); $mime = Filter::mime($mime);
@ -383,6 +384,48 @@ try {
} }
} }
// Save local snap
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
$mime = Filter::mime($mime);
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) {
$crc32data = crc32($content);
$crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host
// Create not duplicated data snaps only for each storage host
if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) {
$time = time();
@mkdir('../public/storage/snap/hp/' . $queueHostPage->hostPageId, 755, true);
$zip = new ZipArchive();
if (true === $zip->open('../public/storage/snap/hp/' . $queueHostPage->hostPageId . '/' . $time . '.zip', ZipArchive::CREATE)) {
if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) {
$hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId,
$crc32data, // do not create duplicated content snaps
$crc32host, // multi host storage with same timestamp / crc32data
'/storage/snap/hp/' . $queueHostPage->hostPageId . '/' . $time . '.zip', // public url
$time);
$zip->close();
break;
}
}
}
}
}
}
// Begin page links collection // Begin page links collection
$links = []; $links = [];
@ -700,6 +743,7 @@ if (CRAWL_LOG_ENABLED) {
$hostPagesProcessed, $hostPagesProcessed,
$hostPagesIndexed, $hostPagesIndexed,
$hostPagesAdded, $hostPagesAdded,
$hostPagesSnapUrlAdded,
$hostPagesBanned, $hostPagesBanned,
$manifestsProcessed, $manifestsProcessed,
$manifestsAdded, $manifestsAdded,
@ -716,6 +760,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

76
library/mysql.php

@ -360,6 +360,63 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function addHostPageSnapURL(int $hostPageId,
int $crc32data,
int $crc32host,
string $url,
int $timeAdded) {
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnapURL` (`hostPageId`,
`crc32data`,
`crc32host`,
`url`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$hostPageId,
$crc32data,
$crc32host,
$url,
$timeAdded]);
return $query->rowCount();
}
public function deleteHostPageSnapURL(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->rowCount();
}
public function getTotalHostPageSnapURLs(int $hostPageId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->fetch()->total;
}
public function getHostPageSnapURLs(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
$query->execute([$hostPageId]);
return $query->fetchAll();
}
public function getHostPageSnapURL(int $hostPageId, int $crc32data, int $crc32host) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? AND `hostPageId` = ? AND `crc32host` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data, $crc32host]);
return $query->fetch();
}
// Cleaner tools // Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) { public function getCleanerQueue(int $limit, int $timeFrom) {
@ -398,7 +455,9 @@ class MySQL {
int $hostsTotal, int $hostsTotal,
int $hostsUpdated, int $hostsUpdated,
int $hostPagesDeleted, int $hostPagesDeleted,
int $hostPageDescriptionsDeleted, int $hostPagesDescriptionsDeleted,
int $hostPagesSnapUrlDeleted,
int $hostPagesToHostPageDeleted,
int $hostPagesBansRemoved, int $hostPagesBansRemoved,
int $manifestsTotal, int $manifestsTotal,
int $manifestsDeleted, int $manifestsDeleted,
@ -414,7 +473,9 @@ class MySQL {
`hostsTotal`, `hostsTotal`,
`hostsUpdated`, `hostsUpdated`,
`hostPagesDeleted`, `hostPagesDeleted`,
`hostPageDescriptionsDeleted`, `hostPagesDescriptionsDeleted`,
`hostPagesSnapUrlDeleted`,
`hostPagesToHostPageDeleted`,
`hostPagesBansRemoved`, `hostPagesBansRemoved`,
`manifestsTotal`, `manifestsTotal`,
`manifestsDeleted`, `manifestsDeleted`,
@ -424,14 +485,16 @@ class MySQL {
`httpRequestsSizeTotal`, `httpRequestsSizeTotal`,
`httpDownloadSizeTotal`, `httpDownloadSizeTotal`,
`httpRequestsTimeTotal`, `httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $query->execute([
$timeAdded, $timeAdded,
$hostsTotal, $hostsTotal,
$hostsUpdated, $hostsUpdated,
$hostPagesDeleted, $hostPagesDeleted,
$hostPageDescriptionsDeleted, $hostPagesDescriptionsDeleted,
$hostPagesSnapUrlDeleted,
$hostPagesToHostPageDeleted,
$hostPagesBansRemoved, $hostPagesBansRemoved,
$manifestsTotal, $manifestsTotal,
$manifestsDeleted, $manifestsDeleted,
@ -523,6 +586,7 @@ class MySQL {
int $hostPagesProcessed, int $hostPagesProcessed,
int $hostPagesIndexed, int $hostPagesIndexed,
int $hostPagesAdded, int $hostPagesAdded,
int $hostPagesSnapUrlAdded,
int $hostPagesBanned, int $hostPagesBanned,
int $manifestsProcessed, int $manifestsProcessed,
int $manifestsAdded, int $manifestsAdded,
@ -537,6 +601,7 @@ class MySQL {
`hostPagesProcessed`, `hostPagesProcessed`,
`hostPagesIndexed`, `hostPagesIndexed`,
`hostPagesAdded`, `hostPagesAdded`,
`hostPagesSnapUrlAdded`,
`hostPagesBanned`, `hostPagesBanned`,
`manifestsProcessed`, `manifestsProcessed`,
`manifestsAdded`, `manifestsAdded`,
@ -544,7 +609,7 @@ class MySQL {
`httpRequestsSizeTotal`, `httpRequestsSizeTotal`,
`httpDownloadSizeTotal`, `httpDownloadSizeTotal`,
`httpRequestsTimeTotal`, `httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $query->execute([
$timeAdded, $timeAdded,
@ -552,6 +617,7 @@ class MySQL {
$hostPagesProcessed, $hostPagesProcessed,
$hostPagesIndexed, $hostPagesIndexed,
$hostPagesAdded, $hostPagesAdded,
$hostPagesSnapUrlAdded,
$hostPagesBanned, $hostPagesBanned,
$manifestsProcessed, $manifestsProcessed,
$manifestsAdded, $manifestsAdded,

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 147 KiB

After

Width:  |  Height:  |  Size: 176 KiB

5
public/api.php

@ -1,7 +1,7 @@
<?php <?php
// Current version // Current version
define('API_VERSION', 0.8); define('API_VERSION', 0.9);
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
@ -109,7 +109,8 @@ if (API_ENABLED) {
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageMime' => CRAWL_PAGE_MIME, 'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, 'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,

31
public/explore.php

@ -234,13 +234,30 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<p><?php echo date('c', $hostPage->timeAdded) ?></p> <p><?php echo date('c', $hostPage->timeAdded) ?></p>
<p><?php echo _('Time updated') ?></p> <p><?php echo _('Time updated') ?></p>
<p><?php echo date('c', $hostPage->timeUpdated) ?></p> <p><?php echo date('c', $hostPage->timeUpdated) ?></p>
<?php if ($totalHostPageIdSources = $db->getTotalHostPageIdSourcesByHostPageIdTarget($hp)) { ?> <?php $totalHostPageSnapUrls = $db->getTotalHostPageSnapURLs($hp); ?>
<p> <p>
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources), <?php echo Filter::plural($totalHostPageSnapUrls, [sprintf(_('%s snap'), $totalHostPageSnapUrls),
sprintf(_('%s referrers'), $totalHostPageIdSources), sprintf(_('%s snaps'), $totalHostPageSnapUrls),
sprintf(_('%s referrers'), $totalHostPageIdSources), sprintf(_('%s snaps'), $totalHostPageSnapUrls),
]) ?> ]) ?>
</p> </p>
<?php if ($totalHostPageSnapUrls) { ?>
<?php foreach ($db->getHostPageSnapURLs($hp) as $hostPageSnapUrl) { ?>
<p>
<a href="<?php echo $hostPageSnapUrl->crc32host === 0 ? WEBSITE_DOMAIN . $hostPageSnapUrl->url : $hostPageSnapUrl->url ?>">
<?php echo date('c', $hostPageSnapUrl->timeAdded) ?>
</a>
</p>
<?php } ?>
<?php } ?>
<?php $totalHostPageIdSources = $db->getTotalHostPageIdSourcesByHostPageIdTarget($hp); ?>
<p>
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources),
]) ?>
</p>
<?php if ($totalHostPageIdSources) { ?>
<?php foreach ($db->getHostPageIdSourcesByHostPageIdTarget($hp) as $hostPageIdSource) { ?> <?php foreach ($db->getHostPageIdSourcesByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?> <?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<p> <p>

Loading…
Cancel
Save