Browse Source

implement local snaps

main
ghost 2 years ago
parent
commit
2f7d99079d
  1. 11
      README.md
  2. 12
      config/app.php.txt
  3. 24
      crontab/cleaner.php
  4. 49
      crontab/crawler.php
  5. BIN
      database/yggo.mwb
  6. 76
      library/mysql.php
  7. BIN
      media/db-prototype.png
  8. 5
      public/api.php
  9. 19
      public/explore.php

11
README.md

@ -27,6 +27,7 @@ php-pdo @@ -27,6 +27,7 @@ php-pdo
php-curl
php-gd
php-mbstring
php-zip
php-mysql
sphinxsearch
```
@ -37,7 +38,7 @@ sphinxsearch @@ -37,7 +38,7 @@ sphinxsearch
* Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder
* Install [Sphinx Search Server](https://sphinxsearch.com)
* Configuration examples are placed at `/config` folder
* Make sure `/storage` folder is writable
* Make sure `/storage`, `/public/storage` folders writable
* Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt)
#### JSON API
@ -147,7 +148,9 @@ GET m=SphinxQL @@ -147,7 +148,9 @@ GET m=SphinxQL
* [x] Index explorer
* [x] Safe images preview
* [x] Extended search syntax support
* [ ] Page history snaps
* [ ] Page content snaps history
+ [x] Local
+ [ ] Remote
##### UI
@ -180,7 +183,9 @@ GET m=SphinxQL @@ -180,7 +183,9 @@ GET m=SphinxQL
* [x] MIME Content-type settings
* [x] Ban non-condition links to prevent extra requests
* [x] Debug log
* [x] History snaps
* [ ] Page content snaps generation
+ [x] Local
+ [ ] Remote
* [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing
* [ ] Palette image index / filter

12
config/app.php.txt

@ -168,7 +168,15 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); @@ -168,7 +168,15 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
* comma separated
*
*/
define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac');
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac');
/*
* Snap pages locally match MIME types
*
* comma separated | false to disable
*
*/
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
/*
* Renew manifests index by timing offset provided
@ -266,7 +274,7 @@ define('CRAWL_MANIFEST', true); @@ -266,7 +274,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility
*
*/
define('CRAWL_MANIFEST_API_VERSION', 0.8);
define('CRAWL_MANIFEST_API_VERSION', 0.9);
/*
* Set default auto-crawl status for new manifest added

24
crontab/cleaner.php

@ -30,7 +30,9 @@ $hostsTotal = $db->getTotalHosts(); @@ -30,7 +30,9 @@ $hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostPagesDeleted = 0;
$hostPageDescriptionsDeleted = 0;
$hostPagesDescriptionsDeleted = 0;
$hostPagesSnapUrlDeleted = 0;
$hostPagesToHostPageDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
@ -74,8 +76,9 @@ try { @@ -74,8 +76,9 @@ try {
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
// Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
$db->deleteHostPageToHostPage($hostPage->hostPageId);
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
@ -91,8 +94,9 @@ try { @@ -91,8 +94,9 @@ try {
if (!$robots->uriAllowed($hostPage->uri)) {
// Delete host page
$db->deleteHostPageDescriptions($hostPage->hostPageId);
$db->deleteHostPageToHostPage($hostPage->hostPageId);
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
if ($hostPage->uri != '/') {
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
@ -162,7 +166,7 @@ try { @@ -162,7 +166,7 @@ try {
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
// Delete page description history
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
// Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
@ -187,7 +191,9 @@ if (CLEAN_LOG_ENABLED) { @@ -187,7 +191,9 @@ if (CLEAN_LOG_ENABLED) {
$hostsTotal,
$hostsUpdated,
$hostPagesDeleted,
$hostPageDescriptionsDeleted,
$hostPagesDescriptionsDeleted,
$hostPagesSnapUrlDeleted,
$hostPagesToHostPageDeleted,
$hostPagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
@ -209,7 +215,9 @@ echo 'Manifests total: ' . $manifestsTotal . PHP_EOL; @@ -209,7 +215,9 @@ echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
echo 'Host page snaps deleted: ' . $hostPagesSnapUrlDeleted . PHP_EOL;
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;

49
crontab/crawler.php

@ -39,6 +39,7 @@ $manifestsAdded = 0; @@ -39,6 +39,7 @@ $manifestsAdded = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
$hostPagesSnapUrlAdded = 0;
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
@ -251,11 +252,11 @@ try { @@ -251,11 +252,11 @@ try {
continue;
}
// Parse MIME
// Parse index MIME
$hostPageIsDom = false;
$hostPageInMime = false;
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
$mime = Filter::mime($mime);
@ -383,6 +384,48 @@ try { @@ -383,6 +384,48 @@ try {
}
}
// Save local snap
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
$mime = Filter::mime($mime);
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), $mime)) {
$crc32data = crc32($content);
$crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host
// Create not duplicated data snaps only for each storage host
if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) {
$time = time();
@mkdir('../public/storage/snap/hp/' . $queueHostPage->hostPageId, 755, true);
$zip = new ZipArchive();
if (true === $zip->open('../public/storage/snap/hp/' . $queueHostPage->hostPageId . '/' . $time . '.zip', ZipArchive::CREATE)) {
if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) {
$hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId,
$crc32data, // do not create duplicated content snaps
$crc32host, // multi host storage with same timestamp / crc32data
'/storage/snap/hp/' . $queueHostPage->hostPageId . '/' . $time . '.zip', // public url
$time);
$zip->close();
break;
}
}
}
}
}
}
// Begin page links collection
$links = [];
@ -700,6 +743,7 @@ if (CRAWL_LOG_ENABLED) { @@ -700,6 +743,7 @@ if (CRAWL_LOG_ENABLED) {
$hostPagesProcessed,
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesSnapUrlAdded,
$hostPagesBanned,
$manifestsProcessed,
$manifestsAdded,
@ -716,6 +760,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL; @@ -716,6 +760,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

76
library/mysql.php

@ -360,6 +360,63 @@ class MySQL { @@ -360,6 +360,63 @@ class MySQL {
return $query->fetchAll();
}
public function addHostPageSnapURL(int $hostPageId,
int $crc32data,
int $crc32host,
string $url,
int $timeAdded) {
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnapURL` (`hostPageId`,
`crc32data`,
`crc32host`,
`url`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$hostPageId,
$crc32data,
$crc32host,
$url,
$timeAdded]);
return $query->rowCount();
}
public function deleteHostPageSnapURL(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->rowCount();
}
public function getTotalHostPageSnapURLs(int $hostPageId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
$query->execute([$hostPageId]);
return $query->fetch()->total;
}
public function getHostPageSnapURLs(int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
$query->execute([$hostPageId]);
return $query->fetchAll();
}
public function getHostPageSnapURL(int $hostPageId, int $crc32data, int $crc32host) {
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? AND `hostPageId` = ? AND `crc32host` = ? LIMIT 1');
$query->execute([$hostPageId, $crc32data, $crc32host]);
return $query->fetch();
}
// Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) {
@ -398,7 +455,9 @@ class MySQL { @@ -398,7 +455,9 @@ class MySQL {
int $hostsTotal,
int $hostsUpdated,
int $hostPagesDeleted,
int $hostPageDescriptionsDeleted,
int $hostPagesDescriptionsDeleted,
int $hostPagesSnapUrlDeleted,
int $hostPagesToHostPageDeleted,
int $hostPagesBansRemoved,
int $manifestsTotal,
int $manifestsDeleted,
@ -414,7 +473,9 @@ class MySQL { @@ -414,7 +473,9 @@ class MySQL {
`hostsTotal`,
`hostsUpdated`,
`hostPagesDeleted`,
`hostPageDescriptionsDeleted`,
`hostPagesDescriptionsDeleted`,
`hostPagesSnapUrlDeleted`,
`hostPagesToHostPageDeleted`,
`hostPagesBansRemoved`,
`manifestsTotal`,
`manifestsDeleted`,
@ -424,14 +485,16 @@ class MySQL { @@ -424,14 +485,16 @@ class MySQL {
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
$hostsTotal,
$hostsUpdated,
$hostPagesDeleted,
$hostPageDescriptionsDeleted,
$hostPagesDescriptionsDeleted,
$hostPagesSnapUrlDeleted,
$hostPagesToHostPageDeleted,
$hostPagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
@ -523,6 +586,7 @@ class MySQL { @@ -523,6 +586,7 @@ class MySQL {
int $hostPagesProcessed,
int $hostPagesIndexed,
int $hostPagesAdded,
int $hostPagesSnapUrlAdded,
int $hostPagesBanned,
int $manifestsProcessed,
int $manifestsAdded,
@ -537,6 +601,7 @@ class MySQL { @@ -537,6 +601,7 @@ class MySQL {
`hostPagesProcessed`,
`hostPagesIndexed`,
`hostPagesAdded`,
`hostPagesSnapUrlAdded`,
`hostPagesBanned`,
`manifestsProcessed`,
`manifestsAdded`,
@ -544,7 +609,7 @@ class MySQL { @@ -544,7 +609,7 @@ class MySQL {
`httpRequestsSizeTotal`,
`httpDownloadSizeTotal`,
`httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([
$timeAdded,
@ -552,6 +617,7 @@ class MySQL { @@ -552,6 +617,7 @@ class MySQL {
$hostPagesProcessed,
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesSnapUrlAdded,
$hostPagesBanned,
$manifestsProcessed,
$manifestsAdded,

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 147 KiB

After

Width:  |  Height:  |  Size: 176 KiB

5
public/api.php

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
<?php
// Current version
define('API_VERSION', 0.8);
define('API_VERSION', 0.9);
// Load system dependencies
require_once('../config/app.php');
@ -109,7 +109,8 @@ if (API_ENABLED) { @@ -109,7 +109,8 @@ if (API_ENABLED) {
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlHostPageMime' => CRAWL_PAGE_MIME,
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,

19
public/explore.php

@ -234,13 +234,30 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the @@ -234,13 +234,30 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
<p><?php echo date('c', $hostPage->timeAdded) ?></p>
<p><?php echo _('Time updated') ?></p>
<p><?php echo date('c', $hostPage->timeUpdated) ?></p>
<?php if ($totalHostPageIdSources = $db->getTotalHostPageIdSourcesByHostPageIdTarget($hp)) { ?>
<?php $totalHostPageSnapUrls = $db->getTotalHostPageSnapURLs($hp); ?>
<p>
<?php echo Filter::plural($totalHostPageSnapUrls, [sprintf(_('%s snap'), $totalHostPageSnapUrls),
sprintf(_('%s snaps'), $totalHostPageSnapUrls),
sprintf(_('%s snaps'), $totalHostPageSnapUrls),
]) ?>
</p>
<?php if ($totalHostPageSnapUrls) { ?>
<?php foreach ($db->getHostPageSnapURLs($hp) as $hostPageSnapUrl) { ?>
<p>
<a href="<?php echo $hostPageSnapUrl->crc32host === 0 ? WEBSITE_DOMAIN . $hostPageSnapUrl->url : $hostPageSnapUrl->url ?>">
<?php echo date('c', $hostPageSnapUrl->timeAdded) ?>
</a>
</p>
<?php } ?>
<?php } ?>
<?php $totalHostPageIdSources = $db->getTotalHostPageIdSourcesByHostPageIdTarget($hp); ?>
<p>
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources),
sprintf(_('%s referrers'), $totalHostPageIdSources),
]) ?>
</p>
<?php if ($totalHostPageIdSources) { ?>
<?php foreach ($db->getHostPageIdSourcesByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
<p>

Loading…
Cancel
Save