mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
implement local snaps
This commit is contained in:
parent
d98b8f5c94
commit
2f7d99079d
11
README.md
11
README.md
@ -27,6 +27,7 @@ php-pdo
|
||||
php-curl
|
||||
php-gd
|
||||
php-mbstring
|
||||
php-zip
|
||||
php-mysql
|
||||
sphinxsearch
|
||||
```
|
||||
@ -37,7 +38,7 @@ sphinxsearch
|
||||
* Deploy the database using [MySQL Workbench](https://www.mysql.com/products/workbench) project presented in the `/database` folder
|
||||
* Install [Sphinx Search Server](https://sphinxsearch.com)
|
||||
* Configuration examples are placed at `/config` folder
|
||||
* Make sure `/storage` folder is writable
|
||||
* Make sure `/storage`, `/public/storage` folders writable
|
||||
* Set up the `/crontab` scripts by following [example](https://github.com/YGGverse/YGGo/blob/main/config/crontab.txt)
|
||||
|
||||
#### JSON API
|
||||
@ -147,7 +148,9 @@ GET m=SphinxQL
|
||||
* [x] Index explorer
|
||||
* [x] Safe images preview
|
||||
* [x] Extended search syntax support
|
||||
* [ ] Page history snaps
|
||||
* [ ] Page content snaps history
|
||||
+ [x] Local
|
||||
+ [ ] Remote
|
||||
|
||||
##### UI
|
||||
|
||||
@ -180,7 +183,9 @@ GET m=SphinxQL
|
||||
* [x] MIME Content-type settings
|
||||
* [x] Ban non-condition links to prevent extra requests
|
||||
* [x] Debug log
|
||||
* [x] History snaps
|
||||
* [ ] Page content snaps generation
|
||||
+ [x] Local
|
||||
+ [ ] Remote
|
||||
* [ ] Indexing new sites homepage in higher priority
|
||||
* [ ] Redirect codes extended processing
|
||||
* [ ] Palette image index / filter
|
||||
|
@ -168,7 +168,15 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
* comma separated
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac');
|
||||
define('CRAWL_PAGE_MIME_INDEX', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac');
|
||||
|
||||
/*
|
||||
* Snap pages locally match MIME types
|
||||
*
|
||||
* comma separated | false to disable
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_MIME_SNAP_LOCAL', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
|
||||
|
||||
/*
|
||||
* Renew manifests index by timing offset provided
|
||||
@ -266,7 +274,7 @@ define('CRAWL_MANIFEST', true);
|
||||
* Manifest API version compatibility
|
||||
*
|
||||
*/
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.8);
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.9);
|
||||
|
||||
/*
|
||||
* Set default auto-crawl status for new manifest added
|
||||
|
@ -30,7 +30,9 @@ $hostsTotal = $db->getTotalHosts();
|
||||
$manifestsTotal = $db->getTotalManifests();
|
||||
$hostsUpdated = 0;
|
||||
$hostPagesDeleted = 0;
|
||||
$hostPageDescriptionsDeleted = 0;
|
||||
$hostPagesDescriptionsDeleted = 0;
|
||||
$hostPagesSnapUrlDeleted = 0;
|
||||
$hostPagesToHostPageDeleted = 0;
|
||||
$manifestsDeleted = 0;
|
||||
$hostPagesBansRemoved = 0;
|
||||
|
||||
@ -74,8 +76,9 @@ try {
|
||||
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
|
||||
|
||||
// Delete host page
|
||||
$db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
$db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
|
||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
|
||||
if ($hostPage->uri != '/') {
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
@ -91,8 +94,9 @@ try {
|
||||
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||
|
||||
// Delete host page
|
||||
$db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
$db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
$hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file
|
||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
|
||||
if ($hostPage->uri != '/') {
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
@ -162,7 +166,7 @@ try {
|
||||
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
|
||||
|
||||
// Delete page description history
|
||||
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
|
||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
|
||||
|
||||
// Delete deprecated logs
|
||||
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
||||
@ -187,7 +191,9 @@ if (CLEAN_LOG_ENABLED) {
|
||||
$hostsTotal,
|
||||
$hostsUpdated,
|
||||
$hostPagesDeleted,
|
||||
$hostPageDescriptionsDeleted,
|
||||
$hostPagesDescriptionsDeleted,
|
||||
$hostPagesSnapUrlDeleted,
|
||||
$hostPagesToHostPageDeleted,
|
||||
$hostPagesBansRemoved,
|
||||
$manifestsTotal,
|
||||
$manifestsDeleted,
|
||||
@ -209,7 +215,9 @@ echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
||||
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
||||
|
||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL;
|
||||
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
|
||||
echo 'Host page snaps deleted: ' . $hostPagesSnapUrlDeleted . PHP_EOL;
|
||||
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
|
||||
|
||||
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
|
||||
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
|
||||
|
@ -39,6 +39,7 @@ $manifestsAdded = 0;
|
||||
$hostPagesAdded = 0;
|
||||
$hostsAdded = 0;
|
||||
$hostPagesBanned = 0;
|
||||
$hostPagesSnapUrlAdded = 0;
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
@ -251,11 +252,11 @@ try {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse MIME
|
||||
// Parse index MIME
|
||||
$hostPageIsDom = false;
|
||||
$hostPageInMime = false;
|
||||
|
||||
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {
|
||||
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
|
||||
|
||||
$mime = Filter::mime($mime);
|
||||
|
||||
@ -383,6 +384,48 @@ try {
|
||||
}
|
||||
}
|
||||
|
||||
// Save local snap
|
||||
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
|
||||
|
||||
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
|
||||
|
||||
$mime = Filter::mime($mime);
|
||||
|
||||
// MIME type allowed in settings
|
||||
if (false !== stripos(Filter::mime($contentType), $mime)) {
|
||||
|
||||
$crc32data = crc32($content);
|
||||
$crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host
|
||||
|
||||
// Create not duplicated data snaps only for each storage host
|
||||
if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) {
|
||||
|
||||
$time = time();
|
||||
|
||||
@mkdir('../public/storage/snap/hp/' . $queueHostPage->hostPageId, 755, true);
|
||||
|
||||
$zip = new ZipArchive();
|
||||
|
||||
if (true === $zip->open('../public/storage/snap/hp/' . $queueHostPage->hostPageId . '/' . $time . '.zip', ZipArchive::CREATE)) {
|
||||
|
||||
if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) {
|
||||
|
||||
$hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId,
|
||||
$crc32data, // do not create duplicated content snaps
|
||||
$crc32host, // multi host storage with same timestamp / crc32data
|
||||
'/storage/snap/hp/' . $queueHostPage->hostPageId . '/' . $time . '.zip', // public url
|
||||
$time);
|
||||
|
||||
$zip->close();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Begin page links collection
|
||||
$links = [];
|
||||
|
||||
@ -700,6 +743,7 @@ if (CRAWL_LOG_ENABLED) {
|
||||
$hostPagesProcessed,
|
||||
$hostPagesIndexed,
|
||||
$hostPagesAdded,
|
||||
$hostPagesSnapUrlAdded,
|
||||
$hostPagesBanned,
|
||||
$manifestsProcessed,
|
||||
$manifestsAdded,
|
||||
@ -716,6 +760,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||
echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL;
|
||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
||||
|
||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||
|
Binary file not shown.
@ -360,6 +360,63 @@ class MySQL {
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function addHostPageSnapURL(int $hostPageId,
|
||||
int $crc32data,
|
||||
int $crc32host,
|
||||
string $url,
|
||||
int $timeAdded) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnapURL` (`hostPageId`,
|
||||
`crc32data`,
|
||||
`crc32host`,
|
||||
`url`,
|
||||
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$hostPageId,
|
||||
$crc32data,
|
||||
$crc32host,
|
||||
$url,
|
||||
$timeAdded]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostPageSnapURL(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getTotalHostPageSnapURLs(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnapURL` WHERE `hostPageId` = ?');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getHostPageSnapURLs(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getHostPageSnapURL(int $hostPageId, int $crc32data, int $crc32host) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? AND `hostPageId` = ? AND `crc32host` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageId, $crc32data, $crc32host]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
// Cleaner tools
|
||||
public function getCleanerQueue(int $limit, int $timeFrom) {
|
||||
|
||||
@ -398,7 +455,9 @@ class MySQL {
|
||||
int $hostsTotal,
|
||||
int $hostsUpdated,
|
||||
int $hostPagesDeleted,
|
||||
int $hostPageDescriptionsDeleted,
|
||||
int $hostPagesDescriptionsDeleted,
|
||||
int $hostPagesSnapUrlDeleted,
|
||||
int $hostPagesToHostPageDeleted,
|
||||
int $hostPagesBansRemoved,
|
||||
int $manifestsTotal,
|
||||
int $manifestsDeleted,
|
||||
@ -414,7 +473,9 @@ class MySQL {
|
||||
`hostsTotal`,
|
||||
`hostsUpdated`,
|
||||
`hostPagesDeleted`,
|
||||
`hostPageDescriptionsDeleted`,
|
||||
`hostPagesDescriptionsDeleted`,
|
||||
`hostPagesSnapUrlDeleted`,
|
||||
`hostPagesToHostPageDeleted`,
|
||||
`hostPagesBansRemoved`,
|
||||
`manifestsTotal`,
|
||||
`manifestsDeleted`,
|
||||
@ -424,14 +485,16 @@ class MySQL {
|
||||
`httpRequestsSizeTotal`,
|
||||
`httpDownloadSizeTotal`,
|
||||
`httpRequestsTimeTotal`,
|
||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([
|
||||
$timeAdded,
|
||||
$hostsTotal,
|
||||
$hostsUpdated,
|
||||
$hostPagesDeleted,
|
||||
$hostPageDescriptionsDeleted,
|
||||
$hostPagesDescriptionsDeleted,
|
||||
$hostPagesSnapUrlDeleted,
|
||||
$hostPagesToHostPageDeleted,
|
||||
$hostPagesBansRemoved,
|
||||
$manifestsTotal,
|
||||
$manifestsDeleted,
|
||||
@ -523,6 +586,7 @@ class MySQL {
|
||||
int $hostPagesProcessed,
|
||||
int $hostPagesIndexed,
|
||||
int $hostPagesAdded,
|
||||
int $hostPagesSnapUrlAdded,
|
||||
int $hostPagesBanned,
|
||||
int $manifestsProcessed,
|
||||
int $manifestsAdded,
|
||||
@ -537,6 +601,7 @@ class MySQL {
|
||||
`hostPagesProcessed`,
|
||||
`hostPagesIndexed`,
|
||||
`hostPagesAdded`,
|
||||
`hostPagesSnapUrlAdded`,
|
||||
`hostPagesBanned`,
|
||||
`manifestsProcessed`,
|
||||
`manifestsAdded`,
|
||||
@ -544,7 +609,7 @@ class MySQL {
|
||||
`httpRequestsSizeTotal`,
|
||||
`httpDownloadSizeTotal`,
|
||||
`httpRequestsTimeTotal`,
|
||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([
|
||||
$timeAdded,
|
||||
@ -552,6 +617,7 @@ class MySQL {
|
||||
$hostPagesProcessed,
|
||||
$hostPagesIndexed,
|
||||
$hostPagesAdded,
|
||||
$hostPagesSnapUrlAdded,
|
||||
$hostPagesBanned,
|
||||
$manifestsProcessed,
|
||||
$manifestsAdded,
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 147 KiB After Width: | Height: | Size: 176 KiB |
@ -1,7 +1,7 @@
|
||||
<?php
|
||||
|
||||
// Current version
|
||||
define('API_VERSION', 0.8);
|
||||
define('API_VERSION', 0.9);
|
||||
|
||||
// Load system dependencies
|
||||
require_once('../config/app.php');
|
||||
@ -109,7 +109,8 @@ if (API_ENABLED) {
|
||||
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
||||
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
||||
'crawlHostPageMime' => CRAWL_PAGE_MIME,
|
||||
'crawlHostPageMimeIndex' => CRAWL_PAGE_MIME_INDEX,
|
||||
'crawlHostPageMimeSnapLocal' => CRAWL_PAGE_MIME_SNAP_LOCAL,
|
||||
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
||||
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
||||
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
||||
|
@ -234,13 +234,30 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<p><?php echo date('c', $hostPage->timeAdded) ?></p>
|
||||
<p><?php echo _('Time updated') ?></p>
|
||||
<p><?php echo date('c', $hostPage->timeUpdated) ?></p>
|
||||
<?php if ($totalHostPageIdSources = $db->getTotalHostPageIdSourcesByHostPageIdTarget($hp)) { ?>
|
||||
<p>
|
||||
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
|
||||
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
||||
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
||||
]) ?>
|
||||
</p>
|
||||
<?php $totalHostPageSnapUrls = $db->getTotalHostPageSnapURLs($hp); ?>
|
||||
<p>
|
||||
<?php echo Filter::plural($totalHostPageSnapUrls, [sprintf(_('%s snap'), $totalHostPageSnapUrls),
|
||||
sprintf(_('%s snaps'), $totalHostPageSnapUrls),
|
||||
sprintf(_('%s snaps'), $totalHostPageSnapUrls),
|
||||
]) ?>
|
||||
</p>
|
||||
<?php if ($totalHostPageSnapUrls) { ?>
|
||||
<?php foreach ($db->getHostPageSnapURLs($hp) as $hostPageSnapUrl) { ?>
|
||||
<p>
|
||||
<a href="<?php echo $hostPageSnapUrl->crc32host === 0 ? WEBSITE_DOMAIN . $hostPageSnapUrl->url : $hostPageSnapUrl->url ?>">
|
||||
<?php echo date('c', $hostPageSnapUrl->timeAdded) ?>
|
||||
</a>
|
||||
</p>
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<?php $totalHostPageIdSources = $db->getTotalHostPageIdSourcesByHostPageIdTarget($hp); ?>
|
||||
<p>
|
||||
<?php echo Filter::plural($totalHostPageIdSources, [sprintf(_('%s referrer'), $totalHostPageIdSources),
|
||||
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
||||
sprintf(_('%s referrers'), $totalHostPageIdSources),
|
||||
]) ?>
|
||||
</p>
|
||||
<?php if ($totalHostPageIdSources) { ?>
|
||||
<?php foreach ($db->getHostPageIdSourcesByHostPageIdTarget($hp) as $hostPageIdSource) { ?>
|
||||
<?php if ($hostPage = $db->getFoundHostPage($hostPageIdSource->hostPageIdSource)) { ?>
|
||||
<p>
|
||||
|
Loading…
x
Reference in New Issue
Block a user