diff --git a/README.md b/README.md index 481d0f6..59d34d5 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ GET m=SphinxQL * [x] Index explorer * [x] Safe images preview * [x] Extended search syntax support -* [ ] Page content snaps history +* [ ] Compressed page snaps history + [x] Local + [ ] Remote @@ -157,6 +157,10 @@ GET m=SphinxQL * [x] CSS only, JS-less interface * [x] Unique ident icons for sites without favicons * [x] Content genre tabs (#1) +* [x] Page index exploring feature + + [x] Meta + + [x] Snaps + + [x] Referrers * [ ] Results with found matches highlight * [ ] The time machine feature by content snaps history diff --git a/config/app.php.txt b/config/app.php.txt index 9419fb3..f9da00e 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -219,15 +219,20 @@ define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000); define('CRAWL_HOST_DEFAULT_STATUS', true); /* - * Index only meta tags to prevent disk overuse - * or false to save meta tags + overall plain text page content + * Index only meta tags + * or false to save meta tags + base64 encoded page content in the `hostPage`.`data` field * * Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field * - * This option able to change search results relevance + * Warning! + * this option disabled requires huge disk storage, + * it's experimental feature, oriented for index operations + * + * see CRAWL_PAGE_MIME_SNAP_LOCAL + * to create compressed data snaps * */ -define('CRAWL_HOST_DEFAULT_META_ONLY', false); +define('CRAWL_HOST_DEFAULT_META_ONLY', true); /* * Not suitable/safe for work status for new host by default diff --git a/crontab/cleaner.php b/crontab/cleaner.php index 0b86180..85ba602 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -31,7 +31,7 @@ $manifestsTotal = $db->getTotalManifests(); $hostsUpdated = 0; $hostPagesDeleted = 0; $hostPagesDescriptionsDeleted = 0; -$hostPagesSnapUrlDeleted = 0; +$hostPagesSnapDeleted = 0; $hostPagesToHostPageDeleted = 0; $manifestsDeleted = 0; $hostPagesBansRemoved = 0; @@ -75,13 +75,23 @@ try { foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) { - // Delete host page - $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); - $hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file - $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); - if ($hostPage->uri != '/') { - $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); + + // Delete host page descriptions + $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); + + // Delete host page refs data + $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); + + // Delete host page snaps + foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { + if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) { + $hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); + } + } + + // Delete host page + $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); } } } @@ -91,16 +101,23 @@ try { foreach ($db->getHostPages($host->hostId) as $hostPage) { - if (!$robots->uriAllowed($hostPage->uri)) { + if ($hostPage->uri != '/' && !$robots->uriAllowed($hostPage->uri)) { - // Delete host page + // Delete host page descriptions $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); - $hostPagesSnapUrlDeleted += $db->deleteHostPageSnapURL($hostPage->hostPageId); // @TODO delete file - $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); - if ($hostPage->uri != '/') { - $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); + // Delete host page refs data + $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); + + // Delete host page snaps + foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) { + if (true === unlink('../public/snap/hp/' . chunk_split($hostPageSnap->hostPageId, 1, '/') . $hostPageSnap->timeAdded . '.zip')) { + $hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId); + } } + + // Delete host page + $hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); } } } @@ -192,7 +209,7 @@ if (CLEAN_LOG_ENABLED) { $hostsUpdated, $hostPagesDeleted, $hostPagesDescriptionsDeleted, - $hostPagesSnapUrlDeleted, + $hostPagesSnapDeleted, $hostPagesToHostPageDeleted, $hostPagesBansRemoved, $manifestsTotal, @@ -216,7 +233,7 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL; -echo 'Host page snaps deleted: ' . $hostPagesSnapUrlDeleted . PHP_EOL; +echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL; echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL; echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL; diff --git a/crontab/crawler.php b/crontab/crawler.php index 929b109..89ce823 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -39,7 +39,7 @@ $manifestsAdded = 0; $hostPagesAdded = 0; $hostsAdded = 0; $hostPagesBanned = 0; -$hostPagesSnapUrlAdded = 0; +$hostPagesSnapAdded = 0; // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); @@ -395,31 +395,31 @@ try { if (false !== stripos(Filter::mime($contentType), $mime)) { $crc32data = crc32($content); - $crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host // Create not duplicated data snaps only for each storage host - if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) { + if (!$db->getHostPageSnap($queueHostPage->hostPageId, $crc32data)) { $time = time(); - $dir = chunk_split($queueHostPage->hostPageId, 1, '/'); + $directory = chunk_split($queueHostPage->hostPageId, 1, '/'); - @mkdir('../public/snap/hp/' . $dir, 755, true); + @mkdir('../public/snap/hp/' . $directory, 755, true); $zip = new ZipArchive(); // Create new container - if (true === $zip->open('../public/snap/hp/' . $dir . $time . '.zip', ZipArchive::CREATE)) { + if (true === $zip->open('../public/snap/hp/' . $directory . $time . '.zip', ZipArchive::CREATE)) { // Insert compressed snap data - if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) { + if (true === $zip->addFromString('DATA', $content) && + true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $time) . PHP_EOL . + sprintf('CRC32: %s', $crc32data . PHP_EOL . + sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . + sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/snap/hp/' . $directory . $time . '.zip')) . PHP_EOL . + sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { // Update DB registry - $hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId, - $crc32data, // do not create duplicated content snaps - $crc32host, // multi host storage with same timestamp / crc32data - '/snap/hp/' . $dir . $time . '.zip', // public url - $time); + $hostPagesSnapAdded += $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $time); $zip->close(); @@ -748,7 +748,7 @@ if (CRAWL_LOG_ENABLED) { $hostPagesProcessed, $hostPagesIndexed, $hostPagesAdded, - $hostPagesSnapUrlAdded, + $hostPagesSnapAdded, $hostPagesBanned, $manifestsProcessed, $manifestsAdded, @@ -765,7 +765,7 @@ echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; -echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL; +echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb index ba5317c..e029b60 100644 Binary files a/database/yggo.mwb and b/database/yggo.mwb differ diff --git a/library/mysql.php b/library/mysql.php index 2fdc331..2aedffd 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -360,59 +360,49 @@ class MySQL { return $query->fetchAll(); } - public function addHostPageSnapURL(int $hostPageId, - int $crc32data, - int $crc32host, - string $url, - int $timeAdded) { - - $query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnapURL` (`hostPageId`, - `crc32data`, - `crc32host`, - `url`, - `timeAdded`) VALUES (?, ?, ?, ?, ?)'); - - $query->execute([$hostPageId, - $crc32data, - $crc32host, - $url, - $timeAdded]); + public function addHostPageSnap(int $hostPageId, string $crc32data, int $timeAdded) { + + $query = $this->_db->prepare('INSERT IGNORE INTO `hostPageSnap` (`hostPageId`, + `crc32data`, + `timeAdded`) VALUES (?, ?, ?)'); + + $query->execute([$hostPageId, $crc32data, $timeAdded]); return $query->rowCount(); } - public function deleteHostPageSnapURL(int $hostPageId) { + public function deleteHostPageSnap(int $hostPageSnapId) { - $query = $this->_db->prepare('DELETE FROM `hostPageSnapURL` WHERE `hostPageId` = ?'); + $query = $this->_db->prepare('DELETE FROM `hostPageSnap` WHERE `hostPageSnapId` = ? LIMIT 1'); - $query->execute([$hostPageId]); + $query->execute([$hostPageSnapId]); return $query->rowCount(); } - public function getTotalHostPageSnapURLs(int $hostPageId) { + public function getTotalHostPageSnaps(int $hostPageId) { - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnapURL` WHERE `hostPageId` = ?'); + $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPageSnap` WHERE `hostPageId` = ?'); $query->execute([$hostPageId]); return $query->fetch()->total; } - public function getHostPageSnapURLs(int $hostPageId) { + public function getHostPageSnaps(int $hostPageId) { - $query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC'); + $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC'); $query->execute([$hostPageId]); return $query->fetchAll(); } - public function getHostPageSnapURL(int $hostPageId, int $crc32data, int $crc32host) { + public function getHostPageSnap(int $hostPageId, int $crc32data) { - $query = $this->_db->prepare('SELECT * FROM `hostPageSnapURL` WHERE `hostPageId` = ? AND `hostPageId` = ? AND `crc32host` = ? LIMIT 1'); + $query = $this->_db->prepare('SELECT * FROM `hostPageSnap` WHERE `hostPageId` = ? AND `hostPageId` = ? LIMIT 1'); - $query->execute([$hostPageId, $crc32data, $crc32host]); + $query->execute([$hostPageId, $crc32data]); return $query->fetch(); } @@ -456,7 +446,7 @@ class MySQL { int $hostsUpdated, int $hostPagesDeleted, int $hostPagesDescriptionsDeleted, - int $hostPagesSnapUrlDeleted, + int $hostPagesSnapDeleted, int $hostPagesToHostPageDeleted, int $hostPagesBansRemoved, int $manifestsTotal, @@ -474,7 +464,7 @@ class MySQL { `hostsUpdated`, `hostPagesDeleted`, `hostPagesDescriptionsDeleted`, - `hostPagesSnapUrlDeleted`, + `hostPagesSnapDeleted`, `hostPagesToHostPageDeleted`, `hostPagesBansRemoved`, `manifestsTotal`, @@ -493,7 +483,7 @@ class MySQL { $hostsUpdated, $hostPagesDeleted, $hostPagesDescriptionsDeleted, - $hostPagesSnapUrlDeleted, + $hostPagesSnapDeleted, $hostPagesToHostPageDeleted, $hostPagesBansRemoved, $manifestsTotal, @@ -586,7 +576,7 @@ class MySQL { int $hostPagesProcessed, int $hostPagesIndexed, int $hostPagesAdded, - int $hostPagesSnapUrlAdded, + int $hostPagesSnapAdded, int $hostPagesBanned, int $manifestsProcessed, int $manifestsAdded, @@ -601,7 +591,7 @@ class MySQL { `hostPagesProcessed`, `hostPagesIndexed`, `hostPagesAdded`, - `hostPagesSnapUrlAdded`, + `hostPagesSnapAdded`, `hostPagesBanned`, `manifestsProcessed`, `manifestsAdded`, @@ -617,7 +607,7 @@ class MySQL { $hostPagesProcessed, $hostPagesIndexed, $hostPagesAdded, - $hostPagesSnapUrlAdded, + $hostPagesSnapAdded, $hostPagesBanned, $manifestsProcessed, $manifestsAdded, diff --git a/media/db-prototype.png b/media/db-prototype.png index 55b62a4..0ea432b 100644 Binary files a/media/db-prototype.png and b/media/db-prototype.png differ diff --git a/public/explore.php b/public/explore.php index d65f308..6560b43 100644 --- a/public/explore.php +++ b/public/explore.php @@ -234,18 +234,18 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
timeAdded) ?>
timeUpdated) ?>
- getTotalHostPageSnapURLs($hp); ?> + getTotalHostPageSnaps($hp); ?>- +
- - getHostPageSnapURLs($hp) as $hostPageSnapUrl) { ?> + + getHostPageSnaps($hp) as $hostPageSnap) { ?>