2023-04-09 00:06:28 +03:00
|
|
|
<?php
|
|
|
|
|
|
|
|
// Lock multi-thread execution
|
|
|
|
$semaphore = sem_get(crc32('crontab.cleaner'), 1);
|
|
|
|
|
|
|
|
if (false === sem_acquire($semaphore, true)) {
|
|
|
|
|
|
|
|
echo 'Process locked by another thread.' . PHP_EOL;
|
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load system dependencies
|
|
|
|
require_once('../config/app.php');
|
|
|
|
require_once('../library/curl.php');
|
|
|
|
require_once('../library/robots.php');
|
|
|
|
require_once('../library/mysql.php');
|
2023-06-04 12:27:20 +03:00
|
|
|
require_once('../library/ftp.php');
|
2023-04-09 00:06:28 +03:00
|
|
|
|
|
|
|
// Connect database
|
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
|
|
|
|
|
|
|
// Debug
|
|
|
|
$timeStart = microtime(true);
|
|
|
|
|
2023-05-09 08:19:49 +03:00
|
|
|
$httpRequestsTotal = 0;
|
|
|
|
$httpRequestsSizeTotal = 0;
|
|
|
|
$httpDownloadSizeTotal = 0;
|
|
|
|
$httpRequestsTimeTotal = 0;
|
|
|
|
|
|
|
|
$hostsTotal = $db->getTotalHosts();
|
|
|
|
$manifestsTotal = $db->getTotalManifests();
|
|
|
|
$hostsUpdated = 0;
|
|
|
|
$hostPagesDeleted = 0;
|
2023-05-13 10:15:07 +03:00
|
|
|
$hostPagesDescriptionsDeleted = 0;
|
2023-05-14 01:45:55 +03:00
|
|
|
$hostPagesSnapDeleted = 0;
|
2023-05-13 10:15:07 +03:00
|
|
|
$hostPagesToHostPageDeleted = 0;
|
2023-05-09 08:19:49 +03:00
|
|
|
$manifestsDeleted = 0;
|
|
|
|
$hostPagesBansRemoved = 0;
|
|
|
|
|
|
|
|
$logsCleanerDeleted = 0;
|
|
|
|
$logsCrawlerDeleted = 0;
|
2023-05-08 11:04:59 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Begin update
|
|
|
|
try {
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-06-13 16:51:16 +03:00
|
|
|
$db->beginTransaction();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Get cleaner queue
|
|
|
|
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Parse host info
|
|
|
|
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Get robots.txt if exists
|
|
|
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-08 08:27:21 +03:00
|
|
|
// Update curl stats
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpRequestsTotal++;
|
2023-05-10 12:47:36 +03:00
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
|
|
|
$hostRobots = $curl->getContent();
|
|
|
|
} else {
|
|
|
|
$hostRobots = null;
|
|
|
|
}
|
2023-04-09 00:06:28 +03:00
|
|
|
|
|
|
|
// Update host data
|
|
|
|
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
|
|
|
|
|
|
|
// Apply host pages limits
|
|
|
|
$totalHostPages = $db->getTotalHostPages($host->hostId);
|
|
|
|
|
|
|
|
if ($totalHostPages > $host->crawlPageLimit) {
|
|
|
|
|
2023-05-04 01:04:39 +03:00
|
|
|
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
|
|
|
|
|
2023-05-10 12:47:36 +03:00
|
|
|
if ($hostPage->uri != '/') {
|
2023-05-14 01:45:55 +03:00
|
|
|
|
|
|
|
// Delete host page descriptions
|
|
|
|
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
|
|
|
|
|
|
|
// Delete host page refs data
|
|
|
|
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
|
|
|
|
|
|
|
// Delete host page snaps
|
2023-05-14 19:41:20 +03:00
|
|
|
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
|
|
|
|
|
2023-05-14 01:45:55 +03:00
|
|
|
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
|
2023-05-14 19:41:20 +03:00
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
if ($hostPageSnap->storageLocal) {
|
2023-05-14 19:41:20 +03:00
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
unlink('../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
|
2023-05-14 19:41:20 +03:00
|
|
|
}
|
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
if ($hostPageSnap->storageMega) {
|
2023-05-14 19:41:20 +03:00
|
|
|
|
|
|
|
$ftp = new Ftp();
|
|
|
|
|
|
|
|
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
|
2023-06-04 13:39:47 +03:00
|
|
|
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
|
2023-05-14 19:41:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
|
2023-05-15 09:18:18 +03:00
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
|
2023-05-14 01:45:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Delete host page
|
|
|
|
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
2023-05-10 12:47:36 +03:00
|
|
|
}
|
2023-05-04 01:04:39 +03:00
|
|
|
}
|
2023-04-09 00:06:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Apply new robots.txt rules
|
2023-05-03 04:17:58 +03:00
|
|
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-06 08:45:37 +03:00
|
|
|
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-14 01:45:55 +03:00
|
|
|
if ($hostPage->uri != '/' && !$robots->uriAllowed($hostPage->uri)) {
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-14 01:45:55 +03:00
|
|
|
// Delete host page descriptions
|
2023-05-13 10:15:07 +03:00
|
|
|
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
2023-05-09 01:29:32 +03:00
|
|
|
|
2023-05-14 01:45:55 +03:00
|
|
|
// Delete host page refs data
|
|
|
|
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
|
|
|
|
|
|
|
// Delete host page snaps
|
2023-05-14 19:41:20 +03:00
|
|
|
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
|
|
|
|
|
2023-05-14 01:45:55 +03:00
|
|
|
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
|
2023-05-14 19:41:20 +03:00
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
if ($hostPageSnap->storageLocal) {
|
2023-05-14 19:41:20 +03:00
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
unlink('../storage/snap/hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
|
2023-05-14 19:41:20 +03:00
|
|
|
}
|
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
if ($hostPageSnap->storageMega) {
|
2023-05-14 19:41:20 +03:00
|
|
|
|
|
|
|
$ftp = new Ftp();
|
|
|
|
|
|
|
|
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
|
2023-06-04 13:39:47 +03:00
|
|
|
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
|
2023-05-14 19:41:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
$db->deleteHostPageSnapDownloads($hostPageSnap->hostPageSnapId);
|
2023-05-15 09:18:18 +03:00
|
|
|
|
2023-06-04 13:39:47 +03:00
|
|
|
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
|
2023-05-10 12:47:36 +03:00
|
|
|
}
|
2023-05-14 01:45:55 +03:00
|
|
|
|
|
|
|
// Delete host page
|
|
|
|
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
2023-04-09 00:06:28 +03:00
|
|
|
}
|
|
|
|
}
|
2023-05-05 05:26:53 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Clean up deprecated manifests
|
|
|
|
foreach ($db->getManifests() as $manifest) {
|
|
|
|
|
|
|
|
$delete = false;
|
|
|
|
|
2023-05-09 10:21:37 +03:00
|
|
|
$curl = new Curl($manifest->url, CRAWL_CURLOPT_USERAGENT);
|
2023-05-05 05:26:53 +03:00
|
|
|
|
2023-05-08 08:27:21 +03:00
|
|
|
// Update curl stats
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpRequestsTotal++;
|
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing non 200 code
|
|
|
|
if (200 != $curl->getCode()) {
|
|
|
|
|
|
|
|
continue; // Wait for reconnect
|
|
|
|
}
|
2023-05-04 01:04:39 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing without returned data
|
|
|
|
if (!$remoteManifest = $curl->getContent()) {
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
$delete = true;
|
|
|
|
}
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on json encoding error
|
|
|
|
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
2023-04-09 00:06:28 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
$delete = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip processing on required fields missed
|
|
|
|
if (empty($remoteManifest->status) ||
|
|
|
|
empty($remoteManifest->result->config->crawlUrlRegexp) ||
|
|
|
|
empty($remoteManifest->result->api->version)) {
|
|
|
|
|
|
|
|
$delete = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip processing on API version not compatible
|
|
|
|
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
|
|
|
|
|
|
|
$delete = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
|
|
|
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
|
|
|
|
|
|
|
$delete = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($delete) {
|
|
|
|
|
|
|
|
$manifestsDeleted += $db->deleteManifest($manifest->manifestId);
|
|
|
|
}
|
2023-04-09 00:06:28 +03:00
|
|
|
}
|
2023-05-05 05:26:53 +03:00
|
|
|
|
2023-05-06 08:45:37 +03:00
|
|
|
// Reset banned pages
|
|
|
|
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
|
|
|
|
|
2023-05-09 08:19:49 +03:00
|
|
|
// Delete page description history
|
2023-05-13 10:15:07 +03:00
|
|
|
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
|
2023-05-09 08:19:49 +03:00
|
|
|
|
2023-05-08 11:04:59 +03:00
|
|
|
// Delete deprecated logs
|
|
|
|
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
|
|
|
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
|
|
|
|
|
2023-05-14 02:39:32 +03:00
|
|
|
// Commit results
|
2023-05-05 05:26:53 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-06-13 16:51:16 +03:00
|
|
|
} catch (Exception $e) {
|
|
|
|
|
|
|
|
$db->rollBack();
|
|
|
|
|
|
|
|
var_dump($e);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Optimize tables
|
|
|
|
if (CLEAN_DB_TABLES_OPTIMIZATION) {
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
$db->beginTransaction();
|
2023-05-29 22:13:41 +03:00
|
|
|
|
|
|
|
$db->optimize();
|
2023-05-14 02:39:32 +03:00
|
|
|
|
2023-06-13 16:51:16 +03:00
|
|
|
$db->commit();
|
2023-05-05 05:26:53 +03:00
|
|
|
|
2023-06-13 16:51:16 +03:00
|
|
|
} catch (Exception $e) {
|
2023-05-05 05:26:53 +03:00
|
|
|
|
2023-06-13 16:51:16 +03:00
|
|
|
$db->rollBack();
|
|
|
|
|
|
|
|
var_dump($e);
|
|
|
|
}
|
2023-04-09 00:06:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Debug
|
2023-05-08 11:04:59 +03:00
|
|
|
$executionTimeTotal = microtime(true) - $timeStart;
|
|
|
|
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
|
|
|
|
|
|
|
if (CLEAN_LOG_ENABLED) {
|
|
|
|
|
|
|
|
$db->addCleanerLog( time(),
|
|
|
|
$hostsTotal,
|
|
|
|
$hostsUpdated,
|
|
|
|
$hostPagesDeleted,
|
2023-05-13 10:15:07 +03:00
|
|
|
$hostPagesDescriptionsDeleted,
|
2023-05-14 01:45:55 +03:00
|
|
|
$hostPagesSnapDeleted,
|
2023-05-13 10:15:07 +03:00
|
|
|
$hostPagesToHostPageDeleted,
|
2023-05-08 11:04:59 +03:00
|
|
|
$hostPagesBansRemoved,
|
|
|
|
$manifestsTotal,
|
|
|
|
$manifestsDeleted,
|
|
|
|
$logsCleanerDeleted,
|
|
|
|
$logsCrawlerDeleted,
|
|
|
|
$httpRequestsTotal,
|
|
|
|
$httpRequestsSizeTotal,
|
|
|
|
$httpDownloadSizeTotal,
|
|
|
|
$httpRequestsTimeTotal,
|
|
|
|
$executionTimeTotal);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2023-04-09 00:06:28 +03:00
|
|
|
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
|
|
|
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
2023-05-08 11:04:59 +03:00
|
|
|
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
|
|
|
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-06 08:45:37 +03:00
|
|
|
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
2023-05-13 10:15:07 +03:00
|
|
|
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
|
2023-05-14 01:45:55 +03:00
|
|
|
echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
|
2023-05-13 10:15:07 +03:00
|
|
|
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-08 11:04:59 +03:00
|
|
|
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
|
|
|
|
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
|
|
|
|
|
|
|
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
|
|
|
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
|
|
|
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
|
|
|
|
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-08 11:04:59 +03:00
|
|
|
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;
|