YGGo/crontab/cleaner.php

405 lines
13 KiB
PHP
Raw Normal View History

2023-04-08 21:06:28 +00:00
<?php
// Lock multi-thread execution
$semaphore = sem_get(crc32('crontab.cleaner'), 1);
if (false === sem_acquire($semaphore, true)) {
echo 'Process locked by another thread.' . PHP_EOL;
exit;
}
// Load system dependencies
2023-06-30 11:38:29 +00:00
require_once(__DIR__ . '/../config/app.php');
require_once(__DIR__ . '/../library/curl.php');
require_once(__DIR__ . '/../library/robots.php');
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/ftp.php');
2023-04-08 21:06:28 +00:00
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Debug
$timeStart = microtime(true);
$httpRequestsTotal = 0;
$httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;
$hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
$hostPagesDeleted = 0;
2023-05-13 07:15:07 +00:00
$hostPagesDescriptionsDeleted = 0;
$hostPagesDomsDeleted = 0;
2023-05-13 22:45:55 +00:00
$hostPagesSnapDeleted = 0;
2023-05-13 07:15:07 +00:00
$hostPagesToHostPageDeleted = 0;
$manifestsDeleted = 0;
$hostPagesBansRemoved = 0;
$logsCleanerDeleted = 0;
$logsCrawlerDeleted = 0;
2023-05-08 08:04:59 +00:00
// Begin update
$db->beginTransaction();
2023-04-08 21:06:28 +00:00
try {
// Get cleaner queue
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
2023-04-08 21:06:28 +00:00
// Get robots.txt if exists
2023-07-30 10:02:24 +00:00
$curl = new Curl($host->hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
2023-04-08 21:06:28 +00:00
2023-05-08 05:27:21 +00:00
// Update curl stats
2023-05-08 08:04:59 +00:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
2023-05-08 08:04:59 +00:00
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 05:27:21 +00:00
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
2023-04-08 21:06:28 +00:00
// Update host data
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
// Apply host pages limits
$totalHostPages = $db->getTotalHostPages($host->hostId);
if ($totalHostPages > $host->crawlPageLimit) {
2023-05-03 22:04:39 +00:00
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
if ($hostPage->uri != '/') {
2023-05-13 22:45:55 +00:00
// Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
// Delete host page DOMs
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
2023-05-13 22:45:55 +00:00
// Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
2023-05-13 22:45:55 +00:00
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
2023-07-29 16:38:09 +00:00
foreach ($storages as $i => $storage) {
// Generate storage id
2023-07-29 16:38:09 +00:00
$crc32name = crc32(sprintf('%s.%s', $name, $i));
switch ($name) {
case 'localhost':
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
2023-05-15 06:18:18 +00:00
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
2023-05-13 22:45:55 +00:00
}
// Delete host page
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
2023-05-03 22:04:39 +00:00
}
2023-04-08 21:06:28 +00:00
}
// Apply new robots.txt rules
2023-05-03 01:17:58 +00:00
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
2023-04-08 21:06:28 +00:00
foreach ($db->getHostPages($host->hostId) as $hostPage) {
2023-04-08 21:06:28 +00:00
2023-05-13 22:45:55 +00:00
if ($hostPage->uri != '/' && !$robots->uriAllowed($hostPage->uri)) {
2023-04-08 21:06:28 +00:00
2023-05-13 22:45:55 +00:00
// Delete host page descriptions
2023-05-13 07:15:07 +00:00
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
// Delete host page DOMs
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
2023-05-13 22:45:55 +00:00
// Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
// Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
2023-05-13 22:45:55 +00:00
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
2023-07-29 16:38:09 +00:00
foreach ($storages as $i => $storage) {
// Generate storage id
2023-07-29 16:38:09 +00:00
$crc32name = crc32(sprintf('%s.%s', $name, $i));
switch ($name) {
case 'localhost':
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
2023-05-15 06:18:18 +00:00
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
}
2023-05-13 22:45:55 +00:00
// Delete host page
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
2023-04-08 21:06:28 +00:00
}
}
}
// Clean up deprecated manifests
foreach ($db->getManifests() as $manifest) {
$delete = false;
$curl = new Curl($manifest->url, CRAWL_CURLOPT_USERAGENT);
2023-05-08 05:27:21 +00:00
// Update curl stats
2023-05-08 08:04:59 +00:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 05:27:21 +00:00
// Skip processing non 200 code
if (200 != $curl->getCode()) {
continue; // Wait for reconnect
}
2023-05-03 22:04:39 +00:00
// Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) {
2023-04-08 21:06:28 +00:00
$delete = true;
}
2023-04-08 21:06:28 +00:00
// Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) {
2023-04-08 21:06:28 +00:00
$delete = true;
}
// Skip processing on required fields missed
if (empty($remoteManifest->status) ||
empty($remoteManifest->result->config->crawlUrlRegexp) ||
empty($remoteManifest->result->api->version)) {
$delete = true;
}
// Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
$delete = true;
}
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
$delete = true;
}
if ($delete) {
$manifestsDeleted += $db->deleteManifest($manifest->manifestId);
}
2023-04-08 21:06:28 +00:00
}
// Reset banned pages
$hostPagesBansRemoved += $db->resetBannedHostPages(time() - CLEAN_PAGE_BAN_SECONDS_OFFSET);
2023-06-16 13:53:14 +00:00
// Clean up banned pages extra data
foreach ($db->getHostPagesBanned() as $hostPage) {
2023-06-16 13:53:14 +00:00
// Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
2023-06-16 13:53:14 +00:00
// Delete host page DOMs
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
2023-06-16 13:53:14 +00:00
// Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
2023-06-16 13:53:14 +00:00
// Delete host page snaps
$snapFilePath = chunk_split($hostPage->hostPageId, 1, '/');
2023-06-16 13:53:14 +00:00
foreach ($db->getHostPageSnaps($hostPage->hostPageId) as $hostPageSnap) {
2023-06-16 13:53:14 +00:00
// Delete snap files
foreach (json_decode(SNAP_STORAGE) as $name => $storages) {
2023-06-16 13:53:14 +00:00
2023-07-29 16:38:09 +00:00
foreach ($storages as $i => $storage) {
2023-06-16 13:53:14 +00:00
// Generate storage id
2023-07-29 16:38:09 +00:00
$crc32name = crc32(sprintf('%s.%s', $name, $i));
2023-06-16 13:53:14 +00:00
switch ($name) {
2023-06-16 13:53:14 +00:00
case 'localhost':
2023-06-16 13:53:14 +00:00
@unlink($storage->directory . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
2023-06-16 13:53:14 +00:00
break;
case 'ftp':
$ftp = new Ftp();
if ($ftp->connect($storage->host, $storage->port, $storage->username, $storage->password, $storage->directory, $storage->timeout, $storage->passive)) {
$ftp->delete('hp/' . $snapFilePath . $hostPageSnap->timeAdded . '.zip');
}
break;
}
// Clean up DB registry
foreach ($db->getHostPageSnapStorages($hostPageSnap->hostPageSnapId) as $hostPageSnapStorage) {
$db->deleteHostPageSnapDownloads($hostPageSnapStorage->hostPageSnapStorageId);
}
$db->deleteHostPageSnapStorages($hostPageSnap->hostPageSnapId);
$hostPagesSnapDeleted += $db->deleteHostPageSnap($hostPageSnap->hostPageSnapId);
}
}
2023-06-16 13:53:14 +00:00
}
}
// Delete page description history
2023-05-13 07:15:07 +00:00
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
// Delete page dom history
$hostPagesDomsDeleted += $db->deleteHostPageDomsByTimeAdded(time() - CLEAN_PAGE_DOM_OFFSET);
2023-05-08 08:04:59 +00:00
// Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
// Delete failed snap files
// @TODO
2023-07-07 09:30:07 +00:00
// Commit results
$db->commit();
} catch (Exception $e) {
$db->rollBack();
var_dump($e);
}
// Optimize tables
if (CLEAN_DB_TABLES_OPTIMIZATION) {
try {
2023-05-29 19:13:41 +00:00
$db->optimize();
} catch (Exception $e) {
var_dump($e);
}
2023-04-08 21:06:28 +00:00
}
// Debug
2023-05-08 08:04:59 +00:00
$executionTimeTotal = microtime(true) - $timeStart;
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
if (CLEAN_LOG_ENABLED) {
$db->addCleanerLog( time(),
$hostsTotal,
$hostsUpdated,
$hostPagesDeleted,
2023-05-13 07:15:07 +00:00
$hostPagesDescriptionsDeleted,
$hostPagesDomsDeleted,
2023-05-13 22:45:55 +00:00
$hostPagesSnapDeleted,
2023-05-13 07:15:07 +00:00
$hostPagesToHostPageDeleted,
2023-05-08 08:04:59 +00:00
$hostPagesBansRemoved,
$manifestsTotal,
$manifestsDeleted,
$logsCleanerDeleted,
$logsCrawlerDeleted,
$httpRequestsTotal,
$httpRequestsSizeTotal,
$httpDownloadSizeTotal,
$httpRequestsTimeTotal,
$executionTimeTotal);
}
2023-04-08 21:06:28 +00:00
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
2023-05-08 08:04:59 +00:00
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
2023-05-08 05:27:21 +00:00
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
2023-05-08 05:27:21 +00:00
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
2023-05-13 07:15:07 +00:00
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
echo 'Host page doms deleted: ' . $hostPagesDomsDeleted . PHP_EOL;
2023-05-13 22:45:55 +00:00
echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
2023-05-13 07:15:07 +00:00
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
2023-05-08 05:27:21 +00:00
2023-05-08 08:04:59 +00:00
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
2023-05-08 05:27:21 +00:00
2023-05-08 08:04:59 +00:00
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;