add curl requests debug

This commit is contained in:
ghost 2023-05-08 08:27:21 +03:00
parent 1aba060d34
commit ea04220de3
3 changed files with 97 additions and 1 deletions

View File

@ -21,6 +21,11 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Debug
$timeStart = microtime(true);
$requestsTotal = 0;
$requestSizeTotal = 0;
$downloadSizeTotal = 0;
$requestsTotalTime = 0;
$hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0;
@ -44,6 +49,12 @@ try {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
@ -131,6 +142,12 @@ try {
$curl = new Curl($manifest->url);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
// Skip processing non 200 code
if (200 != $curl->getCode()) {
@ -195,8 +212,16 @@ echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
echo 'Requests total: ' . $requestsTotal . PHP_EOL;
echo 'Requests total size: ' . $requestSizeTotal . PHP_EOL;
echo 'Download total size: ' . $downloadSizeTotal . PHP_EOL;
echo 'Requests total time: ' . $requestsTotalTime / 1000000 . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;

View File

@ -27,6 +27,11 @@ if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
// Debug
$timeStart = microtime(true);
$requestsTotal = 0;
$requestSizeTotal = 0;
$downloadSizeTotal = 0;
$requestsTotalTime = 0;
$hostPagesProcessed = 0;
$hostImagesProcessed = 0;
$manifestsProcessed = 0;
@ -51,6 +56,12 @@ try {
$curl = new Curl($queueManifest->url);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
// Update manifest index anyway, with the current time and http code
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
@ -108,6 +119,12 @@ try {
// Begin hosts collection
$curl = new Curl($remoteManifest->result->api->hosts);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
// Skip processing non 200 code
if (200 != $curl->getCode()) {
@ -166,6 +183,12 @@ try {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
@ -230,6 +253,12 @@ try {
// Init image request
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
@ -304,6 +333,12 @@ try {
// Init page request
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
@ -468,6 +503,12 @@ try {
// Get robots.txt if exists
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
@ -624,6 +665,12 @@ try {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
// Update curl stats
$requestsTotal++;
$requestSizeTotal += $curl->getSizeRequest();
$downloadSizeTotal += $curl->getSizeDownload();
$requestsTotalTime += $curl->getTotalTime();
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
@ -701,12 +748,21 @@ try {
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Hosts pages banned: ' . $hostPagesBanned . PHP_EOL;
echo 'Hosts images banned: ' . $hostImagesBanned . PHP_EOL;
echo 'Requests total: ' . $requestsTotal . PHP_EOL;
echo 'Requests total size: ' . $requestSizeTotal . PHP_EOL;
echo 'Download total size: ' . $downloadSizeTotal . PHP_EOL;
echo 'Requests total time: ' . $requestsTotalTime / 1000000 . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;

View File

@ -47,6 +47,21 @@ class Curl {
return curl_getinfo($this->_connection, CURLINFO_CONTENT_TYPE);
}
public function getSizeDownload() {
return curl_getinfo($this->_connection, CURLINFO_SIZE_DOWNLOAD);
}
public function getSizeRequest() {
return curl_getinfo($this->_connection, CURLINFO_REQUEST_SIZE);
}
public function getTotalTime() {
return curl_getinfo($this->_connection, CURLINFO_TOTAL_TIME_T);
}
public function getContent() {
return $this->_response;