From 4cb27f563f10f41710665c4bc1e3d8f1bfb693cd Mon Sep 17 00:00:00 2001 From: ghost Date: Wed, 12 Jul 2023 12:27:30 +0300 Subject: [PATCH] fix meta index/nofollow processing --- crontab/crawler.php | 242 +++++++++++++++++++++++--------------------- 1 file changed, 124 insertions(+), 118 deletions(-) diff --git a/crontab/crawler.php b/crontab/crawler.php index 890311e..cda0dce 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -508,7 +508,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND continue; } - // Skip index page links without titles + // Skip index page links without title tag $title = @$dom->getElementsByTagName('title'); if ($title->length == 0) { @@ -539,19 +539,13 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $metaRobots = @$meta->getAttribute('content'); - // Ban page with meta robots:noindex value + // Ban page with meta robots:noindex attribute if (false !== stripos($metaRobots, 'noindex')) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } - - // Skip page with meta robots:nofollow attribute - if (false !== stripos($metaRobots, 'nofollow')) { - - continue; - } } // Grab meta yggo:manifest link when available @@ -594,6 +588,128 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND } } + // Begin snaps + $snapLocal = false; + $snapMega = false; + + // Snap local enabled and MIME in white list + if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) { + + foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { + + // MIME type allowed in settings + if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { + + $snapLocal = true; + break; + } + } + } + + // Snap MEGA enabled and MIME in white list + if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) { + + foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) { + + // MIME type allowed in settings + if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { + + $snapMega = true; + break; + } + } + } + + // At least one snap storage match settings condition + if ($snapLocal || $snapMega) { + + $crc32data = crc32($content); + + // Create not duplicated data snaps only, even new time + if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) { + + $snapTime = time(); + $snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); + + $snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; + @mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true); + + // Create new ZIP container + $zip = new ZipArchive(); + + if (true === $zip->open($snapTmp, ZipArchive::CREATE)) { + + // Insert compressed snap data into the tmp storage + if (true === $zip->addFromString('DATA', $content) && + true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . + sprintf('CRC32: %s', $crc32data . PHP_EOL . + sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . + sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . + sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { + + // Done + $zip->close(); + + // Temporarily snap file exists + if (file_exists($snapTmp)) { + + // Register snap in DB + if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) { + + $hostPagesSnapAdded++; + + // Copy tmp snap to the permanent local storage + if ($snapLocal) { + + @mkdir(__DIR__ . '/../storage/snap/hp/' . $snapPath, 0755, true); + + if (copy($snapTmp, __DIR__ . '/../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) { + + // Update snap location info + $db->updateHostPageSnapStorageLocal($hostPageSnapId, true); + } + } + + // Copy tmp snap to the permanent MEGA storage + if ($snapMega) { + + $ftp = new Ftp(); + + if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { + + $ftp->mkdir('hp/' . $snapPath, true); + + if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) { + + // Update snap location info + $db->updateHostPageSnapStorageMega($hostPageSnapId, true); + } + + $ftp->close(); + } + } + } + } + } + } + + // Remove tmp + @unlink($snapTmp); + } + } + + // Skip page links following with meta robots:nofollow attribute + foreach (@$dom->getElementsByTagName('meta') as $meta) { + + if (@$meta->getAttribute('name') == 'robots') { + + if (false !== stripos($metaRobots, 'nofollow')) { + + continue; + } + } + } + // Update manifest registry if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { @@ -910,116 +1026,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND } } - // Begin snaps - $snapLocal = false; - $snapMega = false; - - // Snap local enabled and MIME in white list - if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) { - - foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { - - // MIME type allowed in settings - if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { - - $snapLocal = true; - break; - } - } - } - - // Snap MEGA enabled and MIME in white list - if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) { - - foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) { - - // MIME type allowed in settings - if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { - - $snapMega = true; - break; - } - } - } - - // At least one snap storage match settings condition - if ($snapLocal || $snapMega) { - - $crc32data = crc32($content); - - // Create not duplicated data snaps only, even new time - if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) { - - $snapTime = time(); - $snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); - - $snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; - @mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true); - - // Create new ZIP container - $zip = new ZipArchive(); - - if (true === $zip->open($snapTmp, ZipArchive::CREATE)) { - - // Insert compressed snap data into the tmp storage - if (true === $zip->addFromString('DATA', $content) && - true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . - sprintf('CRC32: %s', $crc32data . PHP_EOL . - sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . - sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . - sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { - - // Done - $zip->close(); - - // Temporarily snap file exists - if (file_exists($snapTmp)) { - - // Register snap in DB - if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) { - - $hostPagesSnapAdded++; - - // Copy tmp snap to the permanent local storage - if ($snapLocal) { - - @mkdir(__DIR__ . '/../storage/snap/hp/' . $snapPath, 0755, true); - - if (copy($snapTmp, __DIR__ . '/../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) { - - // Update snap location info - $db->updateHostPageSnapStorageLocal($hostPageSnapId, true); - } - } - - // Copy tmp snap to the permanent MEGA storage - if ($snapMega) { - - $ftp = new Ftp(); - - if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { - - $ftp->mkdir('hp/' . $snapPath, true); - - if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) { - - // Update snap location info - $db->updateHostPageSnapStorageMega($hostPageSnapId, true); - } - - $ftp->close(); - } - } - } - } - } - } - - // Remove tmp - @unlink($snapTmp); - } - } - // Apply changes $db->commit();