|
|
@ -449,21 +449,13 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
continue; |
|
|
|
continue; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Parse MIME |
|
|
|
// Check for MIME |
|
|
|
$hostPageIsHtml = false; |
|
|
|
|
|
|
|
$hostPageInMime = false; |
|
|
|
$hostPageInMime = false; |
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) { |
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) { |
|
|
|
|
|
|
|
|
|
|
|
// Ban page on MIME type not allowed in settings |
|
|
|
// Ban page on MIME type not allowed in settings |
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
|
|
|
|
|
|
|
|
// Check for HTML page |
|
|
|
|
|
|
|
if (false !== stripos(Filter::mime($contentType), 'text/html')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPageIsHtml = true; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPageInMime = true; |
|
|
|
$hostPageInMime = true; |
|
|
|
break; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
@ -498,8 +490,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
$robots = null; |
|
|
|
$robots = null; |
|
|
|
$yggoManifest = null; |
|
|
|
$yggoManifest = null; |
|
|
|
|
|
|
|
|
|
|
|
// Is DOM content |
|
|
|
// Is HTML document |
|
|
|
if ($hostPageIsHtml) { |
|
|
|
if (false !== stripos(Filter::mime($contentType), 'text/html')) { |
|
|
|
|
|
|
|
|
|
|
|
// Parse content |
|
|
|
// Parse content |
|
|
|
$dom = new DomDocument(); |
|
|
|
$dom = new DomDocument(); |
|
|
@ -557,143 +549,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
$yggoManifest = Filter::url(@$meta->getAttribute('content')); |
|
|
|
$yggoManifest = Filter::url(@$meta->getAttribute('content')); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Add queued page description if not exists |
|
|
|
|
|
|
|
if ($title || $description || $keywords) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->addHostPageDescription($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
$title ? Filter::pageTitle($title) : null, |
|
|
|
|
|
|
|
$description ? Filter::pageDescription($description) : null, |
|
|
|
|
|
|
|
$keywords ? Filter::pageKeywords($keywords) : null, |
|
|
|
|
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, |
|
|
|
|
|
|
|
time()); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update manifest registry |
|
|
|
|
|
|
|
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$yggoManifestCRC32 = crc32($yggoManifest); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!$db->getManifest($yggoManifestCRC32)) { |
|
|
|
|
|
|
|
$db->addManifest($yggoManifestCRC32, |
|
|
|
|
|
|
|
$yggoManifest, |
|
|
|
|
|
|
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS, |
|
|
|
|
|
|
|
time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$manifestsAdded++; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin snaps |
|
|
|
|
|
|
|
$snapLocal = false; |
|
|
|
|
|
|
|
$snapMega = false; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Snap local enabled and MIME in white list |
|
|
|
|
|
|
|
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// MIME type allowed in settings |
|
|
|
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$snapLocal = true; |
|
|
|
|
|
|
|
break; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Snap MEGA enabled and MIME in white list |
|
|
|
|
|
|
|
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// MIME type allowed in settings |
|
|
|
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$snapMega = true; |
|
|
|
|
|
|
|
break; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// At least one snap storage match settings condition |
|
|
|
|
|
|
|
if ($snapLocal || $snapMega) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$crc32data = crc32($content); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Create not duplicated data snaps only, even new time |
|
|
|
|
|
|
|
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$snapTime = time(); |
|
|
|
|
|
|
|
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; |
|
|
|
|
|
|
|
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Create new ZIP container |
|
|
|
|
|
|
|
$zip = new ZipArchive(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Insert compressed snap data into the tmp storage |
|
|
|
|
|
|
|
if (true === $zip->addFromString('DATA', $content) && |
|
|
|
|
|
|
|
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . |
|
|
|
|
|
|
|
sprintf('CRC32: %s', $crc32data . PHP_EOL . |
|
|
|
|
|
|
|
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . |
|
|
|
|
|
|
|
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . |
|
|
|
|
|
|
|
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Done |
|
|
|
|
|
|
|
$zip->close(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Temporarily snap file exists |
|
|
|
|
|
|
|
if (file_exists($snapTmp)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Register snap in DB |
|
|
|
|
|
|
|
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesSnapAdded++; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Copy tmp snap to the permanent local storage |
|
|
|
|
|
|
|
if ($snapLocal) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@mkdir('../storage/snap/hp/' . $snapPath, 0755, true); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (copy($snapTmp, '../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update snap location info |
|
|
|
|
|
|
|
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Copy tmp snap to the permanent MEGA storage |
|
|
|
|
|
|
|
if ($snapMega) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$ftp = new Ftp(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$ftp->mkdir('hp/' . $snapPath, true); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update snap location info |
|
|
|
|
|
|
|
$db->updateHostPageSnapStorageMega($hostPageSnapId, true); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$ftp->close(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Remove tmp |
|
|
|
|
|
|
|
@unlink($snapTmp); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin page links collection |
|
|
|
// Begin page links collection |
|
|
|
$links = []; |
|
|
|
$links = []; |
|
|
@ -994,6 +849,143 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Add queued page description if not exists |
|
|
|
|
|
|
|
if ($title || $description || $keywords) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->addHostPageDescription($queueHostPage->hostPageId, |
|
|
|
|
|
|
|
$title ? Filter::pageTitle($title) : null, |
|
|
|
|
|
|
|
$description ? Filter::pageDescription($description) : null, |
|
|
|
|
|
|
|
$keywords ? Filter::pageKeywords($keywords) : null, |
|
|
|
|
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, |
|
|
|
|
|
|
|
time()); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update manifest registry |
|
|
|
|
|
|
|
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$yggoManifestCRC32 = crc32($yggoManifest); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!$db->getManifest($yggoManifestCRC32)) { |
|
|
|
|
|
|
|
$db->addManifest($yggoManifestCRC32, |
|
|
|
|
|
|
|
$yggoManifest, |
|
|
|
|
|
|
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS, |
|
|
|
|
|
|
|
time()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$manifestsAdded++; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin snaps |
|
|
|
|
|
|
|
$snapLocal = false; |
|
|
|
|
|
|
|
$snapMega = false; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Snap local enabled and MIME in white list |
|
|
|
|
|
|
|
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// MIME type allowed in settings |
|
|
|
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$snapLocal = true; |
|
|
|
|
|
|
|
break; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Snap MEGA enabled and MIME in white list |
|
|
|
|
|
|
|
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// MIME type allowed in settings |
|
|
|
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$snapMega = true; |
|
|
|
|
|
|
|
break; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// At least one snap storage match settings condition |
|
|
|
|
|
|
|
if ($snapLocal || $snapMega) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$crc32data = crc32($content); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Create not duplicated data snaps only, even new time |
|
|
|
|
|
|
|
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$snapTime = time(); |
|
|
|
|
|
|
|
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; |
|
|
|
|
|
|
|
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Create new ZIP container |
|
|
|
|
|
|
|
$zip = new ZipArchive(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Insert compressed snap data into the tmp storage |
|
|
|
|
|
|
|
if (true === $zip->addFromString('DATA', $content) && |
|
|
|
|
|
|
|
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . |
|
|
|
|
|
|
|
sprintf('CRC32: %s', $crc32data . PHP_EOL . |
|
|
|
|
|
|
|
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . |
|
|
|
|
|
|
|
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . |
|
|
|
|
|
|
|
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Done |
|
|
|
|
|
|
|
$zip->close(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Temporarily snap file exists |
|
|
|
|
|
|
|
if (file_exists($snapTmp)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Register snap in DB |
|
|
|
|
|
|
|
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesSnapAdded++; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Copy tmp snap to the permanent local storage |
|
|
|
|
|
|
|
if ($snapLocal) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@mkdir('../storage/snap/hp/' . $snapPath, 0755, true); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (copy($snapTmp, '../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update snap location info |
|
|
|
|
|
|
|
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Copy tmp snap to the permanent MEGA storage |
|
|
|
|
|
|
|
if ($snapMega) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$ftp = new Ftp(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$ftp->mkdir('hp/' . $snapPath, true); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update snap location info |
|
|
|
|
|
|
|
$db->updateHostPageSnapStorageMega($hostPageSnapId, true); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$ftp->close(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Remove tmp |
|
|
|
|
|
|
|
@unlink($snapTmp); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Apply changes |
|
|
|
// Apply changes |
|
|
|
$db->commit(); |
|
|
|
$db->commit(); |
|
|
|