Browse Source

fix HTML/multimedia content detection

main
ghost 2 years ago
parent
commit
8a747de341
  1. 288
      crontab/crawler.php

288
crontab/crawler.php

@ -449,21 +449,13 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -449,21 +449,13 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
continue;
}
// Parse MIME
$hostPageIsHtml = false;
// Check for MIME
$hostPageInMime = false;
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
// Ban page on MIME type not allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
// Check for HTML page
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
$hostPageIsHtml = true;
}
$hostPageInMime = true;
break;
}
@ -498,8 +490,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -498,8 +490,8 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$robots = null;
$yggoManifest = null;
// Is DOM content
if ($hostPageIsHtml) {
// Is HTML document
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
// Parse content
$dom = new DomDocument();
@ -557,143 +549,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -557,143 +549,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$yggoManifest = Filter::url(@$meta->getAttribute('content'));
}
}
}
// Add queued page description if not exists
if ($title || $description || $keywords) {
$db->addHostPageDescription($queueHostPage->hostPageId,
$title ? Filter::pageTitle($title) : null,
$description ? Filter::pageDescription($description) : null,
$keywords ? Filter::pageKeywords($keywords) : null,
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
time());
}
// Update manifest registry
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) {
$yggoManifestCRC32 = crc32($yggoManifest);
if (!$db->getManifest($yggoManifestCRC32)) {
$db->addManifest($yggoManifestCRC32,
$yggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
$manifestsAdded++;
}
}
// Begin snaps
$snapLocal = false;
$snapMega = false;
// Snap local enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapLocal = true;
break;
}
}
}
// Snap MEGA enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMega = true;
break;
}
}
}
// At least one snap storage match settings condition
if ($snapLocal || $snapMega) {
$crc32data = crc32($content);
// Create not duplicated data snaps only, even new time
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
$snapTime = time();
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create new ZIP container
$zip = new ZipArchive();
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) {
// Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Done
$zip->close();
// Temporarily snap file exists
if (file_exists($snapTmp)) {
// Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) {
$hostPagesSnapAdded++;
// Copy tmp snap to the permanent local storage
if ($snapLocal) {
@mkdir('../storage/snap/hp/' . $snapPath, 0755, true);
if (copy($snapTmp, '../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
}
}
// Copy tmp snap to the permanent MEGA storage
if ($snapMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->mkdir('hp/' . $snapPath, true);
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
}
$ftp->close();
}
}
}
}
}
}
// Remove tmp
@unlink($snapTmp);
}
}
// Begin page links collection
$links = [];
@ -994,6 +849,143 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND @@ -994,6 +849,143 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
}
}
}
}
// Add queued page description if not exists
if ($title || $description || $keywords) {
$db->addHostPageDescription($queueHostPage->hostPageId,
$title ? Filter::pageTitle($title) : null,
$description ? Filter::pageDescription($description) : null,
$keywords ? Filter::pageKeywords($keywords) : null,
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
time());
}
// Update manifest registry
if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) {
$yggoManifestCRC32 = crc32($yggoManifest);
if (!$db->getManifest($yggoManifestCRC32)) {
$db->addManifest($yggoManifestCRC32,
$yggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
$manifestsAdded++;
}
}
// Begin snaps
$snapLocal = false;
$snapMega = false;
// Snap local enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapLocal = true;
break;
}
}
}
// Snap MEGA enabled and MIME in white list
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
// MIME type allowed in settings
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
$snapMega = true;
break;
}
}
}
// At least one snap storage match settings condition
if ($snapLocal || $snapMega) {
$crc32data = crc32($content);
// Create not duplicated data snaps only, even new time
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
$snapTime = time();
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create new ZIP container
$zip = new ZipArchive();
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) {
// Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Done
$zip->close();
// Temporarily snap file exists
if (file_exists($snapTmp)) {
// Register snap in DB
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) {
$hostPagesSnapAdded++;
// Copy tmp snap to the permanent local storage
if ($snapLocal) {
@mkdir('../storage/snap/hp/' . $snapPath, 0755, true);
if (copy($snapTmp, '../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
}
}
// Copy tmp snap to the permanent MEGA storage
if ($snapMega) {
$ftp = new Ftp();
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
$ftp->mkdir('hp/' . $snapPath, true);
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
// Update snap location info
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
}
$ftp->close();
}
}
}
}
}
}
// Remove tmp
@unlink($snapTmp);
}
}
// Apply changes
$db->commit();

Loading…
Cancel
Save