mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
fix meta index/nofollow processing
This commit is contained in:
parent
b7c415a8b0
commit
4cb27f563f
@ -508,7 +508,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip index page links without titles
|
||||
// Skip index page links without title tag
|
||||
$title = @$dom->getElementsByTagName('title');
|
||||
|
||||
if ($title->length == 0) {
|
||||
@ -539,19 +539,13 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
|
||||
$metaRobots = @$meta->getAttribute('content');
|
||||
|
||||
// Ban page with meta robots:noindex value
|
||||
// Ban page with meta robots:noindex attribute
|
||||
if (false !== stripos($metaRobots, 'noindex')) {
|
||||
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip page with meta robots:nofollow attribute
|
||||
if (false !== stripos($metaRobots, 'nofollow')) {
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Grab meta yggo:manifest link when available
|
||||
@ -594,6 +588,128 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
}
|
||||
}
|
||||
|
||||
// Begin snaps
|
||||
$snapLocal = false;
|
||||
$snapMega = false;
|
||||
|
||||
// Snap local enabled and MIME in white list
|
||||
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
|
||||
|
||||
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
|
||||
|
||||
// MIME type allowed in settings
|
||||
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
|
||||
|
||||
$snapLocal = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Snap MEGA enabled and MIME in white list
|
||||
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
|
||||
|
||||
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
|
||||
|
||||
// MIME type allowed in settings
|
||||
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
|
||||
|
||||
$snapMega = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// At least one snap storage match settings condition
|
||||
if ($snapLocal || $snapMega) {
|
||||
|
||||
$crc32data = crc32($content);
|
||||
|
||||
// Create not duplicated data snaps only, even new time
|
||||
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
|
||||
|
||||
$snapTime = time();
|
||||
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
|
||||
|
||||
$snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
|
||||
@mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true);
|
||||
|
||||
// Create new ZIP container
|
||||
$zip = new ZipArchive();
|
||||
|
||||
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) {
|
||||
|
||||
// Insert compressed snap data into the tmp storage
|
||||
if (true === $zip->addFromString('DATA', $content) &&
|
||||
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
|
||||
sprintf('CRC32: %s', $crc32data . PHP_EOL .
|
||||
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
|
||||
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
|
||||
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
|
||||
|
||||
// Done
|
||||
$zip->close();
|
||||
|
||||
// Temporarily snap file exists
|
||||
if (file_exists($snapTmp)) {
|
||||
|
||||
// Register snap in DB
|
||||
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) {
|
||||
|
||||
$hostPagesSnapAdded++;
|
||||
|
||||
// Copy tmp snap to the permanent local storage
|
||||
if ($snapLocal) {
|
||||
|
||||
@mkdir(__DIR__ . '/../storage/snap/hp/' . $snapPath, 0755, true);
|
||||
|
||||
if (copy($snapTmp, __DIR__ . '/../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) {
|
||||
|
||||
// Update snap location info
|
||||
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy tmp snap to the permanent MEGA storage
|
||||
if ($snapMega) {
|
||||
|
||||
$ftp = new Ftp();
|
||||
|
||||
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
|
||||
|
||||
$ftp->mkdir('hp/' . $snapPath, true);
|
||||
|
||||
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
|
||||
|
||||
// Update snap location info
|
||||
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
|
||||
}
|
||||
|
||||
$ftp->close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove tmp
|
||||
@unlink($snapTmp);
|
||||
}
|
||||
}
|
||||
|
||||
// Skip page links following with meta robots:nofollow attribute
|
||||
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
||||
|
||||
if (@$meta->getAttribute('name') == 'robots') {
|
||||
|
||||
if (false !== stripos($metaRobots, 'nofollow')) {
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update manifest registry
|
||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||
|
||||
@ -910,116 +1026,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
||||
}
|
||||
}
|
||||
|
||||
// Begin snaps
|
||||
$snapLocal = false;
|
||||
$snapMega = false;
|
||||
|
||||
// Snap local enabled and MIME in white list
|
||||
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
|
||||
|
||||
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
|
||||
|
||||
// MIME type allowed in settings
|
||||
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
|
||||
|
||||
$snapLocal = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Snap MEGA enabled and MIME in white list
|
||||
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
|
||||
|
||||
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
|
||||
|
||||
// MIME type allowed in settings
|
||||
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
|
||||
|
||||
$snapMega = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// At least one snap storage match settings condition
|
||||
if ($snapLocal || $snapMega) {
|
||||
|
||||
$crc32data = crc32($content);
|
||||
|
||||
// Create not duplicated data snaps only, even new time
|
||||
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
|
||||
|
||||
$snapTime = time();
|
||||
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
|
||||
|
||||
$snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
|
||||
@mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true);
|
||||
|
||||
// Create new ZIP container
|
||||
$zip = new ZipArchive();
|
||||
|
||||
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) {
|
||||
|
||||
// Insert compressed snap data into the tmp storage
|
||||
if (true === $zip->addFromString('DATA', $content) &&
|
||||
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
|
||||
sprintf('CRC32: %s', $crc32data . PHP_EOL .
|
||||
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
|
||||
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
|
||||
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
|
||||
|
||||
// Done
|
||||
$zip->close();
|
||||
|
||||
// Temporarily snap file exists
|
||||
if (file_exists($snapTmp)) {
|
||||
|
||||
// Register snap in DB
|
||||
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) {
|
||||
|
||||
$hostPagesSnapAdded++;
|
||||
|
||||
// Copy tmp snap to the permanent local storage
|
||||
if ($snapLocal) {
|
||||
|
||||
@mkdir(__DIR__ . '/../storage/snap/hp/' . $snapPath, 0755, true);
|
||||
|
||||
if (copy($snapTmp, __DIR__ . '/../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) {
|
||||
|
||||
// Update snap location info
|
||||
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy tmp snap to the permanent MEGA storage
|
||||
if ($snapMega) {
|
||||
|
||||
$ftp = new Ftp();
|
||||
|
||||
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
|
||||
|
||||
$ftp->mkdir('hp/' . $snapPath, true);
|
||||
|
||||
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
|
||||
|
||||
// Update snap location info
|
||||
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
|
||||
}
|
||||
|
||||
$ftp->close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove tmp
|
||||
@unlink($snapTmp);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply changes
|
||||
$db->commit();
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user