Squashed 'src/leveldb/' changes from a31c8aa40..196962ff0

196962ff0 Add AcceleratedCRC32C to port_win.h 1bdf1c34c Merge upstream LevelDB v1.20 d31721eb0 Merge #17: Fixed file sharing errors fecd44902 Fixed file sharing error in Win32Env::GetFileSize(), Win32SequentialFile::_Init(), Win32RandomAccessFile::_Init() Fixed error checking in Win32SequentialFile::_Init() 5b7510f1b Merge #14: Merge upstream LevelDB 1.19 0d969fd57 Merge #16: [LevelDB] Do no crash if filesystem can't fsync c8c029b5b [LevelDB] Do no crash if filesystem can't fsync a53934a3a Increase leveldb version to 1.20. f3f139737 Separate Env tests from PosixEnv tests. eb4f0972f leveldb: Fix compilation warnings in port_posix_sse.cc on x86 (32-bit). d0883b600 Fixed path to doc file: index.md. 7fa20948d Convert documentation to markdown. ea175e28f Implement support for Intel crc32 instruction (SSE 4.2) 95cd743e5 Including <limits> for std::numeric_limits. 646c3588d Limit the number of read-only files the POSIX Env will have open. d40bc3fa5 Merge #13: Typo ebbd772d3 Typo a2fb086d0 Add option for max file size. The currend hard-coded value of 2M is inefficient in colossus. git-subtree-dir: src/leveldb git-subtree-split: 196962ff01c39b4705d8117df5c3f8c205349950
8 years ago · cf44e4ca77
31 changed files with 1414 additions and 1151 deletions
--- a/12
+++ b/12
@ -44,6 +44,7 @@ TESTS = \
 	util/cache_test \
 	util/coding_test \
 	util/crc32c_test \
 	util/env_posix_test \
 	util/env_test \
 	util/hash_test
@ -121,7 +122,7 @@ SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
 else
 # Update db.h if you change these.
 SHARED_VERSION_MAJOR = 1
-SHARED_VERSION_MINOR = 19
+SHARED_VERSION_MINOR = 20
 SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
 SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR)
 SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR)
@ -337,6 +338,9 @@ $(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
 $(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
 $(STATIC_OUTDIR)/env_posix_test:util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
 $(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
@ -412,3 +416,9 @@ $(SHARED_OUTDIR)/%.o: %.cc
 $(SHARED_OUTDIR)/%.o: %.c
 	$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
 $(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
 	$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
 $(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
 	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
--- a/README.md
+++ b/README.md
@ -16,7 +16,7 @@ Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
  * External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions.
 # Documentation
-  [LevelDB library documentation](https://rawgit.com/google/leveldb/master/doc/index.html) is online and bundled with the source code.
+  [LevelDB library documentation](https://github.com/google/leveldb/blob/master/doc/index.md) is online and bundled with the source code.
 # Limitations
@ -135,7 +135,8 @@ uncompressed blocks in memory, the read performance improves again:
 ## Repository contents
-See doc/index.html for more explanation. See doc/impl.html for a brief overview of the implementation.
+See [doc/index.md](doc/index.md) for more explanation. See
 [doc/impl.md](doc/impl.md) for a brief overview of the implementation.
 The public interface is in include/*.h.  Callers should not include or
 rely on the details of any other header files in this package.  Those
--- a/30
+++ b/30
@ -63,6 +63,7 @@ PLATFORM_SHARED_EXT="so"
 PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
 PLATFORM_SHARED_CFLAGS="-fPIC"
 PLATFORM_SHARED_VERSIONED=true
 PLATFORM_SSEFLAGS=
 MEMCMP_FLAG=
 if [ "$CXX" = "g++" ]; then
@ -77,6 +78,7 @@ case "$TARGET_OS" in
        COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
        PLATFORM_LDFLAGS="-lpthread"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    Darwin)
        PLATFORM=OS_MACOSX
@ -85,24 +87,28 @@ case "$TARGET_OS" in
        [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    Linux)
        PLATFORM=OS_LINUX
        COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
        PLATFORM_LDFLAGS="-pthread"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    SunOS)
        PLATFORM=OS_SOLARIS
        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
        PLATFORM_LIBS="-lpthread -lrt"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    FreeBSD)
        PLATFORM=OS_FREEBSD
        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
        PLATFORM_LIBS="-lpthread"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    GNU/kFreeBSD)
        PLATFORM=OS_KFREEBSD
@ -115,24 +121,28 @@ case "$TARGET_OS" in
        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
        PLATFORM_LIBS="-lpthread -lgcc_s"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    OpenBSD)
        PLATFORM=OS_OPENBSD
        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
        PLATFORM_LDFLAGS="-pthread"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    DragonFly)
        PLATFORM=OS_DRAGONFLYBSD
        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
        PLATFORM_LIBS="-lpthread"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    OS_ANDROID_CROSSCOMPILE)
        PLATFORM=OS_ANDROID
        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
        PLATFORM_LDFLAGS=""  # All pthread features are in the Android C library
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        CROSS_COMPILE=true
        ;;
    HP-UX)
@ -140,6 +150,7 @@ case "$TARGET_OS" in
        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
        PLATFORM_LDFLAGS="-pthread"
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        # man ld: +h internal_name
        PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
        ;;
@ -148,6 +159,7 @@ case "$TARGET_OS" in
        COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
        [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
        PORT_FILE=port/port_posix.cc
        PORT_SSE_FILE=port/port_posix_sse.cc
        PLATFORM_SHARED_EXT=
        PLATFORM_SHARED_LDFLAGS=
        PLATFORM_SHARED_CFLAGS=
@ -182,7 +194,7 @@ set +f # re-enable globbing
 # The sources consist of the portable files, plus the platform-specific port
 # file.
-echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
+echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
 echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
 if [ "$CROSS_COMPILE" = "true" ]; then
@ -213,6 +225,21 @@ EOF
    fi
    rm -f $CXXOUTPUT 2>/dev/null
    # Test if gcc SSE 4.2 is supported
    $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null  <<EOF
      int main() {}
 EOF
    if [ "$?" = 0 ]; then
        PLATFORM_SSEFLAGS="-msse4.2"
    fi
    rm -f $CXXOUTPUT 2>/dev/null
 fi
 # Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
 if [ -n "$PLATFORM_SSEFLAGS" ]; then
    PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
 fi
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
@ -225,6 +252,7 @@ echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
 echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
 echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
 echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
 echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
 echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -84,6 +84,14 @@ static bool FLAGS_histogram = false;
 // (initialized to default value by "main")
 static int FLAGS_write_buffer_size = 0;
 // Number of bytes written to each file.
 // (initialized to default value by "main")
 static int FLAGS_max_file_size = 0;
 // Approximate size of user data packed per block (before compression.
 // (initialized to default value by "main")
 static int FLAGS_block_size = 0;
 // Number of bytes to use as a cache of uncompressed data.
 // Negative means use default settings.
 static int FLAGS_cache_size = -1;
@ -109,6 +117,7 @@ static const char* FLAGS_db = NULL;
 namespace leveldb {
 namespace {
 leveldb::Env* g_env = NULL;
 // Helper for quickly generating random data.
 class RandomGenerator {
@ -186,7 +195,7 @@ class Stats {
    done_ = 0;
    bytes_ = 0;
    seconds_ = 0;
-    start_ = Env::Default()->NowMicros();
+    start_ = g_env->NowMicros();
    finish_ = start_;
    message_.clear();
  }
@ -204,7 +213,7 @@ class Stats {
  }
  void Stop() {
-    finish_ = Env::Default()->NowMicros();
+    finish_ = g_env->NowMicros();
    seconds_ = (finish_ - start_) * 1e-6;
  }
@ -214,7 +223,7 @@ class Stats {
  void FinishedSingleOp() {
    if (FLAGS_histogram) {
-      double now = Env::Default()->NowMicros();
+      double now = g_env->NowMicros();
      double micros = now - last_op_finish_;
      hist_.Add(micros);
      if (micros > 20000) {
@ -404,10 +413,10 @@ class Benchmark {
    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
    heap_counter_(0) {
    std::vector<std::string> files;
-    Env::Default()->GetChildren(FLAGS_db, &files);
+    g_env->GetChildren(FLAGS_db, &files);
    for (size_t i = 0; i < files.size(); i++) {
      if (Slice(files[i]).starts_with("heap-")) {
-        Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
+        g_env->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
      }
    }
    if (!FLAGS_use_existing_db) {
@ -589,7 +598,7 @@ class Benchmark {
      arg[i].shared = &shared;
      arg[i].thread = new ThreadState(i);
      arg[i].thread->shared = &shared;
-      Env::Default()->StartThread(ThreadBody, &arg[i]);
+      g_env->StartThread(ThreadBody, &arg[i]);
    }
    shared.mu.Lock();
@ -700,9 +709,12 @@ class Benchmark {
  void Open() {
    assert(db_ == NULL);
    Options options;
    options.env = g_env;
    options.create_if_missing = !FLAGS_use_existing_db;
    options.block_cache = cache_;
    options.write_buffer_size = FLAGS_write_buffer_size;
    options.max_file_size = FLAGS_max_file_size;
    options.block_size = FLAGS_block_size;
    options.max_open_files = FLAGS_open_files;
    options.filter_policy = filter_policy_;
    options.reuse_logs = FLAGS_reuse_logs;
@ -925,7 +937,7 @@ class Benchmark {
    char fname[100];
    snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
    WritableFile* file;
-    Status s = Env::Default()->NewWritableFile(fname, &file);
+    Status s = g_env->NewWritableFile(fname, &file);
    if (!s.ok()) {
      fprintf(stderr, "%s\n", s.ToString().c_str());
      return;
@ -934,7 +946,7 @@ class Benchmark {
    delete file;
    if (!ok) {
      fprintf(stderr, "heap profiling not supported\n");
-      Env::Default()->DeleteFile(fname);
+      g_env->DeleteFile(fname);
    }
  }
 };
@ -943,6 +955,8 @@ class Benchmark {
 int main(int argc, char** argv) {
  FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
  FLAGS_max_file_size = leveldb::Options().max_file_size;
  FLAGS_block_size = leveldb::Options().block_size;
  FLAGS_open_files = leveldb::Options().max_open_files;
  std::string default_db_path;
@ -973,6 +987,10 @@ int main(int argc, char** argv) {
      FLAGS_value_size = n;
    } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
      FLAGS_write_buffer_size = n;
    } else if (sscanf(argv[i], "--max_file_size=%d%c", &n, &junk) == 1) {
      FLAGS_max_file_size = n;
    } else if (sscanf(argv[i], "--block_size=%d%c", &n, &junk) == 1) {
      FLAGS_block_size = n;
    } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
      FLAGS_cache_size = n;
    } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
@ -987,9 +1005,11 @@ int main(int argc, char** argv) {
    }
  }
  leveldb::g_env = leveldb::Env::Default();
  // Choose a location for the test database if none given with --db=<path>
  if (FLAGS_db == NULL) {
-      leveldb::Env::Default()->GetTestDirectory(&default_db_path);
+      leveldb::g_env->GetTestDirectory(&default_db_path);
      default_db_path += "/dbbench";
      FLAGS_db = default_db_path.c_str();
  }
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -96,6 +96,7 @@ Options SanitizeOptions(const std::string& dbname,
  result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
  ClipToRange(&result.max_open_files,    64 + kNumNonTableCacheFiles, 50000);
  ClipToRange(&result.write_buffer_size, 64<<10,                      1<<30);
  ClipToRange(&result.max_file_size,     1<<20,                       1<<30);
  ClipToRange(&result.block_size,        1<<10,                       4<<20);
  if (result.info_log == NULL) {
    // Open a log file in the same directory as the db
--- a/db/log_format.h
+++ b/db/log_format.h
@ -3,7 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 // Log format information shared by reader and writer.
-// See ../doc/log_format.txt for more detail.
+// See ../doc/log_format.md for more detail.
 #ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
 #define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -20,21 +20,29 @@
 namespace leveldb {
-static const int kTargetFileSize = 2 * 1048576;
+static int TargetFileSize(const Options* options) {
  return options->max_file_size;
 }
 // Maximum bytes of overlaps in grandparent (i.e., level+2) before we
 // stop building a single file in a level->level+1 compaction.
-static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize;
+static int64_t MaxGrandParentOverlapBytes(const Options* options) {
  return 10 * TargetFileSize(options);
 }
 // Maximum number of bytes in all compacted files.  We avoid expanding
 // the lower level file set of a compaction if it would make the
 // total compaction cover more than this many bytes.
-static const int64_t kExpandedCompactionByteSizeLimit = 25 * kTargetFileSize;
+static int64_t ExpandedCompactionByteSizeLimit(const Options* options) {
  return 25 * TargetFileSize(options);
 }
-static double MaxBytesForLevel(int level) {
+static double MaxBytesForLevel(const Options* options, int level) {
  // Note: the result for level zero is not really used since we set
  // the level-0 compaction threshold based on number of files.
-  double result = 10 * 1048576.0;  // Result for both level-0 and level-1
+
  // Result for both level-0 and level-1
  double result = 10. * 1048576.0;
  while (level > 1) {
    result *= 10;
    level--;
@ -42,8 +50,9 @@ static double MaxBytesForLevel(int level) {
  return result;
 }
-static uint64_t MaxFileSizeForLevel(int level) {
+static uint64_t MaxFileSizeForLevel(const Options* options, int level) {
-  return kTargetFileSize;  // We could vary per level to reduce number of files?
+  // We could vary per level to reduce number of files?
  return TargetFileSize(options);
 }
 static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
@ -508,7 +517,7 @@ int Version::PickLevelForMemTableOutput(
        // Check that file does not overlap too many grandparent bytes.
        GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
        const int64_t sum = TotalFileSize(overlaps);
-        if (sum > kMaxGrandParentOverlapBytes) {
+        if (sum > MaxGrandParentOverlapBytes(vset_->options_)) {
          break;
        }
      }
@ -1027,7 +1036,7 @@ bool VersionSet::ReuseManifest(const std::string& dscname,
      manifest_type != kDescriptorFile ||
      !env_->GetFileSize(dscname, &manifest_size).ok() ||
      // Make new compacted MANIFEST if old one is too big
-      manifest_size >= kTargetFileSize) {
+      manifest_size >= TargetFileSize(options_)) {
    return false;
  }
@ -1076,7 +1085,8 @@ void VersionSet::Finalize(Version* v) {
    } else {
      // Compute the ratio of current size to size limit.
      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
-      score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
+      score =
          static_cast<double>(level_bytes) / MaxBytesForLevel(options_, level);
    }
    if (score > best_score) {
@ -1290,7 +1300,7 @@ Compaction* VersionSet::PickCompaction() {
    level = current_->compaction_level_;
    assert(level >= 0);
    assert(level+1 < config::kNumLevels);
-    c = new Compaction(level);
+    c = new Compaction(options_, level);
    // Pick the first file that comes after compact_pointer_[level]
    for (size_t i = 0; i < current_->files_[level].size(); i++) {
@ -1307,7 +1317,7 @@ Compaction* VersionSet::PickCompaction() {
    }
  } else if (seek_compaction) {
    level = current_->file_to_compact_level_;
-    c = new Compaction(level);
+    c = new Compaction(options_, level);
    c->inputs_[0].push_back(current_->file_to_compact_);
  } else {
    return NULL;
@ -1352,7 +1362,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
    const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
    const int64_t expanded0_size = TotalFileSize(expanded0);
    if (expanded0.size() > c->inputs_[0].size() &&
-        inputs1_size + expanded0_size < kExpandedCompactionByteSizeLimit) {
+        inputs1_size + expanded0_size <
            ExpandedCompactionByteSizeLimit(options_)) {
      InternalKey new_start, new_limit;
      GetRange(expanded0, &new_start, &new_limit);
      std::vector<FileMetaData*> expanded1;
@ -1414,7 +1425,7 @@ Compaction* VersionSet::CompactRange(
  // and we must not pick one file and drop another older file if the
  // two files overlap.
  if (level > 0) {
-    const uint64_t limit = MaxFileSizeForLevel(level);
+    const uint64_t limit = MaxFileSizeForLevel(options_, level);
    uint64_t total = 0;
    for (size_t i = 0; i < inputs.size(); i++) {
      uint64_t s = inputs[i]->file_size;
@ -1426,7 +1437,7 @@ Compaction* VersionSet::CompactRange(
    }
  }
-  Compaction* c = new Compaction(level);
+  Compaction* c = new Compaction(options_, level);
  c->input_version_ = current_;
  c->input_version_->Ref();
  c->inputs_[0] = inputs;
@ -1434,9 +1445,9 @@ Compaction* VersionSet::CompactRange(
  return c;
 }
-Compaction::Compaction(int level)
+Compaction::Compaction(const Options* options, int level)
    : level_(level),
-      max_output_file_size_(MaxFileSizeForLevel(level)),
+      max_output_file_size_(MaxFileSizeForLevel(options, level)),
      input_version_(NULL),
      grandparent_index_(0),
      seen_key_(false),
@ -1453,12 +1464,13 @@ Compaction::~Compaction() {
 }
 bool Compaction::IsTrivialMove() const {
  const VersionSet* vset = input_version_->vset_;
  // Avoid a move if there is lots of overlapping grandparent data.
  // Otherwise, the move could create a parent file that will require
  // a very expensive merge later on.
-  return (num_input_files(0) == 1 &&
+  return (num_input_files(0) == 1 && num_input_files(1) == 0 &&
-          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <=
-          TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes);
+              MaxGrandParentOverlapBytes(vset->options_));
 }
 void Compaction::AddInputDeletions(VersionEdit* edit) {
@ -1491,8 +1503,9 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
 }
 bool Compaction::ShouldStopBefore(const Slice& internal_key) {
  const VersionSet* vset = input_version_->vset_;
  // Scan to find earliest grandparent file that contains key.
-  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  const InternalKeyComparator* icmp = &vset->icmp_;
  while (grandparent_index_ < grandparents_.size() &&
      icmp->Compare(internal_key,
                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
@ -1503,7 +1516,7 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
  }
  seen_key_ = true;
-  if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) {
+  if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) {
    // Too much overlap for current output; start new output
    overlapped_bytes_ = 0;
    return true;
--- a/db/version_set.h
+++ b/db/version_set.h
@ -366,7 +366,7 @@ class Compaction {
  friend class Version;
  friend class VersionSet;
-  explicit Compaction(int level);
+  Compaction(const Options* options, int level);
  int level_;
  uint64_t max_output_file_size_;
--- a/doc/doc.css
+++ b/doc/doc.css
@ -1,89 +0,0 @@
 body {
  margin-left: 0.5in;
  margin-right: 0.5in;
  background: white;
  color: black;
 }
 h1 {
  margin-left: -0.2in;
  font-size: 14pt;
 }
 h2 {
  margin-left: -0in;
  font-size: 12pt;
 }
 h3 {
  margin-left: -0in;
 }
 h4 {
  margin-left: -0in;
 }
 hr {
  margin-left: -0in;
 }
 /* Definition lists: definition term bold */
 dt {
  font-weight: bold;
 }
 address {
  text-align: center;
 }
 code,samp,var {
  color: blue;
 }
 kbd {
  color: #600000;
 }
 div.note p {
  float: right;
  width: 3in;
  margin-right: 0%;
  padding: 1px;
  border: 2px solid #6060a0;
  background-color: #fffff0;
 }
 ul {
  margin-top: -0em;
  margin-bottom: -0em;
 }
 ol {
  margin-top: -0em;
  margin-bottom: -0em;
 }
 UL.nobullets {
  list-style-type: none;
  list-style-image: none;
  margin-left: -1em;
 }
 p {
  margin: 1em 0 1em 0;
  padding: 0 0 0 0;
 }
 pre {
  line-height: 1.3em;
  padding: 0.4em 0 0.8em 0;
  margin:  0 0 0 0;
  border:  0 0 0 0;
  color: blue;
 }
 .datatable {
  margin-left: auto;
  margin-right: auto;
  margin-top: 2em;
  margin-bottom: 2em;
  border: 1px solid;
 }
 .datatable td,th {
  padding: 0 0.5em 0 0.5em;
  text-align: right;
 }
--- a/doc/impl.html
+++ b/doc/impl.html
@ -1,213 +0,0 @@
 <!DOCTYPE html>
 <html>
 <head>
 <link rel="stylesheet" type="text/css" href="doc.css" />
 <title>Leveldb file layout and compactions</title>
 </head>
 <body>
 <h1>Files</h1>
 The implementation of leveldb is similar in spirit to the
 representation of a single
 <a href="http://research.google.com/archive/bigtable.html">
 Bigtable tablet (section 5.3)</a>.
 However the organization of the files that make up the representation
 is somewhat different and is explained below.
 <p>
 Each database is represented by a set of files stored in a directory.
 There are several different types of files as documented below:
 <p>
 <h2>Log files</h2>
 <p>
 A log file (*.log) stores a sequence of recent updates.  Each update
 is appended to the current log file.  When the log file reaches a
 pre-determined size (approximately 4MB by default), it is converted
 to a sorted table (see below) and a new log file is created for future
 updates.
 <p>
 A copy of the current log file is kept in an in-memory structure (the
 <code>memtable</code>).  This copy is consulted on every read so that read
 operations reflect all logged updates.
 <p>
 <h2>Sorted tables</h2>
 <p>
 A sorted table (*.sst) stores a sequence of entries sorted by key.
 Each entry is either a value for the key, or a deletion marker for the
 key.  (Deletion markers are kept around to hide obsolete values
 present in older sorted tables).
 <p>
 The set of sorted tables are organized into a sequence of levels.  The
 sorted table generated from a log file is placed in a special <code>young</code>
 level (also called level-0).  When the number of young files exceeds a
 certain threshold (currently four), all of the young files are merged
 together with all of the overlapping level-1 files to produce a
 sequence of new level-1 files (we create a new level-1 file for every
 2MB of data.)
 <p>
 Files in the young level may contain overlapping keys.  However files
 in other levels have distinct non-overlapping key ranges.  Consider
 level number L where L >= 1.  When the combined size of files in
 level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
 ...), one file in level-L, and all of the overlapping files in
 level-(L+1) are merged to form a set of new files for level-(L+1).
 These merges have the effect of gradually migrating new updates from
 the young level to the largest level using only bulk reads and writes
 (i.e., minimizing expensive seeks).
 <h2>Manifest</h2>
 <p>
 A MANIFEST file lists the set of sorted tables that make up each
 level, the corresponding key ranges, and other important metadata.
 A new MANIFEST file (with a new number embedded in the file name)
 is created whenever the database is reopened.  The MANIFEST file is
 formatted as a log, and changes made to the serving state (as files
 are added or removed) are appended to this log.
 <p>
 <h2>Current</h2>
 <p>
 CURRENT is a simple text file that contains the name of the latest
 MANIFEST file.
 <p>
 <h2>Info logs</h2>
 <p>
 Informational messages are printed to files named LOG and LOG.old.
 <p>
 <h2>Others</h2>
 <p>
 Other files used for miscellaneous purposes may also be present
 (LOCK, *.dbtmp).
 <h1>Level 0</h1>
 When the log file grows above a certain size (1MB by default):
 <ul>
 <li>Create a brand new memtable and log file and direct future updates here
 <li>In the background:
 <ul>
 <li>Write the contents of the previous memtable to an sstable
 <li>Discard the memtable
 <li>Delete the old log file and the old memtable
 <li>Add the new sstable to the young (level-0) level.
 </ul>
 </ul>
 <h1>Compactions</h1>
 <p>
 When the size of level L exceeds its limit, we compact it in a
 background thread.  The compaction picks a file from level L and all
 overlapping files from the next level L+1.  Note that if a level-L
 file overlaps only part of a level-(L+1) file, the entire file at
 level-(L+1) is used as an input to the compaction and will be
 discarded after the compaction.  Aside: because level-0 is special
 (files in it may overlap each other), we treat compactions from
 level-0 to level-1 specially: a level-0 compaction may pick more than
 one level-0 file in case some of these files overlap each other.
 <p>
 A compaction merges the contents of the picked files to produce a
 sequence of level-(L+1) files.  We switch to producing a new
 level-(L+1) file after the current output file has reached the target
 file size (2MB).  We also switch to a new output file when the key
 range of the current output file has grown enough to overlap more than
 ten level-(L+2) files.  This last rule ensures that a later compaction
 of a level-(L+1) file will not pick up too much data from level-(L+2).
 <p>
 The old files are discarded and the new files are added to the serving
 state.
 <p>
 Compactions for a particular level rotate through the key space.  In
 more detail, for each level L, we remember the ending key of the last
 compaction at level L.  The next compaction for level L will pick the
 first file that starts after this key (wrapping around to the
 beginning of the key space if there is no such file).
 <p>
 Compactions drop overwritten values.  They also drop deletion markers
 if there are no higher numbered levels that contain a file whose range
 overlaps the current key.
 <h2>Timing</h2>
 Level-0 compactions will read up to four 1MB files from level-0, and
 at worst all the level-1 files (10MB).  I.e., we will read 14MB and
 write 14MB.
 <p>
 Other than the special level-0 compactions, we will pick one 2MB file
 from level L.  In the worst case, this will overlap ~ 12 files from
 level L+1 (10 because level-(L+1) is ten times the size of level-L,
 and another two at the boundaries since the file ranges at level-L
 will usually not be aligned with the file ranges at level-L+1).  The
 compaction will therefore read 26MB and write 26MB.  Assuming a disk
 IO rate of 100MB/s (ballpark range for modern drives), the worst
 compaction cost will be approximately 0.5 second.
 <p>
 If we throttle the background writing to something small, say 10% of
 the full 100MB/s speed, a compaction may take up to 5 seconds.  If the
 user is writing at 10MB/s, we might build up lots of level-0 files
 (~50 to hold the 5*10MB).  This may significantly increase the cost of
 reads due to the overhead of merging more files together on every
 read.
 <p>
 Solution 1: To reduce this problem, we might want to increase the log
 switching threshold when the number of level-0 files is large.  Though
 the downside is that the larger this threshold, the more memory we will
 need to hold the corresponding memtable.
 <p>
 Solution 2: We might want to decrease write rate artificially when the
 number of level-0 files goes up.
 <p>
 Solution 3: We work on reducing the cost of very wide merges.
 Perhaps most of the level-0 files will have their blocks sitting
 uncompressed in the cache and we will only need to worry about the
 O(N) complexity in the merging iterator.
 <h2>Number of files</h2>
 Instead of always making 2MB files, we could make larger files for
 larger levels to reduce the total file count, though at the expense of
 more bursty compactions.  Alternatively, we could shard the set of
 files into multiple directories.
 <p>
 An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
 the following timings to do 100K file opens in directories with
 varying number of files:
 <table class="datatable">
 <tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
 <tr><td>1000</td><td>9</td>
 <tr><td>10000</td><td>10</td>
 <tr><td>100000</td><td>16</td>
 </table>
 So maybe even the sharding is not necessary on modern filesystems?
 <h1>Recovery</h1>
 <ul>
 <li> Read CURRENT to find name of the latest committed MANIFEST
 <li> Read the named MANIFEST file
 <li> Clean up stale files
 <li> We could open all sstables here, but it is probably better to be lazy...
 <li> Convert log chunk to a new level-0 sstable
 <li> Start directing new writes to a new log file with recovered sequence#
 </ul>
 <h1>Garbage collection of files</h1>
 <code>DeleteObsoleteFiles()</code> is called at the end of every
 compaction and at the end of recovery.  It finds the names of all
 files in the database.  It deletes all log files that are not the
 current log file.  It deletes all table files that are not referenced
 from some level and are not the output of an active compaction.
 </body>
 </html>
--- a/doc/impl.md
+++ b/doc/impl.md
@ -0,0 +1,170 @@
 ## Files
 The implementation of leveldb is similar in spirit to the representation of a
 single [Bigtable tablet (section 5.3)](http://research.google.com/archive/bigtable.html).
 However the organization of the files that make up the representation is
 somewhat different and is explained below.
 Each database is represented by a set of files stored in a directory. There are
 several different types of files as documented below:
 ### Log files
 A log file (*.log) stores a sequence of recent updates. Each update is appended
 to the current log file. When the log file reaches a pre-determined size
 (approximately 4MB by default), it is converted to a sorted table (see below)
 and a new log file is created for future updates.
 A copy of the current log file is kept in an in-memory structure (the
 `memtable`). This copy is consulted on every read so that read operations
 reflect all logged updates.
 ## Sorted tables
 A sorted table (*.ldb) stores a sequence of entries sorted by key. Each entry is
 either a value for the key, or a deletion marker for the key. (Deletion markers
 are kept around to hide obsolete values present in older sorted tables).
 The set of sorted tables are organized into a sequence of levels. The sorted
 table generated from a log file is placed in a special **young** level (also
 called level-0). When the number of young files exceeds a certain threshold
 (currently four), all of the young files are merged together with all of the
 overlapping level-1 files to produce a sequence of new level-1 files (we create
 a new level-1 file for every 2MB of data.)
 Files in the young level may contain overlapping keys. However files in other
 levels have distinct non-overlapping key ranges. Consider level number L where
 L >= 1. When the combined size of files in level-L exceeds (10^L) MB (i.e., 10MB
 for level-1, 100MB for level-2, ...), one file in level-L, and all of the
 overlapping files in level-(L+1) are merged to form a set of new files for
 level-(L+1). These merges have the effect of gradually migrating new updates
 from the young level to the largest level using only bulk reads and writes
 (i.e., minimizing expensive seeks).
 ### Manifest
 A MANIFEST file lists the set of sorted tables that make up each level, the
 corresponding key ranges, and other important metadata. A new MANIFEST file
 (with a new number embedded in the file name) is created whenever the database
 is reopened. The MANIFEST file is formatted as a log, and changes made to the
 serving state (as files are added or removed) are appended to this log.
 ### Current
 CURRENT is a simple text file that contains the name of the latest MANIFEST
 file.
 ### Info logs
 Informational messages are printed to files named LOG and LOG.old.
 ### Others
 Other files used for miscellaneous purposes may also be present (LOCK, *.dbtmp).
 ## Level 0
 When the log file grows above a certain size (1MB by default):
 Create a brand new memtable and log file and direct future updates here
 In the background:
 Write the contents of the previous memtable to an sstable
 Discard the memtable
 Delete the old log file and the old memtable
 Add the new sstable to the young (level-0) level.
 ## Compactions
 When the size of level L exceeds its limit, we compact it in a background
 thread. The compaction picks a file from level L and all overlapping files from
 the next level L+1. Note that if a level-L file overlaps only part of a
 level-(L+1) file, the entire file at level-(L+1) is used as an input to the
 compaction and will be discarded after the compaction.  Aside: because level-0
 is special (files in it may overlap each other), we treat compactions from
 level-0 to level-1 specially: a level-0 compaction may pick more than one
 level-0 file in case some of these files overlap each other.
 A compaction merges the contents of the picked files to produce a sequence of
 level-(L+1) files. We switch to producing a new level-(L+1) file after the
 current output file has reached the target file size (2MB). We also switch to a
 new output file when the key range of the current output file has grown enough
 to overlap more than ten level-(L+2) files.  This last rule ensures that a later
 compaction of a level-(L+1) file will not pick up too much data from
 level-(L+2).
 The old files are discarded and the new files are added to the serving state.
 Compactions for a particular level rotate through the key space. In more detail,
 for each level L, we remember the ending key of the last compaction at level L.
 The next compaction for level L will pick the first file that starts after this
 key (wrapping around to the beginning of the key space if there is no such
 file).
 Compactions drop overwritten values. They also drop deletion markers if there
 are no higher numbered levels that contain a file whose range overlaps the
 current key.
 ### Timing
 Level-0 compactions will read up to four 1MB files from level-0, and at worst
 all the level-1 files (10MB). I.e., we will read 14MB and write 14MB.
 Other than the special level-0 compactions, we will pick one 2MB file from level
 L. In the worst case, this will overlap ~ 12 files from level L+1 (10 because
 level-(L+1) is ten times the size of level-L, and another two at the boundaries
 since the file ranges at level-L will usually not be aligned with the file
 ranges at level-L+1). The compaction will therefore read 26MB and write 26MB.
 Assuming a disk IO rate of 100MB/s (ballpark range for modern drives), the worst
 compaction cost will be approximately 0.5 second.
 If we throttle the background writing to something small, say 10% of the full
 100MB/s speed, a compaction may take up to 5 seconds. If the user is writing at
 10MB/s, we might build up lots of level-0 files (~50 to hold the 5*10MB). This
 may significantly increase the cost of reads due to the overhead of merging more
 files together on every read.
 Solution 1: To reduce this problem, we might want to increase the log switching
 threshold when the number of level-0 files is large. Though the downside is that
 the larger this threshold, the more memory we will need to hold the
 corresponding memtable.
 Solution 2: We might want to decrease write rate artificially when the number of
 level-0 files goes up.
 Solution 3: We work on reducing the cost of very wide merges. Perhaps most of
 the level-0 files will have their blocks sitting uncompressed in the cache and
 we will only need to worry about the O(N) complexity in the merging iterator.
 ### Number of files
 Instead of always making 2MB files, we could make larger files for larger levels
 to reduce the total file count, though at the expense of more bursty
 compactions.  Alternatively, we could shard the set of files into multiple
 directories.
 An experiment on an ext3 filesystem on Feb 04, 2011 shows the following timings
 to do 100K file opens in directories with varying number of files:
 | Files in directory | Microseconds to open a file |
 |-------------------:|----------------------------:|
 |               1000 |                           9 |
 |              10000 |                          10 |
 |             100000 |                          16 |
 So maybe even the sharding is not necessary on modern filesystems?
 ## Recovery
 * Read CURRENT to find name of the latest committed MANIFEST
 * Read the named MANIFEST file
 * Clean up stale files
 * We could open all sstables here, but it is probably better to be lazy...
 * Convert log chunk to a new level-0 sstable
 * Start directing new writes to a new log file with recovered sequence#
 ## Garbage collection of files
 `DeleteObsoleteFiles()` is called at the end of every compaction and at the end
 of recovery. It finds the names of all files in the database. It deletes all log
 files that are not the current log file. It deletes all table files that are not
 referenced from some level and are not the output of an active compaction.
--- a/doc/index.html
+++ b/doc/index.html
@ -1,549 +0,0 @@
 <!DOCTYPE html>
 <html>
 <head>
 <link rel="stylesheet" type="text/css" href="doc.css" />
 <title>Leveldb</title>
 </head>
 <body>
 <h1>Leveldb</h1>
 <address>Jeff Dean, Sanjay Ghemawat</address>
 <p>
 The <code>leveldb</code> library provides a persistent key value store.  Keys and
 values are arbitrary byte arrays.  The keys are ordered within the key
 value store according to a user-specified comparator function.
 <p>
 <h1>Opening A Database</h1>
 <p>
 A <code>leveldb</code> database has a name which corresponds to a file system
 directory.  All of the contents of database are stored in this
 directory.  The following example shows how to open a database,
 creating it if necessary:
 <p>
 <pre>
  #include &lt;cassert&gt;
  #include "leveldb/db.h"
  leveldb::DB* db;
  leveldb::Options options;
  options.create_if_missing = true;
  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
  assert(status.ok());
  ...
 </pre>
 If you want to raise an error if the database already exists, add
 the following line before the <code>leveldb::DB::Open</code> call:
 <pre>
  options.error_if_exists = true;
 </pre>
 <h1>Status</h1>
 <p>
 You may have noticed the <code>leveldb::Status</code> type above.  Values of this
 type are returned by most functions in <code>leveldb</code> that may encounter an
 error.  You can check if such a result is ok, and also print an
 associated error message:
 <p>
 <pre>
   leveldb::Status s = ...;
   if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
 </pre>
 <h1>Closing A Database</h1>
 <p>
 When you are done with a database, just delete the database object.
 Example:
 <p>
 <pre>
  ... open the db as described above ...
  ... do something with db ...
  delete db;
 </pre>
 <h1>Reads And Writes</h1>
 <p>
 The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
 modify/query the database.  For example, the following code
 moves the value stored under key1 to key2.
 <pre>
  std::string value;
  leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
  if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
  if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
 </pre>
 <h1>Atomic Updates</h1>
 <p>
 Note that if the process dies after the Put of key2 but before the
 delete of key1, the same value may be left stored under multiple keys.
 Such problems can be avoided by using the <code>WriteBatch</code> class to
 atomically apply a set of updates:
 <p>
 <pre>
  #include "leveldb/write_batch.h"
  ...
  std::string value;
  leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
  if (s.ok()) {
    leveldb::WriteBatch batch;
    batch.Delete(key1);
    batch.Put(key2, value);
    s = db-&gt;Write(leveldb::WriteOptions(), &amp;batch);
  }
 </pre>
 The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
 and these edits within the batch are applied in order.  Note that we
 called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
 we do not end up erroneously dropping the value entirely.
 <p>
 Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
 speed up bulk updates by placing lots of individual mutations into the
 same batch.
 <h1>Synchronous Writes</h1>
 By default, each write to <code>leveldb</code> is asynchronous: it
 returns after pushing the write from the process into the operating
 system.  The transfer from operating system memory to the underlying
 persistent storage happens asynchronously.  The <code>sync</code> flag
 can be turned on for a particular write to make the write operation
 not return until the data being written has been pushed all the way to
 persistent storage.  (On Posix systems, this is implemented by calling
 either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
 <code>msync(..., MS_SYNC)</code> before the write operation returns.)
 <pre>
  leveldb::WriteOptions write_options;
  write_options.sync = true;
  db-&gt;Put(write_options, ...);
 </pre>
 Asynchronous writes are often more than a thousand times as fast as
 synchronous writes.  The downside of asynchronous writes is that a
 crash of the machine may cause the last few updates to be lost.  Note
 that a crash of just the writing process (i.e., not a reboot) will not
 cause any loss since even when <code>sync</code> is false, an update
 is pushed from the process memory into the operating system before it
 is considered done.
 <p>
 Asynchronous writes can often be used safely.  For example, when
 loading a large amount of data into the database you can handle lost
 updates by restarting the bulk load after a crash.  A hybrid scheme is
 also possible where every Nth write is synchronous, and in the event
 of a crash, the bulk load is restarted just after the last synchronous
 write finished by the previous run.  (The synchronous write can update
 a marker that describes where to restart on a crash.)
 <p>
 <code>WriteBatch</code> provides an alternative to asynchronous writes.
 Multiple updates may be placed in the same <code>WriteBatch</code> and
 applied together using a synchronous write (i.e.,
 <code>write_options.sync</code> is set to true).  The extra cost of
 the synchronous write will be amortized across all of the writes in
 the batch.
 <p>
 <h1>Concurrency</h1>
 <p>
 A database may only be opened by one process at a time.
 The <code>leveldb</code> implementation acquires a lock from the
 operating system to prevent misuse.  Within a single process, the
 same <code>leveldb::DB</code> object may be safely shared by multiple
 concurrent threads.  I.e., different threads may write into or fetch
 iterators or call <code>Get</code> on the same database without any
 external synchronization (the leveldb implementation will
 automatically do the required synchronization).  However other objects
 (like Iterator and WriteBatch) may require external synchronization.
 If two threads share such an object, they must protect access to it
 using their own locking protocol.  More details are available in
 the public header files.
 <p>
 <h1>Iteration</h1>
 <p>
 The following example demonstrates how to print all key,value pairs
 in a database.
 <p>
 <pre>
  leveldb::Iterator* it = db-&gt;NewIterator(leveldb::ReadOptions());
  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
    cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": "  &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
  }
  assert(it-&gt;status().ok());  // Check for any errors found during the scan
  delete it;
 </pre>
 The following variation shows how to process just the keys in the
 range <code>[start,limit)</code>:
 <p>
 <pre>
  for (it-&gt;Seek(start);
       it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
       it-&gt;Next()) {
    ...
  }
 </pre>
 You can also process entries in reverse order.  (Caveat: reverse
 iteration may be somewhat slower than forward iteration.)
 <p>
 <pre>
  for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
    ...
  }
 </pre>
 <h1>Snapshots</h1>
 <p>
 Snapshots provide consistent read-only views over the entire state of
 the key-value store.  <code>ReadOptions::snapshot</code> may be non-NULL to indicate
 that a read should operate on a particular version of the DB state.
 If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
 implicit snapshot of the current state.
 <p>
 Snapshots are created by the DB::GetSnapshot() method:
 <p>
 <pre>
  leveldb::ReadOptions options;
  options.snapshot = db-&gt;GetSnapshot();
  ... apply some updates to db ...
  leveldb::Iterator* iter = db-&gt;NewIterator(options);
  ... read using iter to view the state when the snapshot was created ...
  delete iter;
  db-&gt;ReleaseSnapshot(options.snapshot);
 </pre>
 Note that when a snapshot is no longer needed, it should be released
 using the DB::ReleaseSnapshot interface.  This allows the
 implementation to get rid of state that was being maintained just to
 support reading as of that snapshot.
 <h1>Slice</h1>
 <p>
 The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
 are instances of the <code>leveldb::Slice</code> type.  <code>Slice</code> is a simple
 structure that contains a length and a pointer to an external byte
 array.  Returning a <code>Slice</code> is a cheaper alternative to returning a
 <code>std::string</code> since we do not need to copy potentially large keys and
 values.  In addition, <code>leveldb</code> methods do not return null-terminated
 C-style strings since <code>leveldb</code> keys and values are allowed to
 contain '\0' bytes.
 <p>
 C++ strings and null-terminated C-style strings can be easily converted
 to a Slice:
 <p>
 <pre>
   leveldb::Slice s1 = "hello";
   std::string str("world");
   leveldb::Slice s2 = str;
 </pre>
 A Slice can be easily converted back to a C++ string:
 <pre>
   std::string str = s1.ToString();
   assert(str == std::string("hello"));
 </pre>
 Be careful when using Slices since it is up to the caller to ensure that
 the external byte array into which the Slice points remains live while
 the Slice is in use.  For example, the following is buggy:
 <p>
 <pre>
   leveldb::Slice slice;
   if (...) {
     std::string str = ...;
     slice = str;
   }
   Use(slice);
 </pre>
 When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
 backing storage for <code>slice</code> will disappear.
 <p>
 <h1>Comparators</h1>
 <p>
 The preceding examples used the default ordering function for key,
 which orders bytes lexicographically.  You can however supply a custom
 comparator when opening a database.  For example, suppose each
 database key consists of two numbers and we should sort by the first
 number, breaking ties by the second number.  First, define a proper
 subclass of <code>leveldb::Comparator</code> that expresses these rules:
 <p>
 <pre>
  class TwoPartComparator : public leveldb::Comparator {
   public:
    // Three-way comparison function:
    //   if a &lt; b: negative result
    //   if a &gt; b: positive result
    //   else: zero result
    int Compare(const leveldb::Slice&amp; a, const leveldb::Slice&amp; b) const {
      int a1, a2, b1, b2;
      ParseKey(a, &amp;a1, &amp;a2);
      ParseKey(b, &amp;b1, &amp;b2);
      if (a1 &lt; b1) return -1;
      if (a1 &gt; b1) return +1;
      if (a2 &lt; b2) return -1;
      if (a2 &gt; b2) return +1;
      return 0;
    }
    // Ignore the following methods for now:
    const char* Name() const { return "TwoPartComparator"; }
    void FindShortestSeparator(std::string*, const leveldb::Slice&amp;) const { }
    void FindShortSuccessor(std::string*) const { }
  };
 </pre>
 Now create a database using this custom comparator:
 <p>
 <pre>
  TwoPartComparator cmp;
  leveldb::DB* db;
  leveldb::Options options;
  options.create_if_missing = true;
  options.comparator = &amp;cmp;
  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
  ...
 </pre>
 <h2>Backwards compatibility</h2>
 <p>
 The result of the comparator's <code>Name</code> method is attached to the
 database when it is created, and is checked on every subsequent
 database open.  If the name changes, the <code>leveldb::DB::Open</code> call will
 fail.  Therefore, change the name if and only if the new key format
 and comparison function are incompatible with existing databases, and
 it is ok to discard the contents of all existing databases.
 <p>
 You can however still gradually evolve your key format over time with
 a little bit of pre-planning.  For example, you could store a version
 number at the end of each key (one byte should suffice for most uses).
 When you wish to switch to a new key format (e.g., adding an optional
 third part to the keys processed by <code>TwoPartComparator</code>),
 (a) keep the same comparator name (b) increment the version number
 for new keys (c) change the comparator function so it uses the
 version numbers found in the keys to decide how to interpret them.
 <p>
 <h1>Performance</h1>
 <p>
 Performance can be tuned by changing the default values of the
 types defined in <code>include/leveldb/options.h</code>.
 <p>
 <h2>Block size</h2>
 <p>
 <code>leveldb</code> groups adjacent keys together into the same block and such a
 block is the unit of transfer to and from persistent storage.  The
 default block size is approximately 4096 uncompressed bytes.
 Applications that mostly do bulk scans over the contents of the
 database may wish to increase this size.  Applications that do a lot
 of point reads of small values may wish to switch to a smaller block
 size if performance measurements indicate an improvement.  There isn't
 much benefit in using blocks smaller than one kilobyte, or larger than
 a few megabytes.  Also note that compression will be more effective
 with larger block sizes.
 <p>
 <h2>Compression</h2>
 <p>
 Each block is individually compressed before being written to
 persistent storage.  Compression is on by default since the default
 compression method is very fast, and is automatically disabled for
 uncompressible data.  In rare cases, applications may want to disable
 compression entirely, but should only do so if benchmarks show a
 performance improvement:
 <p>
 <pre>
  leveldb::Options options;
  options.compression = leveldb::kNoCompression;
  ... leveldb::DB::Open(options, name, ...) ....
 </pre>
 <h2>Cache</h2>
 <p>
 The contents of the database are stored in a set of files in the
 filesystem and each file stores a sequence of compressed blocks.  If
 <code>options.cache</code> is non-NULL, it is used to cache frequently used
 uncompressed block contents.
 <p>
 <pre>
  #include "leveldb/cache.h"
  leveldb::Options options;
  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
  leveldb::DB* db;
  leveldb::DB::Open(options, name, &db);
  ... use the db ...
  delete db
  delete options.cache;
 </pre>
 Note that the cache holds uncompressed data, and therefore it should
 be sized according to application level data sizes, without any
 reduction from compression.  (Caching of compressed blocks is left to
 the operating system buffer cache, or any custom <code>Env</code>
 implementation provided by the client.)
 <p>
 When performing a bulk read, the application may wish to disable
 caching so that the data processed by the bulk read does not end up
 displacing most of the cached contents.  A per-iterator option can be
 used to achieve this:
 <p>
 <pre>
  leveldb::ReadOptions options;
  options.fill_cache = false;
  leveldb::Iterator* it = db-&gt;NewIterator(options);
  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
    ...
  }
 </pre>
 <h2>Key Layout</h2>
 <p>
 Note that the unit of disk transfer and caching is a block.  Adjacent
 keys (according to the database sort order) will usually be placed in
 the same block.  Therefore the application can improve its performance
 by placing keys that are accessed together near each other and placing
 infrequently used keys in a separate region of the key space.
 <p>
 For example, suppose we are implementing a simple file system on top
 of <code>leveldb</code>.  The types of entries we might wish to store are:
 <p>
 <pre>
   filename -&gt; permission-bits, length, list of file_block_ids
   file_block_id -&gt; data
 </pre>
 We might want to prefix <code>filename</code> keys with one letter (say '/') and the
 <code>file_block_id</code> keys with a different letter (say '0') so that scans
 over just the metadata do not force us to fetch and cache bulky file
 contents.
 <p>
 <h2>Filters</h2>
 <p>
 Because of the way <code>leveldb</code> data is organized on disk,
 a single <code>Get()</code> call may involve multiple reads from disk.
 The optional <code>FilterPolicy</code> mechanism can be used to reduce
 the number of disk reads substantially.
 <pre>
   leveldb::Options options;
   options.filter_policy = NewBloomFilterPolicy(10);
   leveldb::DB* db;
   leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
   ... use the database ...
   delete db;
   delete options.filter_policy;
 </pre>
 The preceding code associates a
 <a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
 based filtering policy with the database.  Bloom filter based
 filtering relies on keeping some number of bits of data in memory per
 key (in this case 10 bits per key since that is the argument we passed
 to NewBloomFilterPolicy).  This filter will reduce the number of unnecessary
 disk reads needed for <code>Get()</code> calls by a factor of
 approximately a 100.  Increasing the bits per key will lead to a
 larger reduction at the cost of more memory usage.  We recommend that
 applications whose working set does not fit in memory and that do a
 lot of random reads set a filter policy.
 <p>
 If you are using a custom comparator, you should ensure that the filter
 policy you are using is compatible with your comparator.  For example,
 consider a comparator that ignores trailing spaces when comparing keys.
 <code>NewBloomFilterPolicy</code> must not be used with such a comparator.
 Instead, the application should provide a custom filter policy that
 also ignores trailing spaces.  For example:
 <pre>
  class CustomFilterPolicy : public leveldb::FilterPolicy {
   private:
    FilterPolicy* builtin_policy_;
   public:
    CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) { }
    ~CustomFilterPolicy() { delete builtin_policy_; }
    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
      // Use builtin bloom filter code after removing trailing spaces
      std::vector&lt;Slice&gt; trimmed(n);
      for (int i = 0; i &lt; n; i++) {
        trimmed[i] = RemoveTrailingSpaces(keys[i]);
      }
      return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
    }
    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
      // Use builtin bloom filter code after removing trailing spaces
      return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
    }
  };
 </pre>
 <p>
 Advanced applications may provide a filter policy that does not use
 a bloom filter but uses some other mechanism for summarizing a set
 of keys.  See <code>leveldb/filter_policy.h</code> for detail.
 <p>
 <h1>Checksums</h1>
 <p>
 <code>leveldb</code> associates checksums with all data it stores in the file system.
 There are two separate controls provided over how aggressively these
 checksums are verified:
 <p>
 <ul>
 <li> <code>ReadOptions::verify_checksums</code> may be set to true to force
  checksum verification of all data that is read from the file system on
  behalf of a particular read.  By default, no such verification is
  done.
 <p>
 <li> <code>Options::paranoid_checks</code> may be set to true before opening a
  database to make the database implementation raise an error as soon as
  it detects an internal corruption.  Depending on which portion of the
  database has been corrupted, the error may be raised when the database
  is opened, or later by another database operation.  By default,
  paranoid checking is off so that the database can be used even if
  parts of its persistent storage have been corrupted.
 <p>
  If a database is corrupted (perhaps it cannot be opened when
  paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
  may be used to recover as much of the data as possible
 <p>
 </ul>
 <h1>Approximate Sizes</h1>
 <p>
 The <code>GetApproximateSizes</code> method can used to get the approximate
 number of bytes of file system space used by one or more key ranges.
 <p>
 <pre>
   leveldb::Range ranges[2];
   ranges[0] = leveldb::Range("a", "c");
   ranges[1] = leveldb::Range("x", "z");
   uint64_t sizes[2];
   leveldb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
 </pre>
 The preceding call will set <code>sizes[0]</code> to the approximate number of
 bytes of file system space used by the key range <code>[a..c)</code> and
 <code>sizes[1]</code> to the approximate number of bytes used by the key range
 <code>[x..z)</code>.
 <p>
 <h1>Environment</h1>
 <p>
 All file operations (and other operating system calls) issued by the
 <code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
 Sophisticated clients may wish to provide their own <code>Env</code>
 implementation to get better control.  For example, an application may
 introduce artificial delays in the file IO paths to limit the impact
 of <code>leveldb</code> on other activities in the system.
 <p>
 <pre>
  class SlowEnv : public leveldb::Env {
    .. implementation of the Env interface ...
  };
  SlowEnv env;
  leveldb::Options options;
  options.env = &amp;env;
  Status s = leveldb::DB::Open(options, ...);
 </pre>
 <h1>Porting</h1>
 <p>
 <code>leveldb</code> may be ported to a new platform by providing platform
 specific implementations of the types/methods/functions exported by
 <code>leveldb/port/port.h</code>.  See <code>leveldb/port/port_example.h</code> for more
 details.
 <p>
 In addition, the new platform may need a new default <code>leveldb::Env</code>
 implementation.  See <code>leveldb/util/env_posix.h</code> for an example.
 <h1>Other Information</h1>
 <p>
 Details about the <code>leveldb</code> implementation may be found in
 the following documents:
 <ul>
 <li> <a href="impl.html">Implementation notes</a>
 <li> <a href="table_format.txt">Format of an immutable Table file</a>
 <li> <a href="log_format.txt">Format of a log file</a>
 </ul>
 </body>
 </html>
--- a/doc/index.md
+++ b/doc/index.md
@ -0,0 +1,523 @@
 leveldb
 =======
 _Jeff Dean, Sanjay Ghemawat_
 The leveldb library provides a persistent key value store. Keys and values are
 arbitrary byte arrays.  The keys are ordered within the key value store
 according to a user-specified comparator function.
 ## Opening A Database
 A leveldb database has a name which corresponds to a file system directory. All
 of the contents of database are stored in this directory. The following example
 shows how to open a database, creating it if necessary:
 ```c++
 #include <cassert>
 #include "leveldb/db.h"
 leveldb::DB* db;
 leveldb::Options options;
 options.create_if_missing = true;
 leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
 assert(status.ok());
 ...
 ```
 If you want to raise an error if the database already exists, add the following
 line before the `leveldb::DB::Open` call:
 ```c++
 options.error_if_exists = true;
 ```
 ## Status
 You may have noticed the `leveldb::Status` type above. Values of this type are
 returned by most functions in leveldb that may encounter an error. You can check
 if such a result is ok, and also print an associated error message:
 ```c++
 leveldb::Status s = ...;
 if (!s.ok()) cerr << s.ToString() << endl;
 ```
 ## Closing A Database
 When you are done with a database, just delete the database object. Example:
 ```c++
 ... open the db as described above ...
 ... do something with db ...
 delete db;
 ```
 ## Reads And Writes
 The database provides Put, Delete, and Get methods to modify/query the database.
 For example, the following code moves the value stored under key1 to key2.
 ```c++
 std::string value;
 leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
 if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
 if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
 ```
 ## Atomic Updates
 Note that if the process dies after the Put of key2 but before the delete of
 key1, the same value may be left stored under multiple keys. Such problems can
 be avoided by using the `WriteBatch` class to atomically apply a set of updates:
 ```c++
 #include "leveldb/write_batch.h"
 ...
 std::string value;
 leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
 if (s.ok()) {
  leveldb::WriteBatch batch;
  batch.Delete(key1);
  batch.Put(key2, value);
  s = db->Write(leveldb::WriteOptions(), &batch);
 }
 ```
 The `WriteBatch` holds a sequence of edits to be made to the database, and these
 edits within the batch are applied in order. Note that we called Delete before
 Put so that if key1 is identical to key2, we do not end up erroneously dropping
 the value entirely.
 Apart from its atomicity benefits, `WriteBatch` may also be used to speed up
 bulk updates by placing lots of individual mutations into the same batch.
 ## Synchronous Writes
 By default, each write to leveldb is asynchronous: it returns after pushing the
 write from the process into the operating system. The transfer from operating
 system memory to the underlying persistent storage happens asynchronously. The
 sync flag can be turned on for a particular write to make the write operation
 not return until the data being written has been pushed all the way to
 persistent storage. (On Posix systems, this is implemented by calling either
 `fsync(...)` or `fdatasync(...)` or `msync(..., MS_SYNC)` before the write
 operation returns.)
 ```c++
 leveldb::WriteOptions write_options;
 write_options.sync = true;
 db->Put(write_options, ...);
 ```
 Asynchronous writes are often more than a thousand times as fast as synchronous
 writes. The downside of asynchronous writes is that a crash of the machine may
 cause the last few updates to be lost. Note that a crash of just the writing
 process (i.e., not a reboot) will not cause any loss since even when sync is
 false, an update is pushed from the process memory into the operating system
 before it is considered done.
 Asynchronous writes can often be used safely. For example, when loading a large
 amount of data into the database you can handle lost updates by restarting the
 bulk load after a crash. A hybrid scheme is also possible where every Nth write
 is synchronous, and in the event of a crash, the bulk load is restarted just
 after the last synchronous write finished by the previous run. (The synchronous
 write can update a marker that describes where to restart on a crash.)
 `WriteBatch` provides an alternative to asynchronous writes. Multiple updates
 may be placed in the same WriteBatch and applied together using a synchronous
 write (i.e., `write_options.sync` is set to true). The extra cost of the
 synchronous write will be amortized across all of the writes in the batch.
 ## Concurrency
 A database may only be opened by one process at a time. The leveldb
 implementation acquires a lock from the operating system to prevent misuse.
 Within a single process, the same `leveldb::DB` object may be safely shared by
 multiple concurrent threads. I.e., different threads may write into or fetch
 iterators or call Get on the same database without any external synchronization
 (the leveldb implementation will automatically do the required synchronization).
 However other objects (like Iterator and `WriteBatch`) may require external
 synchronization. If two threads share such an object, they must protect access
 to it using their own locking protocol. More details are available in the public
 header files.
 ## Iteration
 The following example demonstrates how to print all key,value pairs in a
 database.
 ```c++
 leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
 for (it->SeekToFirst(); it->Valid(); it->Next()) {
  cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
 }
 assert(it->status().ok());  // Check for any errors found during the scan
 delete it;
 ```
 The following variation shows how to process just the keys in the range
 [start,limit):
 ```c++
 for (it->Seek(start);
   it->Valid() && it->key().ToString() < limit;
   it->Next()) {
  ...
 }
 ```
 You can also process entries in reverse order. (Caveat: reverse iteration may be
 somewhat slower than forward iteration.)
 ```c++
 for (it->SeekToLast(); it->Valid(); it->Prev()) {
  ...
 }
 ```
 ## Snapshots
 Snapshots provide consistent read-only views over the entire state of the
 key-value store.  `ReadOptions::snapshot` may be non-NULL to indicate that a
 read should operate on a particular version of the DB state. If
 `ReadOptions::snapshot` is NULL, the read will operate on an implicit snapshot
 of the current state.
 Snapshots are created by the `DB::GetSnapshot()` method:
 ```c++
 leveldb::ReadOptions options;
 options.snapshot = db->GetSnapshot();
 ... apply some updates to db ...
 leveldb::Iterator* iter = db->NewIterator(options);
 ... read using iter to view the state when the snapshot was created ...
 delete iter;
 db->ReleaseSnapshot(options.snapshot);
 ```
 Note that when a snapshot is no longer needed, it should be released using the
 `DB::ReleaseSnapshot` interface. This allows the implementation to get rid of
 state that was being maintained just to support reading as of that snapshot.
 ## Slice
 The return value of the `it->key()` and `it->value()` calls above are instances
 of the `leveldb::Slice` type. Slice is a simple structure that contains a length
 and a pointer to an external byte array. Returning a Slice is a cheaper
 alternative to returning a `std::string` since we do not need to copy
 potentially large keys and values. In addition, leveldb methods do not return
 null-terminated C-style strings since leveldb keys and values are allowed to
 contain `'\0'` bytes.
 C++ strings and null-terminated C-style strings can be easily converted to a
 Slice:
 ```c++
 leveldb::Slice s1 = "hello";
 std::string str("world");
 leveldb::Slice s2 = str;
 ```
 A Slice can be easily converted back to a C++ string:
 ```c++
 std::string str = s1.ToString();
 assert(str == std::string("hello"));
 ```
 Be careful when using Slices since it is up to the caller to ensure that the
 external byte array into which the Slice points remains live while the Slice is
 in use. For example, the following is buggy:
 ```c++
 leveldb::Slice slice;
 if (...) {
  std::string str = ...;
  slice = str;
 }
 Use(slice);
 ```
 When the if statement goes out of scope, str will be destroyed and the backing
 storage for slice will disappear.
 ## Comparators
 The preceding examples used the default ordering function for key, which orders
 bytes lexicographically. You can however supply a custom comparator when opening
 a database.  For example, suppose each database key consists of two numbers and
 we should sort by the first number, breaking ties by the second number. First,
 define a proper subclass of `leveldb::Comparator` that expresses these rules:
 ```c++
 class TwoPartComparator : public leveldb::Comparator {
 public:
  // Three-way comparison function:
  //   if a < b: negative result
  //   if a > b: positive result
  //   else: zero result
  int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
    int a1, a2, b1, b2;
    ParseKey(a, &a1, &a2);
    ParseKey(b, &b1, &b2);
    if (a1 < b1) return -1;
    if (a1 > b1) return +1;
    if (a2 < b2) return -1;
    if (a2 > b2) return +1;
    return 0;
  }
  // Ignore the following methods for now:
  const char* Name() const { return "TwoPartComparator"; }
  void FindShortestSeparator(std::string*, const leveldb::Slice&) const {}
  void FindShortSuccessor(std::string*) const {}
 };
 ```
 Now create a database using this custom comparator:
 ```c++
 TwoPartComparator cmp;
 leveldb::DB* db;
 leveldb::Options options;
 options.create_if_missing = true;
 options.comparator = &cmp;
 leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
 ...
 ```
 ### Backwards compatibility
 The result of the comparator's Name method is attached to the database when it
 is created, and is checked on every subsequent database open. If the name
 changes, the `leveldb::DB::Open` call will fail. Therefore, change the name if
 and only if the new key format and comparison function are incompatible with
 existing databases, and it is ok to discard the contents of all existing
 databases.
 You can however still gradually evolve your key format over time with a little
 bit of pre-planning. For example, you could store a version number at the end of
 each key (one byte should suffice for most uses). When you wish to switch to a
 new key format (e.g., adding an optional third part to the keys processed by
 `TwoPartComparator`), (a) keep the same comparator name (b) increment the
 version number for new keys (c) change the comparator function so it uses the
 version numbers found in the keys to decide how to interpret them.
 ## Performance
 Performance can be tuned by changing the default values of the types defined in
 `include/leveldb/options.h`.
 ### Block size
 leveldb groups adjacent keys together into the same block and such a block is
 the unit of transfer to and from persistent storage. The default block size is
 approximately 4096 uncompressed bytes.  Applications that mostly do bulk scans
 over the contents of the database may wish to increase this size. Applications
 that do a lot of point reads of small values may wish to switch to a smaller
 block size if performance measurements indicate an improvement. There isn't much
 benefit in using blocks smaller than one kilobyte, or larger than a few
 megabytes. Also note that compression will be more effective with larger block
 sizes.
 ### Compression
 Each block is individually compressed before being written to persistent
 storage. Compression is on by default since the default compression method is
 very fast, and is automatically disabled for uncompressible data. In rare cases,
 applications may want to disable compression entirely, but should only do so if
 benchmarks show a performance improvement:
 ```c++
 leveldb::Options options;
 options.compression = leveldb::kNoCompression;
 ... leveldb::DB::Open(options, name, ...) ....
 ```
 ### Cache
 The contents of the database are stored in a set of files in the filesystem and
 each file stores a sequence of compressed blocks. If options.cache is non-NULL,
 it is used to cache frequently used uncompressed block contents.
 ```c++
 #include "leveldb/cache.h"
 leveldb::Options options;
 options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
 leveldb::DB* db;
 leveldb::DB::Open(options, name, &db);
 ... use the db ...
 delete db
 delete options.cache;
 ```
 Note that the cache holds uncompressed data, and therefore it should be sized
 according to application level data sizes, without any reduction from
 compression. (Caching of compressed blocks is left to the operating system
 buffer cache, or any custom Env implementation provided by the client.)
 When performing a bulk read, the application may wish to disable caching so that
 the data processed by the bulk read does not end up displacing most of the
 cached contents. A per-iterator option can be used to achieve this:
 ```c++
 leveldb::ReadOptions options;
 options.fill_cache = false;
 leveldb::Iterator* it = db->NewIterator(options);
 for (it->SeekToFirst(); it->Valid(); it->Next()) {
  ...
 }
 ```
 ### Key Layout
 Note that the unit of disk transfer and caching is a block. Adjacent keys
 (according to the database sort order) will usually be placed in the same block.
 Therefore the application can improve its performance by placing keys that are
 accessed together near each other and placing infrequently used keys in a
 separate region of the key space.
 For example, suppose we are implementing a simple file system on top of leveldb.
 The types of entries we might wish to store are:
    filename -> permission-bits, length, list of file_block_ids
    file_block_id -> data
 We might want to prefix filename keys with one letter (say '/') and the
 `file_block_id` keys with a different letter (say '0') so that scans over just
 the metadata do not force us to fetch and cache bulky file contents.
 ### Filters
 Because of the way leveldb data is organized on disk, a single `Get()` call may
 involve multiple reads from disk. The optional FilterPolicy mechanism can be
 used to reduce the number of disk reads substantially.
 ```c++
 leveldb::Options options;
 options.filter_policy = NewBloomFilterPolicy(10);
 leveldb::DB* db;
 leveldb::DB::Open(options, "/tmp/testdb", &db);
 ... use the database ...
 delete db;
 delete options.filter_policy;
 ```
 The preceding code associates a Bloom filter based filtering policy with the
 database.  Bloom filter based filtering relies on keeping some number of bits of
 data in memory per key (in this case 10 bits per key since that is the argument
 we passed to `NewBloomFilterPolicy`). This filter will reduce the number of
 unnecessary disk reads needed for Get() calls by a factor of approximately
 a 100. Increasing the bits per key will lead to a larger reduction at the cost
 of more memory usage. We recommend that applications whose working set does not
 fit in memory and that do a lot of random reads set a filter policy.
 If you are using a custom comparator, you should ensure that the filter policy
 you are using is compatible with your comparator. For example, consider a
 comparator that ignores trailing spaces when comparing keys.
 `NewBloomFilterPolicy` must not be used with such a comparator. Instead, the
 application should provide a custom filter policy that also ignores trailing
 spaces. For example:
 ```c++
 class CustomFilterPolicy : public leveldb::FilterPolicy {
 private:
  FilterPolicy* builtin_policy_;
 public:
  CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) {}
  ~CustomFilterPolicy() { delete builtin_policy_; }
  const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
  void CreateFilter(const Slice* keys, int n, std::string* dst) const {
    // Use builtin bloom filter code after removing trailing spaces
    std::vector<Slice> trimmed(n);
    for (int i = 0; i < n; i++) {
      trimmed[i] = RemoveTrailingSpaces(keys[i]);
    }
    return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
  }
 };
 ```
 Advanced applications may provide a filter policy that does not use a bloom
 filter but uses some other mechanism for summarizing a set of keys. See
 `leveldb/filter_policy.h` for detail.
 ## Checksums
 leveldb associates checksums with all data it stores in the file system. There
 are two separate controls provided over how aggressively these checksums are
 verified:
 `ReadOptions::verify_checksums` may be set to true to force checksum
 verification of all data that is read from the file system on behalf of a
 particular read.  By default, no such verification is done.
 `Options::paranoid_checks` may be set to true before opening a database to make
 the database implementation raise an error as soon as it detects an internal
 corruption. Depending on which portion of the database has been corrupted, the
 error may be raised when the database is opened, or later by another database
 operation. By default, paranoid checking is off so that the database can be used
 even if parts of its persistent storage have been corrupted.
 If a database is corrupted (perhaps it cannot be opened when paranoid checking
 is turned on), the `leveldb::RepairDB` function may be used to recover as much
 of the data as possible
 ## Approximate Sizes
 The `GetApproximateSizes` method can used to get the approximate number of bytes
 of file system space used by one or more key ranges.
 ```c++
 leveldb::Range ranges[2];
 ranges[0] = leveldb::Range("a", "c");
 ranges[1] = leveldb::Range("x", "z");
 uint64_t sizes[2];
 leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
 ```
 The preceding call will set `sizes[0]` to the approximate number of bytes of
 file system space used by the key range `[a..c)` and `sizes[1]` to the
 approximate number of bytes used by the key range `[x..z)`.
 ## Environment
 All file operations (and other operating system calls) issued by the leveldb
 implementation are routed through a `leveldb::Env` object. Sophisticated clients
 may wish to provide their own Env implementation to get better control.
 For example, an application may introduce artificial delays in the file IO
 paths to limit the impact of leveldb on other activities in the system.
 ```c++
 class SlowEnv : public leveldb::Env {
  ... implementation of the Env interface ...
 };
 SlowEnv env;
 leveldb::Options options;
 options.env = &env;
 Status s = leveldb::DB::Open(options, ...);
 ```
 ## Porting
 leveldb may be ported to a new platform by providing platform specific
 implementations of the types/methods/functions exported by
 `leveldb/port/port.h`.  See `leveldb/port/port_example.h` for more details.
 In addition, the new platform may need a new default `leveldb::Env`
 implementation.  See `leveldb/util/env_posix.h` for an example.
 ## Other Information
 Details about the leveldb implementation may be found in the following
 documents:
 1. [Implementation notes](impl.md)
 2. [Format of an immutable Table file](table_format.md)
 3. [Format of a log file](log_format.md)
--- a/doc/log_format.md
+++ b/doc/log_format.md
@ -0,0 +1,75 @@
 leveldb Log format
 ==================
 The log file contents are a sequence of 32KB blocks.  The only exception is that
 the tail of the file may contain a partial block.
 Each block consists of a sequence of records:
    block := record* trailer?
    record :=
      checksum: uint32     // crc32c of type and data[] ; little-endian
      length: uint16       // little-endian
      type: uint8          // One of FULL, FIRST, MIDDLE, LAST
      data: uint8[length]
 A record never starts within the last six bytes of a block (since it won't fit).
 Any leftover bytes here form the trailer, which must consist entirely of zero
 bytes and must be skipped by readers.
 Aside: if exactly seven bytes are left in the current block, and a new non-zero
 length record is added, the writer must emit a FIRST record (which contains zero
 bytes of user data) to fill up the trailing seven bytes of the block and then
 emit all of the user data in subsequent blocks.
 More types may be added in the future.  Some Readers may skip record types they
 do not understand, others may report that some data was skipped.
    FULL == 1
    FIRST == 2
    MIDDLE == 3
    LAST == 4
 The FULL record contains the contents of an entire user record.
 FIRST, MIDDLE, LAST are types used for user records that have been split into
 multiple fragments (typically because of block boundaries).  FIRST is the type
 of the first fragment of a user record, LAST is the type of the last fragment of
 a user record, and MIDDLE is the type of all interior fragments of a user
 record.
 Example: consider a sequence of user records:
    A: length 1000
    B: length 97270
    C: length 8000
 **A** will be stored as a FULL record in the first block.
 **B** will be split into three fragments: first fragment occupies the rest of
 the first block, second fragment occupies the entirety of the second block, and
 the third fragment occupies a prefix of the third block.  This will leave six
 bytes free in the third block, which will be left empty as the trailer.
 **C** will be stored as a FULL record in the fourth block.
 ----
 ## Some benefits over the recordio format:
 1. We do not need any heuristics for resyncing - just go to next block boundary
   and scan.  If there is a corruption, skip to the next block.  As a
   side-benefit, we do not get confused when part of the contents of one log
   file are embedded as a record inside another log file.
 2. Splitting at approximate boundaries (e.g., for mapreduce) is simple: find the
   next block boundary and skip records until we hit a FULL or FIRST record.
 3. We do not need extra buffering for large records.
 ## Some downsides compared to recordio format:
 1. No packing of tiny records.  This could be fixed by adding a new record type,
   so it is a shortcoming of the current implementation, not necessarily the
   format.
 2. No compression.  Again, this could be fixed by adding new record types.
--- a/doc/log_format.txt
+++ b/doc/log_format.txt
@ -1,75 +0,0 @@
 The log file contents are a sequence of 32KB blocks.  The only
 exception is that the tail of the file may contain a partial block.
 Each block consists of a sequence of records:
   block := record* trailer?
   record :=
 	checksum: uint32	// crc32c of type and data[] ; little-endian
 	length: uint16		// little-endian
 	type: uint8		// One of FULL, FIRST, MIDDLE, LAST
 	data: uint8[length]
 A record never starts within the last six bytes of a block (since it
 won't fit).  Any leftover bytes here form the trailer, which must
 consist entirely of zero bytes and must be skipped by readers.
 Aside: if exactly seven bytes are left in the current block, and a new
 non-zero length record is added, the writer must emit a FIRST record
 (which contains zero bytes of user data) to fill up the trailing seven
 bytes of the block and then emit all of the user data in subsequent
 blocks.
 More types may be added in the future.  Some Readers may skip record
 types they do not understand, others may report that some data was
 skipped.
 FULL == 1
 FIRST == 2
 MIDDLE == 3
 LAST == 4
 The FULL record contains the contents of an entire user record.
 FIRST, MIDDLE, LAST are types used for user records that have been
 split into multiple fragments (typically because of block boundaries).
 FIRST is the type of the first fragment of a user record, LAST is the
 type of the last fragment of a user record, and MIDDLE is the type of
 all interior fragments of a user record.
 Example: consider a sequence of user records:
   A: length 1000
   B: length 97270
   C: length 8000
 A will be stored as a FULL record in the first block.
 B will be split into three fragments: first fragment occupies the rest
 of the first block, second fragment occupies the entirety of the
 second block, and the third fragment occupies a prefix of the third
 block.  This will leave six bytes free in the third block, which will
 be left empty as the trailer.
 C will be stored as a FULL record in the fourth block.
 ===================
 Some benefits over the recordio format:
 (1) We do not need any heuristics for resyncing - just go to next
 block boundary and scan.  If there is a corruption, skip to the next
 block.  As a side-benefit, we do not get confused when part of the
 contents of one log file are embedded as a record inside another log
 file.
 (2) Splitting at approximate boundaries (e.g., for mapreduce) is
 simple: find the next block boundary and skip records until we
 hit a FULL or FIRST record.
 (3) We do not need extra buffering for large records.
 Some downsides compared to recordio format:
 (1) No packing of tiny records.  This could be fixed by adding a new
 record type, so it is a shortcoming of the current implementation,
 not necessarily the format.
 (2) No compression.  Again, this could be fixed by adding new record types.
--- a/doc/table_format.md
+++ b/doc/table_format.md
@ -0,0 +1,107 @@
 leveldb File format
 ===================
    <beginning_of_file>
    [data block 1]
    [data block 2]
    ...
    [data block N]
    [meta block 1]
    ...
    [meta block K]
    [metaindex block]
    [index block]
    [Footer]        (fixed size; starts at file_size - sizeof(Footer))
    <end_of_file>
 The file contains internal pointers.  Each such pointer is called
 a BlockHandle and contains the following information:
    offset:   varint64
    size:     varint64
 See [varints](https://developers.google.com/protocol-buffers/docs/encoding#varints)
 for an explanation of varint64 format.
 1.  The sequence of key/value pairs in the file are stored in sorted
 order and partitioned into a sequence of data blocks.  These blocks
 come one after another at the beginning of the file.  Each data block
 is formatted according to the code in `block_builder.cc`, and then
 optionally compressed.
 2. After the data blocks we store a bunch of meta blocks.  The
 supported meta block types are described below.  More meta block types
 may be added in the future.  Each meta block is again formatted using
 `block_builder.cc` and then optionally compressed.
 3. A "metaindex" block.  It contains one entry for every other meta
 block where the key is the name of the meta block and the value is a
 BlockHandle pointing to that meta block.
 4. An "index" block.  This block contains one entry per data block,
 where the key is a string >= last key in that data block and before
 the first key in the successive data block.  The value is the
 BlockHandle for the data block.
 5. At the very end of the file is a fixed length footer that contains
 the BlockHandle of the metaindex and index blocks as well as a magic number.
        metaindex_handle: char[p];     // Block handle for metaindex
        index_handle:     char[q];     // Block handle for index
        padding:          char[40-p-q];// zeroed bytes to make fixed length
                                       // (40==2*BlockHandle::kMaxEncodedLength)
        magic:            fixed64;     // == 0xdb4775248b80fb57 (little-endian)
 ## "filter" Meta Block
 If a `FilterPolicy` was specified when the database was opened, a
 filter block is stored in each table.  The "metaindex" block contains
 an entry that maps from `filter.<N>` to the BlockHandle for the filter
 block where `<N>` is the string returned by the filter policy's
 `Name()` method.
 The filter block stores a sequence of filters, where filter i contains
 the output of `FilterPolicy::CreateFilter()` on all keys that are stored
 in a block whose file offset falls within the range
    [ i*base ... (i+1)*base-1 ]
 Currently, "base" is 2KB.  So for example, if blocks X and Y start in
 the range `[ 0KB .. 2KB-1 ]`, all of the keys in X and Y will be
 converted to a filter by calling `FilterPolicy::CreateFilter()`, and the
 resulting filter will be stored as the first filter in the filter
 block.
 The filter block is formatted as follows:
    [filter 0]
    [filter 1]
    [filter 2]
    ...
    [filter N-1]
    [offset of filter 0]                  : 4 bytes
    [offset of filter 1]                  : 4 bytes
    [offset of filter 2]                  : 4 bytes
    ...
    [offset of filter N-1]                : 4 bytes
    [offset of beginning of offset array] : 4 bytes
    lg(base)                              : 1 byte
 The offset array at the end of the filter block allows efficient
 mapping from a data block offset to the corresponding filter.
 ## "stats" Meta Block
 This meta block contains a bunch of stats.  The key is the name
 of the statistic.  The value contains the statistic.
 TODO(postrelease): record following stats.
    data size
    index size
    key size (uncompressed)
    value size (uncompressed)
    number of entries
    number of data blocks
--- a/doc/table_format.txt
+++ b/doc/table_format.txt
@ -1,104 +0,0 @@
 File format
 ===========
  <beginning_of_file>
  [data block 1]
  [data block 2]
  ...
  [data block N]
  [meta block 1]
  ...
  [meta block K]
  [metaindex block]
  [index block]
  [Footer]        (fixed size; starts at file_size - sizeof(Footer))
  <end_of_file>
 The file contains internal pointers.  Each such pointer is called
 a BlockHandle and contains the following information:
  offset:	    varint64
  size:		    varint64
 See https://developers.google.com/protocol-buffers/docs/encoding#varints
 for an explanation of varint64 format.
 (1) The sequence of key/value pairs in the file are stored in sorted
 order and partitioned into a sequence of data blocks.  These blocks
 come one after another at the beginning of the file.  Each data block
 is formatted according to the code in block_builder.cc, and then
 optionally compressed.
 (2) After the data blocks we store a bunch of meta blocks.  The
 supported meta block types are described below.  More meta block types
 may be added in the future.  Each meta block is again formatted using
 block_builder.cc and then optionally compressed.
 (3) A "metaindex" block.  It contains one entry for every other meta
 block where the key is the name of the meta block and the value is a
 BlockHandle pointing to that meta block.
 (4) An "index" block.  This block contains one entry per data block,
 where the key is a string >= last key in that data block and before
 the first key in the successive data block.  The value is the
 BlockHandle for the data block.
 (6) At the very end of the file is a fixed length footer that contains
 the BlockHandle of the metaindex and index blocks as well as a magic number.
       metaindex_handle: char[p];    // Block handle for metaindex
       index_handle:     char[q];    // Block handle for index
       padding:          char[40-p-q]; // zeroed bytes to make fixed length
                                       // (40==2*BlockHandle::kMaxEncodedLength)
       magic:            fixed64;    // == 0xdb4775248b80fb57 (little-endian)
 "filter" Meta Block
 -------------------
 If a "FilterPolicy" was specified when the database was opened, a
 filter block is stored in each table.  The "metaindex" block contains
 an entry that maps from "filter.<N>" to the BlockHandle for the filter
 block where "<N>" is the string returned by the filter policy's
 "Name()" method.
 The filter block stores a sequence of filters, where filter i contains
 the output of FilterPolicy::CreateFilter() on all keys that are stored
 in a block whose file offset falls within the range
    [ i*base ... (i+1)*base-1 ]
 Currently, "base" is 2KB.  So for example, if blocks X and Y start in
 the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
 converted to a filter by calling FilterPolicy::CreateFilter(), and the
 resulting filter will be stored as the first filter in the filter
 block.
 The filter block is formatted as follows:
     [filter 0]
     [filter 1]
     [filter 2]
     ...
     [filter N-1]
     [offset of filter 0]                  : 4 bytes
     [offset of filter 1]                  : 4 bytes
     [offset of filter 2]                  : 4 bytes
     ...
     [offset of filter N-1]                : 4 bytes
     [offset of beginning of offset array] : 4 bytes
     lg(base)                              : 1 byte
 The offset array at the end of the filter block allows efficient
 mapping from a data block offset to the corresponding filter.
 "stats" Meta Block
 ------------------
 This meta block contains a bunch of stats.  The key is the name
 of the statistic.  The value contains the statistic.
 TODO(postrelease): record following stats.
  data size
  index size
  key size (uncompressed)
  value size (uncompressed)
  number of entries
  number of data blocks
--- a/include/leveldb/db.h
+++ b/include/leveldb/db.h
@ -14,7 +14,7 @@ namespace leveldb {
 // Update Makefile if you change these
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 19;
+static const int kMinorVersion = 20;
 struct Options;
 struct ReadOptions;
--- a/include/leveldb/options.h
+++ b/include/leveldb/options.h
@ -112,6 +112,18 @@ struct Options {
  // Default: 16
  int block_restart_interval;
  // Leveldb will write up to this amount of bytes to a file before
  // switching to a new one.
  // Most clients should leave this parameter alone.  However if your
  // filesystem is more efficient with larger files, you could
  // consider increasing the value.  The downside will be longer
  // compactions and hence longer latency/performance hiccups.
  // Another reason to increase this parameter might be when you are
  // initially populating a large database.
  //
  // Default: 2MB
  size_t max_file_size;
  // Compress blocks using the specified compression algorithm.  This
  // parameter can be changed dynamically.
  //
--- a/port/port_example.h
+++ b/port/port_example.h
@ -129,6 +129,12 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
 // The concatenation of all "data[0,n-1]" fragments is the heap profile.
 extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
 // Extend the CRC to include the first n bytes of buf.
 //
 // Returns zero if the CRC cannot be extended using acceleration, else returns
 // the newly extended CRC value (which may also be zero).
 uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
 }  // namespace port
 }  // namespace leveldb
--- a/port/port_posix.h
+++ b/port/port_posix.h
@ -152,6 +152,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
  return false;
 }
 uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
 } // namespace port
 } // namespace leveldb
--- a/port/port_posix_sse.cc
+++ b/port/port_posix_sse.cc
@ -0,0 +1,129 @@
 // Copyright 2016 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 // A portable implementation of crc32c, optimized to handle
 // four bytes at a time.
 //
 // In a separate source file to allow this accelerated CRC32C function to be
 // compiled with the appropriate compiler flags to enable x86 SSE 4.2
 // instructions.
 #include <stdint.h>
 #include <string.h>
 #include "port/port.h"
 #if defined(LEVELDB_PLATFORM_POSIX_SSE)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #elif defined(__GNUC__) && defined(__SSE4_2__)
 #include <nmmintrin.h>
 #include <cpuid.h>
 #endif
 #endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)
 namespace leveldb {
 namespace port {
 #if defined(LEVELDB_PLATFORM_POSIX_SSE)
 // Used to fetch a naturally-aligned 32-bit word in little endian byte-order
 static inline uint32_t LE_LOAD32(const uint8_t *p) {
  // SSE is x86 only, so ensured that |p| is always little-endian.
  uint32_t word;
  memcpy(&word, p, sizeof(word));
  return word;
 }
 #if defined(_M_X64) || defined(__x86_64__)  // LE_LOAD64 is only used on x64.
 // Used to fetch a naturally-aligned 64-bit word in little endian byte-order
 static inline uint64_t LE_LOAD64(const uint8_t *p) {
  uint64_t dword;
  memcpy(&dword, p, sizeof(dword));
  return dword;
 }
 #endif  // defined(_M_X64) || defined(__x86_64__)
 static inline bool HaveSSE42() {
 #if defined(_MSC_VER)
  int cpu_info[4];
  __cpuid(cpu_info, 1);
  return (cpu_info[2] & (1 << 20)) != 0;
 #elif defined(__GNUC__)
  unsigned int eax, ebx, ecx, edx;
  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
  return (ecx & (1 << 20)) != 0;
 #else
  return false;
 #endif
 }
 #endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)
 // For further improvements see Intel publication at:
 // http://download.intel.com/design/intarch/papers/323405.pdf
 uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) {
 #if !defined(LEVELDB_PLATFORM_POSIX_SSE)
  return 0;
 #else
  static bool have = HaveSSE42();
  if (!have) {
    return 0;
  }
  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
  const uint8_t *e = p + size;
  uint32_t l = crc ^ 0xffffffffu;
 #define STEP1 do {                              \
    l = _mm_crc32_u8(l, *p++);                  \
 } while (0)
 #define STEP4 do {                              \
    l = _mm_crc32_u32(l, LE_LOAD32(p));         \
    p += 4;                                     \
 } while (0)
 #define STEP8 do {                              \
    l = _mm_crc32_u64(l, LE_LOAD64(p));         \
    p += 8;                                     \
 } while (0)
  if (size > 16) {
    // Process unaligned bytes
    for (unsigned int i = reinterpret_cast<uintptr_t>(p) % 8; i; --i) {
      STEP1;
    }
    // _mm_crc32_u64 is only available on x64.
 #if defined(_M_X64) || defined(__x86_64__)
    // Process 8 bytes at a time
    while ((e-p) >= 8) {
      STEP8;
    }
    // Process 4 bytes at a time
    if ((e-p) >= 4) {
      STEP4;
    }
 #else  // !(defined(_M_X64) || defined(__x86_64__))
    // Process 4 bytes at a time
    while ((e-p) >= 4) {
      STEP4;
    }
 #endif  // defined(_M_X64) || defined(__x86_64__)
  }
  // Process the last few bytes
  while (p != e) {
    STEP1;
  }
 #undef STEP8
 #undef STEP4
 #undef STEP1
  return l ^ 0xffffffffu;
 #endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)
 }
 }  // namespace port
 }  // namespace leveldb
--- a/port/port_win.h
+++ b/port/port_win.h
@ -168,6 +168,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
  return false;
 }
 uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
 }
 }
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@ -9,7 +9,7 @@
 namespace leveldb {
-// See doc/table_format.txt for an explanation of the filter block format.
+// See doc/table_format.md for an explanation of the filter block format.
 // Generate new filter every 2KB of data
 static const size_t kFilterBaseLg = 11;
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@ -8,6 +8,8 @@
 #include "util/crc32c.h"
 #include <stdint.h>
 #include "port/port.h"
 #include "util/coding.h"
 namespace leveldb {
@ -283,7 +285,23 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
  return DecodeFixed32(reinterpret_cast<const char*>(p));
 }
 // Determine if the CPU running this program can accelerate the CRC32C
 // calculation.
 static bool CanAccelerateCRC32C() {
  // port::AcceleretedCRC32C returns zero when unable to accelerate.
  static const char kTestCRCBuffer[] = "TestCRCBuffer";
  static const char kBufSize = sizeof(kTestCRCBuffer) - 1;
  static const uint32_t kTestCRCValue = 0xdcbc59fa;
  return port::AcceleratedCRC32C(0, kTestCRCBuffer, kBufSize) == kTestCRCValue;
 }
 uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
  static bool accelerate = CanAccelerateCRC32C();
  if (accelerate) {
    return port::AcceleratedCRC32C(crc, buf, size);
  }
  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
  const uint8_t *e = p + size;
  uint32_t l = crc ^ 0xffffffffu;
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@ -11,12 +11,14 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>
 #include <deque>
 #include <limits>
 #include <set>
 #include "leveldb/env.h"
 #include "leveldb/slice.h"
@ -24,15 +26,70 @@
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/posix_logger.h"
 #include "util/env_posix_test_helper.h"
 namespace leveldb {
 namespace {
 static int open_read_only_file_limit = -1;
 static int mmap_limit = -1;
 static Status IOError(const std::string& context, int err_number) {
  return Status::IOError(context, strerror(err_number));
 }
 // Helper class to limit resource usage to avoid exhaustion.
 // Currently used to limit read-only file descriptors and mmap file usage
 // so that we do not end up running out of file descriptors, virtual memory,
 // or running into kernel performance problems for very large databases.
 class Limiter {
 public:
  // Limit maximum number of resources to |n|.
  Limiter(intptr_t n) {
    SetAllowed(n);
  }
  // If another resource is available, acquire it and return true.
  // Else return false.
  bool Acquire() {
    if (GetAllowed() <= 0) {
      return false;
    }
    MutexLock l(&mu_);
    intptr_t x = GetAllowed();
    if (x <= 0) {
      return false;
    } else {
      SetAllowed(x - 1);
      return true;
    }
  }
  // Release a resource acquired by a previous call to Acquire() that returned
  // true.
  void Release() {
    MutexLock l(&mu_);
    SetAllowed(GetAllowed() + 1);
  }
 private:
  port::Mutex mu_;
  port::AtomicPointer allowed_;
  intptr_t GetAllowed() const {
    return reinterpret_cast<intptr_t>(allowed_.Acquire_Load());
  }
  // REQUIRES: mu_ must be held
  void SetAllowed(intptr_t v) {
    allowed_.Release_Store(reinterpret_cast<void*>(v));
  }
  Limiter(const Limiter&);
  void operator=(const Limiter&);
 };
 class PosixSequentialFile: public SequentialFile {
 private:
  std::string filename_;
@ -70,73 +127,51 @@ class PosixSequentialFile: public SequentialFile {
 class PosixRandomAccessFile: public RandomAccessFile {
 private:
  std::string filename_;
  bool temporary_fd_;  // If true, fd_ is -1 and we open on every read.
  int fd_;
  Limiter* limiter_;
 public:
-  PosixRandomAccessFile(const std::string& fname, int fd)
+  PosixRandomAccessFile(const std::string& fname, int fd, Limiter* limiter)
-      : filename_(fname), fd_(fd) { }
+      : filename_(fname), fd_(fd), limiter_(limiter) {
-  virtual ~PosixRandomAccessFile() { close(fd_); }
+    temporary_fd_ = !limiter->Acquire();
-
+    if (temporary_fd_) {
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+      // Open file on every access.
-                      char* scratch) const {
+      close(fd_);
-    Status s;
+      fd_ = -1;
    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
    *result = Slice(scratch, (r < 0) ? 0 : r);
    if (r < 0) {
      // An error: return a non-ok status
      s = IOError(filename_, errno);
    }
    return s;
  }
 };
-// Helper class to limit mmap file usage so that we do not end up
+  virtual ~PosixRandomAccessFile() {
-// running out virtual memory or running into kernel performance
+    if (!temporary_fd_) {
-// problems for very large databases.
+      close(fd_);
-class MmapLimiter {
+      limiter_->Release();
 public:
  // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes.
  MmapLimiter() {
    SetAllowed(sizeof(void*) >= 8 ? 1000 : 0);
    }
  // If another mmap slot is available, acquire it and return true.
  // Else return false.
  bool Acquire() {
    if (GetAllowed() <= 0) {
      return false;
  }
-    MutexLock l(&mu_);
+
-    intptr_t x = GetAllowed();
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-    if (x <= 0) {
+                      char* scratch) const {
-      return false;
+    int fd = fd_;
-    } else {
+    if (temporary_fd_) {
-      SetAllowed(x - 1);
+      fd = open(filename_.c_str(), O_RDONLY);
-      return true;
+      if (fd < 0) {
        return IOError(filename_, errno);
      }
    }
-  // Release a slot acquired by a previous call to Acquire() that returned true.
+    Status s;
-  void Release() {
+    ssize_t r = pread(fd, scratch, n, static_cast<off_t>(offset));
-    MutexLock l(&mu_);
+    *result = Slice(scratch, (r < 0) ? 0 : r);
-    SetAllowed(GetAllowed() + 1);
+    if (r < 0) {
      // An error: return a non-ok status
      s = IOError(filename_, errno);
    }
-
+    if (temporary_fd_) {
- private:
+      // Close the temporary file descriptor opened earlier.
-  port::Mutex mu_;
+      close(fd);
  port::AtomicPointer allowed_;
  intptr_t GetAllowed() const {
    return reinterpret_cast<intptr_t>(allowed_.Acquire_Load());
    }
-
+    return s;
  // REQUIRES: mu_ must be held
  void SetAllowed(intptr_t v) {
    allowed_.Release_Store(reinterpret_cast<void*>(v));
  }
  MmapLimiter(const MmapLimiter&);
  void operator=(const MmapLimiter&);
 };
 // mmap() based random-access
@ -145,12 +180,12 @@ class PosixMmapReadableFile: public RandomAccessFile {
  std::string filename_;
  void* mmapped_region_;
  size_t length_;
-  MmapLimiter* limiter_;
+  Limiter* limiter_;
 public:
  // base[0,length-1] contains the mmapped contents of the file.
  PosixMmapReadableFile(const std::string& fname, void* base, size_t length,
-                        MmapLimiter* limiter)
+                        Limiter* limiter)
      : filename_(fname), mmapped_region_(base), length_(length),
        limiter_(limiter) {
  }
@ -231,7 +266,7 @@ class PosixWritableFile : public WritableFile {
      if (fd < 0) {
        s = IOError(dir, errno);
      } else {
-        if (fsync(fd) < 0) {
+        if (fsync(fd) < 0 && errno != EINVAL) {
          s = IOError(dir, errno);
        }
        close(fd);
@ -333,7 +368,7 @@ class PosixEnv : public Env {
        mmap_limit_.Release();
      }
    } else {
-      *result = new PosixRandomAccessFile(fname, fd);
+      *result = new PosixRandomAccessFile(fname, fd, &fd_limit_);
    }
    return s;
  }
@ -533,10 +568,42 @@ class PosixEnv : public Env {
  BGQueue queue_;
  PosixLockTable locks_;
-  MmapLimiter mmap_limit_;
+  Limiter mmap_limit_;
  Limiter fd_limit_;
 };
-PosixEnv::PosixEnv() : started_bgthread_(false) {
+// Return the maximum number of concurrent mmaps.
 static int MaxMmaps() {
  if (mmap_limit >= 0) {
    return mmap_limit;
  }
  // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes.
  mmap_limit = sizeof(void*) >= 8 ? 1000 : 0;
  return mmap_limit;
 }
 // Return the maximum number of read-only files to keep open.
 static intptr_t MaxOpenFiles() {
  if (open_read_only_file_limit >= 0) {
    return open_read_only_file_limit;
  }
  struct rlimit rlim;
  if (getrlimit(RLIMIT_NOFILE, &rlim)) {
    // getrlimit failed, fallback to hard-coded default.
    open_read_only_file_limit = 50;
  } else if (rlim.rlim_cur == RLIM_INFINITY) {
    open_read_only_file_limit = std::numeric_limits<int>::max();
  } else {
    // Allow use of 20% of available file descriptors for read-only files.
    open_read_only_file_limit = rlim.rlim_cur / 5;
  }
  return open_read_only_file_limit;
 }
 PosixEnv::PosixEnv()
    : started_bgthread_(false),
      mmap_limit_(MaxMmaps()),
      fd_limit_(MaxOpenFiles()) {
  PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL));
  PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL));
 }
@ -611,6 +678,16 @@ static pthread_once_t once = PTHREAD_ONCE_INIT;
 static Env* default_env;
 static void InitDefaultEnv() { default_env = new PosixEnv; }
 void EnvPosixTestHelper::SetReadOnlyFDLimit(int limit) {
  assert(default_env == NULL);
  open_read_only_file_limit = limit;
 }
 void EnvPosixTestHelper::SetReadOnlyMMapLimit(int limit) {
  assert(default_env == NULL);
  mmap_limit = limit;
 }
 Env* Env::Default() {
  pthread_once(&once, InitDefaultEnv);
  return default_env;
--- a/util/env_posix_test.cc
+++ b/util/env_posix_test.cc
@ -0,0 +1,66 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "leveldb/env.h"
 #include "port/port.h"
 #include "util/testharness.h"
 #include "util/env_posix_test_helper.h"
 namespace leveldb {
 static const int kDelayMicros = 100000;
 static const int kReadOnlyFileLimit = 4;
 static const int kMMapLimit = 4;
 class EnvPosixTest {
 public:
  Env* env_;
  EnvPosixTest() : env_(Env::Default()) { }
  static void SetFileLimits(int read_only_file_limit, int mmap_limit) {
    EnvPosixTestHelper::SetReadOnlyFDLimit(read_only_file_limit);
    EnvPosixTestHelper::SetReadOnlyMMapLimit(mmap_limit);
  }
 };
 TEST(EnvPosixTest, TestOpenOnRead) {
  // Write some test data to a single file that will be opened |n| times.
  std::string test_dir;
  ASSERT_OK(env_->GetTestDirectory(&test_dir));
  std::string test_file = test_dir + "/open_on_read.txt";
  FILE* f = fopen(test_file.c_str(), "w");
  ASSERT_TRUE(f != NULL);
  const char kFileData[] = "abcdefghijklmnopqrstuvwxyz";
  fputs(kFileData, f);
  fclose(f);
  // Open test file some number above the sum of the two limits to force
  // open-on-read behavior of POSIX Env leveldb::RandomAccessFile.
  const int kNumFiles = kReadOnlyFileLimit + kMMapLimit + 5;
  leveldb::RandomAccessFile* files[kNumFiles] = {0};
  for (int i = 0; i < kNumFiles; i++) {
    ASSERT_OK(env_->NewRandomAccessFile(test_file, &files[i]));
  }
  char scratch;
  Slice read_result;
  for (int i = 0; i < kNumFiles; i++) {
    ASSERT_OK(files[i]->Read(i, 1, &read_result, &scratch));
    ASSERT_EQ(kFileData[i], read_result[0]);
  }
  for (int i = 0; i < kNumFiles; i++) {
    delete files[i];
  }
  ASSERT_OK(env_->DeleteFile(test_file));
 }
 }  // namespace leveldb
 int main(int argc, char** argv) {
  // All tests currently run with the same read-only file limits.
  leveldb::EnvPosixTest::SetFileLimits(leveldb::kReadOnlyFileLimit,
                                       leveldb::kMMapLimit);
  return leveldb::test::RunAllTests();
 }
--- a/util/env_posix_test_helper.h
+++ b/util/env_posix_test_helper.h
@ -0,0 +1,28 @@
 // Copyright 2017 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #ifndef STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_
 #define STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_
 namespace leveldb {
 class EnvPosixTest;
 // A helper for the POSIX Env to facilitate testing.
 class EnvPosixTestHelper {
 private:
  friend class EnvPosixTest;
  // Set the maximum number of read-only files that will be opened.
  // Must be called before creating an Env.
  static void SetReadOnlyFDLimit(int limit);
  // Set the maximum number of read-only files that will be mapped via mmap.
  // Must be called before creating an Env.
  static void SetReadOnlyMMapLimit(int limit);
 };
 }  // namespace leveldb
 #endif  // STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_
--- a/util/env_test.cc
+++ b/util/env_test.cc
@ -10,29 +10,31 @@
 namespace leveldb {
 static const int kDelayMicros = 100000;
 static const int kReadOnlyFileLimit = 4;
 static const int kMMapLimit = 4;
-class EnvPosixTest {
+class EnvTest {
 private:
  port::Mutex mu_;
  std::string events_;
 public:
  Env* env_;
-  EnvPosixTest() : env_(Env::Default()) { }
+  EnvTest() : env_(Env::Default()) { }
 };
 static void SetBool(void* ptr) {
  reinterpret_cast<port::AtomicPointer*>(ptr)->NoBarrier_Store(ptr);
 }
-TEST(EnvPosixTest, RunImmediately) {
+TEST(EnvTest, RunImmediately) {
  port::AtomicPointer called (NULL);
  env_->Schedule(&SetBool, &called);
-  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  env_->SleepForMicroseconds(kDelayMicros);
  ASSERT_TRUE(called.NoBarrier_Load() != NULL);
 }
-TEST(EnvPosixTest, RunMany) {
+TEST(EnvTest, RunMany) {
  port::AtomicPointer last_id (NULL);
  struct CB {
@ -59,7 +61,7 @@ TEST(EnvPosixTest, RunMany) {
  env_->Schedule(&CB::Run, &cb3);
  env_->Schedule(&CB::Run, &cb4);
-  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  env_->SleepForMicroseconds(kDelayMicros);
  void* cur = last_id.Acquire_Load();
  ASSERT_EQ(4, reinterpret_cast<uintptr_t>(cur));
 }
@ -78,7 +80,7 @@ static void ThreadBody(void* arg) {
  s->mu.Unlock();
 }
-TEST(EnvPosixTest, StartThread) {
+TEST(EnvTest, StartThread) {
  State state;
  state.val = 0;
  state.num_running = 3;
@ -92,7 +94,7 @@ TEST(EnvPosixTest, StartThread) {
    if (num == 0) {
      break;
    }
-    Env::Default()->SleepForMicroseconds(kDelayMicros);
+    env_->SleepForMicroseconds(kDelayMicros);
  }
  ASSERT_EQ(state.val, 3);
 }
--- a/util/env_win.cc
+++ b/util/env_win.cc
@ -1,7 +1,7 @@
 // This file contains source that originates from:
 // http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/env_win32.h
 // http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/port_win32.cc
-// Those files dont' have any explict license headers but the 
+// Those files don't have any explicit license headers but the 
 // project (http://code.google.com/p/leveldbwin/) lists the 'New BSD License'
 // as the license.
 #if defined(LEVELDB_PLATFORM_WINDOWS)
@ -355,11 +355,13 @@ BOOL Win32SequentialFile::_Init()
 	ToWidePath(_filename, path);
 	_hFile = CreateFileW(path.c_str(),
                         GENERIC_READ,
-                         FILE_SHARE_READ,
+                         FILE_SHARE_READ | FILE_SHARE_WRITE,
                         NULL,
                         OPEN_EXISTING,
-                         FILE_ATTRIBUTE_NORMAL,
+                         FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN,
                         NULL);
    if (_hFile == INVALID_HANDLE_VALUE)
        _hFile = NULL;
    return _hFile ? TRUE : FALSE;
 }
@ -403,7 +405,7 @@ BOOL Win32RandomAccessFile::_Init( LPCWSTR path )
 {
    BOOL bRet = FALSE;
    if(!_hFile)
-        _hFile = ::CreateFileW(path,GENERIC_READ,FILE_SHARE_READ,NULL,OPEN_EXISTING,
+        _hFile = ::CreateFileW(path,GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING,
        FILE_ATTRIBUTE_NORMAL | FILE_FLAG_RANDOM_ACCESS,NULL);
    if(!_hFile || _hFile == INVALID_HANDLE_VALUE )
        _hFile = NULL;
@ -669,7 +671,7 @@ Status Win32Env::GetFileSize( const std::string& fname, uint64_t* file_size )
 	ToWidePath(ModifyPath(path), wpath);
    HANDLE file = ::CreateFileW(wpath.c_str(),
-        GENERIC_READ,FILE_SHARE_READ,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL);
+        GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL);
    LARGE_INTEGER li;
    if(::GetFileSizeEx(file,&li)){
        *file_size = (uint64_t)li.QuadPart;
--- a/util/options.cc
+++ b/util/options.cc
@ -21,6 +21,7 @@ Options::Options()
      block_cache(NULL),
      block_size(4096),
      block_restart_interval(16),
      max_file_size(2<<20),
      compression(kSnappyCompression),
      reuse_logs(false),
      filter_policy(NULL) {