More changes from upstream.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@12 62dab493-f737-651d-591e-8d6aee1b9529
14 years ago · 13b72af77b
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -17,11 +17,14 @@

 // Comma-separated list of operations to run in the specified order
 //   Actual benchmarks:
 //      writeseq    -- write N values in sequential key order
 //      writerandom -- write N values in random key order
 //      writebig    -- write N/1000 100K valuesin random order
 //      readseq     -- read N values sequentially
 //      readrandom  -- read N values in random order
 //      fillseq       -- write N values in sequential key order in async mode
 //      fillrandom    -- write N values in random key order in async mode
 //      overwrite     -- overwrite N values in random key order in async mode
 //      fillsync      -- write N/100 values in random key order in sync mode
 //      fill100K      -- write N/1000 100K values in random order in async mode
 //      readseq       -- read N values sequentially
 //      readreverse   -- read N values in reverse order
 //      readrandom    -- read N values in random order
 //   Meta operations:
 //      compact     -- Compact the entire DB
 //      heapprofile -- Dump a heap profile (if supported by this port)
@ -30,10 +33,10 @@
 //      tenth       -- divide N by 10 (i.e., following benchmarks are smaller)
 //      normal      -- reset N back to its normal value (1000000)
 static const char* FLAGS_benchmarks =
    "writeseq,"
    "writeseq,"
    "writerandom,"
    "sync,tenth,tenth,writerandom,nosync,normal,"
    "fillseq,"
    "fillrandom,"
    "overwrite,"
    "fillsync,"
    "readseq,"
    "readreverse,"
    "readrandom,"
@ -41,7 +44,7 @@ static const char* FLAGS_benchmarks =
    "readseq,"
    "readreverse,"
    "readrandom,"
    "writebig";
    "fill100K";

 // Number of key/values to place in database
 static int FLAGS_num = 1000000;
@ -51,7 +54,7 @@ static int FLAGS_value_size = 100;

 // Arrange to generate values that shrink to this fraction of
 // their original size after compression
 static double FLAGS_compression_ratio = 0.25;
 static double FLAGS_compression_ratio = 0.5;

 // Print histogram of operation timings
 static bool FLAGS_histogram = false;
@ -93,6 +96,19 @@ class RandomGenerator {
    return Slice(data_.data() + pos_ - len, len);
  }
 };

 static Slice TrimSpace(Slice s) {
  int start = 0;
  while (start < s.size() && isspace(s[start])) {
    start++;
  }
  int limit = s.size();
  while (limit > start && isspace(s[limit-1])) {
    limit--;
  }
  return Slice(s.data() + start, limit - start);
 }

 }

 class Benchmark {
@ -100,7 +116,6 @@ class Benchmark {
  Cache* cache_;
  DB* db_;
  int num_;
  bool sync_;
  int heap_counter_;
  double start_;
  double last_op_finish_;
@ -114,6 +129,70 @@ class Benchmark {
  int done_;
  int next_report_;     // When to report next

  void PrintHeader() {
    const int kKeySize = 16;
    PrintEnvironment();
    fprintf(stdout, "Keys:       %d bytes each\n", kKeySize);
    fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
            FLAGS_value_size,
            static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
    fprintf(stdout, "Entries:    %d\n", num_);
    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
            (((kKeySize + FLAGS_value_size) * num_) / 1048576.0));
    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
            (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
             / 1048576.0));
    PrintWarnings();
    fprintf(stdout, "------------------------------------------------\n");
  }

  void PrintWarnings() {
 #if defined(__GNUC__) && !defined(__OPTIMIZE__)
    fprintf(stdout,
            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
            );
 #endif
 #ifndef NDEBUG
    fprintf(stdout,
            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
 #endif
  }

  void PrintEnvironment() {
    fprintf(stderr, "LevelDB:    version %d.%d\n",
            kMajorVersion, kMinorVersion);

 #if defined(__linux)
    time_t now = time(NULL);
    fprintf(stderr, "Date:       %s", ctime(&now));  // ctime() adds newline

    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
    if (cpuinfo != NULL) {
      char line[1000];
      int num_cpus = 0;
      std::string cpu_type;
      std::string cache_size;
      while (fgets(line, sizeof(line), cpuinfo) != NULL) {
        const char* sep = strchr(line, ':');
        if (sep == NULL) {
          continue;
        }
        Slice key = TrimSpace(Slice(line, sep - 1 - line));
        Slice val = TrimSpace(Slice(sep + 1));
        if (key == "model name") {
          ++num_cpus;
          cpu_type = val.ToString();
        } else if (key == "cache size") {
          cache_size = val.ToString();
        }
      }
      fclose(cpuinfo);
      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
    }
 #endif
  }

  void Start() {
    start_ = Env::Default()->NowMicros() * 1e-6;
    bytes_ = 0;
@ -164,9 +243,10 @@ class Benchmark {
      snprintf(rate, sizeof(rate), "%5.1f MB/s",
               (bytes_ / 1048576.0) / (finish - start_));
      if (!message_.empty()) {
        message_.push_back(' ');
        message_  = std::string(rate) + " " + message_;
      } else {
        message_ = rate;
      }
      message_.append(rate);
    }

    fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
@ -183,14 +263,16 @@ class Benchmark {
 public:
  enum Order {
    SEQUENTIAL,
    REVERSE,  // Currently only supported for reads
    RANDOM
  };
  enum DBState {
    FRESH,
    EXISTING
  };

  Benchmark() : cache_(NewLRUCache(200<<20)),
                db_(NULL),
                num_(FLAGS_num),
                sync_(false),
                heap_counter_(0),
                bytes_(0),
                rand_(301) {
@ -210,19 +292,8 @@ class Benchmark {
  }

  void Run() {
    Options options;
    options.create_if_missing = true;
    options.max_open_files = 10000;
    options.block_cache = cache_;
    options.write_buffer_size = FLAGS_write_buffer_size;

    Start();
    Status s = DB::Open(options, "/tmp/dbbench", &db_);
    Stop("open");
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
      exit(1);
    }
    PrintHeader();
    Open();

    const char* benchmarks = FLAGS_benchmarks;
    while (benchmarks != NULL) {
@ -237,30 +308,30 @@ class Benchmark {
      }

      Start();
      if (name == Slice("writeseq")) {
        Write(SEQUENTIAL, num_, FLAGS_value_size);
      } else if (name == Slice("writerandom")) {
        Write(RANDOM, num_, FLAGS_value_size);
      } else if (name == Slice("writebig")) {
        Write(RANDOM, num_ / 1000, 100 * 1000);

      WriteOptions write_options;
      write_options.sync = false;
      if (name == Slice("fillseq")) {
        Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size);
      } else if (name == Slice("fillrandom")) {
        Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size);
      } else if (name == Slice("overwrite")) {
        Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size);
      } else if (name == Slice("fillsync")) {
        write_options.sync = true;
        Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size);
      } else if (name == Slice("fill100K")) {
        Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000);
      } else if (name == Slice("readseq")) {
        Read(SEQUENTIAL);
        ReadSequential();
      } else if (name == Slice("readreverse")) {
        Read(REVERSE);
        ReadReverse();
      } else if (name == Slice("readrandom")) {
        Read(RANDOM);
        ReadRandom();
      } else if (name == Slice("compact")) {
        Compact();
      } else if (name == Slice("heapprofile")) {
        HeapProfile();
      } else if (name == Slice("sync")) {
        sync_ = true;
      } else if (name == Slice("nosync")) {
        sync_ = false;
      } else if (name == Slice("tenth")) {
        num_ = num_ / 10;
      } else if (name == Slice("normal")) {
        num_ = FLAGS_num;
      } else {
        fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
      }
@ -268,16 +339,44 @@ class Benchmark {
    }
  }

  void Write(Order order, int num_entries, int value_size) {
 private:
  void Open() {
    assert(db_ == NULL);
    Options options;
    options.create_if_missing = true;
    options.max_open_files = 10000;
    options.block_cache = cache_;
    options.write_buffer_size = FLAGS_write_buffer_size;
    Status s = DB::Open(options, "/tmp/dbbench", &db_);
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
      exit(1);
    }
  }

  void Write(const WriteOptions& options, Order order, DBState state,
             int num_entries, int value_size) {
    if (state == FRESH) {
      delete db_;
      db_ = NULL;
      DestroyDB("/tmp/dbbench", Options());
      Open();
      Start();  // Do not count time taken to destroy/open
    }

    if (num_entries != num_) {
      char msg[100];
      snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
      message_ = msg;
    }

    WriteBatch batch;
    Status s;
    std::string val;
    WriteOptions options;
    options.sync = sync_;
    for (int i = 0; i < num_entries; i++) {
      const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
      char key[100];
      snprintf(key, sizeof(key), "%012d", k);
      snprintf(key, sizeof(key), "%016d", k);
      batch.Clear();
      batch.Put(key, gen_.Generate(value_size));
      s = db_->Write(options, &batch);
@ -290,42 +389,37 @@ class Benchmark {
    }
  }

  void Read(Order order) {
  void ReadSequential() {
    Iterator* iter = db_->NewIterator(ReadOptions());
    int i = 0;
    for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
      bytes_ += iter->key().size() + iter->value().size();
      FinishedSingleOp();
      ++i;
    }
    delete iter;
  }

  void ReadReverse() {
    Iterator* iter = db_->NewIterator(ReadOptions());
    int i = 0;
    for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
      bytes_ += iter->key().size() + iter->value().size();
      FinishedSingleOp();
      ++i;
    }
    delete iter;
  }

  void ReadRandom() {
    ReadOptions options;
    switch (order) {
      case SEQUENTIAL: {
        Iterator* iter = db_->NewIterator(options);
        int i = 0;
        for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
          bytes_ += iter->key().size() + iter->value().size();
          FinishedSingleOp();
          ++i;
        }
        delete iter;
        break;
      }
      case REVERSE: {
        Iterator* iter = db_->NewIterator(options);
        int i = 0;
        for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
          bytes_ += iter->key().size() + iter->value().size();
          FinishedSingleOp();
          ++i;
        }
        delete iter;
        break;
      }
      case RANDOM: {
        std::string value;
        for (int i = 0; i < num_; i++) {
          char key[100];
          const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
          snprintf(key, sizeof(key), "%012d", k);
          db_->Get(options, key, &value);
          FinishedSingleOp();
        }
        break;
      }
    std::string value;
    for (int i = 0; i < num_; i++) {
      char key[100];
      const int k = rand_.Next() % FLAGS_num;
      snprintf(key, sizeof(key), "%016d", k);
      db_->Get(options, key, &value);
      FinishedSingleOp();
    }
  }

--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -532,8 +532,9 @@ void DBImpl::BackgroundCompaction() {
  }

  Status status;
  if (c->num_input_files(0) == 1 && c->num_input_files(1) == 0) {
  if (c->IsTrivialMove()) {
    // Move file to next level
    assert(c->num_input_files(0) == 1);
    FileMetaData* f = c->input(0, 0);
    c->edit()->DeleteFile(c->level(), f->number);
    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
@ -718,8 +719,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
  bool has_current_user_key = false;
  SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
  for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
    // Handle key/value, add to state, etc.
    Slice key = input->key();
    InternalKey tmp_internal_key;
    tmp_internal_key.DecodeFrom(key);
    if (compact->compaction->ShouldStopBefore(tmp_internal_key) &&
        compact->builder != NULL) {
      status = FinishCompactionOutputFile(compact, input);
      if (!status.ok()) {
        break;
      }
    }

    // Handle key/value, add to state, etc.
    bool drop = false;
    if (!ParseInternalKey(key, &ikey)) {
      // Do not hide error keys
@ -855,6 +866,11 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
  return NewInternalIterator(ReadOptions(), &ignored);
 }

 int64 DBImpl::TEST_MaxNextLevelOverlappingBytes() {
  MutexLock l(&mutex_);
  return versions_->MaxNextLevelOverlappingBytes();
 }

 Status DBImpl::Get(const ReadOptions& options,
                   const Slice& key,
                   std::string* value) {
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -55,6 +55,10 @@ class DBImpl : public DB {
  // The returned iterator should be deleted when no longer needed.
  Iterator* TEST_NewInternalIterator();

  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
  int64 TEST_MaxNextLevelOverlappingBytes();

 private:
  friend class DB;

--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -72,15 +72,19 @@ class DBTest {
  }

  Status Put(const std::string& k, const std::string& v) {
    WriteOptions options;
    options.sync = false;
    WriteBatch batch;
    batch.Put(k, v);
    return db_->Write(WriteOptions(), &batch);
    return db_->Write(options, &batch);
  }

  Status Delete(const std::string& k) {
    WriteOptions options;
    options.sync = false;
    WriteBatch batch;
    batch.Delete(k);
    return db_->Write(WriteOptions(), &batch);
    return db_->Write(options, &batch);
  }

  std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
@ -176,6 +180,35 @@ class DBTest {
    fprintf(stderr, "Found %d live large value files\n", (int)live.size());
    return live;
  }

  void Compact(const Slice& start, const Slice& limit) {
    dbfull()->TEST_CompactMemTable();
    int max_level_with_files = 1;
    for (int level = 1; level < config::kNumLevels; level++) {
      uint64_t v;
      char name[100];
      snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
      if (dbfull()->GetProperty(name, &v) && v > 0) {
        max_level_with_files = level;
      }
    }
    for (int level = 0; level < max_level_with_files; level++) {
      dbfull()->TEST_CompactRange(level, "", "~");
    }
  }

  void DumpFileCounts(const char* label) {
    fprintf(stderr, "---\n%s:\n", label);
    fprintf(stderr, "maxoverlap: %lld\n",
            static_cast<long long>(
                dbfull()->TEST_MaxNextLevelOverlappingBytes()));
    for (int level = 0; level < config::kNumLevels; level++) {
      int num = NumTableFilesAtLevel(level);
      if (num > 0) {
        fprintf(stderr, "  level %3d : %d files\n", level, num);
      }
    }
  }
 };

 TEST(DBTest, Empty) {
@ -315,6 +348,43 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
  }
 }

 TEST(DBTest, SparseMerge) {
  Options options;
  options.compression = kNoCompression;
  Reopen(&options);

  // Suppose there is:
  //    small amount of data with prefix A
  //    large amount of data with prefix B
  //    small amount of data with prefix C
  // and that recent updates have made small changes to all three prefixes.
  // Check that we do not do a compaction that merges all of B in one shot.
  const std::string value(1000, 'x');
  Put("A", "va");
  // Write approximately 100MB of "B" values
  for (int i = 0; i < 100000; i++) {
    char key[100];
    snprintf(key, sizeof(key), "B%010d", i);
    Put(key, value);
  }
  Put("C", "vc");
  Compact("", "z");

  // Make sparse update
  Put("A",    "va2");
  Put("B100", "bvalue2");
  Put("C",    "vc2");
  dbfull()->TEST_CompactMemTable();

  // Compactions should not cause us to create a situation where
  // a file overlaps too much data at the next level.
  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
  dbfull()->TEST_CompactRange(0, "", "z");
  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
  dbfull()->TEST_CompactRange(1, "", "z");
  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
 }

 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
  bool result = (val >= low) && (val <= high);
  if (!result) {
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -20,6 +20,10 @@

 namespace leveldb {

 // Maximum number of overlaps in grandparent (i.e., level+2) before we
 // stop building a single file in a level->level+1 compaction.
 static const int kMaxGrandParentFiles = 10;

 static double MaxBytesForLevel(int level) {
  if (level == 0) {
    return 4 * 1048576.0;
@ -509,7 +513,7 @@ Status VersionSet::Finalize(Version* v) {
  double best_score = -1;

  Status s;
  for (int level = 0; s.ok() && level < config::kNumLevels; level++) {
  for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
    s = SortLevel(v, level);

    // Compute the ratio of current size to size limit.
@ -751,6 +755,25 @@ void VersionSet::AddLiveFiles(std::set* live) {
  }
 }

 int64 VersionSet::MaxNextLevelOverlappingBytes() {
  int64 result = 0;
  std::vector<FileMetaData*> overlaps;
  for (int level = 0; level < config::kNumLevels - 1; level++) {
    for (int i = 0; i < current_->files_[level].size(); i++) {
      const FileMetaData* f = current_->files_[level][i];
      GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps);
      int64 sum = 0;
      for (int j = 0; j < overlaps.size(); j++) {
        sum += overlaps[j]->file_size;
      }
      if (sum > result) {
        result = sum;
      }
    }
  }
  return result;
 }

 // Store in "*inputs" all files in "level" that overlap [begin,end]
 void VersionSet::GetOverlappingInputs(
    int level,
@ -797,6 +820,18 @@ void VersionSet::GetRange(const std::vector& inputs,
  }
 }

 // Stores the minimal range that covers all entries in inputs1 and inputs2
 // in *smallest, *largest.
 // REQUIRES: inputs is not empty
 void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
                           const std::vector<FileMetaData*>& inputs2,
                           InternalKey* smallest,
                           InternalKey* largest) {
  std::vector<FileMetaData*> all = inputs1;
  all.insert(all.end(), inputs2.begin(), inputs2.end());
  GetRange(all, smallest, largest);
 }

 Iterator* VersionSet::MakeInputIterator(Compaction* c) {
  ReadOptions options;
  options.verify_checksums = options_->paranoid_checks;
@ -836,6 +871,7 @@ Compaction* VersionSet::PickCompaction() {
  }
  const int level = current_->compaction_level_;
  assert(level >= 0);
  assert(level+1 < config::kNumLevels);

  Compaction* c = new Compaction(level);
  c->input_version_ = current_;
@ -855,31 +891,36 @@ Compaction* VersionSet::PickCompaction() {
    c->inputs_[0].push_back(current_->files_[level][0]);
  }

  // Find the range we are compacting
  InternalKey smallest, largest;
  GetRange(c->inputs_[0], &smallest, &largest);

  // Files in level 0 may overlap each other, so pick up all overlapping ones
  if (level == 0) {
    InternalKey smallest, largest;
    GetRange(c->inputs_[0], &smallest, &largest);
    // Note that the next call will discard the file we placed in
    // c->inputs_[0] earlier and replace it with an overlapping set
    // which will include the picked file.
    GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]);
    assert(!c->inputs_[0].empty());
    GetRange(c->inputs_[0], &smallest, &largest);
  }

  SetupOtherInputs(c);

  return c;
 }

 void VersionSet::SetupOtherInputs(Compaction* c) {
  const int level = c->level();
  InternalKey smallest, largest;
  GetRange(c->inputs_[0], &smallest, &largest);

  GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);

  // Get entire range covered by compaction
  InternalKey all_start, all_limit;
  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);

  // See if we can grow the number of inputs in "level" without
  // changing the number of "level+1" files we pick up.
  if (!c->inputs_[1].empty()) {
    // Get entire range covered by compaction
    std::vector<FileMetaData*> all = c->inputs_[0];
    all.insert(all.end(), c->inputs_[1].begin(), c->inputs_[1].end());
    InternalKey all_start, all_limit;
    GetRange(all, &all_start, &all_limit);

    std::vector<FileMetaData*> expanded0;
    GetOverlappingInputs(level, all_start, all_limit, &expanded0);
    if (expanded0.size() > c->inputs_[0].size()) {
@ -899,10 +940,17 @@ Compaction* VersionSet::PickCompaction() {
        largest = new_limit;
        c->inputs_[0] = expanded0;
        c->inputs_[1] = expanded1;
        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
      }
    }
  }

  // Compute the set of grandparent files that overlap this compaction
  // (parent == level+1; grandparent == level+2)
  if (level + 2 < config::kNumLevels) {
    GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_);
  }

  if (false) {
    Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
        level,
@ -916,8 +964,6 @@ Compaction* VersionSet::PickCompaction() {
  // key range next time.
  compact_pointer_[level] = largest.Encode().ToString();
  c->edit_.SetCompactPointer(level, largest);

  return c;
 }

 Compaction* VersionSet::CompactRange(
@ -934,25 +980,16 @@ Compaction* VersionSet::CompactRange(
  c->input_version_ = current_;
  c->input_version_->Ref();
  c->inputs_[0] = inputs;

  // Find the range we are compacting
  InternalKey smallest, largest;
  GetRange(c->inputs_[0], &smallest, &largest);

  GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
  if (false) {
    Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
        level,
        EscapeString(smallest.Encode()).c_str(),
        EscapeString(largest.Encode()).c_str());
  }
  SetupOtherInputs(c);
  return c;
 }

 Compaction::Compaction(int level)
    : level_(level),
      max_output_file_size_(MaxFileSizeForLevel(level)),
      input_version_(NULL) {
      input_version_(NULL),
      grandparent_index_(0),
      output_start_(-1) {
  for (int i = 0; i < config::kNumLevels; i++) {
    level_ptrs_[i] = 0;
  }
@ -964,6 +1001,15 @@ Compaction::~Compaction() {
  }
 }

 bool Compaction::IsTrivialMove() const {
  // Avoid a move if there are lots of overlapping grandparent files.
  // Otherwise, the move could create a parent file that will require
  // a very expensive merge later on.
  return (num_input_files(0) == 1
          && num_input_files(1) == 0
          && grandparents_.size() <= kMaxGrandParentFiles);
 }

 void Compaction::AddInputDeletions(VersionEdit* edit) {
  for (int which = 0; which < 2; which++) {
    for (int i = 0; i < inputs_[which].size(); i++) {
@ -993,6 +1039,28 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
  return true;
 }

 bool Compaction::ShouldStopBefore(const InternalKey& key) {
  // Scan to find earliest grandparent file that contains key.
  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
  while (grandparent_index_ < grandparents_.size() &&
      icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) {
    grandparent_index_++;
  }

  // First call?
  if (output_start_ < 0) {
    output_start_ = grandparent_index_;
  }

  if (grandparent_index_ - output_start_ + 1 > kMaxGrandParentFiles) {
    // Too many overlaps for current output; start new output
    output_start_ = grandparent_index_;
    return true;
  } else {
    return false;
  }
 }

 void Compaction::ReleaseInputs() {
  if (input_version_ != NULL) {
    input_version_->Unref();
--- a/db/version_set.h
+++ b/db/version_set.h
@ -139,6 +139,10 @@ class VersionSet {
      const InternalKey& begin,
      const InternalKey& end);

  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
  int64 MaxNextLevelOverlappingBytes();

  // Create an iterator that reads over the compaction inputs for "*c".
  // The caller should delete the iterator when no longer needed.
  Iterator* MakeInputIterator(Compaction* c);
@ -195,6 +199,13 @@ class VersionSet {
                InternalKey* smallest,
                InternalKey* largest);

  void GetRange2(const std::vector<FileMetaData*>& inputs1,
                 const std::vector<FileMetaData*>& inputs2,
                 InternalKey* smallest,
                 InternalKey* largest);

  void SetupOtherInputs(Compaction* c);

  Env* const env_;
  const std::string dbname_;
  const Options* const options_;
@ -250,6 +261,10 @@ class Compaction {
  // Maximum size of files to build during this compaction.
  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }

  // Is this a trivial compaction that can be implemented by just
  // moving a single input file to the next level (no merging or splitting)
  bool IsTrivialMove() const;

  // Add all inputs to this compaction as delete operations to *edit.
  void AddInputDeletions(VersionEdit* edit);

@ -258,6 +273,10 @@ class Compaction {
  // in levels greater than "level+1".
  bool IsBaseLevelForKey(const Slice& user_key);

  // Returns true iff we should stop building the current output
  // before processing "key".
  bool ShouldStopBefore(const InternalKey& key);

  // Release the input version for the compaction, once the compaction
  // is successful.
  void ReleaseInputs();
@ -276,6 +295,12 @@ class Compaction {
  // Each compaction reads inputs from "level_" and "level_+1"
  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs

  // State used to check for number of of overlapping grandparent files
  // (parent == level_ + 1, grandparent == level_ + 2)
  std::vector<FileMetaData*> grandparents_;
  int grandparent_index_;   // Index in grandparent_starts_
  int output_start_;        // Index in grandparent_starts_ where output started

  // State for implementing IsBaseLevelForKey

  // level_ptrs_ holds indices into input_version_->levels_: our state
--- a/doc/impl.html
+++ b/doc/impl.html
@ -123,8 +123,14 @@ one level-0 file in case some of these files overlap each other.
 A compaction merges the contents of the picked files to produce a
 sequence of level-(L+1) files.  We switch to producing a new
 level-(L+1) file after the current output file has reached the target
 file size (2MB).  The old files are discarded and the new files are
 added to the serving state.
 file size (2MB).  We also switch to a new output file when the key
 range of the current output file has grown enough to overlap more then
 ten level-(L+2) files.  This last rule ensures that a later compaction
 of a level-(L+1) file will not pick up too much data from level-(L+2).

 <p>
 The old files are discarded and the new files are added to the serving
 state.

 <p>
 Compactions for a particular level rotate through the key space.  In
--- a/include/db.h
+++ b/include/db.h
@ -12,6 +12,9 @@

 namespace leveldb {

 static const int kMajorVersion = 1;
 static const int kMinorVersion = 0;

 struct Options;
 struct ReadOptions;
 struct WriteOptions;
--- a/leveldb.gyp
+++ b/leveldb.gyp
@ -4,7 +4,7 @@

 {
  'variables': {
    'use_snappy%': 1,
    'use_snappy%': 0,
  },
  'target_defaults': {
    'defines': [