From 13b72af77b2b7fed156c26107476f84d35e4481d Mon Sep 17 00:00:00 2001
From: "jorlow@chromium.org"
 <jorlow@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>
Date: Tue, 22 Mar 2011 18:32:49 +0000
Subject: [PATCH] More changes from upstream.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@12 62dab493-f737-651d-591e-8d6aee1b9529
---
 db/db_bench.cc    | 264 ++++++++++++++++++++++++++++++++++++------------------
 db/db_impl.cc     |  20 ++++-
 db/db_impl.h      |   4 +
 db/db_test.cc     |  74 ++++++++++++++-
 db/version_set.cc | 122 +++++++++++++++++++------
 db/version_set.h  |  25 ++++++
 doc/impl.html     |  10 ++-
 include/db.h      |   3 +
 leveldb.gyp       |   2 +-
 9 files changed, 405 insertions(+), 119 deletions(-)
diff --git a/db/db_bench.cc b/db/db_bench.cc
index db8deea..72e0699 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -17,11 +17,14 @@
 
 // Comma-separated list of operations to run in the specified order
 //   Actual benchmarks:
-//      writeseq    -- write N values in sequential key order
-//      writerandom -- write N values in random key order
-//      writebig    -- write N/1000 100K valuesin random order
-//      readseq     -- read N values sequentially
-//      readrandom  -- read N values in random order
+//      fillseq       -- write N values in sequential key order in async mode
+//      fillrandom    -- write N values in random key order in async mode
+//      overwrite     -- overwrite N values in random key order in async mode
+//      fillsync      -- write N/100 values in random key order in sync mode
+//      fill100K      -- write N/1000 100K values in random order in async mode
+//      readseq       -- read N values sequentially
+//      readreverse   -- read N values in reverse order
+//      readrandom    -- read N values in random order
 //   Meta operations:
 //      compact     -- Compact the entire DB
 //      heapprofile -- Dump a heap profile (if supported by this port)
@@ -30,10 +33,10 @@
 //      tenth       -- divide N by 10 (i.e., following benchmarks are smaller)
 //      normal      -- reset N back to its normal value (1000000)
 static const char* FLAGS_benchmarks =
-    "writeseq,"
-    "writeseq,"
-    "writerandom,"
-    "sync,tenth,tenth,writerandom,nosync,normal,"
+    "fillseq,"
+    "fillrandom,"
+    "overwrite,"
+    "fillsync,"
     "readseq,"
     "readreverse,"
     "readrandom,"
@@ -41,7 +44,7 @@ static const char* FLAGS_benchmarks =
     "readseq,"
     "readreverse,"
     "readrandom,"
-    "writebig";
+    "fill100K";
 
 // Number of key/values to place in database
 static int FLAGS_num = 1000000;
@@ -51,7 +54,7 @@ static int FLAGS_value_size = 100;
 
 // Arrange to generate values that shrink to this fraction of
 // their original size after compression
-static double FLAGS_compression_ratio = 0.25;
+static double FLAGS_compression_ratio = 0.5;
 
 // Print histogram of operation timings
 static bool FLAGS_histogram = false;
@@ -93,6 +96,19 @@ class RandomGenerator {
     return Slice(data_.data() + pos_ - len, len);
   }
 };
+
+static Slice TrimSpace(Slice s) {
+  int start = 0;
+  while (start < s.size() && isspace(s[start])) {
+    start++;
+  }
+  int limit = s.size();
+  while (limit > start && isspace(s[limit-1])) {
+    limit--;
+  }
+  return Slice(s.data() + start, limit - start);
+}
+
 }
 
 class Benchmark {
@@ -100,7 +116,6 @@ class Benchmark {
   Cache* cache_;
   DB* db_;
   int num_;
-  bool sync_;
   int heap_counter_;
   double start_;
   double last_op_finish_;
@@ -114,6 +129,70 @@ class Benchmark {
   int done_;
   int next_report_;     // When to report next
 
+  void PrintHeader() {
+    const int kKeySize = 16;
+    PrintEnvironment();
+    fprintf(stdout, "Keys:       %d bytes each\n", kKeySize);
+    fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
+            FLAGS_value_size,
+            static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
+    fprintf(stdout, "Entries:    %d\n", num_);
+    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
+            (((kKeySize + FLAGS_value_size) * num_) / 1048576.0));
+    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
+            (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
+             / 1048576.0));
+    PrintWarnings();
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+    fprintf(stdout,
+            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
+            );
+#endif
+#ifndef NDEBUG
+    fprintf(stdout,
+            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+  }
+
+  void PrintEnvironment() {
+    fprintf(stderr, "LevelDB:    version %d.%d\n",
+            kMajorVersion, kMinorVersion);
+
+#if defined(__linux)
+    time_t now = time(NULL);
+    fprintf(stderr, "Date:       %s", ctime(&now));  // ctime() adds newline
+
+    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo != NULL) {
+      char line[1000];
+      int num_cpus = 0;
+      std::string cpu_type;
+      std::string cache_size;
+      while (fgets(line, sizeof(line), cpuinfo) != NULL) {
+        const char* sep = strchr(line, ':');
+        if (sep == NULL) {
+          continue;
+        }
+        Slice key = TrimSpace(Slice(line, sep - 1 - line));
+        Slice val = TrimSpace(Slice(sep + 1));
+        if (key == "model name") {
+          ++num_cpus;
+          cpu_type = val.ToString();
+        } else if (key == "cache size") {
+          cache_size = val.ToString();
+        }
+      }
+      fclose(cpuinfo);
+      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#endif
+  }
+
   void Start() {
     start_ = Env::Default()->NowMicros() * 1e-6;
     bytes_ = 0;
@@ -164,9 +243,10 @@ class Benchmark {
       snprintf(rate, sizeof(rate), "%5.1f MB/s",
                (bytes_ / 1048576.0) / (finish - start_));
       if (!message_.empty()) {
-        message_.push_back(' ');
+        message_  = std::string(rate) + " " + message_;
+      } else {
+        message_ = rate;
       }
-      message_.append(rate);
     }
 
     fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
@@ -183,14 +263,16 @@ class Benchmark {
  public:
   enum Order {
     SEQUENTIAL,
-    REVERSE,  // Currently only supported for reads
     RANDOM
   };
+  enum DBState {
+    FRESH,
+    EXISTING
+  };
 
   Benchmark() : cache_(NewLRUCache(200<<20)),
                 db_(NULL),
                 num_(FLAGS_num),
-                sync_(false),
                 heap_counter_(0),
                 bytes_(0),
                 rand_(301) {
@@ -210,19 +292,8 @@ class Benchmark {
   }
 
   void Run() {
-    Options options;
-    options.create_if_missing = true;
-    options.max_open_files = 10000;
-    options.block_cache = cache_;
-    options.write_buffer_size = FLAGS_write_buffer_size;
-
-    Start();
-    Status s = DB::Open(options, "/tmp/dbbench", &db_);
-    Stop("open");
-    if (!s.ok()) {
-      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
-    }
+    PrintHeader();
+    Open();
 
     const char* benchmarks = FLAGS_benchmarks;
     while (benchmarks != NULL) {
@@ -237,30 +308,30 @@ class Benchmark {
       }
 
       Start();
-      if (name == Slice("writeseq")) {
-        Write(SEQUENTIAL, num_, FLAGS_value_size);
-      } else if (name == Slice("writerandom")) {
-        Write(RANDOM, num_, FLAGS_value_size);
-      } else if (name == Slice("writebig")) {
-        Write(RANDOM, num_ / 1000, 100 * 1000);
+
+      WriteOptions write_options;
+      write_options.sync = false;
+      if (name == Slice("fillseq")) {
+        Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size);
+      } else if (name == Slice("fillrandom")) {
+        Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size);
+      } else if (name == Slice("overwrite")) {
+        Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size);
+      } else if (name == Slice("fillsync")) {
+        write_options.sync = true;
+        Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size);
+      } else if (name == Slice("fill100K")) {
+        Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000);
       } else if (name == Slice("readseq")) {
-        Read(SEQUENTIAL);
+        ReadSequential();
       } else if (name == Slice("readreverse")) {
-        Read(REVERSE);
+        ReadReverse();
       } else if (name == Slice("readrandom")) {
-        Read(RANDOM);
+        ReadRandom();
       } else if (name == Slice("compact")) {
         Compact();
       } else if (name == Slice("heapprofile")) {
         HeapProfile();
-      } else if (name == Slice("sync")) {
-        sync_ = true;
-      } else if (name == Slice("nosync")) {
-        sync_ = false;
-      } else if (name == Slice("tenth")) {
-        num_ = num_ / 10;
-      } else if (name == Slice("normal")) {
-        num_ = FLAGS_num;
       } else {
         fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
       }
@@ -268,16 +339,44 @@ class Benchmark {
     }
   }
 
-  void Write(Order order, int num_entries, int value_size) {
+ private:
+  void Open() {
+    assert(db_ == NULL);
+    Options options;
+    options.create_if_missing = true;
+    options.max_open_files = 10000;
+    options.block_cache = cache_;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    Status s = DB::Open(options, "/tmp/dbbench", &db_);
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void Write(const WriteOptions& options, Order order, DBState state,
+             int num_entries, int value_size) {
+    if (state == FRESH) {
+      delete db_;
+      db_ = NULL;
+      DestroyDB("/tmp/dbbench", Options());
+      Open();
+      Start();  // Do not count time taken to destroy/open
+    }
+
+    if (num_entries != num_) {
+      char msg[100];
+      snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
+      message_ = msg;
+    }
+
     WriteBatch batch;
     Status s;
     std::string val;
-    WriteOptions options;
-    options.sync = sync_;
     for (int i = 0; i < num_entries; i++) {
       const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
       char key[100];
-      snprintf(key, sizeof(key), "%012d", k);
+      snprintf(key, sizeof(key), "%016d", k);
       batch.Clear();
       batch.Put(key, gen_.Generate(value_size));
       s = db_->Write(options, &batch);
@@ -290,42 +389,37 @@ class Benchmark {
     }
   }
 
-  void Read(Order order) {
+  void ReadSequential() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    int i = 0;
+    for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
+      bytes_ += iter->key().size() + iter->value().size();
+      FinishedSingleOp();
+      ++i;
+    }
+    delete iter;
+  }
+
+  void ReadReverse() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    int i = 0;
+    for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
+      bytes_ += iter->key().size() + iter->value().size();
+      FinishedSingleOp();
+      ++i;
+    }
+    delete iter;
+  }
+
+  void ReadRandom() {
     ReadOptions options;
-    switch (order) {
-      case SEQUENTIAL: {
-        Iterator* iter = db_->NewIterator(options);
-        int i = 0;
-        for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
-          bytes_ += iter->key().size() + iter->value().size();
-          FinishedSingleOp();
-          ++i;
-        }
-        delete iter;
-        break;
-      }
-      case REVERSE: {
-        Iterator* iter = db_->NewIterator(options);
-        int i = 0;
-        for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
-          bytes_ += iter->key().size() + iter->value().size();
-          FinishedSingleOp();
-          ++i;
-        }
-        delete iter;
-        break;
-      }
-      case RANDOM: {
-        std::string value;
-        for (int i = 0; i < num_; i++) {
-          char key[100];
-          const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
-          snprintf(key, sizeof(key), "%012d", k);
-          db_->Get(options, key, &value);
-          FinishedSingleOp();
-        }
-        break;
-      }
+    std::string value;
+    for (int i = 0; i < num_; i++) {
+      char key[100];
+      const int k = rand_.Next() % FLAGS_num;
+      snprintf(key, sizeof(key), "%016d", k);
+      db_->Get(options, key, &value);
+      FinishedSingleOp();
     }
   }
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 5008af0..4d66044 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -532,8 +532,9 @@ void DBImpl::BackgroundCompaction() {
   }
 
   Status status;
-  if (c->num_input_files(0) == 1 && c->num_input_files(1) == 0) {
+  if (c->IsTrivialMove()) {
     // Move file to next level
+    assert(c->num_input_files(0) == 1);
     FileMetaData* f = c->input(0, 0);
     c->edit()->DeleteFile(c->level(), f->number);
     c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
@@ -718,8 +719,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
   bool has_current_user_key = false;
   SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
   for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
-    // Handle key/value, add to state, etc.
     Slice key = input->key();
+    InternalKey tmp_internal_key;
+    tmp_internal_key.DecodeFrom(key);
+    if (compact->compaction->ShouldStopBefore(tmp_internal_key) &&
+        compact->builder != NULL) {
+      status = FinishCompactionOutputFile(compact, input);
+      if (!status.ok()) {
+        break;
+      }
+    }
+
+    // Handle key/value, add to state, etc.
     bool drop = false;
     if (!ParseInternalKey(key, &ikey)) {
       // Do not hide error keys
@@ -855,6 +866,11 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
   return NewInternalIterator(ReadOptions(), &ignored);
 }
 
+int64 DBImpl::TEST_MaxNextLevelOverlappingBytes() {
+  MutexLock l(&mutex_);
+  return versions_->MaxNextLevelOverlappingBytes();
+}
+
 Status DBImpl::Get(const ReadOptions& options,
                    const Slice& key,
                    std::string* value) {
diff --git a/db/db_impl.h b/db/db_impl.h
index fc3d3f2..980d512 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -55,6 +55,10 @@ class DBImpl : public DB {
   // The returned iterator should be deleted when no longer needed.
   Iterator* TEST_NewInternalIterator();
 
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64 TEST_MaxNextLevelOverlappingBytes();
+
  private:
   friend class DB;
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 895a5e1..f8accf6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -72,15 +72,19 @@ class DBTest {
   }
 
   Status Put(const std::string& k, const std::string& v) {
+    WriteOptions options;
+    options.sync = false;
     WriteBatch batch;
     batch.Put(k, v);
-    return db_->Write(WriteOptions(), &batch);
+    return db_->Write(options, &batch);
   }
 
   Status Delete(const std::string& k) {
+    WriteOptions options;
+    options.sync = false;
     WriteBatch batch;
     batch.Delete(k);
-    return db_->Write(WriteOptions(), &batch);
+    return db_->Write(options, &batch);
   }
 
   std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
@@ -176,6 +180,35 @@ class DBTest {
     fprintf(stderr, "Found %d live large value files\n", (int)live.size());
     return live;
   }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    dbfull()->TEST_CompactMemTable();
+    int max_level_with_files = 1;
+    for (int level = 1; level < config::kNumLevels; level++) {
+      uint64_t v;
+      char name[100];
+      snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
+      if (dbfull()->GetProperty(name, &v) && v > 0) {
+        max_level_with_files = level;
+      }
+    }
+    for (int level = 0; level < max_level_with_files; level++) {
+      dbfull()->TEST_CompactRange(level, "", "~");
+    }
+  }
+
+  void DumpFileCounts(const char* label) {
+    fprintf(stderr, "---\n%s:\n", label);
+    fprintf(stderr, "maxoverlap: %lld\n",
+            static_cast<long long>(
+                dbfull()->TEST_MaxNextLevelOverlappingBytes()));
+    for (int level = 0; level < config::kNumLevels; level++) {
+      int num = NumTableFilesAtLevel(level);
+      if (num > 0) {
+        fprintf(stderr, "  level %3d : %d files\n", level, num);
+      }
+    }
+  }
 };
 
 TEST(DBTest, Empty) {
@@ -315,6 +348,43 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
   }
 }
 
+TEST(DBTest, SparseMerge) {
+  Options options;
+  options.compression = kNoCompression;
+  Reopen(&options);
+
+  // Suppose there is:
+  //    small amount of data with prefix A
+  //    large amount of data with prefix B
+  //    small amount of data with prefix C
+  // and that recent updates have made small changes to all three prefixes.
+  // Check that we do not do a compaction that merges all of B in one shot.
+  const std::string value(1000, 'x');
+  Put("A", "va");
+  // Write approximately 100MB of "B" values
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(key, value);
+  }
+  Put("C", "vc");
+  Compact("", "z");
+
+  // Make sparse update
+  Put("A",    "va2");
+  Put("B100", "bvalue2");
+  Put("C",    "vc2");
+  dbfull()->TEST_CompactMemTable();
+
+  // Compactions should not cause us to create a situation where
+  // a file overlaps too much data at the next level.
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  dbfull()->TEST_CompactRange(0, "", "z");
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  dbfull()->TEST_CompactRange(1, "", "z");
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+}
+
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
   if (!result) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 2435fa2..46333f4 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -20,6 +20,10 @@
 
 namespace leveldb {
 
+// Maximum number of overlaps in grandparent (i.e., level+2) before we
+// stop building a single file in a level->level+1 compaction.
+static const int kMaxGrandParentFiles = 10;
+
 static double MaxBytesForLevel(int level) {
   if (level == 0) {
     return 4 * 1048576.0;
@@ -509,7 +513,7 @@ Status VersionSet::Finalize(Version* v) {
   double best_score = -1;
 
   Status s;
-  for (int level = 0; s.ok() && level < config::kNumLevels; level++) {
+  for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
     s = SortLevel(v, level);
 
     // Compute the ratio of current size to size limit.
@@ -751,6 +755,25 @@ void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
   }
 }
 
+int64 VersionSet::MaxNextLevelOverlappingBytes() {
+  int64 result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 0; level < config::kNumLevels - 1; level++) {
+    for (int i = 0; i < current_->files_[level].size(); i++) {
+      const FileMetaData* f = current_->files_[level][i];
+      GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps);
+      int64 sum = 0;
+      for (int j = 0; j < overlaps.size(); j++) {
+        sum += overlaps[j]->file_size;
+      }
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
 // Store in "*inputs" all files in "level" that overlap [begin,end]
 void VersionSet::GetOverlappingInputs(
     int level,
@@ -797,6 +820,18 @@ void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
   }
 }
 
+// Stores the minimal range that covers all entries in inputs1 and inputs2
+// in *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
+                           const std::vector<FileMetaData*>& inputs2,
+                           InternalKey* smallest,
+                           InternalKey* largest) {
+  std::vector<FileMetaData*> all = inputs1;
+  all.insert(all.end(), inputs2.begin(), inputs2.end());
+  GetRange(all, smallest, largest);
+}
+
 Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   ReadOptions options;
   options.verify_checksums = options_->paranoid_checks;
@@ -836,6 +871,7 @@ Compaction* VersionSet::PickCompaction() {
   }
   const int level = current_->compaction_level_;
   assert(level >= 0);
+  assert(level+1 < config::kNumLevels);
 
   Compaction* c = new Compaction(level);
   c->input_version_ = current_;
@@ -855,31 +891,36 @@ Compaction* VersionSet::PickCompaction() {
     c->inputs_[0].push_back(current_->files_[level][0]);
   }
 
-  // Find the range we are compacting
-  InternalKey smallest, largest;
-  GetRange(c->inputs_[0], &smallest, &largest);
-
   // Files in level 0 may overlap each other, so pick up all overlapping ones
   if (level == 0) {
+    InternalKey smallest, largest;
+    GetRange(c->inputs_[0], &smallest, &largest);
     // Note that the next call will discard the file we placed in
     // c->inputs_[0] earlier and replace it with an overlapping set
     // which will include the picked file.
     GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]);
     assert(!c->inputs_[0].empty());
-    GetRange(c->inputs_[0], &smallest, &largest);
   }
 
+  SetupOtherInputs(c);
+
+  return c;
+}
+
+void VersionSet::SetupOtherInputs(Compaction* c) {
+  const int level = c->level();
+  InternalKey smallest, largest;
+  GetRange(c->inputs_[0], &smallest, &largest);
+
   GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
 
+  // Get entire range covered by compaction
+  InternalKey all_start, all_limit;
+  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
   // See if we can grow the number of inputs in "level" without
   // changing the number of "level+1" files we pick up.
   if (!c->inputs_[1].empty()) {
-    // Get entire range covered by compaction
-    std::vector<FileMetaData*> all = c->inputs_[0];
-    all.insert(all.end(), c->inputs_[1].begin(), c->inputs_[1].end());
-    InternalKey all_start, all_limit;
-    GetRange(all, &all_start, &all_limit);
-
     std::vector<FileMetaData*> expanded0;
     GetOverlappingInputs(level, all_start, all_limit, &expanded0);
     if (expanded0.size() > c->inputs_[0].size()) {
@@ -899,10 +940,17 @@ Compaction* VersionSet::PickCompaction() {
         largest = new_limit;
         c->inputs_[0] = expanded0;
         c->inputs_[1] = expanded1;
+        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
       }
     }
   }
 
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (level + 2 < config::kNumLevels) {
+    GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_);
+  }
+
   if (false) {
     Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
         level,
@@ -916,8 +964,6 @@ Compaction* VersionSet::PickCompaction() {
   // key range next time.
   compact_pointer_[level] = largest.Encode().ToString();
   c->edit_.SetCompactPointer(level, largest);
-
-  return c;
 }
 
 Compaction* VersionSet::CompactRange(
@@ -934,25 +980,16 @@ Compaction* VersionSet::CompactRange(
   c->input_version_ = current_;
   c->input_version_->Ref();
   c->inputs_[0] = inputs;
-
-  // Find the range we are compacting
-  InternalKey smallest, largest;
-  GetRange(c->inputs_[0], &smallest, &largest);
-
-  GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
-  if (false) {
-    Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
-        level,
-        EscapeString(smallest.Encode()).c_str(),
-        EscapeString(largest.Encode()).c_str());
-  }
+  SetupOtherInputs(c);
   return c;
 }
 
 Compaction::Compaction(int level)
     : level_(level),
       max_output_file_size_(MaxFileSizeForLevel(level)),
-      input_version_(NULL) {
+      input_version_(NULL),
+      grandparent_index_(0),
+      output_start_(-1) {
   for (int i = 0; i < config::kNumLevels; i++) {
     level_ptrs_[i] = 0;
   }
@@ -964,6 +1001,15 @@ Compaction::~Compaction() {
   }
 }
 
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there are lots of overlapping grandparent files.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  return (num_input_files(0) == 1
+          && num_input_files(1) == 0
+          && grandparents_.size() <= kMaxGrandParentFiles);
+}
+
 void Compaction::AddInputDeletions(VersionEdit* edit) {
   for (int which = 0; which < 2; which++) {
     for (int i = 0; i < inputs_[which].size(); i++) {
@@ -993,6 +1039,28 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
   return true;
 }
 
+bool Compaction::ShouldStopBefore(const InternalKey& key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) {
+    grandparent_index_++;
+  }
+
+  // First call?
+  if (output_start_ < 0) {
+    output_start_ = grandparent_index_;
+  }
+
+  if (grandparent_index_ - output_start_ + 1 > kMaxGrandParentFiles) {
+    // Too many overlaps for current output; start new output
+    output_start_ = grandparent_index_;
+    return true;
+  } else {
+    return false;
+  }
+}
+
 void Compaction::ReleaseInputs() {
   if (input_version_ != NULL) {
     input_version_->Unref();
diff --git a/db/version_set.h b/db/version_set.h
index b8eee3d..0ea763d 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -139,6 +139,10 @@ class VersionSet {
       const InternalKey& begin,
       const InternalKey& end);
 
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64 MaxNextLevelOverlappingBytes();
+
   // Create an iterator that reads over the compaction inputs for "*c".
   // The caller should delete the iterator when no longer needed.
   Iterator* MakeInputIterator(Compaction* c);
@@ -195,6 +199,13 @@ class VersionSet {
                 InternalKey* smallest,
                 InternalKey* largest);
 
+  void GetRange2(const std::vector<FileMetaData*>& inputs1,
+                 const std::vector<FileMetaData*>& inputs2,
+                 InternalKey* smallest,
+                 InternalKey* largest);
+
+  void SetupOtherInputs(Compaction* c);
+
   Env* const env_;
   const std::string dbname_;
   const Options* const options_;
@@ -250,6 +261,10 @@ class Compaction {
   // Maximum size of files to build during this compaction.
   uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
 
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
   // Add all inputs to this compaction as delete operations to *edit.
   void AddInputDeletions(VersionEdit* edit);
 
@@ -258,6 +273,10 @@ class Compaction {
   // in levels greater than "level+1".
   bool IsBaseLevelForKey(const Slice& user_key);
 
+  // Returns true iff we should stop building the current output
+  // before processing "key".
+  bool ShouldStopBefore(const InternalKey& key);
+
   // Release the input version for the compaction, once the compaction
   // is successful.
   void ReleaseInputs();
@@ -276,6 +295,12 @@ class Compaction {
   // Each compaction reads inputs from "level_" and "level_+1"
   std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
 
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  int grandparent_index_;   // Index in grandparent_starts_
+  int output_start_;        // Index in grandparent_starts_ where output started
+
   // State for implementing IsBaseLevelForKey
 
   // level_ptrs_ holds indices into input_version_->levels_: our state
diff --git a/doc/impl.html b/doc/impl.html
index 2f2c809..b190d2c 100644
--- a/doc/impl.html
+++ b/doc/impl.html
@@ -123,8 +123,14 @@ one level-0 file in case some of these files overlap each other.
 A compaction merges the contents of the picked files to produce a
 sequence of level-(L+1) files.  We switch to producing a new
 level-(L+1) file after the current output file has reached the target
-file size (2MB).  The old files are discarded and the new files are
-added to the serving state.
+file size (2MB).  We also switch to a new output file when the key
+range of the current output file has grown enough to overlap more then
+ten level-(L+2) files.  This last rule ensures that a later compaction
+of a level-(L+1) file will not pick up too much data from level-(L+2).
+
+<p>
+The old files are discarded and the new files are added to the serving
+state.
 
 <p>
 Compactions for a particular level rotate through the key space.  In
diff --git a/include/db.h b/include/db.h
index c4d152d..75be1ca 100644
--- a/include/db.h
+++ b/include/db.h
@@ -12,6 +12,9 @@
 
 namespace leveldb {
 
+static const int kMajorVersion = 1;
+static const int kMinorVersion = 0;
+
 struct Options;
 struct ReadOptions;
 struct WriteOptions;
diff --git a/leveldb.gyp b/leveldb.gyp
index 934f2d0..eb809f3 100644
--- a/leveldb.gyp
+++ b/leveldb.gyp
@@ -4,7 +4,7 @@
 
 {
   'variables': {
-    'use_snappy%': 1,
+    'use_snappy%': 0,
   },
   'target_defaults': {
     'defines': [