From 13b72af77b2b7fed156c26107476f84d35e4481d Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Tue, 22 Mar 2011 18:32:49 +0000 Subject: [PATCH] More changes from upstream. git-svn-id: https://leveldb.googlecode.com/svn/trunk@12 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_bench.cc | 264 ++++++++++++++++++++++++++++++++++++------------------ db/db_impl.cc | 20 ++++- db/db_impl.h | 4 + db/db_test.cc | 74 ++++++++++++++- db/version_set.cc | 122 +++++++++++++++++++------ db/version_set.h | 25 ++++++ doc/impl.html | 10 ++- include/db.h | 3 + leveldb.gyp | 2 +- 9 files changed, 405 insertions(+), 119 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index db8deea..72e0699 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -17,11 +17,14 @@ // Comma-separated list of operations to run in the specified order // Actual benchmarks: -// writeseq -- write N values in sequential key order -// writerandom -- write N values in random key order -// writebig -- write N/1000 100K valuesin random order -// readseq -- read N values sequentially -// readrandom -- read N values in random order +// fillseq -- write N values in sequential key order in async mode +// fillrandom -- write N values in random key order in async mode +// overwrite -- overwrite N values in random key order in async mode +// fillsync -- write N/100 values in random key order in sync mode +// fill100K -- write N/1000 100K values in random order in async mode +// readseq -- read N values sequentially +// readreverse -- read N values in reverse order +// readrandom -- read N values in random order // Meta operations: // compact -- Compact the entire DB // heapprofile -- Dump a heap profile (if supported by this port) @@ -30,10 +33,10 @@ // tenth -- divide N by 10 (i.e., following benchmarks are smaller) // normal -- reset N back to its normal value (1000000) static const char* FLAGS_benchmarks = - "writeseq," - "writeseq," - "writerandom," - "sync,tenth,tenth,writerandom,nosync,normal," + "fillseq," + "fillrandom," + "overwrite," + "fillsync," "readseq," "readreverse," "readrandom," @@ -41,7 +44,7 @@ static const char* FLAGS_benchmarks = "readseq," "readreverse," "readrandom," - "writebig"; + "fill100K"; // Number of key/values to place in database static int FLAGS_num = 1000000; @@ -51,7 +54,7 @@ static int FLAGS_value_size = 100; // Arrange to generate values that shrink to this fraction of // their original size after compression -static double FLAGS_compression_ratio = 0.25; +static double FLAGS_compression_ratio = 0.5; // Print histogram of operation timings static bool FLAGS_histogram = false; @@ -93,6 +96,19 @@ class RandomGenerator { return Slice(data_.data() + pos_ - len, len); } }; + +static Slice TrimSpace(Slice s) { + int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); +} + } class Benchmark { @@ -100,7 +116,6 @@ class Benchmark { Cache* cache_; DB* db_; int num_; - bool sync_; int heap_counter_; double start_; double last_op_finish_; @@ -114,6 +129,70 @@ class Benchmark { int done_; int next_report_; // When to report next + void PrintHeader() { + const int kKeySize = 16; + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", kKeySize); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %d\n", num_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + (((kKeySize + FLAGS_value_size) * num_) / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) + / 1048576.0)); + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + } + + void PrintEnvironment() { + fprintf(stderr, "LevelDB: version %d.%d\n", + kMajorVersion, kMinorVersion); + +#if defined(__linux) + time_t now = time(NULL); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != NULL) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != NULL) { + const char* sep = strchr(line, ':'); + if (sep == NULL) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + void Start() { start_ = Env::Default()->NowMicros() * 1e-6; bytes_ = 0; @@ -164,9 +243,10 @@ class Benchmark { snprintf(rate, sizeof(rate), "%5.1f MB/s", (bytes_ / 1048576.0) / (finish - start_)); if (!message_.empty()) { - message_.push_back(' '); + message_ = std::string(rate) + " " + message_; + } else { + message_ = rate; } - message_.append(rate); } fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", @@ -183,14 +263,16 @@ class Benchmark { public: enum Order { SEQUENTIAL, - REVERSE, // Currently only supported for reads RANDOM }; + enum DBState { + FRESH, + EXISTING + }; Benchmark() : cache_(NewLRUCache(200<<20)), db_(NULL), num_(FLAGS_num), - sync_(false), heap_counter_(0), bytes_(0), rand_(301) { @@ -210,19 +292,8 @@ class Benchmark { } void Run() { - Options options; - options.create_if_missing = true; - options.max_open_files = 10000; - options.block_cache = cache_; - options.write_buffer_size = FLAGS_write_buffer_size; - - Start(); - Status s = DB::Open(options, "/tmp/dbbench", &db_); - Stop("open"); - if (!s.ok()) { - fprintf(stderr, "open error: %s\n", s.ToString().c_str()); - exit(1); - } + PrintHeader(); + Open(); const char* benchmarks = FLAGS_benchmarks; while (benchmarks != NULL) { @@ -237,30 +308,30 @@ class Benchmark { } Start(); - if (name == Slice("writeseq")) { - Write(SEQUENTIAL, num_, FLAGS_value_size); - } else if (name == Slice("writerandom")) { - Write(RANDOM, num_, FLAGS_value_size); - } else if (name == Slice("writebig")) { - Write(RANDOM, num_ / 1000, 100 * 1000); + + WriteOptions write_options; + write_options.sync = false; + if (name == Slice("fillseq")) { + Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size); + } else if (name == Slice("fillrandom")) { + Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size); + } else if (name == Slice("overwrite")) { + Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size); + } else if (name == Slice("fillsync")) { + write_options.sync = true; + Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size); + } else if (name == Slice("fill100K")) { + Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000); } else if (name == Slice("readseq")) { - Read(SEQUENTIAL); + ReadSequential(); } else if (name == Slice("readreverse")) { - Read(REVERSE); + ReadReverse(); } else if (name == Slice("readrandom")) { - Read(RANDOM); + ReadRandom(); } else if (name == Slice("compact")) { Compact(); } else if (name == Slice("heapprofile")) { HeapProfile(); - } else if (name == Slice("sync")) { - sync_ = true; - } else if (name == Slice("nosync")) { - sync_ = false; - } else if (name == Slice("tenth")) { - num_ = num_ / 10; - } else if (name == Slice("normal")) { - num_ = FLAGS_num; } else { fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); } @@ -268,16 +339,44 @@ class Benchmark { } } - void Write(Order order, int num_entries, int value_size) { + private: + void Open() { + assert(db_ == NULL); + Options options; + options.create_if_missing = true; + options.max_open_files = 10000; + options.block_cache = cache_; + options.write_buffer_size = FLAGS_write_buffer_size; + Status s = DB::Open(options, "/tmp/dbbench", &db_); + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void Write(const WriteOptions& options, Order order, DBState state, + int num_entries, int value_size) { + if (state == FRESH) { + delete db_; + db_ = NULL; + DestroyDB("/tmp/dbbench", Options()); + Open(); + Start(); // Do not count time taken to destroy/open + } + + if (num_entries != num_) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%d ops)", num_entries); + message_ = msg; + } + WriteBatch batch; Status s; std::string val; - WriteOptions options; - options.sync = sync_; for (int i = 0; i < num_entries; i++) { const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); char key[100]; - snprintf(key, sizeof(key), "%012d", k); + snprintf(key, sizeof(key), "%016d", k); batch.Clear(); batch.Put(key, gen_.Generate(value_size)); s = db_->Write(options, &batch); @@ -290,42 +389,37 @@ class Benchmark { } } - void Read(Order order) { + void ReadSequential() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadReverse() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadRandom() { ReadOptions options; - switch (order) { - case SEQUENTIAL: { - Iterator* iter = db_->NewIterator(options); - int i = 0; - for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - break; - } - case REVERSE: { - Iterator* iter = db_->NewIterator(options); - int i = 0; - for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - break; - } - case RANDOM: { - std::string value; - for (int i = 0; i < num_; i++) { - char key[100]; - const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); - snprintf(key, sizeof(key), "%012d", k); - db_->Get(options, key, &value); - FinishedSingleOp(); - } - break; - } + std::string value; + for (int i = 0; i < num_; i++) { + char key[100]; + const int k = rand_.Next() % FLAGS_num; + snprintf(key, sizeof(key), "%016d", k); + db_->Get(options, key, &value); + FinishedSingleOp(); } } diff --git a/db/db_impl.cc b/db/db_impl.cc index 5008af0..4d66044 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -532,8 +532,9 @@ void DBImpl::BackgroundCompaction() { } Status status; - if (c->num_input_files(0) == 1 && c->num_input_files(1) == 0) { + if (c->IsTrivialMove()) { // Move file to next level + assert(c->num_input_files(0) == 1); FileMetaData* f = c->input(0, 0); c->edit()->DeleteFile(c->level(), f->number); c->edit()->AddFile(c->level() + 1, f->number, f->file_size, @@ -718,8 +719,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { bool has_current_user_key = false; SequenceNumber last_sequence_for_key = kMaxSequenceNumber; for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Handle key/value, add to state, etc. Slice key = input->key(); + InternalKey tmp_internal_key; + tmp_internal_key.DecodeFrom(key); + if (compact->compaction->ShouldStopBefore(tmp_internal_key) && + compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. bool drop = false; if (!ParseInternalKey(key, &ikey)) { // Do not hide error keys @@ -855,6 +866,11 @@ Iterator* DBImpl::TEST_NewInternalIterator() { return NewInternalIterator(ReadOptions(), &ignored); } +int64 DBImpl::TEST_MaxNextLevelOverlappingBytes() { + MutexLock l(&mutex_); + return versions_->MaxNextLevelOverlappingBytes(); +} + Status DBImpl::Get(const ReadOptions& options, const Slice& key, std::string* value) { diff --git a/db/db_impl.h b/db/db_impl.h index fc3d3f2..980d512 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -55,6 +55,10 @@ class DBImpl : public DB { // The returned iterator should be deleted when no longer needed. Iterator* TEST_NewInternalIterator(); + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64 TEST_MaxNextLevelOverlappingBytes(); + private: friend class DB; diff --git a/db/db_test.cc b/db/db_test.cc index 895a5e1..f8accf6 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -72,15 +72,19 @@ class DBTest { } Status Put(const std::string& k, const std::string& v) { + WriteOptions options; + options.sync = false; WriteBatch batch; batch.Put(k, v); - return db_->Write(WriteOptions(), &batch); + return db_->Write(options, &batch); } Status Delete(const std::string& k) { + WriteOptions options; + options.sync = false; WriteBatch batch; batch.Delete(k); - return db_->Write(WriteOptions(), &batch); + return db_->Write(options, &batch); } std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { @@ -176,6 +180,35 @@ class DBTest { fprintf(stderr, "Found %d live large value files\n", (int)live.size()); return live; } + + void Compact(const Slice& start, const Slice& limit) { + dbfull()->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + uint64_t v; + char name[100]; + snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); + if (dbfull()->GetProperty(name, &v) && v > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbfull()->TEST_CompactRange(level, "", "~"); + } + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < config::kNumLevels; level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } }; TEST(DBTest, Empty) { @@ -315,6 +348,43 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { } } +TEST(DBTest, SparseMerge) { + Options options; + options.compression = kNoCompression; + Reopen(&options); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put("A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + Put("C", "vc"); + Compact("", "z"); + + // Make sparse update + Put("A", "va2"); + Put("B100", "bvalue2"); + Put("C", "vc2"); + dbfull()->TEST_CompactMemTable(); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); +} + static bool Between(uint64_t val, uint64_t low, uint64_t high) { bool result = (val >= low) && (val <= high); if (!result) { diff --git a/db/version_set.cc b/db/version_set.cc index 2435fa2..46333f4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -20,6 +20,10 @@ namespace leveldb { +// Maximum number of overlaps in grandparent (i.e., level+2) before we +// stop building a single file in a level->level+1 compaction. +static const int kMaxGrandParentFiles = 10; + static double MaxBytesForLevel(int level) { if (level == 0) { return 4 * 1048576.0; @@ -509,7 +513,7 @@ Status VersionSet::Finalize(Version* v) { double best_score = -1; Status s; - for (int level = 0; s.ok() && level < config::kNumLevels; level++) { + for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { s = SortLevel(v, level); // Compute the ratio of current size to size limit. @@ -751,6 +755,25 @@ void VersionSet::AddLiveFiles(std::set* live) { } } +int64 VersionSet::MaxNextLevelOverlappingBytes() { + int64 result = 0; + std::vector overlaps; + for (int level = 0; level < config::kNumLevels - 1; level++) { + for (int i = 0; i < current_->files_[level].size(); i++) { + const FileMetaData* f = current_->files_[level][i]; + GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); + int64 sum = 0; + for (int j = 0; j < overlaps.size(); j++) { + sum += overlaps[j]->file_size; + } + if (sum > result) { + result = sum; + } + } + } + return result; +} + // Store in "*inputs" all files in "level" that overlap [begin,end] void VersionSet::GetOverlappingInputs( int level, @@ -797,6 +820,18 @@ void VersionSet::GetRange(const std::vector& inputs, } } +// Stores the minimal range that covers all entries in inputs1 and inputs2 +// in *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + Iterator* VersionSet::MakeInputIterator(Compaction* c) { ReadOptions options; options.verify_checksums = options_->paranoid_checks; @@ -836,6 +871,7 @@ Compaction* VersionSet::PickCompaction() { } const int level = current_->compaction_level_; assert(level >= 0); + assert(level+1 < config::kNumLevels); Compaction* c = new Compaction(level); c->input_version_ = current_; @@ -855,31 +891,36 @@ Compaction* VersionSet::PickCompaction() { c->inputs_[0].push_back(current_->files_[level][0]); } - // Find the range we are compacting - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - // Files in level 0 may overlap each other, so pick up all overlapping ones if (level == 0) { + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); // Note that the next call will discard the file we placed in // c->inputs_[0] earlier and replace it with an overlapping set // which will include the picked file. GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); assert(!c->inputs_[0].empty()); - GetRange(c->inputs_[0], &smallest, &largest); } + SetupOtherInputs(c); + + return c; +} + +void VersionSet::SetupOtherInputs(Compaction* c) { + const int level = c->level(); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + // See if we can grow the number of inputs in "level" without // changing the number of "level+1" files we pick up. if (!c->inputs_[1].empty()) { - // Get entire range covered by compaction - std::vector all = c->inputs_[0]; - all.insert(all.end(), c->inputs_[1].begin(), c->inputs_[1].end()); - InternalKey all_start, all_limit; - GetRange(all, &all_start, &all_limit); - std::vector expanded0; GetOverlappingInputs(level, all_start, all_limit, &expanded0); if (expanded0.size() > c->inputs_[0].size()) { @@ -899,10 +940,17 @@ Compaction* VersionSet::PickCompaction() { largest = new_limit; c->inputs_[0] = expanded0; c->inputs_[1] = expanded1; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); } } } + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < config::kNumLevels) { + GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); + } + if (false) { Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", level, @@ -916,8 +964,6 @@ Compaction* VersionSet::PickCompaction() { // key range next time. compact_pointer_[level] = largest.Encode().ToString(); c->edit_.SetCompactPointer(level, largest); - - return c; } Compaction* VersionSet::CompactRange( @@ -934,25 +980,16 @@ Compaction* VersionSet::CompactRange( c->input_version_ = current_; c->input_version_->Ref(); c->inputs_[0] = inputs; - - // Find the range we are compacting - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - - GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); - if (false) { - Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", - level, - EscapeString(smallest.Encode()).c_str(), - EscapeString(largest.Encode()).c_str()); - } + SetupOtherInputs(c); return c; } Compaction::Compaction(int level) : level_(level), max_output_file_size_(MaxFileSizeForLevel(level)), - input_version_(NULL) { + input_version_(NULL), + grandparent_index_(0), + output_start_(-1) { for (int i = 0; i < config::kNumLevels; i++) { level_ptrs_[i] = 0; } @@ -964,6 +1001,15 @@ Compaction::~Compaction() { } } +bool Compaction::IsTrivialMove() const { + // Avoid a move if there are lots of overlapping grandparent files. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + return (num_input_files(0) == 1 + && num_input_files(1) == 0 + && grandparents_.size() <= kMaxGrandParentFiles); +} + void Compaction::AddInputDeletions(VersionEdit* edit) { for (int which = 0; which < 2; which++) { for (int i = 0; i < inputs_[which].size(); i++) { @@ -993,6 +1039,28 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) { return true; } +bool Compaction::ShouldStopBefore(const InternalKey& key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { + grandparent_index_++; + } + + // First call? + if (output_start_ < 0) { + output_start_ = grandparent_index_; + } + + if (grandparent_index_ - output_start_ + 1 > kMaxGrandParentFiles) { + // Too many overlaps for current output; start new output + output_start_ = grandparent_index_; + return true; + } else { + return false; + } +} + void Compaction::ReleaseInputs() { if (input_version_ != NULL) { input_version_->Unref(); diff --git a/db/version_set.h b/db/version_set.h index b8eee3d..0ea763d 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -139,6 +139,10 @@ class VersionSet { const InternalKey& begin, const InternalKey& end); + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64 MaxNextLevelOverlappingBytes(); + // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. Iterator* MakeInputIterator(Compaction* c); @@ -195,6 +199,13 @@ class VersionSet { InternalKey* smallest, InternalKey* largest); + void GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest); + + void SetupOtherInputs(Compaction* c); + Env* const env_; const std::string dbname_; const Options* const options_; @@ -250,6 +261,10 @@ class Compaction { // Maximum size of files to build during this compaction. uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + // Add all inputs to this compaction as delete operations to *edit. void AddInputDeletions(VersionEdit* edit); @@ -258,6 +273,10 @@ class Compaction { // in levels greater than "level+1". bool IsBaseLevelForKey(const Slice& user_key); + // Returns true iff we should stop building the current output + // before processing "key". + bool ShouldStopBefore(const InternalKey& key); + // Release the input version for the compaction, once the compaction // is successful. void ReleaseInputs(); @@ -276,6 +295,12 @@ class Compaction { // Each compaction reads inputs from "level_" and "level_+1" std::vector inputs_[2]; // The two sets of inputs + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + int grandparent_index_; // Index in grandparent_starts_ + int output_start_; // Index in grandparent_starts_ where output started + // State for implementing IsBaseLevelForKey // level_ptrs_ holds indices into input_version_->levels_: our state diff --git a/doc/impl.html b/doc/impl.html index 2f2c809..b190d2c 100644 --- a/doc/impl.html +++ b/doc/impl.html @@ -123,8 +123,14 @@ one level-0 file in case some of these files overlap each other. A compaction merges the contents of the picked files to produce a sequence of level-(L+1) files. We switch to producing a new level-(L+1) file after the current output file has reached the target -file size (2MB). The old files are discarded and the new files are -added to the serving state. +file size (2MB). We also switch to a new output file when the key +range of the current output file has grown enough to overlap more then +ten level-(L+2) files. This last rule ensures that a later compaction +of a level-(L+1) file will not pick up too much data from level-(L+2). + +

+The old files are discarded and the new files are added to the serving +state.

Compactions for a particular level rotate through the key space. In diff --git a/include/db.h b/include/db.h index c4d152d..75be1ca 100644 --- a/include/db.h +++ b/include/db.h @@ -12,6 +12,9 @@ namespace leveldb { +static const int kMajorVersion = 1; +static const int kMinorVersion = 0; + struct Options; struct ReadOptions; struct WriteOptions; diff --git a/leveldb.gyp b/leveldb.gyp index 934f2d0..eb809f3 100644 --- a/leveldb.gyp +++ b/leveldb.gyp @@ -4,7 +4,7 @@ { 'variables': { - 'use_snappy%': 1, + 'use_snappy%': 0, }, 'target_defaults': { 'defines': [