diff --git a/build_detect_platform b/build_detect_platform index 7f0df31..d1804e0 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -35,11 +35,6 @@ case `uname -s` in echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_FREEBSD" >> build_config.mk echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk ;; - GNU/kFreeBSD) - PLATFORM=OS_FREEBSD - echo "PLATFORM_CFLAGS=-pthread -DOS_FREEBSD" >> build_config.mk - echo "PLATFORM_LDFLAGS=-lpthread -lrt" >> build_config.mk - ;; *) echo "Unknown platform!" exit 1 diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 69fa03a..1edcd84 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -229,8 +229,8 @@ TEST(CorruptionTest, TableFile) { Build(100); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); + dbi->TEST_CompactRange(0, NULL, NULL); + dbi->TEST_CompactRange(1, NULL, NULL); Corrupt(kTableFile, 100, 1); Check(99, 99); @@ -278,7 +278,7 @@ TEST(CorruptionTest, CorruptedDescriptor) { ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(0, NULL, NULL); Corrupt(kDescriptorFile, 0, 1000); Status s = TryReopen(); diff --git a/db/db_bench.cc b/db/db_bench.cc index bb63e59..cf9bb65 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -796,20 +796,7 @@ class Benchmark { } void Compact(ThreadState* thread) { - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - std::string property; - char name[100]; - snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); - if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbi->TEST_CompactRange(level, "", "~"); - } + db_->CompactRange(NULL, NULL); } void PrintStats() { diff --git a/db/db_impl.cc b/db/db_impl.cc index 0ca6386..56182a0 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -454,13 +454,8 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, if (s.ok() && meta.file_size > 0) { const Slice min_user_key = meta.smallest.user_key(); const Slice max_user_key = meta.largest.user_key(); - if (base != NULL && !base->OverlapInLevel(0, min_user_key, max_user_key)) { - // Push the new sstable to a higher level if possible to reduce - // expensive manifest file ops. - while (level < config::kMaxMemCompactLevel && - !base->OverlapInLevel(level + 1, min_user_key, max_user_key)) { - level++; - } + if (base != NULL) { + level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); } edit->AddFile(level, meta.number, meta.file_size, meta.smallest, meta.largest); @@ -506,25 +501,55 @@ Status DBImpl::CompactMemTable() { return s; } -void DBImpl::TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end) { +void DBImpl::CompactRange(const Slice* begin, const Slice* end) { + int max_level_with_files = 1; + { + MutexLock l(&mutex_); + Version* base = versions_->current(); + for (int level = 1; level < config::kNumLevels; level++) { + if (base->OverlapInLevel(level, begin, end)) { + max_level_with_files = level; + } + } + } + TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap + for (int level = 0; level < max_level_with_files; level++) { + TEST_CompactRange(level, begin, end); + } +} + +void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { assert(level >= 0); assert(level + 1 < config::kNumLevels); - MutexLock l(&mutex_); - while (manual_compaction_ != NULL) { - bg_cv_.Wait(); - } + InternalKey begin_storage, end_storage; + ManualCompaction manual; manual.level = level; - manual.begin = begin; - manual.end = end; - manual_compaction_ = &manual; - MaybeScheduleCompaction(); - while (manual_compaction_ == &manual) { - bg_cv_.Wait(); + manual.done = false; + if (begin == NULL) { + manual.begin = NULL; + } else { + begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); + manual.begin = &begin_storage; + } + if (end == NULL) { + manual.end = NULL; + } else { + end_storage = InternalKey(*end, 0, static_cast(0)); + manual.end = &end_storage; + } + + MutexLock l(&mutex_); + while (!manual.done) { + while (manual_compaction_ != NULL) { + bg_cv_.Wait(); + } + manual_compaction_ = &manual; + MaybeScheduleCompaction(); + while (manual_compaction_ == &manual) { + bg_cv_.Wait(); + } } } @@ -590,12 +615,20 @@ void DBImpl::BackgroundCompaction() { Compaction* c; bool is_manual = (manual_compaction_ != NULL); + InternalKey manual_end; if (is_manual) { - const ManualCompaction* m = manual_compaction_; - c = versions_->CompactRange( + ManualCompaction* m = manual_compaction_; + c = versions_->CompactRange(m->level, m->begin, m->end); + m->done = (c == NULL); + if (c != NULL) { + manual_end = c->input(0, c->num_input_files(0) - 1)->largest; + } + Log(options_.info_log, + "Manual compaction at level-%d from %s .. %s; will stop at %s\n", m->level, - InternalKey(m->begin, kMaxSequenceNumber, kValueTypeForSeek), - InternalKey(m->end, 0, static_cast(0))); + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + (m->done ? "(end)" : manual_end.DebugString().c_str())); } else { c = versions_->PickCompaction(); } @@ -638,7 +671,13 @@ void DBImpl::BackgroundCompaction() { } if (is_manual) { - // Mark it as done + ManualCompaction* m = manual_compaction_; + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + m->tmp_storage = manual_end; + m->begin = &m->tmp_storage; + } manual_compaction_ = NULL; } } @@ -1109,10 +1148,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { versions_->SetLastSequence(last_sequence); } - if (options.post_write_snapshot != NULL) { - *options.post_write_snapshot = - status.ok() ? snapshots_.New(last_sequence) : NULL; - } ReleaseLoggingResponsibility(&self); return status; } @@ -1225,6 +1260,9 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } } return true; + } else if (in == "sstables") { + *value = versions_->current()->DebugString(); + return true; } return false; diff --git a/db/db_impl.h b/db/db_impl.h index 5268137..ab03181 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -38,14 +38,12 @@ class DBImpl : public DB { virtual void ReleaseSnapshot(const Snapshot* snapshot); virtual bool GetProperty(const Slice& property, std::string* value); virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); + virtual void CompactRange(const Slice* begin, const Slice* end); // Extra methods (for testing) that are not in the public DB interface - // Compact any files in the named level that overlap [begin,end] - void TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end); + // Compact any files in the named level that overlap [*begin,*end] + void TEST_CompactRange(int level, const Slice* begin, const Slice* end); // Force current memtable contents to be compacted. Status TEST_CompactMemTable(); @@ -145,8 +143,10 @@ class DBImpl : public DB { // Information for a manual compaction struct ManualCompaction { int level; - std::string begin; - std::string end; + bool done; + const InternalKey* begin; // NULL means beginning of key range + const InternalKey* end; // NULL means end of key range + InternalKey tmp_storage; // Used to keep track of compaction progress }; ManualCompaction* manual_compaction_; diff --git a/db/db_test.cc b/db/db_test.cc index daa9c03..ab71c51 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -195,6 +195,23 @@ class DBTest { return result; } + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < config::kNumLevels; level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + uint64_t Size(const Slice& start, const Slice& limit) { Range r(start, limit); uint64_t size; @@ -203,26 +220,23 @@ class DBTest { } void Compact(const Slice& start, const Slice& limit) { - dbfull()->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - if (NumTableFilesAtLevel(level) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbfull()->TEST_CompactRange(level, "", "~"); + db_->CompactRange(&start, &limit); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large) { + for (int i = 0; i < n; i++) { + Put(small, "begin"); + Put(large, "end"); + dbfull()->TEST_CompactMemTable(); } } // Prevent pushing of new sstables into deeper levels by adding // tables that cover a specified range to all levels. void FillLevels(const std::string& smallest, const std::string& largest) { - for (int level = 0; level < config::kNumLevels; level++) { - Put(smallest, "begin"); - Put(largest, "end"); - dbfull()->TEST_CompactMemTable(); - } + MakeTables(config::kNumLevels, smallest, largest); } void DumpFileCounts(const char* label) { @@ -238,6 +252,12 @@ class DBTest { } } + std::string DumpSSTableList() { + std::string property; + db_->GetProperty("leveldb.sstables", &property); + return property; + } + std::string IterStatus(Iterator* iter) { std::string result; if (iter->Valid()) { @@ -367,7 +387,7 @@ TEST(DBTest, GetEncountersEmptyLevel) { } // Step 2: clear level 1 if necessary. - dbfull()->TEST_CompactRange(1, "a", "z"); + dbfull()->TEST_CompactRange(1, NULL, NULL); ASSERT_EQ(NumTableFilesAtLevel(0), 1); ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 1); @@ -693,7 +713,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { // Reopening moves updates to level-0 Reopen(&options); - dbfull()->TEST_CompactRange(0, "", Key(100000)); + dbfull()->TEST_CompactRange(0, NULL, NULL); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_GT(NumTableFilesAtLevel(1), 1); @@ -744,7 +764,7 @@ TEST(DBTest, SparseMerge) { } Put("C", "vc"); dbfull()->TEST_CompactMemTable(); - dbfull()->TEST_CompactRange(0, "A", "Z"); + dbfull()->TEST_CompactRange(0, NULL, NULL); // Make sparse update Put("A", "va2"); @@ -755,9 +775,9 @@ TEST(DBTest, SparseMerge) { // Compactions should not cause us to create a situation where // a file overlaps too much data at the next level. ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(0, NULL, NULL); ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(1, "", "z"); + dbfull()->TEST_CompactRange(1, NULL, NULL); ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); } @@ -808,9 +828,11 @@ TEST(DBTest, ApproximateSizes) { ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); - dbfull()->TEST_CompactRange(0, - Key(compact_start), - Key(compact_start + 9)); + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend); } ASSERT_EQ(NumTableFilesAtLevel(0), 0); @@ -850,7 +872,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); - dbfull()->TEST_CompactRange(0, Key(0), Key(100)); + dbfull()->TEST_CompactRange(0, NULL, NULL); } } @@ -921,11 +943,12 @@ TEST(DBTest, HiddenValuesAreRemoved) { ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); db_->ReleaseSnapshot(snapshot); ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); - dbfull()->TEST_CompactRange(0, "", "x"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, NULL, &x); ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_GE(NumTableFilesAtLevel(1), 1); - dbfull()->TEST_CompactRange(1, "", "x"); + dbfull()->TEST_CompactRange(1, NULL, &x); ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); @@ -949,11 +972,12 @@ TEST(DBTest, DeletionMarkers1) { ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-2, "", "z"); + Slice z("z"); + dbfull()->TEST_CompactRange(last-2, NULL, &z); // DEL eliminated, but v1 remains because we aren't compacting that level // (DEL can be eliminated because v2 hides v1). ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(last-1, "", "z"); + dbfull()->TEST_CompactRange(last-1, NULL, NULL); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); @@ -976,15 +1000,54 @@ TEST(DBTest, DeletionMarkers2) { ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-2, "", "z"); + dbfull()->TEST_CompactRange(last-2, NULL, NULL); // DEL kept: "last" file overlaps ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-1, "", "z"); + dbfull()->TEST_CompactRange(last-1, NULL, NULL); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); } +TEST(DBTest, OverlapInLevel0) { + ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config"; + + // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. + ASSERT_OK(Put("100", "v100")); + ASSERT_OK(Put("999", "v999")); + dbfull()->TEST_CompactMemTable(); + ASSERT_OK(Delete("100")); + ASSERT_OK(Delete("999")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("0,1,1", FilesPerLevel()); + + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put("300", "v300")); + ASSERT_OK(Put("500", "v500")); + dbfull()->TEST_CompactMemTable(); + ASSERT_OK(Put("200", "v200")); + ASSERT_OK(Put("600", "v600")); + ASSERT_OK(Put("900", "v900")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("2,1,1", FilesPerLevel()); + + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, NULL, NULL); + dbfull()->TEST_CompactRange(2, NULL, NULL); + ASSERT_EQ("2", FilesPerLevel()); + + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete("600")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_EQ("NOT_FOUND", Get("600")); +} + TEST(DBTest, ComparatorCheck) { class NewComparator : public Comparator { public: @@ -1008,6 +1071,40 @@ TEST(DBTest, ComparatorCheck) { << s.ToString(); } +TEST(DBTest, ManualCompaction) { + ASSERT_EQ(config::kMaxMemCompactLevel, 2) + << "Need to update this test to match kMaxMemCompactLevel"; + + MakeTables(3, "p", "q"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls before files + Compact("", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls after files + Compact("r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range overlaps files + Compact("p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel()); + + // Populate a different range + MakeTables(3, "c", "e"); + ASSERT_EQ("1,1,2", FilesPerLevel()); + + // Compact just the new range + Compact("b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel()); + + // Compact all + MakeTables(1, "a", "z"); + ASSERT_EQ("0,1,2", FilesPerLevel()); + db_->CompactRange(NULL, NULL); + ASSERT_EQ("0,0,1", FilesPerLevel()); +} + TEST(DBTest, DBOpen_Options) { std::string dbname = test::TmpDir() + "/db_options_test"; DestroyDB(dbname, Options()); @@ -1187,7 +1284,6 @@ class ModelDB: public DB { delete reinterpret_cast(snapshot); } virtual Status Write(const WriteOptions& options, WriteBatch* batch) { - assert(options.post_write_snapshot == NULL); // Not supported class Handler : public WriteBatch::Handler { public: KVMap* map_; @@ -1211,6 +1307,9 @@ class ModelDB: public DB { sizes[i] = 0; } } + virtual void CompactRange(const Slice* start, const Slice* end) { + } + private: class ModelIter: public Iterator { public: diff --git a/db/dbformat.cc b/db/dbformat.cc index af2e077..4fb3531 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -31,6 +31,18 @@ std::string ParsedInternalKey::DebugString() const { return result; } +std::string InternalKey::DebugString() const { + std::string result; + ParsedInternalKey parsed; + if (ParseInternalKey(rep_, &parsed)) { + result = parsed.DebugString(); + } else { + result = "(bad)"; + result.append(EscapeString(rep_)); + } + return result; +} + const char* InternalKeyComparator::Name() const { return "leveldb.InternalKeyComparator"; } diff --git a/db/dbformat.h b/db/dbformat.h index 7344cbf..d046990 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -149,6 +149,8 @@ class InternalKey { } void Clear() { rep_.clear(); } + + std::string DebugString() const; }; inline int InternalKeyComparator::Compare( diff --git a/db/version_edit.cc b/db/version_edit.cc index f6b9e9c..9891c32 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -235,9 +235,8 @@ std::string VersionEdit::DebugString() const { for (size_t i = 0; i < compact_pointers_.size(); i++) { r.append("\n CompactPointer: "); AppendNumberTo(&r, compact_pointers_[i].first); - r.append(" '"); - AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); - r.append("'"); + r.append(" "); + r.append(compact_pointers_[i].second.DebugString()); } for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); iter != deleted_files_.end(); @@ -255,11 +254,10 @@ std::string VersionEdit::DebugString() const { AppendNumberTo(&r, f.number); r.append(" "); AppendNumberTo(&r, f.file_size); - r.append(" '"); - AppendEscapedStringTo(&r, f.smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, f.largest.Encode()); - r.append("'"); + r.append(" "); + r.append(f.smallest.DebugString()); + r.append(" .. "); + r.append(f.largest.DebugString()); } r.append("\n}\n"); return r; diff --git a/db/version_set.cc b/db/version_set.cc index d75b347..8b96af0 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -41,6 +41,14 @@ static uint64_t MaxFileSizeForLevel(int level) { return kTargetFileSize; // We could vary per level to reduce number of files? } +static int64_t TotalFileSize(const std::vector& files) { + int64_t sum = 0; + for (size_t i = 0; i < files.size(); i++) { + sum += files[i]->file_size; + } + return sum; +} + namespace { std::string IntSetToString(const std::set& s) { std::string result = "{"; @@ -96,17 +104,55 @@ int FindFile(const InternalKeyComparator& icmp, return right; } +static bool AfterFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // NULL user_key occurs before all keys and is therefore never after *f + return (user_key != NULL && + ucmp->Compare(*user_key, f->largest.user_key()) > 0); +} + +static bool BeforeFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // NULL user_key occurs after all keys and is therefore never before *f + return (user_key != NULL && + ucmp->Compare(*user_key, f->smallest.user_key()) < 0); +} + bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, + bool disjoint_sorted_files, const std::vector& files, - const Slice& smallest_user_key, - const Slice& largest_user_key) { - // Find the earliest possible internal key for smallest_user_key - InternalKey small(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); - const uint32_t index = FindFile(icmp, files, small.Encode()); - return ((index < files.size()) && - icmp.user_comparator()->Compare( - largest_user_key, files[index]->smallest.user_key()) >= 0); + const Slice* smallest_user_key, + const Slice* largest_user_key) { + const Comparator* ucmp = icmp.user_comparator(); + if (!disjoint_sorted_files) { + // Need to check against all files + for (int i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + if (AfterFile(ucmp, smallest_user_key, f) || + BeforeFile(ucmp, largest_user_key, f)) { + // No overlap + } else { + return true; // Overlap + } + } + return false; + } + + // Binary search over file list + uint32_t index = 0; + if (smallest_user_key != NULL) { + // Find the earliest possible internal key for smallest_user_key + InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek); + index = FindFile(icmp, files, small.Encode()); + } + + if (index >= files.size()) { + // beginning of range is after all files, so no overlap. + return false; + } + + return !BeforeFile(ucmp, largest_user_key, files[index]); } // An internal iterator. For a given version/level pair, yields @@ -358,11 +404,64 @@ void Version::Unref() { } bool Version::OverlapInLevel(int level, - const Slice& smallest_user_key, - const Slice& largest_user_key) { - return SomeFileOverlapsRange(vset_->icmp_, files_[level], - smallest_user_key, - largest_user_key); + const Slice* smallest_user_key, + const Slice* largest_user_key) { + return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], + smallest_user_key, largest_user_key); +} + +int Version::PickLevelForMemTableOutput( + const Slice& smallest_user_key, + const Slice& largest_user_key) { + int level = 0; + if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { + // Push to next level if there is no overlap in next level, + // and the #bytes overlapping in the level after that are limited. + InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey limit(largest_user_key, 0, static_cast(0)); + std::vector overlaps; + while (level < config::kMaxMemCompactLevel) { + if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { + break; + } + GetOverlappingInputs(level + 2, &start, &limit, &overlaps); + const int64_t sum = TotalFileSize(overlaps); + if (sum > kMaxGrandParentOverlapBytes) { + break; + } + level++; + } + } + return level; +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +void Version::GetOverlappingInputs( + int level, + const InternalKey* begin, + const InternalKey* end, + std::vector* inputs) { + inputs->clear(); + Slice user_begin, user_end; + if (begin != NULL) { + user_begin = begin->user_key(); + } + if (end != NULL) { + user_end = end->user_key(); + } + const Comparator* user_cmp = vset_->icmp_.user_comparator(); + for (size_t i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + if (begin != NULL && + user_cmp->Compare(f->largest.user_key(), user_begin) < 0) { + // "f" is completely before specified range; skip it + } else if (end != NULL && + user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { + // "f" is completely after specified range; skip it + } else { + inputs->push_back(f); + } + } } std::string Version::DebugString() const { @@ -381,11 +480,11 @@ std::string Version::DebugString() const { AppendNumberTo(&r, files[i]->number); r.push_back(':'); AppendNumberTo(&r, files[i]->file_size); - r.append("['"); - AppendEscapedStringTo(&r, files[i]->smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, files[i]->largest.Encode()); - r.append("']\n"); + r.append("["); + r.append(files[i]->smallest.DebugString()); + r.append(" .. "); + r.append(files[i]->largest.DebugString()); + r.append("]\n"); } } return r; @@ -540,8 +639,8 @@ class VersionSet::Builder { const InternalKey& this_begin = v->files_[level][i]->smallest; if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) { fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", - EscapeString(prev_end.Encode()).c_str(), - EscapeString(this_begin.Encode()).c_str()); + prev_end.DebugString().c_str(), + this_begin.DebugString().c_str()); abort(); } } @@ -814,14 +913,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { } } -static int64_t TotalFileSize(const std::vector& files) { - int64_t sum = 0; - for (size_t i = 0; i < files.size(); i++) { - sum += files[i]->file_size; - } - return sum; -} - void VersionSet::Finalize(Version* v) { // Precomputed best level for next compaction int best_level = -1; @@ -967,7 +1058,8 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() { for (int level = 1; level < config::kNumLevels - 1; level++) { for (size_t i = 0; i < current_->files_[level].size(); i++) { const FileMetaData* f = current_->files_[level][i]; - GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); + current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest, + &overlaps); const int64_t sum = TotalFileSize(overlaps); if (sum > result) { result = sum; @@ -977,27 +1069,6 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() { return result; } -// Store in "*inputs" all files in "level" that overlap [begin,end] -void VersionSet::GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs) { - inputs->clear(); - Slice user_begin = begin.user_key(); - Slice user_end = end.user_key(); - const Comparator* user_cmp = icmp_.user_comparator(); - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || - user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { - // Either completely before or after range; skip it - } else { - inputs->push_back(f); - } - } -} - // Stores the minimal range that covers all entries in inputs in // *smallest, *largest. // REQUIRES: inputs is not empty @@ -1113,7 +1184,7 @@ Compaction* VersionSet::PickCompaction() { // Note that the next call will discard the file we placed in // c->inputs_[0] earlier and replace it with an overlapping set // which will include the picked file. - GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); + current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); assert(!c->inputs_[0].empty()); } @@ -1127,7 +1198,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) { InternalKey smallest, largest; GetRange(c->inputs_[0], &smallest, &largest); - GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]); // Get entire range covered by compaction InternalKey all_start, all_limit; @@ -1137,12 +1208,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) { // changing the number of "level+1" files we pick up. if (!c->inputs_[1].empty()) { std::vector expanded0; - GetOverlappingInputs(level, all_start, all_limit, &expanded0); + current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0); if (expanded0.size() > c->inputs_[0].size()) { InternalKey new_start, new_limit; GetRange(expanded0, &new_start, &new_limit); std::vector expanded1; - GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); + current_->GetOverlappingInputs(level+1, &new_start, &new_limit, + &expanded1); if (expanded1.size() == c->inputs_[1].size()) { Log(options_->info_log, "Expanding@%d %d+%d to %d+%d\n", @@ -1163,14 +1235,15 @@ void VersionSet::SetupOtherInputs(Compaction* c) { // Compute the set of grandparent files that overlap this compaction // (parent == level+1; grandparent == level+2) if (level + 2 < config::kNumLevels) { - GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); + current_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); } if (false) { Log(options_->info_log, "Compacting %d '%s' .. '%s'", level, - EscapeString(smallest.Encode()).c_str(), - EscapeString(largest.Encode()).c_str()); + smallest.DebugString().c_str(), + largest.DebugString().c_str()); } // Update the place where we will do the next compaction for this level. @@ -1183,14 +1256,26 @@ void VersionSet::SetupOtherInputs(Compaction* c) { Compaction* VersionSet::CompactRange( int level, - const InternalKey& begin, - const InternalKey& end) { + const InternalKey* begin, + const InternalKey* end) { std::vector inputs; - GetOverlappingInputs(level, begin, end, &inputs); + current_->GetOverlappingInputs(level, begin, end, &inputs); if (inputs.empty()) { return NULL; } + // Avoid compacting too much in one shot in case the range is large. + const uint64_t limit = MaxFileSizeForLevel(level); + uint64_t total = 0; + for (int i = 0; i < inputs.size(); i++) { + uint64_t s = inputs[i]->file_size; + total += s; + if (total >= limit) { + inputs.resize(i + 1); + break; + } + } + Compaction* c = new Compaction(level); c->input_version_ = current_; c->input_version_->Ref(); diff --git a/db/version_set.h b/db/version_set.h index 2dbd948..b866b2a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -43,12 +43,17 @@ extern int FindFile(const InternalKeyComparator& icmp, const Slice& key); // Returns true iff some file in "files" overlaps the user key range -// [smallest,largest]. +// [*smallest,*largest]. +// smallest==NULL represents a key smaller than all keys in the DB. +// largest==NULL represents a key largest than all keys in the DB. +// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges +// in sorted order. extern bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, + bool disjoint_sorted_files, const std::vector& files, - const Slice& smallest_user_key, - const Slice& largest_user_key); + const Slice* smallest_user_key, + const Slice* largest_user_key); class Version { public: @@ -77,11 +82,24 @@ class Version { void Ref(); void Unref(); + void GetOverlappingInputs( + int level, + const InternalKey* begin, // NULL means before all keys + const InternalKey* end, // NULL means after all keys + std::vector* inputs); + // Returns true iff some file in the specified level overlaps - // some part of [smallest_user_key,largest_user_key]. + // some part of [*smallest_user_key,*largest_user_key]. + // smallest_user_key==NULL represents a key smaller than all keys in the DB. + // largest_user_key==NULL represents a key largest than all keys in the DB. bool OverlapInLevel(int level, - const Slice& smallest_user_key, - const Slice& largest_user_key); + const Slice* smallest_user_key, + const Slice* largest_user_key); + + // Return the level at which we should place a new memtable compaction + // result that covers the range [smallest_user_key,largest_user_key]. + int PickLevelForMemTableOutput(const Slice& smallest_user_key, + const Slice& largest_user_key); int NumFiles(int level) const { return files_[level].size(); } @@ -192,8 +210,8 @@ class VersionSet { // the result. Compaction* CompactRange( int level, - const InternalKey& begin, - const InternalKey& end); + const InternalKey* begin, + const InternalKey* end); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. @@ -232,12 +250,6 @@ class VersionSet { void Finalize(Version* v); - void GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs); - void GetRange(const std::vector& inputs, InternalKey* smallest, InternalKey* largest); diff --git a/db/version_set_test.cc b/db/version_set_test.cc index ecfd62b..06f8bbd 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -12,6 +12,9 @@ namespace leveldb { class FindFileTest { public: std::vector files_; + bool disjoint_sorted_files_; + + FindFileTest() : disjoint_sorted_files_(true) { } ~FindFileTest() { for (int i = 0; i < files_.size(); i++) { @@ -37,13 +40,20 @@ class FindFileTest { bool Overlaps(const char* smallest, const char* largest) { InternalKeyComparator cmp(BytewiseComparator()); - return SomeFileOverlapsRange(cmp, files_, smallest, largest); + Slice s(smallest != NULL ? smallest : ""); + Slice l(largest != NULL ? largest : ""); + return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_, + (smallest != NULL ? &s : NULL), + (largest != NULL ? &l : NULL)); } }; TEST(FindFileTest, Empty) { ASSERT_EQ(0, Find("foo")); ASSERT_TRUE(! Overlaps("a", "z")); + ASSERT_TRUE(! Overlaps(NULL, "z")); + ASSERT_TRUE(! Overlaps("a", NULL)); + ASSERT_TRUE(! Overlaps(NULL, NULL)); } TEST(FindFileTest, Single) { @@ -67,6 +77,13 @@ TEST(FindFileTest, Single) { ASSERT_TRUE(Overlaps("p1", "z")); ASSERT_TRUE(Overlaps("q", "q")); ASSERT_TRUE(Overlaps("q", "q1")); + + ASSERT_TRUE(! Overlaps(NULL, "j")); + ASSERT_TRUE(! Overlaps("r", NULL)); + ASSERT_TRUE(Overlaps(NULL, "p")); + ASSERT_TRUE(Overlaps(NULL, "p1")); + ASSERT_TRUE(Overlaps("q", NULL)); + ASSERT_TRUE(Overlaps(NULL, NULL)); } @@ -108,6 +125,26 @@ TEST(FindFileTest, Multiple) { ASSERT_TRUE(Overlaps("450", "500")); } +TEST(FindFileTest, MultipleNullBoundaries) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_TRUE(! Overlaps(NULL, "149")); + ASSERT_TRUE(! Overlaps("451", NULL)); + ASSERT_TRUE(Overlaps(NULL, NULL)); + ASSERT_TRUE(Overlaps(NULL, "150")); + ASSERT_TRUE(Overlaps(NULL, "199")); + ASSERT_TRUE(Overlaps(NULL, "200")); + ASSERT_TRUE(Overlaps(NULL, "201")); + ASSERT_TRUE(Overlaps(NULL, "400")); + ASSERT_TRUE(Overlaps(NULL, "800")); + ASSERT_TRUE(Overlaps("100", NULL)); + ASSERT_TRUE(Overlaps("200", NULL)); + ASSERT_TRUE(Overlaps("449", NULL)); + ASSERT_TRUE(Overlaps("450", NULL)); +} + TEST(FindFileTest, OverlapSequenceChecks) { Add("200", "200", 5000, 3000); ASSERT_TRUE(! Overlaps("199", "199")); @@ -117,6 +154,24 @@ TEST(FindFileTest, OverlapSequenceChecks) { ASSERT_TRUE(Overlaps("200", "210")); } +TEST(FindFileTest, OverlappingFiles) { + Add("150", "600"); + Add("400", "500"); + disjoint_sorted_files_ = false; + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("601", "700")); + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); + ASSERT_TRUE(Overlaps("450", "700")); + ASSERT_TRUE(Overlaps("600", "700")); +} + } int main(int argc, char** argv) { diff --git a/doc/index.html b/doc/index.html index 8d03c45..472f7cd 100644 --- a/doc/index.html +++ b/doc/index.html @@ -193,7 +193,7 @@ that a read should operate on a particular version of the DB state. If ReadOptions::snapshot is NULL, the read will operate on an implicit snapshot of the current state.

-Snapshots typically are created by the DB::GetSnapshot() method: +Snapshots are created by the DB::GetSnapshot() method:

   leveldb::ReadOptions options;
@@ -208,26 +208,6 @@ Note that when a snapshot is no longer needed, it should be released
 using the DB::ReleaseSnapshot interface.  This allows the
 implementation to get rid of state that was being maintained just to
 support reading as of that snapshot.
-

-A Write operation can also return a snapshot that -represents the state of the database just after applying a particular -set of updates: -

-

-  leveldb::Snapshot* snapshot;
-  leveldb::WriteOptions write_options;
-  write_options.post_write_snapshot = &snapshot;
-  leveldb::Status status = db->Write(write_options, ...);
-  ... perform other mutations to db ...
-
-  leveldb::ReadOptions read_options;
-  read_options.snapshot = snapshot;
-  leveldb::Iterator* iter = db->NewIterator(read_options);
-  ... read as of the state just after the Write call returned ...
-  delete iter;
-
-  db->ReleaseSnapshot(snapshot);
-

Slice

The return value of the it->key() and it->value() calls above diff --git a/include/leveldb/db.h b/include/leveldb/db.h index f945dd7..7fb2965 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -112,6 +112,8 @@ class DB { // where is an ASCII representation of a level number (e.g. "0"). // "leveldb.stats" - returns a multi-line string that describes statistics // about the internal operation of the DB. + // "leveldb.sstables" - returns a multi-line string that describes all + // of the sstables that make up the db contents. virtual bool GetProperty(const Slice& property, std::string* value) = 0; // For each i in [0,n-1], store in "sizes[i]", the approximate @@ -125,8 +127,17 @@ class DB { virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) = 0; - // Possible extensions: - // (1) Add a method to compact a range of keys + // Compact the underlying storage for the key range [*begin,*end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // + // begin==NULL is treated as a key before all keys in the database. + // end==NULL is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(NULL, NULL); + virtual void CompactRange(const Slice* begin, const Slice* end) = 0; private: // No copying allowed diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 1a8ff6b..a39d66f 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -160,6 +160,8 @@ class SequentialFile { // Read up to "n" bytes from the file. "scratch[0..n-1]" may be // written by this routine. Sets "*result" to the data that was // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. // If an error was encountered, returns a non-OK status. // // REQUIRES: External synchronization @@ -184,8 +186,10 @@ class RandomAccessFile { // Read up to "n" bytes from the file starting at "offset". // "scratch[0..n-1]" may be written by this routine. Sets "*result" // to the data that was read (including if fewer than "n" bytes were - // successfully read). If an error was encountered, returns a - // non-OK status. + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. // // Safe for concurrent use by multiple threads. virtual Status Read(uint64_t offset, size_t n, Slice* result, diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 381f228..84ac7fc 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -177,21 +177,8 @@ struct WriteOptions { // Default: false bool sync; - // If "post_write_snapshot" is non-NULL, and the write succeeds, - // *post_write_snapshot will be modified to point to a snapshot of - // the DB state immediately after this write. The caller must call - // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the - // snapshot is no longer needed. - // - // If "post_write_snapshot" is non-NULL, and the write fails, - // *post_write_snapshot will be set to NULL. - // - // Default: NULL - const Snapshot** post_write_snapshot; - WriteOptions() - : sync(false), - post_write_snapshot(NULL) { + : sync(false) { } }; diff --git a/util/coding.h b/util/coding.h index 8755968..c47b9d8 100644 --- a/util/coding.h +++ b/util/coding.h @@ -62,10 +62,10 @@ inline uint32_t DecodeFixed32(const char* ptr) { memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load return result; } else { - return ((static_cast(ptr[0])) - | (static_cast(ptr[1]) << 8) - | (static_cast(ptr[2]) << 16) - | (static_cast(ptr[3]) << 24)); + return ((static_cast(static_cast(ptr[0]))) + | (static_cast(static_cast(ptr[1])) << 8) + | (static_cast(static_cast(ptr[2])) << 16) + | (static_cast(static_cast(ptr[3])) << 24)); } } diff --git a/util/posix_logger.h b/util/posix_logger.h index 0dbdeaa..55428e5 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -1,5 +1,6 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// Author: sanjay@google.com (Sanjay Ghemawat) +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. // // Logger implementation that can be shared by all environments // where enough posix functionality is available.