From 0cfb990d58ffba9b56df6e9829ddb9d220824066 Mon Sep 17 00:00:00 2001 From: David Grogan Date: Tue, 10 Dec 2013 10:36:31 -0800 Subject: [PATCH] Release LevelDB 1.15 - switched from mmap based writing to simpler stdio based writing. Has a minor impact (0.5 microseconds) on microbenchmarks for asynchronous writes. Synchronous writes speed up from 30ms to 10ms on linux/ext4. Should be much more reliable on diverse platforms. - compaction errors now immediately put the database into a read-only mode (until it is re-opened). As a downside, a disk going out of space and then space being created will require a re-open to recover from, whereas previously that would happen automatically. On the plus side, many corruption possibilities go away. - force the DB to enter an error-state so that all future writes fail when a synchronous log write succeeds but the sync fails. - repair now regenerates sstables that exhibit problems - fix issue 218 - Use native memory barriers on OSX - fix issue 212 - QNX build is broken - fix build on iOS with xcode 5 - make tests compile and pass on windows --- Makefile | 2 +- build_detect_platform | 10 +++ db/corruption_test.cc | 26 ++++++- db/db_bench.cc | 8 +-- db/db_impl.cc | 99 ++++++++++++++----------- db/db_impl.h | 9 +-- db/db_test.cc | 108 ++++++++++++++-------------- db/repair.cc | 163 ++++++++++++++++++++++++++++------------- db/version_set.cc | 33 --------- db/version_set.h | 2 - include/leveldb/db.h | 2 +- include/leveldb/env.h | 2 +- port/atomic_pointer.h | 14 ++-- table/filter_block_test.cc | 2 +- util/arena.h | 2 +- util/arena_test.cc | 6 +- util/bloom_test.cc | 3 +- util/coding_test.cc | 8 +-- util/env_posix.cc | 176 +++++++-------------------------------------- util/testharness.cc | 2 +- util/testutil.cc | 2 +- util/testutil.h | 2 +- 22 files changed, 320 insertions(+), 361 deletions(-) diff --git a/Makefile b/Makefile index 26de8c2..344ff29 100644 --- a/Makefile +++ b/Makefile @@ -72,7 +72,7 @@ SHARED = $(SHARED1) else # Update db.h if you change these. SHARED_MAJOR = 1 -SHARED_MINOR = 14 +SHARED_MINOR = 15 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) diff --git a/build_detect_platform b/build_detect_platform index d50869d..6e59c6f 100755 --- a/build_detect_platform +++ b/build_detect_platform @@ -131,6 +131,16 @@ case "$TARGET_OS" in # man ld: +h internal_name PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl," ;; + IOS) + PLATFORM=IOS + COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX" + [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd` + PORT_FILE=port/port_posix.cc + PLATFORM_SHARED_EXT= + PLATFORM_SHARED_LDFLAGS= + PLATFORM_SHARED_CFLAGS= + PLATFORM_SHARED_VERSIONED= + ;; *) echo "Unknown platform!" >&2 exit 1 diff --git a/db/corruption_test.cc b/db/corruption_test.cc index b37ffdf..96afc68 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -75,7 +75,13 @@ class CorruptionTest { Slice key = Key(i, &key_space); batch.Clear(); batch.Put(key, Value(i, &value_space)); - ASSERT_OK(db_->Write(WriteOptions(), &batch)); + WriteOptions options; + // Corrupt() doesn't work without this sync on windows; stat reports 0 for + // the file size. + if (i == n - 1) { + options.sync = true; + } + ASSERT_OK(db_->Write(options, &batch)); } } @@ -125,7 +131,7 @@ class CorruptionTest { FileType type; std::string fname; int picked_number = -1; - for (int i = 0; i < filenames.size(); i++) { + for (size_t i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && type == filetype && int(number) > picked_number) { // Pick latest file @@ -238,6 +244,22 @@ TEST(CorruptionTest, TableFile) { Check(90, 99); } +TEST(CorruptionTest, TableFileRepair) { + options_.block_size = 2 * kValueSize; // Limit scope of corruption + options_.paranoid_checks = true; + Reopen(); + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, NULL, NULL); + dbi->TEST_CompactRange(1, NULL, NULL); + + Corrupt(kTableFile, 100, 1); + RepairDB(); + Reopen(); + Check(95, 99); +} + TEST(CorruptionTest, TableFileIndexData) { Build(10000); // Enough to build multiple Tables DBImpl* dbi = reinterpret_cast(db_); diff --git a/db/db_bench.cc b/db/db_bench.cc index 7abdf87..fc46d89 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -128,7 +128,7 @@ class RandomGenerator { pos_ = 0; } - Slice Generate(int len) { + Slice Generate(size_t len) { if (pos_ + len > data_.size()) { pos_ = 0; assert(len < data_.size()); @@ -139,11 +139,11 @@ class RandomGenerator { }; static Slice TrimSpace(Slice s) { - int start = 0; + size_t start = 0; while (start < s.size() && isspace(s[start])) { start++; } - int limit = s.size(); + size_t limit = s.size(); while (limit > start && isspace(s[limit-1])) { limit--; } @@ -399,7 +399,7 @@ class Benchmark { heap_counter_(0) { std::vector files; Env::Default()->GetChildren(FLAGS_db, &files); - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { if (Slice(files[i]).starts_with("heap-")) { Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]); } diff --git a/db/db_impl.cc b/db/db_impl.cc index fa13510..faf5e7d 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -133,8 +133,7 @@ DBImpl::DBImpl(const Options& raw_options, const std::string& dbname) seed_(0), tmp_batch_(new WriteBatch), bg_compaction_scheduled_(false), - manual_compaction_(NULL), - consecutive_compaction_errors_(0) { + manual_compaction_(NULL) { mem_->Ref(); has_imm_.Release_Store(NULL); @@ -217,6 +216,12 @@ void DBImpl::MaybeIgnoreError(Status* s) const { } void DBImpl::DeleteObsoleteFiles() { + if (!bg_error_.ok()) { + // After a background error, we don't know whether a new version may + // or may not have been committed, so we cannot safely garbage collect. + return; + } + // Make a set of all of the live files std::set live = pending_outputs_; versions_->AddLiveFiles(&live); @@ -495,7 +500,7 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, return s; } -Status DBImpl::CompactMemTable() { +void DBImpl::CompactMemTable() { mutex_.AssertHeld(); assert(imm_ != NULL); @@ -523,9 +528,9 @@ Status DBImpl::CompactMemTable() { imm_ = NULL; has_imm_.Release_Store(NULL); DeleteObsoleteFiles(); + } else { + RecordBackgroundError(s); } - - return s; } void DBImpl::CompactRange(const Slice* begin, const Slice* end) { @@ -568,16 +573,18 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { } MutexLock l(&mutex_); - while (!manual.done) { - while (manual_compaction_ != NULL) { - bg_cv_.Wait(); - } - manual_compaction_ = &manual; - MaybeScheduleCompaction(); - while (manual_compaction_ == &manual) { + while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { + if (manual_compaction_ == NULL) { // Idle + manual_compaction_ = &manual; + MaybeScheduleCompaction(); + } else { // Running either my compaction or another compaction. bg_cv_.Wait(); } } + if (manual_compaction_ == &manual) { + // Cancel my manual compaction since we aborted early for some reason. + manual_compaction_ = NULL; + } } Status DBImpl::TEST_CompactMemTable() { @@ -596,12 +603,22 @@ Status DBImpl::TEST_CompactMemTable() { return s; } +void DBImpl::RecordBackgroundError(const Status& s) { + mutex_.AssertHeld(); + if (bg_error_.ok()) { + bg_error_ = s; + bg_cv_.SignalAll(); + } +} + void DBImpl::MaybeScheduleCompaction() { mutex_.AssertHeld(); if (bg_compaction_scheduled_) { // Already scheduled } else if (shutting_down_.Acquire_Load()) { // DB is being deleted; no more background compactions + } else if (!bg_error_.ok()) { + // Already got an error; no more changes } else if (imm_ == NULL && manual_compaction_ == NULL && !versions_->NeedsCompaction()) { @@ -619,30 +636,12 @@ void DBImpl::BGWork(void* db) { void DBImpl::BackgroundCall() { MutexLock l(&mutex_); assert(bg_compaction_scheduled_); - if (!shutting_down_.Acquire_Load()) { - Status s = BackgroundCompaction(); - if (s.ok()) { - // Success - consecutive_compaction_errors_ = 0; - } else if (shutting_down_.Acquire_Load()) { - // Error most likely due to shutdown; do not wait - } else { - // Wait a little bit before retrying background compaction in - // case this is an environmental problem and we do not want to - // chew up resources for failed compactions for the duration of - // the problem. - bg_cv_.SignalAll(); // In case a waiter can proceed despite the error - Log(options_.info_log, "Waiting after background compaction error: %s", - s.ToString().c_str()); - mutex_.Unlock(); - ++consecutive_compaction_errors_; - int seconds_to_sleep = 1; - for (int i = 0; i < 3 && i < consecutive_compaction_errors_ - 1; ++i) { - seconds_to_sleep *= 2; - } - env_->SleepForMicroseconds(seconds_to_sleep * 1000000); - mutex_.Lock(); - } + if (shutting_down_.Acquire_Load()) { + // No more background work when shutting down. + } else if (!bg_error_.ok()) { + // No more background work after a background error. + } else { + BackgroundCompaction(); } bg_compaction_scheduled_ = false; @@ -653,11 +652,12 @@ void DBImpl::BackgroundCall() { bg_cv_.SignalAll(); } -Status DBImpl::BackgroundCompaction() { +void DBImpl::BackgroundCompaction() { mutex_.AssertHeld(); if (imm_ != NULL) { - return CompactMemTable(); + CompactMemTable(); + return; } Compaction* c; @@ -691,6 +691,9 @@ Status DBImpl::BackgroundCompaction() { c->edit()->AddFile(c->level() + 1, f->number, f->file_size, f->smallest, f->largest); status = versions_->LogAndApply(c->edit(), &mutex_); + if (!status.ok()) { + RecordBackgroundError(status); + } VersionSet::LevelSummaryStorage tmp; Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", static_cast(f->number), @@ -701,6 +704,9 @@ Status DBImpl::BackgroundCompaction() { } else { CompactionState* compact = new CompactionState(c); status = DoCompactionWork(compact); + if (!status.ok()) { + RecordBackgroundError(status); + } CleanupCompaction(compact); c->ReleaseInputs(); DeleteObsoleteFiles(); @@ -714,9 +720,6 @@ Status DBImpl::BackgroundCompaction() { } else { Log(options_.info_log, "Compaction error: %s", status.ToString().c_str()); - if (options_.paranoid_checks && bg_error_.ok()) { - bg_error_ = status; - } } if (is_manual) { @@ -732,7 +735,6 @@ Status DBImpl::BackgroundCompaction() { } manual_compaction_ = NULL; } - return status; } void DBImpl::CleanupCompaction(CompactionState* compact) { @@ -1002,6 +1004,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { if (status.ok()) { status = InstallCompactionResults(compact); } + if (!status.ok()) { + RecordBackgroundError(status); + } VersionSet::LevelSummaryStorage tmp; Log(options_.info_log, "compacted to: %s", versions_->LevelSummary(&tmp)); @@ -1185,13 +1190,23 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { { mutex_.Unlock(); status = log_->AddRecord(WriteBatchInternal::Contents(updates)); + bool sync_error = false; if (status.ok() && options.sync) { status = logfile_->Sync(); + if (!status.ok()) { + sync_error = true; + } } if (status.ok()) { status = WriteBatchInternal::InsertInto(updates, mem_); } mutex_.Lock(); + if (sync_error) { + // The state of the log file is indeterminate: the log record we + // just added may or may not show up when the DB is re-opened. + // So we force the DB into a mode where all future writes fail. + RecordBackgroundError(status); + } } if (updates == tmp_batch_) tmp_batch_->Clear(); diff --git a/db/db_impl.h b/db/db_impl.h index 75fd30a..cfc9981 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -87,8 +87,8 @@ class DBImpl : public DB { // Compact the in-memory write buffer to disk. Switches to a new // log-file/memtable and writes a new descriptor iff successful. - Status CompactMemTable() - EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Errors are recorded in bg_error_. + void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_); Status RecoverLogFile(uint64_t log_number, VersionEdit* edit, @@ -102,10 +102,12 @@ class DBImpl : public DB { EXCLUSIVE_LOCKS_REQUIRED(mutex_); WriteBatch* BuildBatchGroup(Writer** last_writer); + void RecordBackgroundError(const Status& s); + void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); static void BGWork(void* db); void BackgroundCall(); - Status BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); + void BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); void CleanupCompaction(CompactionState* compact) EXCLUSIVE_LOCKS_REQUIRED(mutex_); Status DoCompactionWork(CompactionState* compact) @@ -170,7 +172,6 @@ class DBImpl : public DB { // Have we encountered a background error in paranoid mode? Status bg_error_; - int consecutive_compaction_errors_; // Per level compaction stats. stats_[level] stores the stats for // compactions that produced data for the specified "level". diff --git a/db/db_test.cc b/db/db_test.cc index 848a038..280b01c 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -57,8 +57,11 @@ void DelayMilliseconds(int millis) { // Special Env used to delay background operations class SpecialEnv : public EnvWrapper { public: - // sstable Sync() calls are blocked while this pointer is non-NULL. - port::AtomicPointer delay_sstable_sync_; + // sstable/log Sync() calls are blocked while this pointer is non-NULL. + port::AtomicPointer delay_data_sync_; + + // sstable/log Sync() calls return an error. + port::AtomicPointer data_sync_error_; // Simulate no-space errors while this pointer is non-NULL. port::AtomicPointer no_space_; @@ -75,11 +78,9 @@ class SpecialEnv : public EnvWrapper { bool count_random_reads_; AtomicCounter random_read_counter_; - AtomicCounter sleep_counter_; - AtomicCounter sleep_time_counter_; - explicit SpecialEnv(Env* base) : EnvWrapper(base) { - delay_sstable_sync_.Release_Store(NULL); + delay_data_sync_.Release_Store(NULL); + data_sync_error_.Release_Store(NULL); no_space_.Release_Store(NULL); non_writable_.Release_Store(NULL); count_random_reads_ = false; @@ -88,17 +89,17 @@ class SpecialEnv : public EnvWrapper { } Status NewWritableFile(const std::string& f, WritableFile** r) { - class SSTableFile : public WritableFile { + class DataFile : public WritableFile { private: SpecialEnv* env_; WritableFile* base_; public: - SSTableFile(SpecialEnv* env, WritableFile* base) + DataFile(SpecialEnv* env, WritableFile* base) : env_(env), base_(base) { } - ~SSTableFile() { delete base_; } + ~DataFile() { delete base_; } Status Append(const Slice& data) { if (env_->no_space_.Acquire_Load() != NULL) { // Drop writes on the floor @@ -110,7 +111,10 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { - while (env_->delay_sstable_sync_.Acquire_Load() != NULL) { + if (env_->data_sync_error_.Acquire_Load() != NULL) { + return Status::IOError("simulated data sync error"); + } + while (env_->delay_data_sync_.Acquire_Load() != NULL) { DelayMilliseconds(100); } return base_->Sync(); @@ -147,8 +151,9 @@ class SpecialEnv : public EnvWrapper { Status s = target()->NewWritableFile(f, r); if (s.ok()) { - if (strstr(f.c_str(), ".ldb") != NULL) { - *r = new SSTableFile(this, *r); + if (strstr(f.c_str(), ".ldb") != NULL || + strstr(f.c_str(), ".log") != NULL) { + *r = new DataFile(this, *r); } else if (strstr(f.c_str(), "MANIFEST") != NULL) { *r = new ManifestFile(this, *r); } @@ -179,12 +184,6 @@ class SpecialEnv : public EnvWrapper { } return s; } - - virtual void SleepForMicroseconds(int micros) { - sleep_counter_.Increment(); - sleep_time_counter_.IncrementBy(micros); - } - }; class DBTest { @@ -322,7 +321,7 @@ class DBTest { } // Check reverse iteration results are the reverse of forward results - int matched = 0; + size_t matched = 0; for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { ASSERT_LT(matched, forward.size()); ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); @@ -543,11 +542,11 @@ TEST(DBTest, GetFromImmutableLayer) { ASSERT_OK(Put("foo", "v1")); ASSERT_EQ("v1", Get("foo")); - env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + env_->delay_data_sync_.Release_Store(env_); // Block sync calls Put("k1", std::string(100000, 'x')); // Fill memtable Put("k2", std::string(100000, 'y')); // Trigger compaction ASSERT_EQ("v1", Get("foo")); - env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls + env_->delay_data_sync_.Release_Store(NULL); // Release sync calls } while (ChangeOptions()); } @@ -1534,41 +1533,13 @@ TEST(DBTest, NoSpace) { Compact("a", "z"); const int num_files = CountFiles(); env_->no_space_.Release_Store(env_); // Force out-of-space errors - env_->sleep_counter_.Reset(); - for (int i = 0; i < 5; i++) { + for (int i = 0; i < 10; i++) { for (int level = 0; level < config::kNumLevels-1; level++) { dbfull()->TEST_CompactRange(level, NULL, NULL); } } env_->no_space_.Release_Store(NULL); ASSERT_LT(CountFiles(), num_files + 3); - - // Check that compaction attempts slept after errors - ASSERT_GE(env_->sleep_counter_.Read(), 5); -} - -TEST(DBTest, ExponentialBackoff) { - Options options = CurrentOptions(); - options.env = env_; - Reopen(&options); - - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - Compact("a", "z"); - env_->non_writable_.Release_Store(env_); // Force errors for new files - env_->sleep_counter_.Reset(); - env_->sleep_time_counter_.Reset(); - for (int i = 0; i < 5; i++) { - dbfull()->TEST_CompactRange(2, NULL, NULL); - } - env_->non_writable_.Release_Store(NULL); - - // Wait for compaction to finish - DelayMilliseconds(1000); - - ASSERT_GE(env_->sleep_counter_.Read(), 5); - ASSERT_LT(env_->sleep_counter_.Read(), 10); - ASSERT_GE(env_->sleep_time_counter_.Read(), 10e6); } TEST(DBTest, NonWritableFileSystem) { @@ -1591,6 +1562,37 @@ TEST(DBTest, NonWritableFileSystem) { env_->non_writable_.Release_Store(NULL); } +TEST(DBTest, WriteSyncError) { + // Check that log sync errors cause the DB to disallow future writes. + + // (a) Cause log sync calls to fail + Options options = CurrentOptions(); + options.env = env_; + Reopen(&options); + env_->data_sync_error_.Release_Store(env_); + + // (b) Normal write should succeed + WriteOptions w; + ASSERT_OK(db_->Put(w, "k1", "v1")); + ASSERT_EQ("v1", Get("k1")); + + // (c) Do a sync write; should fail + w.sync = true; + ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok()); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("NOT_FOUND", Get("k2")); + + // (d) make sync behave normally + env_->data_sync_error_.Release_Store(NULL); + + // (e) Do a non-sync write; should fail + w.sync = false; + ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok()); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("NOT_FOUND", Get("k2")); + ASSERT_EQ("NOT_FOUND", Get("k3")); +} + TEST(DBTest, ManifestWriteError) { // Test for the following problem: // (a) Compaction produces file F @@ -1697,7 +1699,7 @@ TEST(DBTest, BloomFilter) { dbfull()->TEST_CompactMemTable(); // Prevent auto compactions triggered by seeks - env_->delay_sstable_sync_.Release_Store(env_); + env_->delay_data_sync_.Release_Store(env_); // Lookup present keys. Should rarely read from small sstable. env_->random_read_counter_.Reset(); @@ -1718,7 +1720,7 @@ TEST(DBTest, BloomFilter) { fprintf(stderr, "%d missing => %d reads\n", N, reads); ASSERT_LE(reads, 3*N/100); - env_->delay_sstable_sync_.Release_Store(NULL); + env_->delay_data_sync_.Release_Store(NULL); Close(); delete options.block_cache; delete options.filter_policy; @@ -1778,7 +1780,7 @@ static void MTThreadBody(void* arg) { ASSERT_EQ(k, key); ASSERT_GE(w, 0); ASSERT_LT(w, kNumThreads); - ASSERT_LE(c, reinterpret_cast( + ASSERT_LE(static_cast(c), reinterpret_cast( t->state->counter[w].Acquire_Load())); } } diff --git a/db/repair.cc b/db/repair.cc index dc93fb8..96c9b37 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -244,68 +244,133 @@ class Repairer { void ExtractMetaData() { std::vector kept; for (size_t i = 0; i < table_numbers_.size(); i++) { - TableInfo t; - t.meta.number = table_numbers_[i]; - Status status = ScanTable(&t); - if (!status.ok()) { - std::string fname = TableFileName(dbname_, table_numbers_[i]); - Log(options_.info_log, "Table #%llu: ignoring %s", - (unsigned long long) table_numbers_[i], - status.ToString().c_str()); - ArchiveFile(fname); - } else { - tables_.push_back(t); - } + ScanTable(table_numbers_[i]); } } - Status ScanTable(TableInfo* t) { - std::string fname = TableFileName(dbname_, t->meta.number); - int counter = 0; - Status status = env_->GetFileSize(fname, &t->meta.file_size); + Iterator* NewTableIterator(const FileMetaData& meta) { + // Same as compaction iterators: if paranoid_checks are on, turn + // on checksum verification. + ReadOptions r; + r.verify_checksums = options_.paranoid_checks; + return table_cache_->NewIterator(r, meta.number, meta.file_size); + } + + void ScanTable(uint64_t number) { + TableInfo t; + t.meta.number = number; + std::string fname = TableFileName(dbname_, number); + Status status = env_->GetFileSize(fname, &t.meta.file_size); if (!status.ok()) { - fname = SSTTableFileName(dbname_, t->meta.number); - Status s2 = env_->GetFileSize(fname, &t->meta.file_size); - if (s2.ok()) + // Try alternate file name. + fname = SSTTableFileName(dbname_, number); + Status s2 = env_->GetFileSize(fname, &t.meta.file_size); + if (s2.ok()) { status = Status::OK(); + } + } + if (!status.ok()) { + ArchiveFile(TableFileName(dbname_, number)); + ArchiveFile(SSTTableFileName(dbname_, number)); + Log(options_.info_log, "Table #%llu: dropped: %s", + (unsigned long long) t.meta.number, + status.ToString().c_str()); + return; } - if (status.ok()) { - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), t->meta.number, t->meta.file_size); - bool empty = true; - ParsedInternalKey parsed; - t->max_sequence = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - Slice key = iter->key(); - if (!ParseInternalKey(key, &parsed)) { - Log(options_.info_log, "Table #%llu: unparsable key %s", - (unsigned long long) t->meta.number, - EscapeString(key).c_str()); - continue; - } - counter++; - if (empty) { - empty = false; - t->meta.smallest.DecodeFrom(key); - } - t->meta.largest.DecodeFrom(key); - if (parsed.sequence > t->max_sequence) { - t->max_sequence = parsed.sequence; - } + // Extract metadata by scanning through table. + int counter = 0; + Iterator* iter = NewTableIterator(t.meta); + bool empty = true; + ParsedInternalKey parsed; + t.max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t.meta.number, + EscapeString(key).c_str()); + continue; + } + + counter++; + if (empty) { + empty = false; + t.meta.smallest.DecodeFrom(key); } - if (!iter->status().ok()) { - status = iter->status(); + t.meta.largest.DecodeFrom(key); + if (parsed.sequence > t.max_sequence) { + t.max_sequence = parsed.sequence; } - delete iter; } - // If there was trouble opening an .sst file this will report that the .ldb - // file was not found, which is kind of lame but shouldn't happen often. + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; Log(options_.info_log, "Table #%llu: %d entries %s", - (unsigned long long) t->meta.number, + (unsigned long long) t.meta.number, counter, status.ToString().c_str()); - return status; + + if (status.ok()) { + tables_.push_back(t); + } else { + RepairTable(fname, t); // RepairTable archives input file. + } + } + + void RepairTable(const std::string& src, TableInfo t) { + // We will copy src contents to a new table and then rename the + // new table over the source. + + // Create builder. + std::string copy = TableFileName(dbname_, next_file_number_++); + WritableFile* file; + Status s = env_->NewWritableFile(copy, &file); + if (!s.ok()) { + return; + } + TableBuilder* builder = new TableBuilder(options_, file); + + // Copy data. + Iterator* iter = NewTableIterator(t.meta); + int counter = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + builder->Add(iter->key(), iter->value()); + counter++; + } + delete iter; + + ArchiveFile(src); + if (counter == 0) { + builder->Abandon(); // Nothing to save + } else { + s = builder->Finish(); + if (s.ok()) { + t.meta.file_size = builder->FileSize(); + } + } + delete builder; + builder = NULL; + + if (s.ok()) { + s = file->Close(); + } + delete file; + file = NULL; + + if (counter > 0 && s.ok()) { + std::string orig = TableFileName(dbname_, t.meta.number); + s = env_->RenameFile(copy, orig); + if (s.ok()) { + Log(options_.info_log, "Table #%llu: %d entries repaired", + (unsigned long long) t.meta.number, counter); + tables_.push_back(t); + } + } + if (!s.ok()) { + env_->DeleteFile(copy); + } } Status WriteDescriptor() { diff --git a/db/version_set.cc b/db/version_set.cc index 66d73be..517edd3 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -876,12 +876,6 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { } if (!s.ok()) { Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); - if (ManifestContains(record)) { - Log(options_->info_log, - "MANIFEST contains log record despite error; advancing to new " - "version to prevent mismatch between in-memory and logged state"); - s = Status::OK(); - } } } @@ -889,8 +883,6 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { // new CURRENT file that points to it. if (s.ok() && !new_manifest_file.empty()) { s = SetCurrentFile(env_, dbname_, manifest_file_number_); - // No need to double-check MANIFEST in case of error since it - // will be discarded below. } mu->Lock(); @@ -1124,31 +1116,6 @@ const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const { return scratch->buffer; } -// Return true iff the manifest contains the specified record. -bool VersionSet::ManifestContains(const std::string& record) const { - std::string fname = DescriptorFileName(dbname_, manifest_file_number_); - Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); - SequentialFile* file = NULL; - Status s = env_->NewSequentialFile(fname, &file); - if (!s.ok()) { - Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); - return false; - } - log::Reader reader(file, NULL, true/*checksum*/, 0); - Slice r; - std::string scratch; - bool result = false; - while (reader.ReadRecord(&r, &scratch)) { - if (r == Slice(record)) { - result = true; - break; - } - } - delete file; - Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); - return result; -} - uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t result = 0; for (int level = 0; level < config::kNumLevels; level++) { diff --git a/db/version_set.h b/db/version_set.h index 20de0e2..8dc14b8 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -292,8 +292,6 @@ class VersionSet { void AppendVersion(Version* v); - bool ManifestContains(const std::string& record) const; - Env* const env_; const std::string dbname_; const Options* const options_; diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 259a81f..5ffb29d 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -14,7 +14,7 @@ namespace leveldb { // Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 14; +static const int kMinorVersion = 15; struct Options; struct ReadOptions; diff --git a/include/leveldb/env.h b/include/leveldb/env.h index fa32289..b2072d0 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -13,9 +13,9 @@ #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ #define STORAGE_LEVELDB_INCLUDE_ENV_H_ -#include #include #include +#include #include #include "leveldb/status.h" diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h index e17bf43..a9866b2 100644 --- a/port/atomic_pointer.h +++ b/port/atomic_pointer.h @@ -50,6 +50,13 @@ namespace port { // http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx #define LEVELDB_HAVE_MEMORY_BARRIER +// Mac OS +#elif defined(OS_MACOSX) +inline void MemoryBarrier() { + OSMemoryBarrier(); +} +#define LEVELDB_HAVE_MEMORY_BARRIER + // Gcc on x86 #elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__) inline void MemoryBarrier() { @@ -68,13 +75,6 @@ inline void MemoryBarrier() { } #define LEVELDB_HAVE_MEMORY_BARRIER -// Mac OS -#elif defined(OS_MACOSX) -inline void MemoryBarrier() { - OSMemoryBarrier(); -} -#define LEVELDB_HAVE_MEMORY_BARRIER - // ARM Linux #elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__) typedef void (*LinuxKernelMemoryBarrierFunc)(void); diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc index 3a2a07c..8c4a474 100644 --- a/table/filter_block_test.cc +++ b/table/filter_block_test.cc @@ -29,7 +29,7 @@ class TestHashFilter : public FilterPolicy { virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { uint32_t h = Hash(key.data(), key.size(), 1); - for (int i = 0; i + 4 <= filter.size(); i += 4) { + for (size_t i = 0; i + 4 <= filter.size(); i += 4) { if (h == DecodeFixed32(filter.data() + i)) { return true; } diff --git a/util/arena.h b/util/arena.h index 8f7dde2..73bbf1c 100644 --- a/util/arena.h +++ b/util/arena.h @@ -5,9 +5,9 @@ #ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ #define STORAGE_LEVELDB_UTIL_ARENA_H_ -#include #include #include +#include #include namespace leveldb { diff --git a/util/arena_test.cc b/util/arena_test.cc index 63d1778..58e870e 100644 --- a/util/arena_test.cc +++ b/util/arena_test.cc @@ -40,7 +40,7 @@ TEST(ArenaTest, Simple) { r = arena.Allocate(s); } - for (int b = 0; b < s; b++) { + for (size_t b = 0; b < s; b++) { // Fill the "i"th allocation with a known bit pattern r[b] = i % 256; } @@ -51,10 +51,10 @@ TEST(ArenaTest, Simple) { ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); } } - for (int i = 0; i < allocated.size(); i++) { + for (size_t i = 0; i < allocated.size(); i++) { size_t num_bytes = allocated[i].first; const char* p = allocated[i].second; - for (int b = 0; b < num_bytes; b++) { + for (size_t b = 0; b < num_bytes; b++) { // Check the "i"th allocation for the known bit pattern ASSERT_EQ(int(p[b]) & 0xff, i % 256); } diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 0bf8e8d..77fb1b3 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -126,7 +126,8 @@ TEST(BloomTest, VaryingLengths) { } Build(); - ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length; + ASSERT_LE(FilterSize(), static_cast((length * 10 / 8) + 40)) + << length; // All added keys must match for (int i = 0; i < length; i++) { diff --git a/util/coding_test.cc b/util/coding_test.cc index fb5726e..521541e 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -112,13 +112,13 @@ TEST(Coding, Varint64) { } std::string s; - for (int i = 0; i < values.size(); i++) { + for (size_t i = 0; i < values.size(); i++) { PutVarint64(&s, values[i]); } const char* p = s.data(); const char* limit = p + s.size(); - for (int i = 0; i < values.size(); i++) { + for (size_t i = 0; i < values.size(); i++) { ASSERT_TRUE(p < limit); uint64_t actual; const char* start = p; @@ -143,7 +143,7 @@ TEST(Coding, Varint32Truncation) { std::string s; PutVarint32(&s, large_value); uint32_t result; - for (int len = 0; len < s.size() - 1; len++) { + for (size_t len = 0; len < s.size() - 1; len++) { ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); } ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); @@ -162,7 +162,7 @@ TEST(Coding, Varint64Truncation) { std::string s; PutVarint64(&s, large_value); uint64_t result; - for (int len = 0; len < s.size() - 1; len++) { + for (size_t len = 0; len < s.size() - 1; len++) { ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); } ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); diff --git a/util/env_posix.cc b/util/env_posix.cc index 3e2925d..e1cbebd 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -175,147 +175,43 @@ class PosixMmapReadableFile: public RandomAccessFile { } }; -// We preallocate up to an extra megabyte and use memcpy to append new -// data to the file. This is safe since we either properly close the -// file before reading from it, or for log files, the reading code -// knows enough to skip zero suffixes. -class PosixMmapFile : public WritableFile { +class PosixWritableFile : public WritableFile { private: std::string filename_; - int fd_; - size_t page_size_; - size_t map_size_; // How much extra memory to map at a time - char* base_; // The mapped region - char* limit_; // Limit of the mapped region - char* dst_; // Where to write next (in range [base_,limit_]) - char* last_sync_; // Where have we synced up to - uint64_t file_offset_; // Offset of base_ in file - - // Have we done an munmap of unsynced data? - bool pending_sync_; - - // Roundup x to a multiple of y - static size_t Roundup(size_t x, size_t y) { - return ((x + y - 1) / y) * y; - } - - size_t TruncateToPageBoundary(size_t s) { - s -= (s & (page_size_ - 1)); - assert((s % page_size_) == 0); - return s; - } - - bool UnmapCurrentRegion() { - bool result = true; - if (base_ != NULL) { - if (last_sync_ < limit_) { - // Defer syncing this data until next Sync() call, if any - pending_sync_ = true; - } - if (munmap(base_, limit_ - base_) != 0) { - result = false; - } - file_offset_ += limit_ - base_; - base_ = NULL; - limit_ = NULL; - last_sync_ = NULL; - dst_ = NULL; - - // Increase the amount we map the next time, but capped at 1MB - if (map_size_ < (1<<20)) { - map_size_ *= 2; - } - } - return result; - } - - bool MapNewRegion() { - assert(base_ == NULL); - if (ftruncate(fd_, file_offset_ + map_size_) < 0) { - return false; - } - void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, - fd_, file_offset_); - if (ptr == MAP_FAILED) { - return false; - } - base_ = reinterpret_cast(ptr); - limit_ = base_ + map_size_; - dst_ = base_; - last_sync_ = base_; - return true; - } + FILE* file_; public: - PosixMmapFile(const std::string& fname, int fd, size_t page_size) - : filename_(fname), - fd_(fd), - page_size_(page_size), - map_size_(Roundup(65536, page_size)), - base_(NULL), - limit_(NULL), - dst_(NULL), - last_sync_(NULL), - file_offset_(0), - pending_sync_(false) { - assert((page_size & (page_size - 1)) == 0); - } - - - ~PosixMmapFile() { - if (fd_ >= 0) { - PosixMmapFile::Close(); + PosixWritableFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + + ~PosixWritableFile() { + if (file_ != NULL) { + // Ignoring any potential errors + fclose(file_); } } virtual Status Append(const Slice& data) { - const char* src = data.data(); - size_t left = data.size(); - while (left > 0) { - assert(base_ <= dst_); - assert(dst_ <= limit_); - size_t avail = limit_ - dst_; - if (avail == 0) { - if (!UnmapCurrentRegion() || - !MapNewRegion()) { - return IOError(filename_, errno); - } - } - - size_t n = (left <= avail) ? left : avail; - memcpy(dst_, src, n); - dst_ += n; - src += n; - left -= n; + size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); + if (r != data.size()) { + return IOError(filename_, errno); } return Status::OK(); } virtual Status Close() { - Status s; - size_t unused = limit_ - dst_; - if (!UnmapCurrentRegion()) { - s = IOError(filename_, errno); - } else if (unused > 0) { - // Trim the extra space at the end of the file - if (ftruncate(fd_, file_offset_ - unused) < 0) { - s = IOError(filename_, errno); - } - } - - if (close(fd_) < 0) { - if (s.ok()) { - s = IOError(filename_, errno); - } + Status result; + if (fclose(file_) != 0) { + result = IOError(filename_, errno); } - - fd_ = -1; - base_ = NULL; - limit_ = NULL; - return s; + file_ = NULL; + return result; } virtual Status Flush() { + if (fflush_unlocked(file_) != 0) { + return IOError(filename_, errno); + } return Status::OK(); } @@ -352,26 +248,10 @@ class PosixMmapFile : public WritableFile { if (!s.ok()) { return s; } - - if (pending_sync_) { - // Some unmapped data was not synced - pending_sync_ = false; - if (fdatasync(fd_) < 0) { - s = IOError(filename_, errno); - } + if (fflush_unlocked(file_) != 0 || + fdatasync(fileno(file_)) != 0) { + s = Status::IOError(filename_, strerror(errno)); } - - if (dst_ > last_sync_) { - // Find the beginnings of the pages that contain the first and last - // bytes to be synced. - size_t p1 = TruncateToPageBoundary(last_sync_ - base_); - size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); - last_sync_ = dst_; - if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { - s = IOError(filename_, errno); - } - } - return s; } }; @@ -462,12 +342,12 @@ class PosixEnv : public Env { virtual Status NewWritableFile(const std::string& fname, WritableFile** result) { Status s; - const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd < 0) { + FILE* f = fopen(fname.c_str(), "w"); + if (f == NULL) { *result = NULL; s = IOError(fname, errno); } else { - *result = new PosixMmapFile(fname, fd, page_size_); + *result = new PosixWritableFile(fname, f); } return s; } @@ -630,7 +510,6 @@ class PosixEnv : public Env { return NULL; } - size_t page_size_; pthread_mutex_t mu_; pthread_cond_t bgsignal_; pthread_t bgthread_; @@ -645,8 +524,7 @@ class PosixEnv : public Env { MmapLimiter mmap_limit_; }; -PosixEnv::PosixEnv() : page_size_(getpagesize()), - started_bgthread_(false) { +PosixEnv::PosixEnv() : started_bgthread_(false) { PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); } diff --git a/util/testharness.cc b/util/testharness.cc index eb1bdd5..402fab3 100644 --- a/util/testharness.cc +++ b/util/testharness.cc @@ -38,7 +38,7 @@ int RunAllTests() { int num = 0; if (tests != NULL) { - for (int i = 0; i < tests->size(); i++) { + for (size_t i = 0; i < tests->size(); i++) { const Test& t = (*tests)[i]; if (matcher != NULL) { std::string name = t.base; diff --git a/util/testutil.cc b/util/testutil.cc index 538d095..bee56bf 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -32,7 +32,7 @@ std::string RandomKey(Random* rnd, int len) { extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst) { + size_t len, std::string* dst) { int raw = static_cast(len * compressed_fraction); if (raw < 1) raw = 1; std::string raw_data; diff --git a/util/testutil.h b/util/testutil.h index 824e655..adad3fc 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -24,7 +24,7 @@ extern std::string RandomKey(Random* rnd, int len); // "N*compressed_fraction" bytes and return a Slice that references // the generated data. extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst); + size_t len, std::string* dst); // A wrapper that allows injection of errors. class ErrorEnv : public EnvWrapper {