git-svn-id: https://leveldb.googlecode.com/svn/trunk@2 62dab493-f737-651d-591e-8d6aee1b9529ld
@ -0,0 +1,8 @@ | |||
# Names should be added to this file like so: | |||
# Name or Organization <email address> | |||
Google Inc. | |||
# Initial version authors: | |||
Jeffrey Dean <jeff@google.com> | |||
Sanjay Ghemawat <sanjay@google.com> |
@ -0,0 +1,64 @@ | |||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
# Use of this source code is governed by a BSD-style license that can be | |||
# found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
# INSTRUCTIONS | |||
# After you've downloaded and installed the Android NDK from: | |||
# http://developer.android.com/sdk/ndk/index.html | |||
# 1. In the same directory as this file, Android.mk, type: | |||
# $ ln -s leveldb ../jni | |||
# (The Android NDK will only build native projects in | |||
# subdirectories named "jni".) | |||
# 2. $ cd .. | |||
# 3. Execute ndk-build: | |||
# $ $(ANDROID_NDK_DIR)/ndk-build | |||
LOCAL_PATH := $(call my-dir) | |||
include $(CLEAR_VARS) | |||
LOCAL_MODULE := leveldb | |||
# Build flags: | |||
# - LEVELDB_PLATFORM_ANDROID to use the correct port header: port_android.h | |||
LOCAL_CFLAGS := -DLEVELDB_PLATFORM_ANDROID -std=gnu++0x | |||
LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../ | |||
LOCAL_CPP_EXTENSION := .cc | |||
LOCAL_SRC_FILES := ./db/builder.cc \ | |||
./db/db_bench.cc \ | |||
./db/db_impl.cc \ | |||
./db/db_iter.cc \ | |||
./db/filename.cc \ | |||
./db/dbformat.cc \ | |||
./db/log_reader.cc \ | |||
./db/log_writer.cc \ | |||
./db/memtable.cc \ | |||
./db/repair.cc \ | |||
./db/table_cache.cc \ | |||
./db/version_edit.cc \ | |||
./db/version_set.cc \ | |||
./db/write_batch.cc \ | |||
./port/port_android.cc \ | |||
./table/block.cc \ | |||
./table/block_builder.cc \ | |||
./table/format.cc \ | |||
./table/iterator.cc \ | |||
./table/merger.cc \ | |||
./table/table.cc \ | |||
./table/table_builder.cc \ | |||
./table/two_level_iterator.cc \ | |||
./util/arena.cc \ | |||
./util/cache.cc \ | |||
./util/coding.cc \ | |||
./util/comparator.cc \ | |||
./util/crc32c.cc \ | |||
./util/env.cc \ | |||
./util/env_posix.cc \ | |||
./util/hash.cc \ | |||
./util/histogram.cc \ | |||
./util/logging.cc \ | |||
./util/options.cc \ | |||
./util/status.cc \ | |||
./util/testharness.cc \ | |||
./util/testutil.cc | |||
include $(BUILD_SHARED_LIBRARY) |
@ -0,0 +1,6 @@ | |||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
# Use of this source code is governed by a BSD-style license that can be | |||
# found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
APP_ABI := armeabi-v7a | |||
APP_STL := gnustl_static |
@ -0,0 +1,27 @@ | |||
Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
* Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
* Redistributions in binary form must reproduce the above | |||
copyright notice, this list of conditions and the following disclaimer | |||
in the documentation and/or other materials provided with the | |||
distribution. | |||
* Neither the name of Google Inc. nor the names of its | |||
contributors may be used to endorse or promote products derived from | |||
this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
@ -0,0 +1,134 @@ | |||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
# Use of this source code is governed by a BSD-style license that can be | |||
# found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
CC = g++ | |||
# Uncomment one of the following to switch between debug and opt mode | |||
#OPT = -O2 -DNDEBUG | |||
OPT = -g2 | |||
CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -std=c++0x $(OPT) | |||
LDFLAGS=-lpthread | |||
LIBOBJECTS = \ | |||
./db/builder.o \ | |||
./db/db_impl.o \ | |||
./db/db_iter.o \ | |||
./db/filename.o \ | |||
./db/format.o \ | |||
./db/log_reader.o \ | |||
./db/log_writer.o \ | |||
./db/memtable.o \ | |||
./db/repair.o \ | |||
./db/table_cache.o \ | |||
./db/version_edit.o \ | |||
./db/version_set.o \ | |||
./db/write_batch.o \ | |||
./port/port_posix.o \ | |||
./port/sha1_portable.o \ | |||
./table/block.o \ | |||
./table/block_builder.o \ | |||
./table/format.o \ | |||
./table/iterator.o \ | |||
./table/merger.o \ | |||
./table/table.o \ | |||
./table/table_builder.o \ | |||
./table/two_level_iterator.o \ | |||
./util/arena.o \ | |||
./util/cache.o \ | |||
./util/coding.o \ | |||
./util/comparator.o \ | |||
./util/crc32c.o \ | |||
./util/env.o \ | |||
./util/env_posix.o \ | |||
./util/hash.o \ | |||
./util/histogram.o \ | |||
./util/logging.o \ | |||
./util/options.o \ | |||
./util/status.o | |||
TESTUTIL = ./util/testutil.o | |||
TESTHARNESS = ./util/testharness.o $(TESTUTIL) | |||
TESTS = \ | |||
arena_test \ | |||
cache_test \ | |||
coding_test \ | |||
corruption_test \ | |||
crc32c_test \ | |||
db_test \ | |||
dbformat_test \ | |||
env_test \ | |||
filename_test \ | |||
log_test \ | |||
sha1_test \ | |||
skiplist_test \ | |||
table_test \ | |||
version_edit_test \ | |||
write_batch_test | |||
PROGRAMS = db_bench $(TESTS) | |||
all: $(PROGRAMS) | |||
check: $(TESTS) | |||
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done | |||
clean: | |||
rm -f $(PROGRAMS) */*.o | |||
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) | |||
$(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ | |||
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) | |||
$(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ | |||
.cc.o: | |||
$(CC) $(CFLAGS) $< -o $@ | |||
# TODO(gabor): dependencies for .o files | |||
# TODO(gabor): Build library |
@ -0,0 +1,51 @@ | |||
leveldb: A key-value store | |||
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) | |||
The code under this directory implements a system for maintaining a | |||
persistent key/value store. | |||
See doc/index.html for more explanation. | |||
See doc/db_layout.txt for a brief overview of the implementation. | |||
The public interface is in include/*.h. Callers should not include or | |||
rely on the details of any other header files in this package. Those | |||
internal APIs may be changed without warning. | |||
Guide to header files: | |||
include/db.h | |||
Main interface to the DB: Start here | |||
include/options.h | |||
Control over the behavior of an entire database, and also | |||
control over the behavior of individual reads and writes. | |||
include/comparator.h | |||
Abstraction for user-specified comparison function. If you want | |||
just bytewise comparison of keys, you can use the default comparator, | |||
but clients can write their own comparator implementations if they | |||
want custom ordering (e.g. to handle different character | |||
encodings, etc.) | |||
include/iterator.h | |||
Interface for iterating over data. You can get an iterator | |||
from a DB object. | |||
include/write_batch.h | |||
Interface for atomically applying multiple updates to a database. | |||
include/slice.h | |||
A simple module for maintaining a pointer and a length into some | |||
other byte array. | |||
include/status.h | |||
Status is returned from many of the public interfaces and is used | |||
to report success and various kinds of errors. | |||
include/env.h | |||
Abstraction of the OS environment. A posix implementation of | |||
this interface is in util/env_posix.cc | |||
include/table.h | |||
include/table_builder.h | |||
Lower-level modules that most clients probably won't use directly |
@ -0,0 +1,23 @@ | |||
Before adding to chrome | |||
----------------------- | |||
- multi-threaded test/benchmark | |||
- Allow missing crc32c in Table format? | |||
Maybe afterwards | |||
---------------- | |||
ss | |||
- Stats | |||
- Speed up backwards scan (avoid three passes over data) | |||
db | |||
- Maybe implement DB::BulkDeleteForRange(start_key, end_key) | |||
that would blow away files whose ranges are entirely contained | |||
within [start_key..end_key]? For Chrome, deletion of obsolete | |||
object stores, etc. can be done in the background anyway, so | |||
probably not that important. | |||
api changes? | |||
- Efficient large value reading and writing | |||
Faster Get implementation |
@ -0,0 +1,97 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/builder.h" | |||
#include "db/filename.h" | |||
#include "db/dbformat.h" | |||
#include "db/table_cache.h" | |||
#include "db/version_edit.h" | |||
#include "include/db.h" | |||
#include "include/env.h" | |||
#include "include/iterator.h" | |||
namespace leveldb { | |||
Status BuildTable(const std::string& dbname, | |||
Env* env, | |||
const Options& options, | |||
TableCache* table_cache, | |||
Iterator* iter, | |||
FileMetaData* meta, | |||
VersionEdit* edit) { | |||
Status s; | |||
meta->file_size = 0; | |||
iter->SeekToFirst(); | |||
std::string fname = TableFileName(dbname, meta->number); | |||
if (iter->Valid()) { | |||
WritableFile* file; | |||
s = env->NewWritableFile(fname, &file); | |||
if (!s.ok()) { | |||
return s; | |||
} | |||
TableBuilder* builder = new TableBuilder(options, file); | |||
meta->smallest.DecodeFrom(iter->key()); | |||
for (; iter->Valid(); iter->Next()) { | |||
Slice key = iter->key(); | |||
meta->largest.DecodeFrom(key); | |||
if (ExtractValueType(key) == kTypeLargeValueRef) { | |||
if (iter->value().size() != LargeValueRef::ByteSize()) { | |||
s = Status::Corruption("invalid indirect reference hash value (L0)"); | |||
break; | |||
} | |||
edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()), | |||
meta->number, | |||
iter->key()); | |||
} | |||
builder->Add(key, iter->value()); | |||
} | |||
// Finish and check for builder errors | |||
if (s.ok()) { | |||
s = builder->Finish(); | |||
if (s.ok()) { | |||
meta->file_size = builder->FileSize(); | |||
assert(meta->file_size > 0); | |||
} | |||
} else { | |||
builder->Abandon(); | |||
} | |||
delete builder; | |||
// Finish and check for file errors | |||
if (s.ok()) { | |||
s = file->Sync(); | |||
} | |||
if (s.ok()) { | |||
s = file->Close(); | |||
} | |||
delete file; | |||
file = NULL; | |||
if (s.ok()) { | |||
// Verify that the table is usable | |||
Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number); | |||
s = it->status(); | |||
delete it; | |||
} | |||
} | |||
// Check for input iterator errors | |||
if (!iter->status().ok()) { | |||
s = iter->status(); | |||
} | |||
if (s.ok() && meta->file_size > 0) { | |||
edit->AddFile(0, meta->number, meta->file_size, | |||
meta->smallest, meta->largest); | |||
} else { | |||
env->DeleteFile(fname); | |||
} | |||
return s; | |||
} | |||
} |
@ -0,0 +1,36 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ | |||
#define STORAGE_LEVELDB_DB_BUILDER_H_ | |||
#include "include/status.h" | |||
namespace leveldb { | |||
struct Options; | |||
struct FileMetaData; | |||
class Env; | |||
class Iterator; | |||
class TableCache; | |||
class VersionEdit; | |||
// Build a Table file from the contents of *iter. The generated file | |||
// will be named according to meta->number. On success, the rest of | |||
// *meta will be filled with metadata about the generated table, and | |||
// large value refs and the added file information will be added to | |||
// *edit. If no data is present in *iter, meta->file_size will be set | |||
// to zero, and no Table file will be produced. | |||
extern Status BuildTable(const std::string& dbname, | |||
Env* env, | |||
const Options& options, | |||
TableCache* table_cache, | |||
Iterator* iter, | |||
FileMetaData* meta, | |||
VersionEdit* edit); | |||
} | |||
#endif // STORAGE_LEVELDB_DB_BUILDER_H_ |
@ -0,0 +1,366 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "include/db.h" | |||
#include <errno.h> | |||
#include <fcntl.h> | |||
#include <sys/stat.h> | |||
#include <sys/types.h> | |||
#include "include/env.h" | |||
#include "include/table.h" | |||
#include "include/write_batch.h" | |||
#include "db/db_impl.h" | |||
#include "db/filename.h" | |||
#include "db/version_set.h" | |||
#include "util/logging.h" | |||
#include "util/testharness.h" | |||
#include "util/testutil.h" | |||
namespace leveldb { | |||
static const int kValueSize = 1000; | |||
class CorruptionTest { | |||
public: | |||
test::ErrorEnv env_; | |||
Random rnd_; | |||
std::string dbname_; | |||
Options options_; | |||
DB* db_; | |||
CorruptionTest() : rnd_(test::RandomSeed()) { | |||
options_.env = &env_; | |||
dbname_ = test::TmpDir() + "/db_test"; | |||
DestroyDB(dbname_, options_); | |||
db_ = NULL; | |||
options_.create_if_missing = true; | |||
Reopen(); | |||
options_.create_if_missing = false; | |||
} | |||
~CorruptionTest() { | |||
delete db_; | |||
DestroyDB(dbname_, Options()); | |||
} | |||
Status TryReopen(Options* options = NULL) { | |||
delete db_; | |||
db_ = NULL; | |||
Options opt = (options ? *options : options_); | |||
opt.env = &env_; | |||
return DB::Open(opt, dbname_, &db_); | |||
} | |||
void Reopen(Options* options = NULL) { | |||
ASSERT_OK(TryReopen(options)); | |||
} | |||
void RepairDB() { | |||
delete db_; | |||
db_ = NULL; | |||
ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); | |||
} | |||
void Build(int n) { | |||
std::string key_space, value_space; | |||
WriteBatch batch; | |||
for (int i = 0; i < n; i++) { | |||
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); | |||
Slice key = Key(i, &key_space); | |||
batch.Clear(); | |||
batch.Put(key, Value(i, &value_space)); | |||
ASSERT_OK(db_->Write(WriteOptions(), &batch)); | |||
} | |||
} | |||
void Check(int min_expected, int max_expected) { | |||
int next_expected = 0; | |||
int missed = 0; | |||
int bad_keys = 0; | |||
int bad_values = 0; | |||
int correct = 0; | |||
std::string value_space; | |||
Iterator* iter = db_->NewIterator(ReadOptions()); | |||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { | |||
uint64_t key; | |||
Slice in(iter->key()); | |||
if (!ConsumeDecimalNumber(&in, &key) || | |||
!in.empty() || | |||
key < next_expected) { | |||
bad_keys++; | |||
continue; | |||
} | |||
missed += (key - next_expected); | |||
next_expected = key + 1; | |||
if (iter->value() != Value(key, &value_space)) { | |||
bad_values++; | |||
} else { | |||
correct++; | |||
} | |||
} | |||
delete iter; | |||
fprintf(stderr, | |||
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", | |||
min_expected, max_expected, correct, bad_keys, bad_values, missed); | |||
ASSERT_LE(min_expected, correct); | |||
ASSERT_GE(max_expected, correct); | |||
} | |||
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { | |||
// Pick file to corrupt | |||
std::vector<std::string> filenames; | |||
ASSERT_OK(env_.GetChildren(dbname_, &filenames)); | |||
uint64_t number; | |||
LargeValueRef large_ref; | |||
FileType type; | |||
std::vector<std::string> candidates; | |||
for (int i = 0; i < filenames.size(); i++) { | |||
if (ParseFileName(filenames[i], &number, &large_ref, &type) && | |||
type == filetype) { | |||
candidates.push_back(dbname_ + "/" + filenames[i]); | |||
} | |||
} | |||
ASSERT_TRUE(!candidates.empty()) << filetype; | |||
std::string fname = candidates[rnd_.Uniform(candidates.size())]; | |||
struct stat sbuf; | |||
if (stat(fname.c_str(), &sbuf) != 0) { | |||
const char* msg = strerror(errno); | |||
ASSERT_TRUE(false) << fname << ": " << msg; | |||
} | |||
if (offset < 0) { | |||
// Relative to end of file; make it absolute | |||
if (-offset > sbuf.st_size) { | |||
offset = 0; | |||
} else { | |||
offset = sbuf.st_size + offset; | |||
} | |||
} | |||
if (offset > sbuf.st_size) { | |||
offset = sbuf.st_size; | |||
} | |||
if (offset + bytes_to_corrupt > sbuf.st_size) { | |||
bytes_to_corrupt = sbuf.st_size - offset; | |||
} | |||
// Do it | |||
std::string contents; | |||
Status s = ReadFileToString(Env::Default(), fname, &contents); | |||
ASSERT_TRUE(s.ok()) << s.ToString(); | |||
for (int i = 0; i < bytes_to_corrupt; i++) { | |||
contents[i + offset] ^= 0x80; | |||
} | |||
s = WriteStringToFile(Env::Default(), contents, fname); | |||
ASSERT_TRUE(s.ok()) << s.ToString(); | |||
} | |||
uint64_t Property(const std::string& name) { | |||
uint64_t result; | |||
if (!db_->GetProperty(name, &result)) { | |||
result = ~static_cast<uint64_t>(0); | |||
} | |||
return result; | |||
} | |||
// Return the ith key | |||
Slice Key(int i, std::string* storage) { | |||
char buf[100]; | |||
snprintf(buf, sizeof(buf), "%016d", i); | |||
storage->assign(buf, strlen(buf)); | |||
return Slice(*storage); | |||
} | |||
// Return the value to associate with the specified key | |||
Slice Value(int k, std::string* storage) { | |||
Random r(k); | |||
return test::RandomString(&r, kValueSize, storage); | |||
} | |||
}; | |||
TEST(CorruptionTest, Recovery) { | |||
Build(10); | |||
Check(10, 10); | |||
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record | |||
Corrupt(kLogFile, 2*kValueSize, 1); // Somewhere in second log record? | |||
Reopen(); | |||
Check(8, 8); | |||
} | |||
TEST(CorruptionTest, RecoverWriteError) { | |||
env_.writable_file_error_ = true; | |||
Status s = TryReopen(); | |||
ASSERT_TRUE(!s.ok()); | |||
} | |||
TEST(CorruptionTest, NewFileErrorDuringWrite) { | |||
// Do enough writing to force minor compaction | |||
env_.writable_file_error_ = true; | |||
const int num = 3 + (Options().write_buffer_size / kValueSize); | |||
std::string value_storage; | |||
Status s; | |||
for (int i = 0; s.ok() && i < num; i++) { | |||
WriteBatch batch; | |||
batch.Put("a", Value(100, &value_storage)); | |||
s = db_->Write(WriteOptions(), &batch); | |||
} | |||
ASSERT_TRUE(!s.ok()); | |||
ASSERT_GE(env_.num_writable_file_errors_, 1); | |||
env_.writable_file_error_ = false; | |||
Reopen(); | |||
} | |||
TEST(CorruptionTest, TableFile) { | |||
Build(100); | |||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); | |||
dbi->TEST_CompactMemTable(); | |||
dbi->TEST_CompactRange(0, "", "~"); | |||
dbi->TEST_CompactRange(1, "", "~"); | |||
Corrupt(kTableFile, 100, 1); | |||
Check(99, 99); | |||
} | |||
TEST(CorruptionTest, TableFileIndexData) { | |||
Build(10000); // Enough to build multiple Tables | |||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); | |||
dbi->TEST_CompactMemTable(); | |||
dbi->TEST_CompactRange(0, "", "~"); | |||
dbi->TEST_CompactRange(1, "", "~"); | |||
Corrupt(kTableFile, -1000, 500); | |||
Reopen(); | |||
Check(5000, 9999); | |||
} | |||
TEST(CorruptionTest, MissingDescriptor) { | |||
Build(1000); | |||
RepairDB(); | |||
Reopen(); | |||
Check(1000, 1000); | |||
} | |||
TEST(CorruptionTest, SequenceNumberRecovery) { | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); | |||
RepairDB(); | |||
Reopen(); | |||
std::string v; | |||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); | |||
ASSERT_EQ("v5", v); | |||
// Write something. If sequence number was not recovered properly, | |||
// it will be hidden by an earlier write. | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); | |||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); | |||
ASSERT_EQ("v6", v); | |||
Reopen(); | |||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); | |||
ASSERT_EQ("v6", v); | |||
} | |||
TEST(CorruptionTest, LargeValueRecovery) { | |||
Options options; | |||
options.large_value_threshold = 10000; | |||
Reopen(&options); | |||
Random rnd(301); | |||
std::string big; | |||
ASSERT_OK(db_->Put(WriteOptions(), | |||
"foo", test::RandomString(&rnd, 100000, &big))); | |||
std::string v; | |||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); | |||
ASSERT_EQ(big, v); | |||
RepairDB(); | |||
Reopen(); | |||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); | |||
ASSERT_EQ(big, v); | |||
Reopen(); | |||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); | |||
ASSERT_EQ(big, v); | |||
} | |||
TEST(CorruptionTest, CorruptedDescriptor) { | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); | |||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); | |||
dbi->TEST_CompactMemTable(); | |||
dbi->TEST_CompactRange(0, "", "~"); | |||
Corrupt(kDescriptorFile, 0, 1000); | |||
Status s = TryReopen(); | |||
ASSERT_TRUE(!s.ok()); | |||
RepairDB(); | |||
Reopen(); | |||
std::string v; | |||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); | |||
ASSERT_EQ("hello", v); | |||
} | |||
TEST(CorruptionTest, CompactionInputError) { | |||
Build(10); | |||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); | |||
dbi->TEST_CompactMemTable(); | |||
ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); | |||
Corrupt(kTableFile, 100, 1); | |||
Check(9, 9); | |||
// Force compactions by writing lots of values | |||
Build(10000); | |||
Check(10000, 10000); | |||
dbi->TEST_CompactRange(0, "", "~"); | |||
ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); | |||
} | |||
TEST(CorruptionTest, CompactionInputErrorParanoid) { | |||
Options options; | |||
options.paranoid_checks = true; | |||
Reopen(&options); | |||
Build(10); | |||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); | |||
dbi->TEST_CompactMemTable(); | |||
ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); | |||
Corrupt(kTableFile, 100, 1); | |||
Check(9, 9); | |||
// Write must eventually fail because of corrupted table | |||
Status s; | |||
std::string tmp1, tmp2; | |||
for (int i = 0; i < 10000 && s.ok(); i++) { | |||
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); | |||
} | |||
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; | |||
} | |||
TEST(CorruptionTest, UnrelatedKeys) { | |||
Build(10); | |||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); | |||
dbi->TEST_CompactMemTable(); | |||
Corrupt(kTableFile, 100, 1); | |||
std::string tmp1, tmp2; | |||
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); | |||
std::string v; | |||
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); | |||
ASSERT_EQ(Value(1000, &tmp2).ToString(), v); | |||
dbi->TEST_CompactMemTable(); | |||
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); | |||
ASSERT_EQ(Value(1000, &tmp2).ToString(), v); | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,376 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include <sys/types.h> | |||
#include <stdio.h> | |||
#include <stdlib.h> | |||
#include "db/db_impl.h" | |||
#include "db/version_set.h" | |||
#include "include/cache.h" | |||
#include "include/db.h" | |||
#include "include/env.h" | |||
#include "include/write_batch.h" | |||
#include "util/histogram.h" | |||
#include "util/random.h" | |||
#include "util/testutil.h" | |||
// Comma-separated list of operations to run in the specified order | |||
// Actual benchmarks: | |||
// writeseq -- write N values in sequential key order | |||
// writerandom -- write N values in random key order | |||
// writebig -- write N/1000 100K valuesin random order | |||
// readseq -- read N values sequentially | |||
// readrandom -- read N values in random order | |||
// Meta operations: | |||
// compact -- Compact the entire DB | |||
// heapprofile -- Dump a heap profile (if supported by this port) | |||
// sync -- switch to synchronous writes (not the default) | |||
// nosync -- switch to asynchronous writes (the default) | |||
// tenth -- divide N by 10 (i.e., following benchmarks are smaller) | |||
// normal -- reset N back to its normal value (1000000) | |||
static const char* FLAGS_benchmarks = | |||
"writeseq," | |||
"writeseq," | |||
"writerandom," | |||
"sync,tenth,tenth,writerandom,nosync,normal," | |||
"readseq," | |||
"readrandom," | |||
"compact," | |||
"readseq," | |||
"readrandom," | |||
"writebig"; | |||
// Number of key/values to place in database | |||
static int FLAGS_num = 1000000; | |||
// Size of each value | |||
static int FLAGS_value_size = 100; | |||
// Arrange to generate values that shrink to this fraction of | |||
// their original size after compression | |||
static double FLAGS_compression_ratio = 0.25; | |||
// Print histogram of operation timings | |||
static bool FLAGS_histogram = false; | |||
// Number of bytes to buffer in memtable before compacting | |||
static int FLAGS_write_buffer_size = 1 << 20; | |||
namespace leveldb { | |||
// Helper for quickly generating random data. | |||
namespace { | |||
class RandomGenerator { | |||
private: | |||
std::string data_; | |||
int pos_; | |||
public: | |||
RandomGenerator() { | |||
// We use a limited amount of data over and over again and ensure | |||
// that it is larger than the compression window (32KB), and also | |||
// large enough to serve all typical value sizes we want to write. | |||
Random rnd(301); | |||
std::string piece; | |||
while (data_.size() < 1048576) { | |||
// Add a short fragment that is as compressible as specified | |||
// by FLAGS_compression_ratio. | |||
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); | |||
data_.append(piece); | |||
} | |||
pos_ = 0; | |||
} | |||
Slice Generate(int len) { | |||
if (pos_ + len > data_.size()) { | |||
pos_ = 0; | |||
assert(len < data_.size()); | |||
} | |||
pos_ += len; | |||
return Slice(data_.data() + pos_ - len, len); | |||
} | |||
}; | |||
} | |||
class Benchmark { | |||
private: | |||
Cache* cache_; | |||
DB* db_; | |||
int num_; | |||
bool sync_; | |||
int heap_counter_; | |||
double start_; | |||
double last_op_finish_; | |||
int64_t bytes_; | |||
std::string message_; | |||
Histogram hist_; | |||
RandomGenerator gen_; | |||
Random rand_; | |||
// State kept for progress messages | |||
int done_; | |||
int next_report_; // When to report next | |||
void Start() { | |||
start_ = Env::Default()->NowMicros() * 1e-6; | |||
bytes_ = 0; | |||
message_.clear(); | |||
last_op_finish_ = start_; | |||
hist_.Clear(); | |||
done_ = 0; | |||
next_report_ = 100; | |||
} | |||
void FinishedSingleOp() { | |||
if (FLAGS_histogram) { | |||
double now = Env::Default()->NowMicros() * 1e-6; | |||
double micros = (now - last_op_finish_) * 1e6; | |||
hist_.Add(micros); | |||
if (micros > 20000) { | |||
fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); | |||
fflush(stderr); | |||
} | |||
last_op_finish_ = now; | |||
} | |||
done_++; | |||
if (done_ >= next_report_) { | |||
if (next_report_ < 1000) { | |||
next_report_ += 100; | |||
} else if (next_report_ < 10000) { | |||
next_report_ += 1000; | |||
} else if (next_report_ < 100000) { | |||
next_report_ += 10000; | |||
} else { | |||
next_report_ += 100000; | |||
} | |||
fprintf(stderr, "... finished %d ops%30s\r", done_, ""); | |||
fflush(stderr); | |||
} | |||
} | |||
void Stop(const Slice& name) { | |||
double finish = Env::Default()->NowMicros() * 1e-6; | |||
// Pretend at least one op was done in case we are running a benchmark | |||
// that does nto call FinishedSingleOp(). | |||
if (done_ < 1) done_ = 1; | |||
if (bytes_ > 0) { | |||
char rate[100]; | |||
snprintf(rate, sizeof(rate), "%5.1f MB/s", | |||
(bytes_ / 1048576.0) / (finish - start_)); | |||
if (!message_.empty()) { | |||
message_.push_back(' '); | |||
} | |||
message_.append(rate); | |||
} | |||
fprintf(stdout, "%-12s : %10.3f micros/op;%s%s\n", | |||
name.ToString().c_str(), | |||
(finish - start_) * 1e6 / done_, | |||
(message_.empty() ? "" : " "), | |||
message_.c_str()); | |||
if (FLAGS_histogram) { | |||
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); | |||
} | |||
fflush(stdout); | |||
} | |||
public: | |||
enum Order { SEQUENTIAL, RANDOM }; | |||
Benchmark() : cache_(NewLRUCache(200<<20)), | |||
db_(NULL), | |||
num_(FLAGS_num), | |||
sync_(false), | |||
heap_counter_(0), | |||
bytes_(0), | |||
rand_(301) { | |||
std::vector<std::string> files; | |||
Env::Default()->GetChildren("/tmp/dbbench", &files); | |||
for (int i = 0; i < files.size(); i++) { | |||
if (Slice(files[i]).starts_with("heap-")) { | |||
Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); | |||
} | |||
} | |||
DestroyDB("/tmp/dbbench", Options()); | |||
} | |||
~Benchmark() { | |||
delete db_; | |||
delete cache_; | |||
} | |||
void Run() { | |||
Options options; | |||
options.create_if_missing = true; | |||
options.max_open_files = 10000; | |||
options.block_cache = cache_; | |||
options.write_buffer_size = FLAGS_write_buffer_size; | |||
Start(); | |||
Status s = DB::Open(options, "/tmp/dbbench", &db_); | |||
Stop("open"); | |||
if (!s.ok()) { | |||
fprintf(stderr, "open error: %s\n", s.ToString().c_str()); | |||
exit(1); | |||
} | |||
const char* benchmarks = FLAGS_benchmarks; | |||
while (benchmarks != NULL) { | |||
const char* sep = strchr(benchmarks, ','); | |||
Slice name; | |||
if (sep == NULL) { | |||
name = benchmarks; | |||
benchmarks = NULL; | |||
} else { | |||
name = Slice(benchmarks, sep - benchmarks); | |||
benchmarks = sep + 1; | |||
} | |||
Start(); | |||
if (name == Slice("writeseq")) { | |||
Write(SEQUENTIAL, num_, FLAGS_value_size); | |||
} else if (name == Slice("writerandom")) { | |||
Write(RANDOM, num_, FLAGS_value_size); | |||
} else if (name == Slice("writebig")) { | |||
Write(RANDOM, num_ / 1000, 100 * 1000); | |||
} else if (name == Slice("readseq")) { | |||
Read(SEQUENTIAL); | |||
} else if (name == Slice("readrandom")) { | |||
Read(RANDOM); | |||
} else if (name == Slice("compact")) { | |||
Compact(); | |||
} else if (name == Slice("heapprofile")) { | |||
HeapProfile(); | |||
} else if (name == Slice("sync")) { | |||
sync_ = true; | |||
} else if (name == Slice("nosync")) { | |||
sync_ = false; | |||
} else if (name == Slice("tenth")) { | |||
num_ = num_ / 10; | |||
} else if (name == Slice("normal")) { | |||
num_ = FLAGS_num; | |||
} else { | |||
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); | |||
} | |||
Stop(name); | |||
} | |||
} | |||
void Write(Order order, int num_entries, int value_size) { | |||
WriteBatch batch; | |||
Status s; | |||
std::string val; | |||
WriteOptions options; | |||
options.sync = sync_; | |||
for (int i = 0; i < num_entries; i++) { | |||
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); | |||
char key[100]; | |||
snprintf(key, sizeof(key), "%012d", k); | |||
batch.Clear(); | |||
batch.Put(key, gen_.Generate(value_size)); | |||
s = db_->Write(options, &batch); | |||
bytes_ += value_size + strlen(key); | |||
if (!s.ok()) { | |||
fprintf(stderr, "put error: %s\n", s.ToString().c_str()); | |||
exit(1); | |||
} | |||
FinishedSingleOp(); | |||
} | |||
} | |||
void Read(Order order) { | |||
ReadOptions options; | |||
if (order == SEQUENTIAL) { | |||
Iterator* iter = db_->NewIterator(options); | |||
int i = 0; | |||
for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { | |||
bytes_ += iter->key().size() + iter->value().size(); | |||
FinishedSingleOp(); | |||
++i; | |||
} | |||
delete iter; | |||
} else { | |||
std::string value; | |||
for (int i = 0; i < num_; i++) { | |||
char key[100]; | |||
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); | |||
snprintf(key, sizeof(key), "%012d", k); | |||
db_->Get(options, key, &value); | |||
FinishedSingleOp(); | |||
} | |||
} | |||
} | |||
void Compact() { | |||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); | |||
dbi->TEST_CompactMemTable(); | |||
int max_level_with_files = 1; | |||
for (int level = 1; level < config::kNumLevels; level++) { | |||
uint64_t v; | |||
char name[100]; | |||
snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); | |||
if (db_->GetProperty(name, &v) && v > 0) { | |||
max_level_with_files = level; | |||
} | |||
} | |||
for (int level = 0; level < max_level_with_files; level++) { | |||
dbi->TEST_CompactRange(level, "", "~"); | |||
} | |||
} | |||
static void WriteToFile(void* arg, const char* buf, int n) { | |||
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n)); | |||
} | |||
void HeapProfile() { | |||
char fname[100]; | |||
snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); | |||
WritableFile* file; | |||
Status s = Env::Default()->NewWritableFile(fname, &file); | |||
if (!s.ok()) { | |||
message_ = s.ToString(); | |||
return; | |||
} | |||
bool ok = port::GetHeapProfile(WriteToFile, file); | |||
delete file; | |||
if (!ok) { | |||
message_ = "not supported"; | |||
Env::Default()->DeleteFile(fname); | |||
} | |||
} | |||
}; | |||
} | |||
int main(int argc, char** argv) { | |||
for (int i = 1; i < argc; i++) { | |||
double d; | |||
int n; | |||
char junk; | |||
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { | |||
FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); | |||
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { | |||
FLAGS_compression_ratio = d; | |||
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && | |||
(n == 0 || n == 1)) { | |||
FLAGS_histogram = n; | |||
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { | |||
FLAGS_num = n; | |||
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { | |||
FLAGS_value_size = n; | |||
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { | |||
FLAGS_write_buffer_size = n; | |||
} else { | |||
fprintf(stderr, "Invalid flag '%s'\n", argv[i]); | |||
exit(1); | |||
} | |||
} | |||
leveldb::Benchmark benchmark; | |||
benchmark.Run(); | |||
return 0; | |||
} |
@ -0,0 +1,192 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ | |||
#define STORAGE_LEVELDB_DB_DB_IMPL_H_ | |||
#include <set> | |||
#include "db/dbformat.h" | |||
#include "db/log_writer.h" | |||
#include "db/snapshot.h" | |||
#include "include/db.h" | |||
#include "include/env.h" | |||
#include "port/port.h" | |||
namespace leveldb { | |||
class MemTable; | |||
class TableCache; | |||
class Version; | |||
class VersionEdit; | |||
class VersionSet; | |||
class DBImpl : public DB { | |||
public: | |||
DBImpl(const Options& options, const std::string& dbname); | |||
virtual ~DBImpl(); | |||
// Implementations of the DB interface | |||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); | |||
virtual Status Delete(const WriteOptions&, const Slice& key); | |||
virtual Status Write(const WriteOptions& options, WriteBatch* updates); | |||
virtual Status Get(const ReadOptions& options, | |||
const Slice& key, | |||
std::string* value); | |||
virtual Iterator* NewIterator(const ReadOptions&); | |||
virtual const Snapshot* GetSnapshot(); | |||
virtual void ReleaseSnapshot(const Snapshot* snapshot); | |||
virtual bool GetProperty(const Slice& property, uint64_t* value); | |||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); | |||
// Extra methods (for testing) that are not in the public DB interface | |||
// Compact any files in the named level that overlap [begin,end] | |||
void TEST_CompactRange( | |||
int level, | |||
const std::string& begin, | |||
const std::string& end); | |||
// Force current memtable contents to be compacted. | |||
Status TEST_CompactMemTable(); | |||
// Return an internal iterator over the current state of the database. | |||
// The keys of this iterator are internal keys (see format.h). | |||
// The returned iterator should be deleted when no longer needed. | |||
Iterator* TEST_NewInternalIterator(); | |||
private: | |||
friend class DB; | |||
Iterator* NewInternalIterator(const ReadOptions&, | |||
SequenceNumber* latest_snapshot); | |||
Status NewDB(); | |||
// Recover the descriptor from persistent storage. May do a significant | |||
// amount of work to recover recently logged updates. Any changes to | |||
// be made to the descriptor are added to *edit. | |||
Status Recover(VersionEdit* edit); | |||
// Apply the specified updates and save the resulting descriptor to | |||
// persistent storage. If cleanup_mem is non-NULL, arrange to | |||
// delete it when all existing snapshots have gone away iff Install() | |||
// returns OK. | |||
Status Install(VersionEdit* edit, | |||
uint64_t new_log_number, | |||
MemTable* cleanup_mem); | |||
void MaybeIgnoreError(Status* s) const; | |||
// Delete any unneeded files and stale in-memory entries. | |||
void DeleteObsoleteFiles(); | |||
// Called when an iterator over a particular version of the | |||
// descriptor goes away. | |||
static void Unref(void* arg1, void* arg2); | |||
// Compact the in-memory write buffer to disk. Switches to a new | |||
// log-file/memtable and writes a new descriptor iff successful. | |||
Status CompactMemTable(); | |||
Status RecoverLogFile(uint64_t log_number, | |||
VersionEdit* edit, | |||
SequenceNumber* max_sequence); | |||
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); | |||
bool HasLargeValues(const WriteBatch& batch) const; | |||
// Process data in "*updates" and return a status. "assigned_seq" | |||
// is the sequence number assigned to the first mod in "*updates". | |||
// If no large values are encountered, "*final" is set to "updates". | |||
// If large values were encountered, registers the references of the | |||
// large values with the VersionSet, writes the large values to | |||
// files (if appropriate), and allocates a new WriteBatch with the | |||
// large values replaced with indirect references and stores a | |||
// pointer to the new WriteBatch in *final. If *final != updates on | |||
// return, then the client should delete *final when no longer | |||
// needed. Returns OK on success, and an appropriate error | |||
// otherwise. | |||
Status HandleLargeValues(SequenceNumber assigned_seq, | |||
WriteBatch* updates, | |||
WriteBatch** final); | |||
// Helper routine for HandleLargeValues | |||
void MaybeCompressLargeValue( | |||
const Slice& raw_value, | |||
Slice* file_bytes, | |||
std::string* scratch, | |||
LargeValueRef* ref); | |||
struct CompactionState; | |||
void MaybeScheduleCompaction(); | |||
static void BGWork(void* db); | |||
void BackgroundCall(); | |||
void BackgroundCompaction(); | |||
void CleanupCompaction(CompactionState* compact); | |||
Status DoCompactionWork(CompactionState* compact); | |||
Status OpenCompactionOutputFile(CompactionState* compact); | |||
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); | |||
Status InstallCompactionResults(CompactionState* compact); | |||
// Constant after construction | |||
Env* const env_; | |||
const InternalKeyComparator internal_comparator_; | |||
const Options options_; // options_.comparator == &internal_comparator_ | |||
bool owns_info_log_; | |||
const std::string dbname_; | |||
// table_cache_ provides its own synchronization | |||
TableCache* table_cache_; | |||
// Lock over the persistent DB state. Non-NULL iff successfully acquired. | |||
FileLock* db_lock_; | |||
// State below is protected by mutex_ | |||
port::Mutex mutex_; | |||
port::AtomicPointer shutting_down_; | |||
port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ | |||
port::CondVar compacting_cv_; // Signalled when !compacting_ | |||
SequenceNumber last_sequence_; | |||
MemTable* mem_; | |||
WritableFile* logfile_; | |||
log::Writer* log_; | |||
uint64_t log_number_; | |||
SnapshotList snapshots_; | |||
// Set of table files to protect from deletion because they are | |||
// part of ongoing compactions. | |||
std::set<uint64_t> pending_outputs_; | |||
// Has a background compaction been scheduled or is running? | |||
bool bg_compaction_scheduled_; | |||
// Is there a compaction running? | |||
bool compacting_; | |||
VersionSet* versions_; | |||
// Have we encountered a background error in paranoid mode? | |||
Status bg_error_; | |||
// No copying allowed | |||
DBImpl(const DBImpl&); | |||
void operator=(const DBImpl&); | |||
const Comparator* user_comparator() const { | |||
return internal_comparator_.user_comparator(); | |||
} | |||
}; | |||
// Sanitize db options. The caller should delete result.info_log if | |||
// it is not equal to src.info_log. | |||
extern Options SanitizeOptions(const std::string& db, | |||
const InternalKeyComparator* icmp, | |||
const Options& src); | |||
} | |||
#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ |
@ -0,0 +1,412 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/db_iter.h" | |||
#include "db/filename.h" | |||
#include "db/dbformat.h" | |||
#include "include/env.h" | |||
#include "include/iterator.h" | |||
#include "port/port.h" | |||
#include "util/logging.h" | |||
#include "util/mutexlock.h" | |||
namespace leveldb { | |||
#if 0 | |||
static void DumpInternalIter(Iterator* iter) { | |||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { | |||
ParsedInternalKey k; | |||
if (!ParseInternalKey(iter->key(), &k)) { | |||
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); | |||
} else { | |||
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); | |||
} | |||
} | |||
} | |||
#endif | |||
namespace { | |||
// Memtables and sstables that make the DB representation contain | |||
// (userkey,seq,type) => uservalue entries. DBIter | |||
// combines multiple entries for the same userkey found in the DB | |||
// representation into a single entry while accounting for sequence | |||
// numbers, deletion markers, overwrites, etc. | |||
class DBIter: public Iterator { | |||
public: | |||
DBIter(const std::string* dbname, Env* env, | |||
const Comparator* cmp, Iterator* iter, SequenceNumber s) | |||
: dbname_(dbname), | |||
env_(env), | |||
user_comparator_(cmp), | |||
iter_(iter), | |||
sequence_(s), | |||
large_(NULL), | |||
valid_(false) { | |||
} | |||
virtual ~DBIter() { | |||
delete iter_; | |||
delete large_; | |||
} | |||
virtual bool Valid() const { return valid_; } | |||
virtual Slice key() const { | |||
assert(valid_); | |||
return key_; | |||
} | |||
virtual Slice value() const { | |||
assert(valid_); | |||
if (large_ == NULL) { | |||
return value_; | |||
} else { | |||
MutexLock l(&large_->mutex); | |||
if (!large_->produced) { | |||
ReadIndirectValue(); | |||
} | |||
return large_->value; | |||
} | |||
} | |||
virtual void Next() { | |||
assert(valid_); | |||
// iter_ is already positioned past DBIter::key() | |||
FindNextUserEntry(); | |||
} | |||
virtual void Prev() { | |||
assert(valid_); | |||
bool ignored; | |||
ScanUntilBeforeCurrentKey(&ignored); | |||
FindPrevUserEntry(); | |||
} | |||
virtual void Seek(const Slice& target) { | |||
ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek); | |||
std::string tmp; | |||
AppendInternalKey(&tmp, ikey); | |||
iter_->Seek(tmp); | |||
FindNextUserEntry(); | |||
} | |||
virtual void SeekToFirst() { | |||
iter_->SeekToFirst(); | |||
FindNextUserEntry(); | |||
} | |||
virtual void SeekToLast(); | |||
virtual Status status() const { | |||
if (status_.ok()) { | |||
if (large_ != NULL && !large_->status.ok()) return large_->status; | |||
return iter_->status(); | |||
} else { | |||
return status_; | |||
} | |||
} | |||
private: | |||
void FindNextUserEntry(); | |||
void FindPrevUserEntry(); | |||
void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); } | |||
void SaveValue(const Slice& v) { | |||
if (value_.capacity() > v.size() + 1048576) { | |||
std::string empty; | |||
swap(empty, value_); | |||
} | |||
value_.assign(v.data(), v.size()); | |||
} | |||
bool ParseKey(ParsedInternalKey* key); | |||
void SkipPast(const Slice& k); | |||
void ScanUntilBeforeCurrentKey(bool* found_live); | |||
void ReadIndirectValue() const; | |||
struct Large { | |||
port::Mutex mutex; | |||
std::string value; | |||
bool produced; | |||
Status status; | |||
}; | |||
const std::string* const dbname_; | |||
Env* const env_; | |||
const Comparator* const user_comparator_; | |||
// iter_ is positioned just past current entry for DBIter if valid_ | |||
Iterator* const iter_; | |||
SequenceNumber const sequence_; | |||
Status status_; | |||
std::string key_; // Always a user key | |||
std::string value_; | |||
Large* large_; // Non-NULL if value is an indirect reference | |||
bool valid_; | |||
// No copying allowed | |||
DBIter(const DBIter&); | |||
void operator=(const DBIter&); | |||
}; | |||
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { | |||
if (!ParseInternalKey(iter_->key(), ikey)) { | |||
status_ = Status::Corruption("corrupted internal key in DBIter"); | |||
return false; | |||
} else { | |||
return true; | |||
} | |||
} | |||
void DBIter::FindNextUserEntry() { | |||
if (large_ != NULL) { | |||
if (status_.ok() && !large_->status.ok()) { | |||
status_ = large_->status; | |||
} | |||
delete large_; | |||
large_ = NULL; | |||
} | |||
while (iter_->Valid()) { | |||
ParsedInternalKey ikey; | |||
if (!ParseKey(&ikey)) { | |||
// Skip past corrupted entry | |||
iter_->Next(); | |||
continue; | |||
} | |||
if (ikey.sequence > sequence_) { | |||
// Ignore entries newer than the snapshot | |||
iter_->Next(); | |||
continue; | |||
} | |||
switch (ikey.type) { | |||
case kTypeDeletion: | |||
SaveKey(ikey.user_key); // Make local copy for use by SkipPast() | |||
iter_->Next(); | |||
SkipPast(key_); | |||
// Do not return deleted entries. Instead keep looping. | |||
break; | |||
case kTypeValue: | |||
SaveKey(ikey.user_key); | |||
SaveValue(iter_->value()); | |||
iter_->Next(); | |||
SkipPast(key_); | |||
// Yield the value we just found. | |||
valid_ = true; | |||
return; | |||
case kTypeLargeValueRef: | |||
SaveKey(ikey.user_key); | |||
// Save the large value ref as value_, and read it lazily on a call | |||
// to value() | |||
SaveValue(iter_->value()); | |||
large_ = new Large; | |||
large_->produced = false; | |||
iter_->Next(); | |||
SkipPast(key_); | |||
// Yield the value we just found. | |||
valid_ = true; | |||
return; | |||
} | |||
} | |||
valid_ = false; | |||
key_.clear(); | |||
value_.clear(); | |||
assert(large_ == NULL); | |||
} | |||
void DBIter::SkipPast(const Slice& k) { | |||
while (iter_->Valid()) { | |||
ParsedInternalKey ikey; | |||
// Note that if we cannot parse an internal key, we keep looping | |||
// so that if we have a run like the following: | |||
// <x,100,v> => value100 | |||
// <corrupted entry for user key x> | |||
// <x,50,v> => value50 | |||
// we will skip over the corrupted entry as well as value50. | |||
if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) { | |||
break; | |||
} | |||
iter_->Next(); | |||
} | |||
} | |||
void DBIter::SeekToLast() { | |||
// Position iter_ at the last uncorrupted user key and then | |||
// let FindPrevUserEntry() do the heavy lifting to find | |||
// a user key that is live. | |||
iter_->SeekToLast(); | |||
ParsedInternalKey current; | |||
while (iter_->Valid() && !ParseKey(¤t)) { | |||
iter_->Prev(); | |||
} | |||
if (iter_->Valid()) { | |||
SaveKey(current.user_key); | |||
} | |||
FindPrevUserEntry(); | |||
} | |||
// Let X be the user key at which iter_ is currently positioned. | |||
// Adjust DBIter to point at the last entry with a key <= X that | |||
// has a live value. | |||
void DBIter::FindPrevUserEntry() { | |||
// Consider the following example: | |||
// | |||
// A@540 | |||
// A@400 | |||
// | |||
// B@300 | |||
// B@200 | |||
// B@100 <- iter_ | |||
// | |||
// C@301 | |||
// C@201 | |||
// | |||
// The comments marked "(first iteration)" below relate what happens | |||
// for the preceding example in the first iteration of the while loop | |||
// below. There may be more than one iteration either if there are | |||
// no live values for B, or if there is a corruption. | |||
while (iter_->Valid()) { | |||
std::string saved = key_; | |||
bool found_live; | |||
ScanUntilBeforeCurrentKey(&found_live); | |||
// (first iteration) iter_ at A@400 | |||
if (found_live) { | |||
// Step forward into range of entries with user key >= saved | |||
if (!iter_->Valid()) { | |||
iter_->SeekToFirst(); | |||
} else { | |||
iter_->Next(); | |||
} | |||
// (first iteration) iter_ at B@300 | |||
FindNextUserEntry(); // Sets key_ to the key of the next value it found | |||
if (valid_ && user_comparator_->Compare(key_, saved) == 0) { | |||
// (first iteration) iter_ at C@301 | |||
return; | |||
} | |||
// FindNextUserEntry() could not find any entries under the | |||
// user key "saved". This is probably a corruption since | |||
// ScanUntilBefore(saved) found a live value. So we skip | |||
// backwards to an earlier key and ignore the corrupted | |||
// entries for "saved". | |||
// | |||
// (first iteration) iter_ at C@301 and saved == "B" | |||
key_ = saved; | |||
bool ignored; | |||
ScanUntilBeforeCurrentKey(&ignored); | |||
// (first iteration) iter_ at A@400 | |||
} | |||
} | |||
valid_ = false; | |||
key_.clear(); | |||
value_.clear(); | |||
} | |||
void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) { | |||
*found_live = false; | |||
if (!iter_->Valid()) { | |||
iter_->SeekToLast(); | |||
} | |||
while (iter_->Valid()) { | |||
ParsedInternalKey current; | |||
if (!ParseKey(¤t)) { | |||
iter_->Prev(); | |||
continue; | |||
} | |||
if (current.sequence > sequence_) { | |||
// Ignore entries that are serialized after this read | |||
iter_->Prev(); | |||
continue; | |||
} | |||
const int cmp = user_comparator_->Compare(current.user_key, key_); | |||
if (cmp < 0) { | |||
SaveKey(current.user_key); | |||
return; | |||
} else if (cmp == 0) { | |||
switch (current.type) { | |||
case kTypeDeletion: | |||
*found_live = false; | |||
break; | |||
case kTypeValue: | |||
case kTypeLargeValueRef: | |||
*found_live = true; | |||
break; | |||
} | |||
} else { // cmp > 0 | |||
*found_live = false; | |||
} | |||
iter_->Prev(); | |||
} | |||
} | |||
void DBIter::ReadIndirectValue() const { | |||
assert(!large_->produced); | |||
large_->produced = true; | |||
LargeValueRef large_ref; | |||
if (value_.size() != LargeValueRef::ByteSize()) { | |||
large_->status = Status::Corruption("malformed large value reference"); | |||
return; | |||
} | |||
memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize()); | |||
std::string fname = LargeValueFileName(*dbname_, large_ref); | |||
RandomAccessFile* file; | |||
Status s = env_->NewRandomAccessFile(fname, &file); | |||
if (s.ok()) { | |||
uint64_t file_size = file->Size(); | |||
uint64_t value_size = large_ref.ValueSize(); | |||
large_->value.resize(value_size); | |||
Slice result; | |||
s = file->Read(0, file_size, &result, | |||
const_cast<char*>(large_->value.data())); | |||
if (s.ok()) { | |||
if (result.size() == file_size) { | |||
switch (large_ref.compression_type()) { | |||
case kNoCompression: { | |||
if (result.data() != large_->value.data()) { | |||
large_->value.assign(result.data(), result.size()); | |||
} | |||
break; | |||
} | |||
case kLightweightCompression: { | |||
std::string uncompressed; | |||
if (port::Lightweight_Uncompress(result.data(), result.size(), | |||
&uncompressed) && | |||
uncompressed.size() == large_ref.ValueSize()) { | |||
swap(uncompressed, large_->value); | |||
} else { | |||
s = Status::Corruption( | |||
"Unable to read entire compressed large value file"); | |||
} | |||
} | |||
} | |||
} else { | |||
s = Status::Corruption("Unable to read entire large value file"); | |||
} | |||
} | |||
delete file; // Ignore errors on closing | |||
} | |||
if (!s.ok()) { | |||
large_->value.clear(); | |||
large_->status = s; | |||
} | |||
} | |||
} // anonymous namespace | |||
Iterator* NewDBIterator( | |||
const std::string* dbname, | |||
Env* env, | |||
const Comparator* user_key_comparator, | |||
Iterator* internal_iter, | |||
const SequenceNumber& sequence) { | |||
return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); | |||
} | |||
} |
@ -0,0 +1,26 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ | |||
#define STORAGE_LEVELDB_DB_DB_ITER_H_ | |||
#include <stdint.h> | |||
#include "include/db.h" | |||
#include "db/dbformat.h" | |||
namespace leveldb { | |||
// Return a new iterator that converts internal keys (yielded by | |||
// "*internal_iter") that were live at the specified "sequence" number | |||
// into appropriate user keys. | |||
extern Iterator* NewDBIterator( | |||
const std::string* dbname, | |||
Env* env, | |||
const Comparator* user_key_comparator, | |||
Iterator* internal_iter, | |||
const SequenceNumber& sequence); | |||
} | |||
#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ |
@ -0,0 +1,963 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "include/db.h" | |||
#include "db/db_impl.h" | |||
#include "db/filename.h" | |||
#include "db/version_set.h" | |||
#include "db/write_batch_internal.h" | |||
#include "include/env.h" | |||
#include "include/table.h" | |||
#include "util/logging.h" | |||
#include "util/testharness.h" | |||
#include "util/testutil.h" | |||
namespace leveldb { | |||
static std::string RandomString(Random* rnd, int len) { | |||
std::string r; | |||
test::RandomString(rnd, len, &r); | |||
return r; | |||
} | |||
class DBTest { | |||
public: | |||
std::string dbname_; | |||
Env* env_; | |||
DB* db_; | |||
Options last_options_; | |||
DBTest() : env_(Env::Default()) { | |||
dbname_ = test::TmpDir() + "/db_test"; | |||
DestroyDB(dbname_, Options()); | |||
db_ = NULL; | |||
Reopen(); | |||
} | |||
~DBTest() { | |||
delete db_; | |||
DestroyDB(dbname_, Options()); | |||
} | |||
DBImpl* dbfull() { | |||
return reinterpret_cast<DBImpl*>(db_); | |||
} | |||
void Reopen(Options* options = NULL) { | |||
ASSERT_OK(TryReopen(options)); | |||
} | |||
void DestroyAndReopen(Options* options = NULL) { | |||
delete db_; | |||
db_ = NULL; | |||
DestroyDB(dbname_, Options()); | |||
ASSERT_OK(TryReopen(options)); | |||
} | |||
Status TryReopen(Options* options) { | |||
delete db_; | |||
db_ = NULL; | |||
Options opts; | |||
if (options != NULL) { | |||
opts = *options; | |||
} else { | |||
opts.create_if_missing = true; | |||
} | |||
last_options_ = opts; | |||
return DB::Open(opts, dbname_, &db_); | |||
} | |||
Status Put(const std::string& k, const std::string& v) { | |||
WriteBatch batch; | |||
batch.Put(k, v); | |||
return db_->Write(WriteOptions(), &batch); | |||
} | |||
Status Delete(const std::string& k) { | |||
WriteBatch batch; | |||
batch.Delete(k); | |||
return db_->Write(WriteOptions(), &batch); | |||
} | |||
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { | |||
ReadOptions options; | |||
options.snapshot = snapshot; | |||
std::string result; | |||
Status s = db_->Get(options, k, &result); | |||
if (s.IsNotFound()) { | |||
result = "NOT_FOUND"; | |||
} else if (!s.ok()) { | |||
result = s.ToString(); | |||
} | |||
return result; | |||
} | |||
std::string AllEntriesFor(const Slice& user_key) { | |||
Iterator* iter = dbfull()->TEST_NewInternalIterator(); | |||
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); | |||
iter->Seek(target.Encode()); | |||
std::string result; | |||
if (!iter->status().ok()) { | |||
result = iter->status().ToString(); | |||
} else { | |||
result = "[ "; | |||
bool first = true; | |||
while (iter->Valid()) { | |||
ParsedInternalKey ikey; | |||
if (!ParseInternalKey(iter->key(), &ikey)) { | |||
result += "CORRUPTED"; | |||
} else { | |||
if (last_options_.comparator->Compare( | |||
ikey.user_key, user_key) != 0) { | |||
break; | |||
} | |||
if (!first) { | |||
result += ", "; | |||
} | |||
first = false; | |||
switch (ikey.type) { | |||
case kTypeValue: | |||
result += iter->value().ToString(); | |||
break; | |||
case kTypeLargeValueRef: | |||
result += "LARGEVALUE(" + EscapeString(iter->value()) + ")"; | |||
break; | |||
case kTypeDeletion: | |||
result += "DEL"; | |||
break; | |||
} | |||
} | |||
iter->Next(); | |||
} | |||
if (!first) { | |||
result += " "; | |||
} | |||
result += "]"; | |||
} | |||
delete iter; | |||
return result; | |||
} | |||
int NumTableFilesAtLevel(int level) { | |||
uint64_t val; | |||
ASSERT_TRUE( | |||
db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), | |||
&val)); | |||
return val; | |||
} | |||
uint64_t Size(const Slice& start, const Slice& limit) { | |||
Range r(start, limit); | |||
uint64_t size; | |||
db_->GetApproximateSizes(&r, 1, &size); | |||
return size; | |||
} | |||
std::set<LargeValueRef> LargeValueFiles() const { | |||
// Return the set of large value files that exist in the database | |||
std::vector<std::string> filenames; | |||
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose | |||
uint64_t number; | |||
LargeValueRef large_ref; | |||
FileType type; | |||
std::set<LargeValueRef> live; | |||
for (int i = 0; i < filenames.size(); i++) { | |||
if (ParseFileName(filenames[i], &number, &large_ref, &type) && | |||
type == kLargeValueFile) { | |||
fprintf(stderr, " live: %s\n", | |||
LargeValueRefToFilenameString(large_ref).c_str()); | |||
live.insert(large_ref); | |||
} | |||
} | |||
fprintf(stderr, "Found %d live large value files\n", (int)live.size()); | |||
return live; | |||
} | |||
}; | |||
TEST(DBTest, Empty) { | |||
ASSERT_TRUE(db_ != NULL); | |||
ASSERT_EQ("NOT_FOUND", Get("foo")); | |||
} | |||
TEST(DBTest, ReadWrite) { | |||
ASSERT_OK(Put("foo", "v1")); | |||
ASSERT_EQ("v1", Get("foo")); | |||
ASSERT_OK(Put("bar", "v2")); | |||
ASSERT_OK(Put("foo", "v3")); | |||
ASSERT_EQ("v3", Get("foo")); | |||
ASSERT_EQ("v2", Get("bar")); | |||
} | |||
TEST(DBTest, PutDeleteGet) { | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); | |||
ASSERT_EQ("v1", Get("foo")); | |||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); | |||
ASSERT_EQ("v2", Get("foo")); | |||
ASSERT_OK(db_->Delete(WriteOptions(), "foo")); | |||
ASSERT_EQ("NOT_FOUND", Get("foo")); | |||
} | |||
TEST(DBTest, Recover) { | |||
ASSERT_OK(Put("foo", "v1")); | |||
ASSERT_OK(Put("baz", "v5")); | |||
Reopen(); | |||
ASSERT_EQ("v1", Get("foo")); | |||
ASSERT_EQ("v1", Get("foo")); | |||
ASSERT_EQ("v5", Get("baz")); | |||
ASSERT_OK(Put("bar", "v2")); | |||
ASSERT_OK(Put("foo", "v3")); | |||
Reopen(); | |||
ASSERT_EQ("v3", Get("foo")); | |||
ASSERT_OK(Put("foo", "v4")); | |||
ASSERT_EQ("v4", Get("foo")); | |||
ASSERT_EQ("v2", Get("bar")); | |||
ASSERT_EQ("v5", Get("baz")); | |||
} | |||
TEST(DBTest, RecoveryWithEmptyLog) { | |||
ASSERT_OK(Put("foo", "v1")); | |||
ASSERT_OK(Put("foo", "v2")); | |||
Reopen(); | |||
Reopen(); | |||
ASSERT_OK(Put("foo", "v3")); | |||
Reopen(); | |||
ASSERT_EQ("v3", Get("foo")); | |||
} | |||
static std::string Key(int i) { | |||
char buf[100]; | |||
snprintf(buf, sizeof(buf), "key%06d", i); | |||
return std::string(buf); | |||
} | |||
TEST(DBTest, MinorCompactionsHappen) { | |||
Options options; | |||
options.write_buffer_size = 10000; | |||
Reopen(&options); | |||
const int N = 100; | |||
int starting_num_tables = NumTableFilesAtLevel(0); | |||
for (int i = 0; i < N; i++) { | |||
ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); | |||
} | |||
int ending_num_tables = NumTableFilesAtLevel(0); | |||
ASSERT_GT(ending_num_tables, starting_num_tables); | |||
for (int i = 0; i < N; i++) { | |||
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); | |||
} | |||
Reopen(); | |||
for (int i = 0; i < N; i++) { | |||
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); | |||
} | |||
} | |||
TEST(DBTest, RecoverWithLargeLog) { | |||
{ | |||
Options options; | |||
options.large_value_threshold = 1048576; | |||
Reopen(&options); | |||
ASSERT_OK(Put("big1", std::string(200000, '1'))); | |||
ASSERT_OK(Put("big2", std::string(200000, '2'))); | |||
ASSERT_OK(Put("small3", std::string(10, '3'))); | |||
ASSERT_OK(Put("small4", std::string(10, '4'))); | |||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); | |||
} | |||
// Make sure that if we re-open with a small write buffer size that | |||
// we flush table files in the middle of a large log file. | |||
Options options; | |||
options.write_buffer_size = 100000; | |||
options.large_value_threshold = 1048576; | |||
Reopen(&options); | |||
ASSERT_EQ(NumTableFilesAtLevel(0), 3); | |||
ASSERT_EQ(std::string(200000, '1'), Get("big1")); | |||
ASSERT_EQ(std::string(200000, '2'), Get("big2")); | |||
ASSERT_EQ(std::string(10, '3'), Get("small3")); | |||
ASSERT_EQ(std::string(10, '4'), Get("small4")); | |||
ASSERT_GT(NumTableFilesAtLevel(0), 1); | |||
} | |||
TEST(DBTest, CompactionsGenerateMultipleFiles) { | |||
Options options; | |||
options.write_buffer_size = 100000000; // Large write buffer | |||
options.large_value_threshold = 1048576; | |||
Reopen(&options); | |||
Random rnd(301); | |||
// Write 8MB (80 values, each 100K) | |||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); | |||
std::vector<std::string> values; | |||
for (int i = 0; i < 80; i++) { | |||
values.push_back(RandomString(&rnd, 100000)); | |||
ASSERT_OK(Put(Key(i), values[i])); | |||
} | |||
// Reopening moves updates to level-0 | |||
Reopen(&options); | |||
dbfull()->TEST_CompactRange(0, "", Key(100000)); | |||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); | |||
ASSERT_GT(NumTableFilesAtLevel(1), 1); | |||
for (int i = 0; i < 80; i++) { | |||
ASSERT_EQ(Get(Key(i)), values[i]); | |||
} | |||
} | |||
static bool Between(uint64_t val, uint64_t low, uint64_t high) { | |||
bool result = (val >= low) && (val <= high); | |||
if (!result) { | |||
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", | |||
(unsigned long long)(val), | |||
(unsigned long long)(low), | |||
(unsigned long long)(high)); | |||
} | |||
return result; | |||
} | |||
TEST(DBTest, ApproximateSizes) { | |||
for (int test = 0; test < 2; test++) { | |||
// test==0: default large_value_threshold | |||
// test==1: 1 MB large_value_threshold | |||
Options options; | |||
options.large_value_threshold = (test == 0) ? 65536 : 1048576; | |||
options.write_buffer_size = 100000000; // Large write buffer | |||
options.compression = kNoCompression; | |||
DestroyAndReopen(); | |||
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); | |||
Reopen(&options); | |||
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); | |||
// Write 8MB (80 values, each 100K) | |||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); | |||
const int N = 80; | |||
Random rnd(301); | |||
for (int i = 0; i < N; i++) { | |||
ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); | |||
} | |||
if (test == 1) { | |||
// 0 because GetApproximateSizes() does not account for memtable space for | |||
// non-large values | |||
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); | |||
} else { | |||
ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000)); | |||
ASSERT_TRUE(Between(Size(Key(20), Key(30)), | |||
100000*10, 100000*10 + 10000)); | |||
} | |||
// Check sizes across recovery by reopening a few times | |||
for (int run = 0; run < 3; run++) { | |||
Reopen(&options); | |||
for (int compact_start = 0; compact_start < N; compact_start += 10) { | |||
for (int i = 0; i < N; i += 10) { | |||
ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); | |||
ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), | |||
100000 * (i+1), 100000 * (i+1) + 10000)); | |||
ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), | |||
100000 * 10, 100000 * 10 + 10000)); | |||
} | |||
ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); | |||
ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); | |||
dbfull()->TEST_CompactRange(0, | |||
Key(compact_start), | |||
Key(compact_start + 9)); | |||
} | |||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); | |||
ASSERT_GT(NumTableFilesAtLevel(1), 0); | |||
} | |||
} | |||
} | |||
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { | |||
Options options; | |||
options.large_value_threshold = 65536; | |||
options.compression = kNoCompression; | |||
Reopen(); | |||
Random rnd(301); | |||
std::string big1 = RandomString(&rnd, 100000); | |||
ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); | |||
ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); | |||
ASSERT_OK(Put(Key(2), big1)); | |||
ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); | |||
ASSERT_OK(Put(Key(4), big1)); | |||
ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); | |||
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); | |||
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); | |||
// Check sizes across recovery by reopening a few times | |||
for (int run = 0; run < 3; run++) { | |||
Reopen(&options); | |||
ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); | |||
ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); | |||
ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); | |||
ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); | |||
ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); | |||
ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); | |||
ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); | |||
ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); | |||
ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); | |||
ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); | |||
dbfull()->TEST_CompactRange(0, Key(0), Key(100)); | |||
} | |||
} | |||
TEST(DBTest, IteratorPinsRef) { | |||
Put("foo", "hello"); | |||
// Get iterator that will yield the current contents of the DB. | |||
Iterator* iter = db_->NewIterator(ReadOptions()); | |||
// Write to force compactions | |||
Put("foo", "newvalue1"); | |||
for (int i = 0; i < 100; i++) { | |||
ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values | |||
} | |||
Put("foo", "newvalue2"); | |||
iter->SeekToFirst(); | |||
ASSERT_TRUE(iter->Valid()); | |||
ASSERT_EQ("foo", iter->key().ToString()); | |||
ASSERT_EQ("hello", iter->value().ToString()); | |||
iter->Next(); | |||
ASSERT_TRUE(!iter->Valid()); | |||
delete iter; | |||
} | |||
TEST(DBTest, Snapshot) { | |||
Put("foo", "v1"); | |||
const Snapshot* s1 = db_->GetSnapshot(); | |||
Put("foo", "v2"); | |||
const Snapshot* s2 = db_->GetSnapshot(); | |||
Put("foo", "v3"); | |||
const Snapshot* s3 = db_->GetSnapshot(); | |||
Put("foo", "v4"); | |||
ASSERT_EQ("v1", Get("foo", s1)); | |||
ASSERT_EQ("v2", Get("foo", s2)); | |||
ASSERT_EQ("v3", Get("foo", s3)); | |||
ASSERT_EQ("v4", Get("foo")); | |||
db_->ReleaseSnapshot(s3); | |||
ASSERT_EQ("v1", Get("foo", s1)); | |||
ASSERT_EQ("v2", Get("foo", s2)); | |||
ASSERT_EQ("v4", Get("foo")); | |||
db_->ReleaseSnapshot(s1); | |||
ASSERT_EQ("v2", Get("foo", s2)); | |||
ASSERT_EQ("v4", Get("foo")); | |||
db_->ReleaseSnapshot(s2); | |||
ASSERT_EQ("v4", Get("foo")); | |||
} | |||
TEST(DBTest, HiddenValuesAreRemoved) { | |||
Random rnd(301); | |||
std::string big = RandomString(&rnd, 50000); | |||
Put("foo", big); | |||
Put("pastfoo", "v"); | |||
const Snapshot* snapshot = db_->GetSnapshot(); | |||
Put("foo", "tiny"); | |||
Put("pastfoo2", "v2"); // Advance sequence number one more | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
ASSERT_GT(NumTableFilesAtLevel(0), 0); | |||
ASSERT_EQ(big, Get("foo", snapshot)); | |||
ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); | |||
db_->ReleaseSnapshot(snapshot); | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); | |||
dbfull()->TEST_CompactRange(0, "", "x"); | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); | |||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); | |||
ASSERT_GE(NumTableFilesAtLevel(1), 1); | |||
dbfull()->TEST_CompactRange(1, "", "x"); | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); | |||
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); | |||
} | |||
TEST(DBTest, DeletionMarkers1) { | |||
Put("foo", "v1"); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
dbfull()->TEST_CompactRange(1, "", "z"); | |||
ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file | |||
Delete("foo"); | |||
Put("foo", "v2"); | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
// DEL eliminated, but v1 remains because we aren't compacting that level | |||
// (DEL can be eliminated because v2 hides v1). | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); | |||
dbfull()->TEST_CompactRange(1, "", "z"); | |||
// Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. | |||
// (as is v1). | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); | |||
} | |||
TEST(DBTest, DeletionMarkers2) { | |||
Put("foo", "v1"); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
dbfull()->TEST_CompactRange(1, "", "z"); | |||
ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file | |||
Delete("foo"); | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
// DEL kept: L2 file overlaps | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); | |||
dbfull()->TEST_CompactRange(1, "", "z"); | |||
// Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. | |||
// (as is v1). | |||
ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); | |||
} | |||
TEST(DBTest, ComparatorCheck) { | |||
class NewComparator : public Comparator { | |||
public: | |||
virtual const char* Name() const { return "leveldb.NewComparator"; } | |||
virtual int Compare(const Slice& a, const Slice& b) const { | |||
return BytewiseComparator()->Compare(a, b); | |||
} | |||
virtual void FindShortestSeparator(std::string* s, const Slice& l) const { | |||
BytewiseComparator()->FindShortestSeparator(s, l); | |||
} | |||
virtual void FindShortSuccessor(std::string* key) const { | |||
BytewiseComparator()->FindShortSuccessor(key); | |||
} | |||
}; | |||
NewComparator cmp; | |||
Options new_options; | |||
new_options.comparator = &cmp; | |||
Status s = TryReopen(&new_options); | |||
ASSERT_TRUE(!s.ok()); | |||
ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) | |||
<< s.ToString(); | |||
} | |||
static bool LargeValuesOK(DBTest* db, | |||
const std::set<LargeValueRef>& expected) { | |||
std::set<LargeValueRef> actual = db->LargeValueFiles(); | |||
if (actual.size() != expected.size()) { | |||
fprintf(stderr, "Sets differ in size: %d vs %d\n", | |||
(int)actual.size(), (int)expected.size()); | |||
return false; | |||
} | |||
for (std::set<LargeValueRef>::const_iterator it = expected.begin(); | |||
it != expected.end(); | |||
++it) { | |||
if (actual.count(*it) != 1) { | |||
fprintf(stderr, " key '%s' not found in actual set\n", | |||
LargeValueRefToFilenameString(*it).c_str()); | |||
return false; | |||
} | |||
} | |||
return true; | |||
} | |||
TEST(DBTest, LargeValues1) { | |||
Options options; | |||
options.large_value_threshold = 10000; | |||
Reopen(&options); | |||
Random rnd(301); | |||
std::string big1; | |||
test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible | |||
std::set<LargeValueRef> expected; | |||
ASSERT_OK(Put("big1", big1)); | |||
expected.insert(LargeValueRef::Make(big1, kNoCompression)); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(Delete("big1")); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
// No handling of deletion markers on memtable compactions, so big1 remains | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
expected.erase(LargeValueRef::Make(big1, kNoCompression)); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
} | |||
TEST(DBTest, LargeValues2) { | |||
Options options; | |||
options.large_value_threshold = 10000; | |||
Reopen(&options); | |||
Random rnd(301); | |||
std::string big1, big2; | |||
test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible | |||
test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible | |||
std::set<LargeValueRef> expected; | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(Put("big1", big1)); | |||
expected.insert(LargeValueRef::Make(big1, kNoCompression)); | |||
ASSERT_EQ(big1, Get("big1")); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(Put("big2", big2)); | |||
ASSERT_EQ(big2, Get("big2")); | |||
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM) | |||
// TODO(sanjay) Reenable after compression support is added | |||
expected.insert(LargeValueRef::Make(big2, kNoCompression)); | |||
#else | |||
expected.insert(LargeValueRef::Make(big2, kLightweightCompression)); | |||
#endif | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(Put("big2", big2)); | |||
ASSERT_OK(Put("big2_b", big2)); | |||
ASSERT_EQ(big1, Get("big1")); | |||
ASSERT_EQ(big2, Get("big2")); | |||
ASSERT_EQ(big2, Get("big2_b")); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(Delete("big1")); | |||
ASSERT_EQ("NOT_FOUND", Get("big1")); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
expected.erase(LargeValueRef::Make(big1, kNoCompression)); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
dbfull()->TEST_CompactRange(1, "", "z"); | |||
ASSERT_OK(Delete("big2")); | |||
ASSERT_EQ("NOT_FOUND", Get("big2")); | |||
ASSERT_EQ(big2, Get("big2_b")); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
// Make sure the large value refs survive a reload and compactions after | |||
// the reload. | |||
Reopen(); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
ASSERT_OK(Put("foo", "bar")); | |||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); | |||
dbfull()->TEST_CompactRange(0, "", "z"); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
} | |||
TEST(DBTest, LargeValues3) { | |||
// Make sure we don't compress values if | |||
Options options; | |||
options.large_value_threshold = 10000; | |||
options.compression = kNoCompression; | |||
Reopen(&options); | |||
Random rnd(301); | |||
std::string big1 = std::string(100000, 'x'); // Very compressible | |||
std::set<LargeValueRef> expected; | |||
ASSERT_OK(Put("big1", big1)); | |||
ASSERT_EQ(big1, Get("big1")); | |||
expected.insert(LargeValueRef::Make(big1, kNoCompression)); | |||
ASSERT_TRUE(LargeValuesOK(this, expected)); | |||
} | |||
TEST(DBTest, DBOpen_Options) { | |||
std::string dbname = test::TmpDir() + "/db_options_test"; | |||
DestroyDB(dbname, Options()); | |||
// Does not exist, and create_if_missing == false: error | |||
DB* db = NULL; | |||
Options opts; | |||
opts.create_if_missing = false; | |||
Status s = DB::Open(opts, dbname, &db); | |||
ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); | |||
ASSERT_TRUE(db == NULL); | |||
// Does not exist, and create_if_missing == true: OK | |||
opts.create_if_missing = true; | |||
s = DB::Open(opts, dbname, &db); | |||
ASSERT_OK(s); | |||
ASSERT_TRUE(db != NULL); | |||
delete db; | |||
db = NULL; | |||
// Does exist, and error_if_exists == true: error | |||
opts.create_if_missing = false; | |||
opts.error_if_exists = true; | |||
s = DB::Open(opts, dbname, &db); | |||
ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); | |||
ASSERT_TRUE(db == NULL); | |||
// Does exist, and error_if_exists == false: OK | |||
opts.create_if_missing = true; | |||
opts.error_if_exists = false; | |||
s = DB::Open(opts, dbname, &db); | |||
ASSERT_OK(s); | |||
ASSERT_TRUE(db != NULL); | |||
delete db; | |||
db = NULL; | |||
} | |||
class ModelDB: public DB { | |||
public: | |||
explicit ModelDB(const Options& options): options_(options) { } | |||
~ModelDB() { } | |||
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { | |||
return DB::Put(o, k, v); | |||
} | |||
virtual Status Delete(const WriteOptions& o, const Slice& key) { | |||
return DB::Delete(o, key); | |||
} | |||
virtual Status Get(const ReadOptions& options, | |||
const Slice& key, std::string* value) { | |||
assert(false); // Not implemented | |||
return Status::NotFound(key); | |||
} | |||
virtual Iterator* NewIterator(const ReadOptions& options) { | |||
if (options.snapshot == NULL) { | |||
KVMap* saved = new KVMap; | |||
*saved = map_; | |||
return new ModelIter(saved, true); | |||
} else { | |||
const KVMap* snapshot_state = | |||
reinterpret_cast<const KVMap*>(options.snapshot->number_); | |||
return new ModelIter(snapshot_state, false); | |||
} | |||
} | |||
virtual const Snapshot* GetSnapshot() { | |||
KVMap* saved = new KVMap; | |||
*saved = map_; | |||
return snapshots_.New( | |||
reinterpret_cast<SequenceNumber>(saved)); | |||
} | |||
virtual void ReleaseSnapshot(const Snapshot* snapshot) { | |||
const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_); | |||
delete saved; | |||
snapshots_.Delete(snapshot); | |||
} | |||
virtual Status Write(const WriteOptions& options, WriteBatch* batch) { | |||
assert(options.post_write_snapshot == NULL); // Not supported | |||
for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { | |||
switch (it.op()) { | |||
case kTypeValue: | |||
map_[it.key().ToString()] = it.value().ToString(); | |||
break; | |||
case kTypeLargeValueRef: | |||
assert(false); // Should not occur | |||
break; | |||
case kTypeDeletion: | |||
map_.erase(it.key().ToString()); | |||
break; | |||
} | |||
} | |||
return Status::OK(); | |||
} | |||
virtual bool GetProperty(const Slice& property, uint64_t* value) { | |||
return false; | |||
} | |||
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { | |||
for (int i = 0; i < n; i++) { | |||
sizes[i] = 0; | |||
} | |||
} | |||
private: | |||
typedef std::map<std::string, std::string> KVMap; | |||
class ModelIter: public Iterator { | |||
public: | |||
ModelIter(const KVMap* map, bool owned) | |||
: map_(map), owned_(owned), iter_(map_->end()) { | |||
} | |||
~ModelIter() { | |||
if (owned_) delete map_; | |||
} | |||
virtual bool Valid() const { return iter_ != map_->end(); } | |||
virtual void SeekToFirst() { iter_ = map_->begin(); } | |||
virtual void SeekToLast() { | |||
if (map_->empty()) { | |||
iter_ = map_->end(); | |||
} else { | |||
iter_ = map_->find(map_->rbegin()->first); | |||
} | |||
} | |||
virtual void Seek(const Slice& k) { | |||
iter_ = map_->lower_bound(k.ToString()); | |||
} | |||
virtual void Next() { ++iter_; } | |||
virtual void Prev() { --iter_; } | |||
virtual Slice key() const { return iter_->first; } | |||
virtual Slice value() const { return iter_->second; } | |||
virtual Status status() const { return Status::OK(); } | |||
private: | |||
const KVMap* const map_; | |||
const bool owned_; // Do we own map_ | |||
KVMap::const_iterator iter_; | |||
}; | |||
const Options options_; | |||
KVMap map_; | |||
SnapshotList snapshots_; | |||
}; | |||
static std::string RandomKey(Random* rnd) { | |||
int len = (rnd->OneIn(3) | |||
? 1 // Short sometimes to encourage collisions | |||
: (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); | |||
return test::RandomKey(rnd, len); | |||
} | |||
static bool CompareIterators(int step, | |||
DB* model, | |||
DB* db, | |||
const Snapshot* model_snap, | |||
const Snapshot* db_snap) { | |||
ReadOptions options; | |||
options.snapshot = model_snap; | |||
Iterator* miter = model->NewIterator(options); | |||
options.snapshot = db_snap; | |||
Iterator* dbiter = db->NewIterator(options); | |||
bool ok = true; | |||
int count = 0; | |||
for (miter->SeekToFirst(), dbiter->SeekToFirst(); | |||
ok && miter->Valid() && dbiter->Valid(); | |||
miter->Next(), dbiter->Next()) { | |||
count++; | |||
if (miter->key().compare(dbiter->key()) != 0) { | |||
fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", | |||
step, | |||
EscapeString(miter->key()).c_str(), | |||
EscapeString(dbiter->key()).c_str()); | |||
ok = false; | |||
break; | |||
} | |||
if (miter->value().compare(dbiter->value()) != 0) { | |||
fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", | |||
step, | |||
EscapeString(miter->key()).c_str(), | |||
EscapeString(miter->value()).c_str(), | |||
EscapeString(miter->value()).c_str()); | |||
ok = false; | |||
} | |||
} | |||
if (ok) { | |||
if (miter->Valid() != dbiter->Valid()) { | |||
fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", | |||
step, miter->Valid(), dbiter->Valid()); | |||
ok = false; | |||
} | |||
} | |||
fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); | |||
delete miter; | |||
delete dbiter; | |||
return ok; | |||
} | |||
TEST(DBTest, Randomized) { | |||
Random rnd(test::RandomSeed()); | |||
ModelDB model(last_options_); | |||
const int N = 10000; | |||
const Snapshot* model_snap = NULL; | |||
const Snapshot* db_snap = NULL; | |||
std::string k, v; | |||
for (int step = 0; step < N; step++) { | |||
if (step % 100 == 0) { | |||
fprintf(stderr, "Step %d of %d\n", step, N); | |||
} | |||
int p = rnd.Uniform(100); | |||
if (p < 45) { // Put | |||
k = RandomKey(&rnd); | |||
v = RandomString(&rnd, | |||
rnd.OneIn(20) | |||
? 100 + rnd.Uniform(100) | |||
: rnd.Uniform(8)); | |||
ASSERT_OK(model.Put(WriteOptions(), k, v)); | |||
ASSERT_OK(db_->Put(WriteOptions(), k, v)); | |||
} else if (p < 90) { // Delete | |||
k = RandomKey(&rnd); | |||
ASSERT_OK(model.Delete(WriteOptions(), k)); | |||
ASSERT_OK(db_->Delete(WriteOptions(), k)); | |||
} else { // Multi-element batch | |||
WriteBatch b; | |||
const int num = rnd.Uniform(8); | |||
for (int i = 0; i < num; i++) { | |||
if (i == 0 || !rnd.OneIn(10)) { | |||
k = RandomKey(&rnd); | |||
} else { | |||
// Periodically re-use the same key from the previous iter, so | |||
// we have multiple entries in the write batch for the same key | |||
} | |||
if (rnd.OneIn(2)) { | |||
v = RandomString(&rnd, rnd.Uniform(10)); | |||
b.Put(k, v); | |||
} else { | |||
b.Delete(k); | |||
} | |||
} | |||
ASSERT_OK(model.Write(WriteOptions(), &b)); | |||
ASSERT_OK(db_->Write(WriteOptions(), &b)); | |||
} | |||
if ((step % 100) == 0) { | |||
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); | |||
ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); | |||
// Save a snapshot from each DB this time that we'll use next | |||
// time we compare things, to make sure the current state is | |||
// preserved with the snapshot | |||
if (model_snap != NULL) model.ReleaseSnapshot(model_snap); | |||
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); | |||
Reopen(); | |||
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); | |||
model_snap = model.GetSnapshot(); | |||
db_snap = db_->GetSnapshot(); | |||
} | |||
} | |||
if (model_snap != NULL) model.ReleaseSnapshot(model_snap); | |||
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,152 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include <stdio.h> | |||
#include "db/dbformat.h" | |||
#include "port/port.h" | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { | |||
assert(seq <= kMaxSequenceNumber); | |||
assert(t <= kValueTypeForSeek); | |||
return (seq << 8) | t; | |||
} | |||
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { | |||
result->append(key.user_key.data(), key.user_key.size()); | |||
PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); | |||
} | |||
std::string ParsedInternalKey::DebugString() const { | |||
char buf[50]; | |||
snprintf(buf, sizeof(buf), "' @ %llu : %d", | |||
(unsigned long long) sequence, | |||
int(type)); | |||
std::string result = "'"; | |||
result += user_key.ToString(); | |||
result += buf; | |||
return result; | |||
} | |||
const char* InternalKeyComparator::Name() const { | |||
return "leveldb.InternalKeyComparator"; | |||
} | |||
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { | |||
// Order by: | |||
// increasing user key (according to user-supplied comparator) | |||
// decreasing sequence number | |||
// decreasing type (though sequence# should be enough to disambiguate) | |||
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); | |||
if (r == 0) { | |||
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); | |||
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); | |||
if (anum > bnum) { | |||
r = -1; | |||
} else if (anum < bnum) { | |||
r = +1; | |||
} | |||
} | |||
return r; | |||
} | |||
void InternalKeyComparator::FindShortestSeparator( | |||
std::string* start, | |||
const Slice& limit) const { | |||
// Attempt to shorten the user portion of the key | |||
Slice user_start = ExtractUserKey(*start); | |||
Slice user_limit = ExtractUserKey(limit); | |||
std::string tmp(user_start.data(), user_start.size()); | |||
user_comparator_->FindShortestSeparator(&tmp, user_limit); | |||
if (user_comparator_->Compare(*start, tmp) < 0) { | |||
// User key has become larger. Tack on the earliest possible | |||
// number to the shortened user key. | |||
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); | |||
assert(this->Compare(*start, tmp) < 0); | |||
assert(this->Compare(tmp, limit) < 0); | |||
start->swap(tmp); | |||
} | |||
} | |||
void InternalKeyComparator::FindShortSuccessor(std::string* key) const { | |||
Slice user_key = ExtractUserKey(*key); | |||
std::string tmp(user_key.data(), user_key.size()); | |||
user_comparator_->FindShortSuccessor(&tmp); | |||
if (user_comparator_->Compare(user_key, tmp) < 0) { | |||
// User key has become larger. Tack on the earliest possible | |||
// number to the shortened user key. | |||
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); | |||
assert(this->Compare(*key, tmp) < 0); | |||
key->swap(tmp); | |||
} | |||
} | |||
LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) { | |||
LargeValueRef result; | |||
port::SHA1_Hash(value.data(), value.size(), &result.data[0]); | |||
EncodeFixed64(&result.data[20], value.size()); | |||
result.data[28] = static_cast<unsigned char>(ctype); | |||
return result; | |||
} | |||
std::string LargeValueRefToFilenameString(const LargeValueRef& h) { | |||
assert(sizeof(h.data) == LargeValueRef::ByteSize()); | |||
assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf | |||
static const char tohex[] = "0123456789abcdef"; | |||
char buf[20*2]; | |||
for (int i = 0; i < 20; i++) { | |||
buf[2*i] = tohex[(h.data[i] >> 4) & 0xf]; | |||
buf[2*i+1] = tohex[h.data[i] & 0xf]; | |||
} | |||
std::string result = std::string(buf, sizeof(buf)); | |||
result += "-"; | |||
result += NumberToString(h.ValueSize()); | |||
result += "-"; | |||
result += NumberToString(static_cast<uint64_t>(h.compression_type())); | |||
return result; | |||
} | |||
static uint32_t hexvalue(char c) { | |||
if (c >= '0' && c <= '9') { | |||
return c - '0'; | |||
} else if (c >= 'A' && c <= 'F') { | |||
return 10 + c - 'A'; | |||
} else { | |||
assert(c >= 'a' && c <= 'f'); | |||
return 10 + c - 'a'; | |||
} | |||
} | |||
bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) { | |||
Slice in = s; | |||
if (in.size() < 40) { | |||
return false; | |||
} | |||
for (int i = 0; i < 20; i++) { | |||
if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) { | |||
return false; | |||
} | |||
unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]); | |||
h->data[i] = c; | |||
} | |||
in.remove_prefix(40); | |||
uint64_t value_size, ctype; | |||
if (ConsumeChar(&in, '-') && | |||
ConsumeDecimalNumber(&in, &value_size) && | |||
ConsumeChar(&in, '-') && | |||
ConsumeDecimalNumber(&in, &ctype) && | |||
in.empty() && | |||
(ctype <= kLightweightCompression)) { | |||
EncodeFixed64(&h->data[20], value_size); | |||
h->data[28] = static_cast<unsigned char>(ctype); | |||
return true; | |||
} else { | |||
return false; | |||
} | |||
} | |||
} |
@ -0,0 +1,198 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ | |||
#define STORAGE_LEVELDB_DB_FORMAT_H_ | |||
#include <stdio.h> | |||
#include "include/comparator.h" | |||
#include "include/db.h" | |||
#include "include/slice.h" | |||
#include "include/table_builder.h" | |||
#include "util/coding.h" | |||
#include "util/logging.h" | |||
namespace leveldb { | |||
class InternalKey; | |||
// Value types encoded as the last component of internal keys. | |||
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk | |||
// data structures. | |||
enum ValueType { | |||
kTypeDeletion = 0x0, | |||
kTypeValue = 0x1, | |||
kTypeLargeValueRef = 0x2, | |||
}; | |||
// kValueTypeForSeek defines the ValueType that should be passed when | |||
// constructing a ParsedInternalKey object for seeking to a particular | |||
// sequence number (since we sort sequence numbers in decreasing order | |||
// and the value type is embedded as the low 8 bits in the sequence | |||
// number in internal keys, we need to use the highest-numbered | |||
// ValueType, not the lowest). | |||
static const ValueType kValueTypeForSeek = kTypeLargeValueRef; | |||
typedef uint64_t SequenceNumber; | |||
// We leave eight bits empty at the bottom so a type and sequence# | |||
// can be packed together into 64-bits. | |||
static const SequenceNumber kMaxSequenceNumber = | |||
((0x1ull << 56) - 1); | |||
struct ParsedInternalKey { | |||
Slice user_key; | |||
SequenceNumber sequence; | |||
ValueType type; | |||
ParsedInternalKey() { } // Intentionally left uninitialized (for speed) | |||
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) | |||
: user_key(u), sequence(seq), type(t) { } | |||
std::string DebugString() const; | |||
}; | |||
// Return the length of the encoding of "key". | |||
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { | |||
return key.user_key.size() + 8; | |||
} | |||
// Append the serialization of "key" to *result. | |||
extern void AppendInternalKey(std::string* result, | |||
const ParsedInternalKey& key); | |||
// Attempt to parse an internal key from "internal_key". On success, | |||
// stores the parsed data in "*result", and returns true. | |||
// | |||
// On error, returns false, leaves "*result" in an undefined state. | |||
extern bool ParseInternalKey(const Slice& internal_key, | |||
ParsedInternalKey* result); | |||
// Returns the user key portion of an internal key. | |||
inline Slice ExtractUserKey(const Slice& internal_key) { | |||
assert(internal_key.size() >= 8); | |||
return Slice(internal_key.data(), internal_key.size() - 8); | |||
} | |||
inline ValueType ExtractValueType(const Slice& internal_key) { | |||
assert(internal_key.size() >= 8); | |||
const size_t n = internal_key.size(); | |||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8); | |||
unsigned char c = num & 0xff; | |||
return static_cast<ValueType>(c); | |||
} | |||
// A comparator for internal keys that uses a specified comparator for | |||
// the user key portion and breaks ties by decreasing sequence number. | |||
class InternalKeyComparator : public Comparator { | |||
private: | |||
const Comparator* user_comparator_; | |||
public: | |||
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } | |||
virtual const char* Name() const; | |||
virtual int Compare(const Slice& a, const Slice& b) const; | |||
virtual void FindShortestSeparator( | |||
std::string* start, | |||
const Slice& limit) const; | |||
virtual void FindShortSuccessor(std::string* key) const; | |||
const Comparator* user_comparator() const { return user_comparator_; } | |||
int Compare(const InternalKey& a, const InternalKey& b) const; | |||
}; | |||
// Modules in this directory should keep internal keys wrapped inside | |||
// the following class instead of plain strings so that we do not | |||
// incorrectly use string comparisons instead of an InternalKeyComparator. | |||
class InternalKey { | |||
private: | |||
std::string rep_; | |||
public: | |||
InternalKey() { } // Leave rep_ as empty to indicate it is invalid | |||
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { | |||
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); | |||
} | |||
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } | |||
Slice Encode() const { | |||
assert(!rep_.empty()); | |||
return rep_; | |||
} | |||
Slice user_key() const { return ExtractUserKey(rep_); } | |||
void SetFrom(const ParsedInternalKey& p) { | |||
rep_.clear(); | |||
AppendInternalKey(&rep_, p); | |||
} | |||
void Clear() { rep_.clear(); } | |||
}; | |||
inline int InternalKeyComparator::Compare( | |||
const InternalKey& a, const InternalKey& b) const { | |||
return Compare(a.Encode(), b.Encode()); | |||
} | |||
// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte | |||
// uncompressed size, and a 1 byte CompressionType code. An | |||
// encoded form of it is embedded in the filenames of large value | |||
// files stored in the database, and the raw binary form is stored as | |||
// the iter->value() result for values of type kTypeLargeValueRef in | |||
// the table and log files that make up the database. | |||
struct LargeValueRef { | |||
char data[29]; | |||
// Initialize a large value ref for the given data | |||
static LargeValueRef Make(const Slice& data, | |||
CompressionType compression_type); | |||
// Initialize a large value ref from a serialized, 29-byte reference value | |||
static LargeValueRef FromRef(const Slice& ref) { | |||
LargeValueRef result; | |||
assert(ref.size() == sizeof(result.data)); | |||
memcpy(result.data, ref.data(), sizeof(result.data)); | |||
return result; | |||
} | |||
// Return the number of bytes in a LargeValueRef (not the | |||
// number of bytes in the value referenced). | |||
static size_t ByteSize() { return sizeof(LargeValueRef().data); } | |||
// Return the number of bytes in the value referenced by "*this". | |||
uint64_t ValueSize() const { return DecodeFixed64(&data[20]); } | |||
CompressionType compression_type() const { | |||
return static_cast<CompressionType>(data[28]); | |||
} | |||
bool operator==(const LargeValueRef& b) const { | |||
return memcmp(data, b.data, sizeof(data)) == 0; | |||
} | |||
bool operator<(const LargeValueRef& b) const { | |||
return memcmp(data, b.data, sizeof(data)) < 0; | |||
} | |||
}; | |||
// Convert the large value ref to a human-readable string suitable | |||
// for embedding in a large value filename. | |||
extern std::string LargeValueRefToFilenameString(const LargeValueRef& h); | |||
// Parse the large value filename string in "input" and store it in | |||
// "*h". If successful, returns true. Otherwise returns false. | |||
extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref); | |||
inline bool ParseInternalKey(const Slice& internal_key, | |||
ParsedInternalKey* result) { | |||
const size_t n = internal_key.size(); | |||
if (n < 8) return false; | |||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8); | |||
unsigned char c = num & 0xff; | |||
result->sequence = num >> 8; | |||
result->type = static_cast<ValueType>(c); | |||
result->user_key = Slice(internal_key.data(), n - 8); | |||
return (c <= static_cast<unsigned char>(kTypeLargeValueRef)); | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_DB_FORMAT_H_ |
@ -0,0 +1,127 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/dbformat.h" | |||
#include "util/logging.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
static std::string IKey(const std::string& user_key, | |||
uint64_t seq, | |||
ValueType vt) { | |||
std::string encoded; | |||
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); | |||
return encoded; | |||
} | |||
static std::string Shorten(const std::string& s, const std::string& l) { | |||
std::string result = s; | |||
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); | |||
return result; | |||
} | |||
static std::string ShortSuccessor(const std::string& s) { | |||
std::string result = s; | |||
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); | |||
return result; | |||
} | |||
static void TestKey(const std::string& key, | |||
uint64_t seq, | |||
ValueType vt) { | |||
std::string encoded = IKey(key, seq, vt); | |||
Slice in(encoded); | |||
ParsedInternalKey decoded("", 0, kTypeValue); | |||
ASSERT_TRUE(ParseInternalKey(in, &decoded)); | |||
ASSERT_EQ(key, decoded.user_key.ToString()); | |||
ASSERT_EQ(seq, decoded.sequence); | |||
ASSERT_EQ(vt, decoded.type); | |||
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); | |||
} | |||
class FormatTest { }; | |||
TEST(FormatTest, InternalKey_EncodeDecode) { | |||
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; | |||
const uint64_t seq[] = { | |||
1, 2, 3, | |||
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, | |||
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, | |||
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 | |||
}; | |||
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { | |||
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { | |||
TestKey(keys[k], seq[s], kTypeValue); | |||
TestKey("hello", 1, kTypeDeletion); | |||
} | |||
} | |||
} | |||
TEST(FormatTest, InternalKeyShortSeparator) { | |||
// When user keys are same | |||
ASSERT_EQ(IKey("foo", 100, kTypeValue), | |||
Shorten(IKey("foo", 100, kTypeValue), | |||
IKey("foo", 99, kTypeValue))); | |||
ASSERT_EQ(IKey("foo", 100, kTypeValue), | |||
Shorten(IKey("foo", 100, kTypeValue), | |||
IKey("foo", 101, kTypeValue))); | |||
ASSERT_EQ(IKey("foo", 100, kTypeValue), | |||
Shorten(IKey("foo", 100, kTypeValue), | |||
IKey("foo", 100, kTypeValue))); | |||
ASSERT_EQ(IKey("foo", 100, kTypeValue), | |||
Shorten(IKey("foo", 100, kTypeValue), | |||
IKey("foo", 100, kTypeDeletion))); | |||
ASSERT_EQ(IKey("foo", 100, kTypeValue), | |||
Shorten(IKey("foo", 100, kTypeValue), | |||
IKey("foo", 100, kTypeLargeValueRef))); | |||
// When user keys are misordered | |||
ASSERT_EQ(IKey("foo", 100, kTypeValue), | |||
Shorten(IKey("foo", 100, kTypeValue), | |||
IKey("bar", 99, kTypeValue))); | |||
// When user keys are different, but correctly ordered | |||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), | |||
Shorten(IKey("foo", 100, kTypeValue), | |||
IKey("hello", 200, kTypeValue))); | |||
// When start user key is prefix of limit user key | |||
ASSERT_EQ(IKey("foo", 100, kTypeValue), | |||
Shorten(IKey("foo", 100, kTypeValue), | |||
IKey("foobar", 200, kTypeValue))); | |||
// When limit user key is prefix of start user key | |||
ASSERT_EQ(IKey("foobar", 100, kTypeValue), | |||
Shorten(IKey("foobar", 100, kTypeValue), | |||
IKey("foo", 200, kTypeValue))); | |||
} | |||
TEST(FormatTest, InternalKeyShortestSuccessor) { | |||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), | |||
ShortSuccessor(IKey("foo", 100, kTypeValue))); | |||
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), | |||
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); | |||
} | |||
TEST(FormatTest, SHA1) { | |||
// Check that we are computing the same value as sha1. | |||
// Note that the last two numbers are the length of the input and the | |||
// compression type. | |||
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr | |||
LargeValueRefToFilenameString( | |||
LargeValueRef::Make("hello", kNoCompression))); | |||
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr | |||
LargeValueRefToFilenameString( | |||
LargeValueRef::Make("hello", kLightweightCompression))); | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,154 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include <ctype.h> | |||
#include <stdio.h> | |||
#include "db/filename.h" | |||
#include "db/dbformat.h" | |||
#include "include/env.h" | |||
#include "util/logging.h" | |||
namespace leveldb { | |||
static std::string MakeFileName(const std::string& name, uint64_t number, | |||
const char* suffix) { | |||
char buf[100]; | |||
snprintf(buf, sizeof(buf), "/%06llu.%s", | |||
static_cast<unsigned long long>(number), | |||
suffix); | |||
return name + buf; | |||
} | |||
std::string LogFileName(const std::string& name, uint64_t number) { | |||
assert(number > 0); | |||
return MakeFileName(name, number, "log"); | |||
} | |||
std::string TableFileName(const std::string& name, uint64_t number) { | |||
assert(number > 0); | |||
return MakeFileName(name, number, "sst"); | |||
} | |||
std::string LargeValueFileName(const std::string& name, | |||
const LargeValueRef& large_ref) { | |||
std::string result = name + "/"; | |||
result += LargeValueRefToFilenameString(large_ref); | |||
result += ".val"; | |||
return result; | |||
} | |||
std::string DescriptorFileName(const std::string& dbname, uint64_t number) { | |||
assert(number > 0); | |||
char buf[100]; | |||
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", | |||
static_cast<unsigned long long>(number)); | |||
return dbname + buf; | |||
} | |||
std::string CurrentFileName(const std::string& dbname) { | |||
return dbname + "/CURRENT"; | |||
} | |||
std::string LockFileName(const std::string& dbname) { | |||
return dbname + "/LOCK"; | |||
} | |||
std::string TempFileName(const std::string& dbname, uint64_t number) { | |||
assert(number > 0); | |||
return MakeFileName(dbname, number, "dbtmp"); | |||
} | |||
std::string InfoLogFileName(const std::string& dbname) { | |||
return dbname + "/LOG"; | |||
} | |||
// Return the name of the old info log file for "dbname". | |||
std::string OldInfoLogFileName(const std::string& dbname) { | |||
return dbname + "/LOG.old"; | |||
} | |||
// Owned filenames have the form: | |||
// dbname/CURRENT | |||
// dbname/LOCK | |||
// dbname/LOG | |||
// dbname/LOG.old | |||
// dbname/MANIFEST-[0-9]+ | |||
// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val | |||
// dbname/[0-9]+.(log|sst) | |||
bool ParseFileName(const std::string& fname, | |||
uint64_t* number, | |||
LargeValueRef* large_ref, | |||
FileType* type) { | |||
Slice rest(fname); | |||
if (rest == "CURRENT") { | |||
*number = 0; | |||
*type = kCurrentFile; | |||
} else if (rest == "LOCK") { | |||
*number = 0; | |||
*type = kDBLockFile; | |||
} else if (rest == "LOG" || rest == "LOG.old") { | |||
*number = 0; | |||
*type = kInfoLogFile; | |||
} else if (rest.size() >= 4 && | |||
Slice(rest.data() + rest.size() - 4, 4) == ".val") { | |||
LargeValueRef h; | |||
if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4), | |||
&h)) { | |||
return false; | |||
} | |||
*large_ref = h; | |||
*type = kLargeValueFile; | |||
} else if (rest.starts_with("MANIFEST-")) { | |||
rest.remove_prefix(strlen("MANIFEST-")); | |||
uint64_t num; | |||
if (!ConsumeDecimalNumber(&rest, &num)) { | |||
return false; | |||
} | |||
if (!rest.empty()) { | |||
return false; | |||
} | |||
*type = kDescriptorFile; | |||
*number = num; | |||
} else { | |||
// Avoid strtoull() to keep filename format independent of the | |||
// current locale | |||
uint64_t num; | |||
if (!ConsumeDecimalNumber(&rest, &num)) { | |||
return false; | |||
} | |||
Slice suffix = rest; | |||
if (suffix == Slice(".log")) { | |||
*type = kLogFile; | |||
} else if (suffix == Slice(".sst")) { | |||
*type = kTableFile; | |||
} else if (suffix == Slice(".dbtmp")) { | |||
*type = kTempFile; | |||
} else { | |||
return false; | |||
} | |||
*number = num; | |||
} | |||
return true; | |||
} | |||
Status SetCurrentFile(Env* env, const std::string& dbname, | |||
uint64_t descriptor_number) { | |||
// Remove leading "dbname/" and add newline to manifest file name | |||
std::string manifest = DescriptorFileName(dbname, descriptor_number); | |||
Slice contents = manifest; | |||
assert(contents.starts_with(dbname + "/")); | |||
contents.remove_prefix(dbname.size() + 1); | |||
std::string tmp = TempFileName(dbname, descriptor_number); | |||
Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); | |||
if (s.ok()) { | |||
s = env->RenameFile(tmp, CurrentFileName(dbname)); | |||
} | |||
if (!s.ok()) { | |||
env->DeleteFile(tmp); | |||
} | |||
return s; | |||
} | |||
} |
@ -0,0 +1,92 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// File names used by DB code | |||
#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ | |||
#define STORAGE_LEVELDB_DB_FILENAME_H_ | |||
#include <stdint.h> | |||
#include <string> | |||
#include "include/slice.h" | |||
#include "include/status.h" | |||
#include "port/port.h" | |||
namespace leveldb { | |||
class Env; | |||
struct LargeValueRef; | |||
enum FileType { | |||
kLogFile, | |||
kDBLockFile, | |||
kTableFile, | |||
kLargeValueFile, | |||
kDescriptorFile, | |||
kCurrentFile, | |||
kTempFile, | |||
kInfoLogFile, // Either the current one, or an old one | |||
}; | |||
// Return the name of the log file with the specified number | |||
// in the db named by "dbname". The result will be prefixed with | |||
// "dbname". | |||
extern std::string LogFileName(const std::string& dbname, uint64_t number); | |||
// Return the name of the sstable with the specified number | |||
// in the db named by "dbname". The result will be prefixed with | |||
// "dbname". | |||
extern std::string TableFileName(const std::string& dbname, uint64_t number); | |||
// Return the name of the large value file with the specified large | |||
// value reference in the db named by "dbname". The result will be | |||
// prefixed with "dbname". | |||
extern std::string LargeValueFileName(const std::string& dbname, | |||
const LargeValueRef& large_ref); | |||
// Return the name of the descriptor file for the db named by | |||
// "dbname" and the specified incarnation number. The result will be | |||
// prefixed with "dbname". | |||
extern std::string DescriptorFileName(const std::string& dbname, | |||
uint64_t number); | |||
// Return the name of the current file. This file contains the name | |||
// of the current manifest file. The result will be prefixed with | |||
// "dbname". | |||
extern std::string CurrentFileName(const std::string& dbname); | |||
// Return the name of the lock file for the db named by | |||
// "dbname". The result will be prefixed with "dbname". | |||
extern std::string LockFileName(const std::string& dbname); | |||
// Return the name of a temporary file owned by the db named "dbname". | |||
// The result will be prefixed with "dbname". | |||
extern std::string TempFileName(const std::string& dbname, uint64_t number); | |||
// Return the name of the info log file for "dbname". | |||
extern std::string InfoLogFileName(const std::string& dbname); | |||
// Return the name of the old info log file for "dbname". | |||
extern std::string OldInfoLogFileName(const std::string& dbname); | |||
// If filename is a leveldb file, store the type of the file in *type. | |||
// If *type is kLargeValueFile, then the large value reference data | |||
// from the filename is stored in "*large_ref. For all other types of | |||
// files, the number encoded in the filename is stored in *number. If | |||
// the filename was successfully parsed, returns true. Else return | |||
// false. | |||
extern bool ParseFileName(const std::string& filename, | |||
uint64_t* number, | |||
LargeValueRef* large_ref, | |||
FileType* type); | |||
// Make the CURRENT file point to the descriptor file with the | |||
// specified number. | |||
extern Status SetCurrentFile(Env* env, const std::string& dbname, | |||
uint64_t descriptor_number); | |||
} | |||
#endif // STORAGE_LEVELDB_DB_FILENAME_H_ |
@ -0,0 +1,156 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/filename.h" | |||
#include "db/dbformat.h" | |||
#include "port/port.h" | |||
#include "util/logging.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
class FileNameTest { }; | |||
TEST(FileNameTest, Parse) { | |||
Slice db; | |||
FileType type; | |||
uint64_t number; | |||
LargeValueRef large_ref; | |||
// Successful parses | |||
static struct { | |||
const char* fname; | |||
uint64_t number; | |||
const char* large_ref; | |||
FileType type; | |||
} cases[] = { | |||
{ "100.log", 100, "", kLogFile }, | |||
{ "0.log", 0, "", kLogFile }, | |||
{ "0.sst", 0, "", kTableFile }, | |||
{ "CURRENT", 0, "", kCurrentFile }, | |||
{ "LOCK", 0, "", kDBLockFile }, | |||
{ "MANIFEST-2", 2, "", kDescriptorFile }, | |||
{ "MANIFEST-7", 7, "", kDescriptorFile }, | |||
{ "LOG", 0, "", kInfoLogFile }, | |||
{ "LOG.old", 0, "", kInfoLogFile }, | |||
{ "18446744073709551615.log", 18446744073709551615ull, "", | |||
kLogFile }, | |||
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0, | |||
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile }, | |||
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0, | |||
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0", | |||
kLargeValueFile }, | |||
}; | |||
for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { | |||
std::string f = cases[i].fname; | |||
ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f; | |||
ASSERT_EQ(cases[i].type, type) << f; | |||
if (type == kLargeValueFile) { | |||
ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref)) | |||
<< f; | |||
} else { | |||
ASSERT_EQ(cases[i].number, number) << f; | |||
} | |||
} | |||
// Errors | |||
static const char* errors[] = { | |||
"", | |||
"foo", | |||
"foo-dx-100.log", | |||
".log", | |||
"", | |||
"manifest", | |||
"CURREN", | |||
"CURRENTX", | |||
"MANIFES", | |||
"MANIFEST", | |||
"MANIFEST-", | |||
"XMANIFEST-3", | |||
"MANIFEST-3x", | |||
"LOC", | |||
"LOCKx", | |||
"LO", | |||
"LOGx", | |||
"18446744073709551616.log", | |||
"184467440737095516150.log", | |||
"100", | |||
"100.", | |||
"100.lop", | |||
"100.val", | |||
".val", | |||
"123456789012345678901234567890123456789-12340.val", | |||
"1234567890123456789012345678901234567-123-0.val", | |||
"12345678901234567890123456789012345678902-100-1-.val", | |||
// Overflow on value size | |||
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val", | |||
// '03.val' is a bad compression type | |||
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" }; | |||
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { | |||
std::string f = errors[i]; | |||
ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f; | |||
}; | |||
} | |||
TEST(FileNameTest, Construction) { | |||
uint64_t number; | |||
FileType type; | |||
LargeValueRef large_ref; | |||
std::string fname; | |||
fname = CurrentFileName("foo"); | |||
ASSERT_EQ("foo/", std::string(fname.data(), 4)); | |||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); | |||
ASSERT_EQ(0, number); | |||
ASSERT_EQ(kCurrentFile, type); | |||
fname = LockFileName("foo"); | |||
ASSERT_EQ("foo/", std::string(fname.data(), 4)); | |||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); | |||
ASSERT_EQ(0, number); | |||
ASSERT_EQ(kDBLockFile, type); | |||
fname = LogFileName("foo", 192); | |||
ASSERT_EQ("foo/", std::string(fname.data(), 4)); | |||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); | |||
ASSERT_EQ(192, number); | |||
ASSERT_EQ(kLogFile, type); | |||
fname = TableFileName("bar", 200); | |||
ASSERT_EQ("bar/", std::string(fname.data(), 4)); | |||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); | |||
ASSERT_EQ(200, number); | |||
ASSERT_EQ(kTableFile, type); | |||
fname = DescriptorFileName("bar", 100); | |||
ASSERT_EQ("bar/", std::string(fname.data(), 4)); | |||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); | |||
ASSERT_EQ(100, number); | |||
ASSERT_EQ(kDescriptorFile, type); | |||
fname = TempFileName("tmp", 999); | |||
ASSERT_EQ("tmp/", std::string(fname.data(), 4)); | |||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); | |||
ASSERT_EQ(999, number); | |||
ASSERT_EQ(kTempFile, type); | |||
for (int i = 0; i <= kLightweightCompression; i++) { | |||
CompressionType ctype = static_cast<CompressionType>(i); | |||
std::string value = "abcdef"; | |||
LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype); | |||
fname = LargeValueFileName("tmp", real_large_ref); | |||
ASSERT_EQ("tmp/", std::string(fname.data(), 4)); | |||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); | |||
ASSERT_TRUE(real_large_ref == large_ref); | |||
ASSERT_EQ(kLargeValueFile, type); | |||
ASSERT_EQ(large_ref.compression_type(), ctype); | |||
} | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,35 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// Log format information shared by reader and writer. | |||
// See ../doc/log_format.txt for more detail. | |||
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ | |||
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ | |||
namespace leveldb { | |||
namespace log { | |||
enum RecordType { | |||
// Zero is reserved for preallocated files | |||
kZeroType = 0, | |||
kFullType = 1, | |||
// For fragments | |||
kFirstType = 2, | |||
kMiddleType = 3, | |||
kLastType = 4, | |||
}; | |||
static const int kMaxRecordType = kLastType; | |||
static const int kBlockSize = 32768; | |||
// Header is checksum (4 bytes), type (1 byte), length (2 bytes). | |||
static const int kHeaderSize = 4 + 1 + 2; | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ |
@ -0,0 +1,172 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/log_reader.h" | |||
#include <stdint.h> | |||
#include "include/env.h" | |||
#include "util/coding.h" | |||
#include "util/crc32c.h" | |||
namespace leveldb { | |||
namespace log { | |||
Reader::Reporter::~Reporter() { | |||
} | |||
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) | |||
: file_(file), | |||
reporter_(reporter), | |||
checksum_(checksum), | |||
backing_store_(new char[kBlockSize]), | |||
buffer_(), | |||
eof_(false) { | |||
} | |||
Reader::~Reader() { | |||
delete[] backing_store_; | |||
} | |||
bool Reader::ReadRecord(Slice* record, std::string* scratch) { | |||
scratch->clear(); | |||
record->clear(); | |||
bool in_fragmented_record = false; | |||
Slice fragment; | |||
while (true) { | |||
switch (ReadPhysicalRecord(&fragment)) { | |||
case kFullType: | |||
if (in_fragmented_record) { | |||
ReportDrop(scratch->size(), "partial record without end"); | |||
} | |||
scratch->clear(); | |||
*record = fragment; | |||
return true; | |||
case kFirstType: | |||
if (in_fragmented_record) { | |||
ReportDrop(scratch->size(), "partial record without end"); | |||
} | |||
scratch->assign(fragment.data(), fragment.size()); | |||
in_fragmented_record = true; | |||
break; | |||
case kMiddleType: | |||
if (!in_fragmented_record) { | |||
ReportDrop(fragment.size(), "missing start of fragmented record"); | |||
} else { | |||
scratch->append(fragment.data(), fragment.size()); | |||
} | |||
break; | |||
case kLastType: | |||
if (!in_fragmented_record) { | |||
ReportDrop(fragment.size(), "missing start of fragmented record"); | |||
} else { | |||
scratch->append(fragment.data(), fragment.size()); | |||
*record = Slice(*scratch); | |||
return true; | |||
} | |||
break; | |||
case kEof: | |||
if (in_fragmented_record) { | |||
ReportDrop(scratch->size(), "partial record without end"); | |||
scratch->clear(); | |||
} | |||
return false; | |||
case kBadRecord: | |||
if (in_fragmented_record) { | |||
ReportDrop(scratch->size(), "error in middle of record"); | |||
in_fragmented_record = false; | |||
scratch->clear(); | |||
} | |||
break; | |||
default: | |||
ReportDrop( | |||
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)), | |||
"unknown record type"); | |||
in_fragmented_record = false; | |||
scratch->clear(); | |||
break; | |||
} | |||
} | |||
return false; | |||
} | |||
void Reader::ReportDrop(size_t bytes, const char* reason) { | |||
if (reporter_ != NULL) { | |||
reporter_->Corruption(bytes, Status::Corruption(reason)); | |||
} | |||
} | |||
unsigned int Reader::ReadPhysicalRecord(Slice* result) { | |||
while (true) { | |||
if (buffer_.size() <= kHeaderSize) { | |||
if (!eof_) { | |||
// Last read was a full read, so this is a trailer to skip | |||
buffer_.clear(); | |||
Status status = file_->Read(kBlockSize, &buffer_, backing_store_); | |||
if (!status.ok()) { | |||
if (reporter_ != NULL) { | |||
reporter_->Corruption(kBlockSize, status); | |||
} | |||
buffer_.clear(); | |||
eof_ = true; | |||
return kEof; | |||
} else if (buffer_.size() < kBlockSize) { | |||
eof_ = true; | |||
} | |||
continue; | |||
} else if (buffer_.size() == 0) { | |||
// End of file | |||
return kEof; | |||
} else if (buffer_.size() < kHeaderSize) { | |||
ReportDrop(buffer_.size(), "truncated record at end of file"); | |||
buffer_.clear(); | |||
return kEof; | |||
} else { | |||
// We have a trailing zero-length record. Fall through and check it. | |||
} | |||
} | |||
// Parse the header | |||
const char* header = buffer_.data(); | |||
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff; | |||
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff; | |||
const unsigned int type = header[6]; | |||
const uint32_t length = a | (b << 8); | |||
if (kHeaderSize + length > buffer_.size()) { | |||
ReportDrop(buffer_.size(), "bad record length"); | |||
buffer_.clear(); | |||
return kBadRecord; | |||
} | |||
// Check crc | |||
if (checksum_) { | |||
if (type == kZeroType && length == 0) { | |||
// Skip zero length record | |||
buffer_.remove_prefix(kHeaderSize + length); | |||
return kBadRecord; | |||
} | |||
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); | |||
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); | |||
if (actual_crc != expected_crc) { | |||
ReportDrop(length, "checksum mismatch"); | |||
buffer_.remove_prefix(kHeaderSize + length); | |||
return kBadRecord; | |||
} | |||
} | |||
buffer_.remove_prefix(kHeaderSize + length); | |||
*result = Slice(header + kHeaderSize, length); | |||
return type; | |||
} | |||
} | |||
} | |||
} |
@ -0,0 +1,75 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ | |||
#define STORAGE_LEVELDB_DB_LOG_READER_H_ | |||
#include "db/log_format.h" | |||
#include "include/slice.h" | |||
#include "include/status.h" | |||
namespace leveldb { | |||
class SequentialFile; | |||
namespace log { | |||
class Reader { | |||
public: | |||
// Interface for reporting errors. | |||
class Reporter { | |||
public: | |||
virtual ~Reporter(); | |||
// Some corruption was detected. "size" is the approximate number | |||
// of bytes dropped due to the corruption. | |||
virtual void Corruption(size_t bytes, const Status& status) = 0; | |||
}; | |||
// Create a reader that will return log records from "*file". | |||
// "*file" must remain live while this Reader is in use. | |||
// | |||
// If "reporter" is non-NULL, it is notified whenever some data is | |||
// dropped due to a detected corruption. "*reporter" must remain | |||
// live while this Reader is in use. | |||
// | |||
// If "checksum" is true, verify checksums if available. | |||
Reader(SequentialFile* file, Reporter* reporter, bool checksum); | |||
~Reader(); | |||
// Read the next record into *record. Returns true if read | |||
// successfully, false if we hit end of the input. May use | |||
// "*scratch" as temporary storage. The contents filled in *record | |||
// will only be valid until the next mutating operation on this | |||
// reader or the next mutation to *scratch. | |||
bool ReadRecord(Slice* record, std::string* scratch); | |||
private: | |||
SequentialFile* const file_; | |||
Reporter* const reporter_; | |||
bool const checksum_; | |||
char* const backing_store_; | |||
Slice buffer_; | |||
bool eof_; // Last Read() indicated EOF by returning < kBlockSize | |||
// Extend record types with the following special values | |||
enum { | |||
kEof = kMaxRecordType + 1, | |||
kBadRecord = kMaxRecordType + 2 | |||
}; | |||
// Return type, or one of the preceding special values | |||
unsigned int ReadPhysicalRecord(Slice* result); | |||
void ReportDrop(size_t bytes, const char* reason); | |||
// No copying allowed | |||
Reader(const Reader&); | |||
void operator=(const Reader&); | |||
}; | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ |
@ -0,0 +1,361 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/log_reader.h" | |||
#include "db/log_writer.h" | |||
#include "include/env.h" | |||
#include "util/coding.h" | |||
#include "util/crc32c.h" | |||
#include "util/random.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
namespace log { | |||
// Construct a string of the specified length made out of the supplied | |||
// partial string. | |||
static std::string BigString(const std::string& partial_string, size_t n) { | |||
std::string result; | |||
while (result.size() < n) { | |||
result.append(partial_string); | |||
} | |||
result.resize(n); | |||
return result; | |||
} | |||
// Construct a string from a number | |||
static std::string NumberString(int n) { | |||
char buf[50]; | |||
snprintf(buf, sizeof(buf), "%d.", n); | |||
return std::string(buf); | |||
} | |||
// Return a skewed potentially long string | |||
static std::string RandomSkewedString(int i, Random* rnd) { | |||
return BigString(NumberString(i), rnd->Skewed(17)); | |||
} | |||
class LogTest { | |||
private: | |||
class StringDest : public WritableFile { | |||
public: | |||
std::string contents_; | |||
virtual Status Close() { return Status::OK(); } | |||
virtual Status Flush() { return Status::OK(); } | |||
virtual Status Sync() { return Status::OK(); } | |||
virtual Status Append(const Slice& slice) { | |||
contents_.append(slice.data(), slice.size()); | |||
return Status::OK(); | |||
} | |||
}; | |||
class StringSource : public SequentialFile { | |||
public: | |||
Slice contents_; | |||
bool force_error_; | |||
bool returned_partial_; | |||
StringSource() : force_error_(false), returned_partial_(false) { } | |||
virtual Status Read(size_t n, Slice* result, char* scratch) { | |||
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; | |||
ASSERT_EQ(kBlockSize, n); | |||
if (force_error_) { | |||
force_error_ = false; | |||
returned_partial_ = true; | |||
return Status::Corruption("read error"); | |||
} | |||
if (contents_.size() < n) { | |||
n = contents_.size(); | |||
returned_partial_ = true; | |||
} | |||
*result = Slice(contents_.data(), n); | |||
contents_.remove_prefix(n); | |||
return Status::OK(); | |||
} | |||
}; | |||
class ReportCollector : public Reader::Reporter { | |||
public: | |||
size_t dropped_bytes_; | |||
std::string message_; | |||
ReportCollector() : dropped_bytes_(0) { } | |||
virtual void Corruption(size_t bytes, const Status& status) { | |||
dropped_bytes_ += bytes; | |||
message_.append(status.ToString()); | |||
} | |||
}; | |||
StringDest dest_; | |||
StringSource source_; | |||
ReportCollector report_; | |||
bool reading_; | |||
Writer writer_; | |||
Reader reader_; | |||
public: | |||
LogTest() : reading_(false), | |||
writer_(&dest_), | |||
reader_(&source_, &report_, true/*checksum*/) { | |||
} | |||
void Write(const std::string& msg) { | |||
ASSERT_TRUE(!reading_) << "Write() after starting to read"; | |||
writer_.AddRecord(Slice(msg)); | |||
} | |||
size_t WrittenBytes() const { | |||
return dest_.contents_.size(); | |||
} | |||
std::string Read() { | |||
if (!reading_) { | |||
reading_ = true; | |||
source_.contents_ = Slice(dest_.contents_); | |||
} | |||
std::string scratch; | |||
Slice record; | |||
if (reader_.ReadRecord(&record, &scratch)) { | |||
return record.ToString(); | |||
} else { | |||
return "EOF"; | |||
} | |||
} | |||
void IncrementByte(int offset, int delta) { | |||
dest_.contents_[offset] += delta; | |||
} | |||
void SetByte(int offset, char new_byte) { | |||
dest_.contents_[offset] = new_byte; | |||
} | |||
void ShrinkSize(int bytes) { | |||
dest_.contents_.resize(dest_.contents_.size() - bytes); | |||
} | |||
void FixChecksum(int header_offset, int len) { | |||
// Compute crc of type/len/data | |||
uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); | |||
crc = crc32c::Mask(crc); | |||
EncodeFixed32(&dest_.contents_[header_offset], crc); | |||
} | |||
void ForceError() { | |||
source_.force_error_ = true; | |||
} | |||
size_t DroppedBytes() const { | |||
return report_.dropped_bytes_; | |||
} | |||
// Returns OK iff recorded error message contains "msg" | |||
std::string MatchError(const std::string& msg) const { | |||
if (report_.message_.find(msg) == std::string::npos) { | |||
return report_.message_; | |||
} else { | |||
return "OK"; | |||
} | |||
} | |||
}; | |||
TEST(LogTest, Empty) { | |||
ASSERT_EQ("EOF", Read()); | |||
} | |||
TEST(LogTest, ReadWrite) { | |||
Write("foo"); | |||
Write("bar"); | |||
Write(""); | |||
Write("xxxx"); | |||
ASSERT_EQ("foo", Read()); | |||
ASSERT_EQ("bar", Read()); | |||
ASSERT_EQ("", Read()); | |||
ASSERT_EQ("xxxx", Read()); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work | |||
} | |||
TEST(LogTest, ManyBlocks) { | |||
for (int i = 0; i < 100000; i++) { | |||
Write(NumberString(i)); | |||
} | |||
for (int i = 0; i < 100000; i++) { | |||
ASSERT_EQ(NumberString(i), Read()); | |||
} | |||
ASSERT_EQ("EOF", Read()); | |||
} | |||
TEST(LogTest, Fragmentation) { | |||
Write("small"); | |||
Write(BigString("medium", 50000)); | |||
Write(BigString("large", 100000)); | |||
ASSERT_EQ("small", Read()); | |||
ASSERT_EQ(BigString("medium", 50000), Read()); | |||
ASSERT_EQ(BigString("large", 100000), Read()); | |||
ASSERT_EQ("EOF", Read()); | |||
} | |||
TEST(LogTest, MarginalTrailer) { | |||
// Make a trailer that is exactly the same length as an empty record. | |||
const int n = kBlockSize - 2*kHeaderSize; | |||
Write(BigString("foo", n)); | |||
ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); | |||
Write(""); | |||
Write("bar"); | |||
ASSERT_EQ(BigString("foo", n), Read()); | |||
ASSERT_EQ("", Read()); | |||
ASSERT_EQ("bar", Read()); | |||
ASSERT_EQ("EOF", Read()); | |||
} | |||
TEST(LogTest, ShortTrailer) { | |||
const int n = kBlockSize - 2*kHeaderSize + 4; | |||
Write(BigString("foo", n)); | |||
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); | |||
Write(""); | |||
Write("bar"); | |||
ASSERT_EQ(BigString("foo", n), Read()); | |||
ASSERT_EQ("", Read()); | |||
ASSERT_EQ("bar", Read()); | |||
ASSERT_EQ("EOF", Read()); | |||
} | |||
TEST(LogTest, AlignedEof) { | |||
const int n = kBlockSize - 2*kHeaderSize + 4; | |||
Write(BigString("foo", n)); | |||
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); | |||
ASSERT_EQ(BigString("foo", n), Read()); | |||
ASSERT_EQ("EOF", Read()); | |||
} | |||
TEST(LogTest, RandomRead) { | |||
const int N = 500; | |||
Random write_rnd(301); | |||
for (int i = 0; i < N; i++) { | |||
Write(RandomSkewedString(i, &write_rnd)); | |||
} | |||
Random read_rnd(301); | |||
for (int i = 0; i < N; i++) { | |||
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); | |||
} | |||
ASSERT_EQ("EOF", Read()); | |||
} | |||
// Tests of all the error paths in log_reader.cc follow: | |||
TEST(LogTest, ReadError) { | |||
Write("foo"); | |||
ForceError(); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(kBlockSize, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("read error")); | |||
} | |||
TEST(LogTest, BadRecordType) { | |||
Write("foo"); | |||
// Type is stored in header[6] | |||
IncrementByte(6, 100); | |||
FixChecksum(0, 3); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(3, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("unknown record type")); | |||
} | |||
TEST(LogTest, TruncatedTrailingRecord) { | |||
Write("foo"); | |||
ShrinkSize(4); // Drop all payload as well as a header byte | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("truncated record at end of file")); | |||
} | |||
TEST(LogTest, BadLength) { | |||
Write("foo"); | |||
ShrinkSize(1); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("bad record length")); | |||
} | |||
TEST(LogTest, ChecksumMismatch) { | |||
Write("foo"); | |||
IncrementByte(0, 10); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(3, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("checksum mismatch")); | |||
} | |||
TEST(LogTest, UnexpectedMiddleType) { | |||
Write("foo"); | |||
SetByte(6, kMiddleType); | |||
FixChecksum(0, 3); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(3, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("missing start")); | |||
} | |||
TEST(LogTest, UnexpectedLastType) { | |||
Write("foo"); | |||
SetByte(6, kLastType); | |||
FixChecksum(0, 3); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(3, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("missing start")); | |||
} | |||
TEST(LogTest, UnexpectedFullType) { | |||
Write("foo"); | |||
Write("bar"); | |||
SetByte(6, kFirstType); | |||
FixChecksum(0, 3); | |||
ASSERT_EQ("bar", Read()); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(3, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("partial record without end")); | |||
} | |||
TEST(LogTest, UnexpectedFirstType) { | |||
Write("foo"); | |||
Write(BigString("bar", 100000)); | |||
SetByte(6, kFirstType); | |||
FixChecksum(0, 3); | |||
ASSERT_EQ(BigString("bar", 100000), Read()); | |||
ASSERT_EQ("EOF", Read()); | |||
ASSERT_EQ(3, DroppedBytes()); | |||
ASSERT_EQ("OK", MatchError("partial record without end")); | |||
} | |||
TEST(LogTest, ErrorJoinsRecords) { | |||
// Consider two fragmented records: | |||
// first(R1) last(R1) first(R2) last(R2) | |||
// where the middle two fragments disappear. We do not want | |||
// first(R1),last(R2) to get joined and returned as a valid record. | |||
// Write records that span two blocks | |||
Write(BigString("foo", kBlockSize)); | |||
Write(BigString("bar", kBlockSize)); | |||
Write("correct"); | |||
// Wipe the middle block | |||
for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { | |||
SetByte(offset, 'x'); | |||
} | |||
ASSERT_EQ("correct", Read()); | |||
ASSERT_EQ("EOF", Read()); | |||
const int dropped = DroppedBytes(); | |||
ASSERT_LE(dropped, 2*kBlockSize + 100); | |||
ASSERT_GE(dropped, 2*kBlockSize); | |||
} | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,101 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/log_writer.h" | |||
#include <stdint.h> | |||
#include "include/env.h" | |||
#include "util/coding.h" | |||
#include "util/crc32c.h" | |||
namespace leveldb { | |||
namespace log { | |||
Writer::Writer(WritableFile* dest) | |||
: dest_(dest), | |||
block_offset_(0) { | |||
for (int i = 0; i <= kMaxRecordType; i++) { | |||
char t = static_cast<char>(i); | |||
type_crc_[i] = crc32c::Value(&t, 1); | |||
} | |||
} | |||
Writer::~Writer() { | |||
} | |||
Status Writer::AddRecord(const Slice& slice) { | |||
const char* ptr = slice.data(); | |||
size_t left = slice.size(); | |||
// Fragment the record if necessary and emit it. Note that if slice | |||
// is empty, we still want to iterate once to emit a single | |||
// zero-length record | |||
Status s; | |||
do { | |||
const int leftover = kBlockSize - block_offset_; | |||
assert(leftover >= 0); | |||
if (leftover <= kHeaderSize) { | |||
// Switch to a new block | |||
if (leftover > 0) { | |||
// Fill the trailer | |||
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover)); | |||
} | |||
block_offset_ = 0; | |||
} | |||
// Invariant: we never leave <= kHeaderSize bytes in a block. | |||
const int avail = kBlockSize - block_offset_ - kHeaderSize; | |||
assert(avail > 0); | |||
const size_t fragment_length = (left < avail) ? left : avail; | |||
RecordType type; | |||
const bool begin = (ptr == slice.data()); | |||
const bool end = (left == fragment_length); | |||
if (begin && end) { | |||
type = kFullType; | |||
} else if (begin) { | |||
type = kFirstType; | |||
} else if (end) { | |||
type = kLastType; | |||
} else { | |||
type = kMiddleType; | |||
} | |||
s = EmitPhysicalRecord(type, ptr, fragment_length); | |||
ptr += fragment_length; | |||
left -= fragment_length; | |||
} while (s.ok() && left > 0); | |||
return s; | |||
} | |||
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { | |||
assert(n <= 0xffff); // Must fit in two bytes | |||
assert(block_offset_ + kHeaderSize + n <= kBlockSize); | |||
// Format the header | |||
char buf[kHeaderSize]; | |||
buf[4] = static_cast<char>(n & 0xff); | |||
buf[5] = static_cast<char>(n >> 8); | |||
buf[6] = static_cast<char>(t); | |||
// Compute the crc of the record type and the payload. | |||
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); | |||
crc = crc32c::Mask(crc); // Adjust for storage | |||
EncodeFixed32(buf, crc); | |||
// Write the header and the payload | |||
Status s = dest_->Append(Slice(buf, kHeaderSize)); | |||
if (s.ok()) { | |||
s = dest_->Append(Slice(ptr, n)); | |||
if (s.ok()) { | |||
s = dest_->Flush(); | |||
} | |||
} | |||
block_offset_ += kHeaderSize + n; | |||
return s; | |||
} | |||
} | |||
} |
@ -0,0 +1,48 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ | |||
#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ | |||
#include <stdint.h> | |||
#include "db/log_format.h" | |||
#include "include/slice.h" | |||
#include "include/status.h" | |||
namespace leveldb { | |||
class WritableFile; | |||
namespace log { | |||
class Writer { | |||
public: | |||
// Create a writer that will append data to "*dest". | |||
// "*dest" must be initially empty. | |||
// "*dest" must remain live while this Writer is in use. | |||
explicit Writer(WritableFile* dest); | |||
~Writer(); | |||
Status AddRecord(const Slice& slice); | |||
private: | |||
WritableFile* dest_; | |||
int block_offset_; // Current offset in block | |||
// crc32c values for all supported record types. These are | |||
// pre-computed to reduce the overhead of computing the crc of the | |||
// record type stored in the header. | |||
uint32_t type_crc_[kMaxRecordType + 1]; | |||
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); | |||
// No copying allowed | |||
Writer(const Writer&); | |||
void operator=(const Writer&); | |||
}; | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ |
@ -0,0 +1,109 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/memtable.h" | |||
#include "db/dbformat.h" | |||
#include "include/comparator.h" | |||
#include "include/env.h" | |||
#include "include/iterator.h" | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
static Slice GetLengthPrefixedSlice(const char* data) { | |||
uint32_t len; | |||
const char* p = data; | |||
p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted | |||
return Slice(p, len); | |||
} | |||
MemTable::MemTable(const InternalKeyComparator& cmp) | |||
: comparator_(cmp), | |||
table_(comparator_, &arena_) { | |||
} | |||
MemTable::~MemTable() { | |||
} | |||
size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } | |||
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) | |||
const { | |||
// Internal keys are encoded as length-prefixed strings. | |||
Slice a = GetLengthPrefixedSlice(aptr); | |||
Slice b = GetLengthPrefixedSlice(bptr); | |||
return comparator.Compare(a, b); | |||
} | |||
// Encode a suitable internal key target for "target" and return it. | |||
// Uses *scratch as scratch space, and the returned pointer will point | |||
// into this scratch space. | |||
static const char* EncodeKey(std::string* scratch, const Slice& target) { | |||
scratch->clear(); | |||
PutVarint32(scratch, target.size()); | |||
scratch->append(target.data(), target.size()); | |||
return scratch->data(); | |||
} | |||
class MemTableIterator: public Iterator { | |||
public: | |||
explicit MemTableIterator(MemTable::Table* table) { | |||
iter_ = new MemTable::Table::Iterator(table); | |||
} | |||
virtual ~MemTableIterator() { delete iter_; } | |||
virtual bool Valid() const { return iter_->Valid(); } | |||
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } | |||
virtual void SeekToFirst() { iter_->SeekToFirst(); } | |||
virtual void SeekToLast() { iter_->SeekToLast(); } | |||
virtual void Next() { iter_->Next(); } | |||
virtual void Prev() { iter_->Prev(); } | |||
virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } | |||
virtual Slice value() const { | |||
Slice key_slice = GetLengthPrefixedSlice(iter_->key()); | |||
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); | |||
} | |||
virtual Status status() const { return Status::OK(); } | |||
private: | |||
MemTable::Table::Iterator* iter_; | |||
std::string tmp_; // For passing to EncodeKey | |||
// No copying allowed | |||
MemTableIterator(const MemTableIterator&); | |||
void operator=(const MemTableIterator&); | |||
}; | |||
Iterator* MemTable::NewIterator() { | |||
return new MemTableIterator(&table_); | |||
} | |||
void MemTable::Add(SequenceNumber s, ValueType type, | |||
const Slice& key, | |||
const Slice& value) { | |||
// Format of an entry is concatenation of: | |||
// key_size : varint32 of internal_key.size() | |||
// key bytes : char[internal_key.size()] | |||
// value_size : varint32 of value.size() | |||
// value bytes : char[value.size()] | |||
size_t key_size = key.size(); | |||
size_t val_size = value.size(); | |||
size_t internal_key_size = key_size + 8; | |||
const size_t encoded_len = | |||
VarintLength(internal_key_size) + internal_key_size + | |||
VarintLength(val_size) + val_size; | |||
char* buf = arena_.Allocate(encoded_len); | |||
char* p = EncodeVarint32(buf, internal_key_size); | |||
memcpy(p, key.data(), key_size); | |||
p += key_size; | |||
EncodeFixed64(p, (s << 8) | type); | |||
p += 8; | |||
p = EncodeVarint32(p, val_size); | |||
memcpy(p, value.data(), val_size); | |||
assert((p + val_size) - buf == encoded_len); | |||
table_.Insert(buf); | |||
} | |||
} |
@ -0,0 +1,69 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ | |||
#define STORAGE_LEVELDB_DB_MEMTABLE_H_ | |||
#include <string> | |||
#include "include/db.h" | |||
#include "db/dbformat.h" | |||
#include "db/skiplist.h" | |||
#include "util/arena.h" | |||
namespace leveldb { | |||
class InternalKeyComparator; | |||
class Mutex; | |||
class MemTableIterator; | |||
class MemTable { | |||
public: | |||
explicit MemTable(const InternalKeyComparator& comparator); | |||
~MemTable(); | |||
// Returns an estimate of the number of bytes of data in use by this | |||
// data structure. | |||
// | |||
// REQUIRES: external synchronization to prevent simultaneous | |||
// operations on the same MemTable. | |||
size_t ApproximateMemoryUsage(); | |||
// Return an iterator that yields the contents of the memtable. | |||
// | |||
// The caller must ensure that the underlying MemTable remains live | |||
// while the returned iterator is live. The keys returned by this | |||
// iterator are internal keys encoded by AppendInternalKey in the | |||
// db/format.{h,cc} module. | |||
Iterator* NewIterator(); | |||
// Add an entry into memtable that maps key to value at the | |||
// specified sequence number and with the specified type. | |||
// Typically value will be empty if type==kTypeDeletion. | |||
void Add(SequenceNumber seq, ValueType type, | |||
const Slice& key, | |||
const Slice& value); | |||
private: | |||
struct KeyComparator { | |||
const InternalKeyComparator comparator; | |||
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } | |||
int operator()(const char* a, const char* b) const; | |||
}; | |||
friend class MemTableIterator; | |||
friend class MemTableBackwardIterator; | |||
typedef SkipList<const char*, KeyComparator> Table; | |||
KeyComparator comparator_; | |||
Arena arena_; | |||
Table table_; | |||
// No copying allowed | |||
MemTable(const MemTable&); | |||
void operator=(const MemTable&); | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ |
@ -0,0 +1,396 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// We recover the contents of the descriptor from the other files we find. | |||
// (1) Any log files are first converted to tables | |||
// (2) We scan every table to compute | |||
// (a) smallest/largest for the table | |||
// (b) large value refs from the table | |||
// (c) largest sequence number in the table | |||
// (3) We generate descriptor contents: | |||
// - log number is set to zero | |||
// - next-file-number is set to 1 + largest file number we found | |||
// - last-sequence-number is set to largest sequence# found across | |||
// all tables (see 2c) | |||
// - compaction pointers are cleared | |||
// - every table file is added at level 0 | |||
// | |||
// Possible optimization 1: | |||
// (a) Compute total size and use to pick appropriate max-level M | |||
// (b) Sort tables by largest sequence# in the table | |||
// (c) For each table: if it overlaps earlier table, place in level-0, | |||
// else place in level-M. | |||
// Possible optimization 2: | |||
// Store per-table metadata (smallest, largest, largest-seq#, | |||
// large-value-refs, ...) in the table's meta section to speed up | |||
// ScanTable. | |||
#include "db/builder.h" | |||
#include "db/db_impl.h" | |||
#include "db/dbformat.h" | |||
#include "db/filename.h" | |||
#include "db/log_reader.h" | |||
#include "db/log_writer.h" | |||
#include "db/memtable.h" | |||
#include "db/table_cache.h" | |||
#include "db/version_edit.h" | |||
#include "db/write_batch_internal.h" | |||
#include "include/comparator.h" | |||
#include "include/db.h" | |||
#include "include/env.h" | |||
namespace leveldb { | |||
namespace { | |||
class Repairer { | |||
public: | |||
Repairer(const std::string& dbname, const Options& options) | |||
: dbname_(dbname), | |||
env_(options.env), | |||
icmp_(options.comparator), | |||
options_(SanitizeOptions(dbname, &icmp_, options)), | |||
owns_info_log_(options_.info_log != options.info_log), | |||
next_file_number_(1) { | |||
// TableCache can be small since we expect each table to be opened once. | |||
table_cache_ = new TableCache(dbname_, &options_, 10); | |||
} | |||
~Repairer() { | |||
delete table_cache_; | |||
if (owns_info_log_) { | |||
delete options_.info_log; | |||
} | |||
} | |||
Status Run() { | |||
Status status = FindFiles(); | |||
if (status.ok()) { | |||
ConvertLogFilesToTables(); | |||
ExtractMetaData(); | |||
status = WriteDescriptor(); | |||
} | |||
if (status.ok()) { | |||
unsigned long long bytes = 0; | |||
for (int i = 0; i < tables_.size(); i++) { | |||
bytes += tables_[i].meta.file_size; | |||
} | |||
Log(env_, options_.info_log, | |||
"**** Repaired leveldb %s; " | |||
"recovered %d files; %llu bytes. " | |||
"Some data may have been lost. " | |||
"****", | |||
dbname_.c_str(), | |||
static_cast<int>(tables_.size()), | |||
bytes); | |||
} | |||
return status; | |||
} | |||
private: | |||
struct TableInfo { | |||
FileMetaData meta; | |||
SequenceNumber max_sequence; | |||
}; | |||
std::string const dbname_; | |||
Env* const env_; | |||
InternalKeyComparator const icmp_; | |||
Options const options_; | |||
bool owns_info_log_; | |||
TableCache* table_cache_; | |||
VersionEdit edit_; | |||
std::vector<std::string> manifests_; | |||
std::vector<uint64_t> table_numbers_; | |||
std::vector<uint64_t> logs_; | |||
std::vector<TableInfo> tables_; | |||
uint64_t next_file_number_; | |||
Status FindFiles() { | |||
std::vector<std::string> filenames; | |||
Status status = env_->GetChildren(dbname_, &filenames); | |||
if (!status.ok()) { | |||
return status; | |||
} | |||
if (filenames.empty()) { | |||
return Status::IOError(dbname_, "repair found no files"); | |||
} | |||
uint64_t number; | |||
LargeValueRef large_ref; | |||
FileType type; | |||
for (int i = 0; i < filenames.size(); i++) { | |||
if (ParseFileName(filenames[i], &number, &large_ref, &type)) { | |||
if (type == kLargeValueFile) { | |||
// Will be picked up when we process a Table that points to it | |||
} else if (type == kDescriptorFile) { | |||
manifests_.push_back(filenames[i]); | |||
} else { | |||
if (number + 1 > next_file_number_) { | |||
next_file_number_ = number + 1; | |||
} | |||
if (type == kLogFile) { | |||
logs_.push_back(number); | |||
} else if (type == kTableFile) { | |||
table_numbers_.push_back(number); | |||
} else { | |||
// Ignore other files | |||
} | |||
} | |||
} | |||
} | |||
return status; | |||
} | |||
void ConvertLogFilesToTables() { | |||
for (int i = 0; i < logs_.size(); i++) { | |||
std::string logname = LogFileName(dbname_, logs_[i]); | |||
Status status = ConvertLogToTable(logs_[i]); | |||
if (!status.ok()) { | |||
Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", | |||
(unsigned long long) logs_[i], | |||
status.ToString().c_str()); | |||
} | |||
ArchiveFile(logname); | |||
} | |||
} | |||
Status ConvertLogToTable(uint64_t log) { | |||
struct LogReporter : public log::Reader::Reporter { | |||
Env* env; | |||
WritableFile* info_log; | |||
uint64_t lognum; | |||
virtual void Corruption(size_t bytes, const Status& s) { | |||
// We print error messages for corruption, but continue repairing. | |||
Log(env, info_log, "Log #%llu: dropping %d bytes; %s", | |||
(unsigned long long) lognum, | |||
static_cast<int>(bytes), | |||
s.ToString().c_str()); | |||
} | |||
}; | |||
// Open the log file | |||
std::string logname = LogFileName(dbname_, log); | |||
SequentialFile* lfile; | |||
Status status = env_->NewSequentialFile(logname, &lfile); | |||
if (!status.ok()) { | |||
return status; | |||
} | |||
// Create the log reader. | |||
LogReporter reporter; | |||
reporter.env = env_; | |||
reporter.info_log = options_.info_log; | |||
reporter.lognum = log; | |||
// We intentially make log::Reader do checksumming so that | |||
// corruptions cause entire commits to be skipped instead of | |||
// propagating bad information (like overly large sequence | |||
// numbers). | |||
log::Reader reader(lfile, &reporter, false/*do not checksum*/); | |||
// Read all the records and add to a memtable | |||
std::string scratch; | |||
Slice record; | |||
WriteBatch batch; | |||
MemTable mem(icmp_); | |||
int counter = 0; | |||
while (reader.ReadRecord(&record, &scratch)) { | |||
if (record.size() < 12) { | |||
reporter.Corruption( | |||
record.size(), Status::Corruption("log record too small")); | |||
continue; | |||
} | |||
WriteBatchInternal::SetContents(&batch, record); | |||
status = WriteBatchInternal::InsertInto(&batch, &mem); | |||
if (status.ok()) { | |||
counter += WriteBatchInternal::Count(&batch); | |||
} else { | |||
Log(env_, options_.info_log, "Log #%llu: ignoring %s", | |||
(unsigned long long) log, | |||
status.ToString().c_str()); | |||
status = Status::OK(); // Keep going with rest of file | |||
} | |||
} | |||
delete lfile; | |||
// We ignore any version edits generated by the conversion to a Table | |||
// since ExtractMetaData() will also generate edits. | |||
VersionEdit skipped; | |||
FileMetaData meta; | |||
meta.number = next_file_number_++; | |||
Iterator* iter = mem.NewIterator(); | |||
status = BuildTable(dbname_, env_, options_, table_cache_, iter, | |||
&meta, &skipped); | |||
delete iter; | |||
if (status.ok()) { | |||
if (meta.file_size > 0) { | |||
table_numbers_.push_back(meta.number); | |||
} | |||
} | |||
Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", | |||
(unsigned long long) log, | |||
counter, | |||
(unsigned long long) meta.number, | |||
status.ToString().c_str()); | |||
return status; | |||
} | |||
void ExtractMetaData() { | |||
std::vector<TableInfo> kept; | |||
for (int i = 0; i < table_numbers_.size(); i++) { | |||
TableInfo t; | |||
t.meta.number = table_numbers_[i]; | |||
Status status = ScanTable(&t); | |||
if (!status.ok()) { | |||
std::string fname = TableFileName(dbname_, table_numbers_[i]); | |||
Log(env_, options_.info_log, "Table #%llu: ignoring %s", | |||
(unsigned long long) table_numbers_[i], | |||
status.ToString().c_str()); | |||
ArchiveFile(fname); | |||
} else { | |||
tables_.push_back(t); | |||
} | |||
} | |||
} | |||
Status ScanTable(TableInfo* t) { | |||
std::string fname = TableFileName(dbname_, t->meta.number); | |||
int counter = 0; | |||
Status status = env_->GetFileSize(fname, &t->meta.file_size); | |||
if (status.ok()) { | |||
Iterator* iter = table_cache_->NewIterator( | |||
ReadOptions(), t->meta.number); | |||
bool empty = true; | |||
ParsedInternalKey parsed; | |||
t->max_sequence = 0; | |||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { | |||
Slice key = iter->key(); | |||
if (!ParseInternalKey(key, &parsed)) { | |||
Log(env_, options_.info_log, "Table #%llu: unparsable key %s", | |||
(unsigned long long) t->meta.number, | |||
EscapeString(key).c_str()); | |||
continue; | |||
} | |||
counter++; | |||
if (empty) { | |||
empty = false; | |||
t->meta.smallest.DecodeFrom(key); | |||
} | |||
t->meta.largest.DecodeFrom(key); | |||
if (parsed.sequence > t->max_sequence) { | |||
t->max_sequence = parsed.sequence; | |||
} | |||
if (ExtractValueType(key) == kTypeLargeValueRef) { | |||
if (iter->value().size() != LargeValueRef::ByteSize()) { | |||
Log(env_, options_.info_log, "Table #%llu: bad large value ref", | |||
(unsigned long long) t->meta.number); | |||
} else { | |||
edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()), | |||
t->meta.number, | |||
key); | |||
} | |||
} | |||
} | |||
if (!iter->status().ok()) { | |||
status = iter->status(); | |||
} | |||
delete iter; | |||
} | |||
Log(env_, options_.info_log, "Table #%llu: %d entries %s", | |||
(unsigned long long) t->meta.number, | |||
counter, | |||
status.ToString().c_str()); | |||
return status; | |||
} | |||
Status WriteDescriptor() { | |||
std::string tmp = TempFileName(dbname_, 1); | |||
WritableFile* file; | |||
Status status = env_->NewWritableFile(tmp, &file); | |||
if (!status.ok()) { | |||
return status; | |||
} | |||
SequenceNumber max_sequence = 0; | |||
for (int i = 0; i < tables_.size(); i++) { | |||
if (max_sequence < tables_[i].max_sequence) { | |||
max_sequence = tables_[i].max_sequence; | |||
} | |||
} | |||
edit_.SetComparatorName(icmp_.user_comparator()->Name()); | |||
edit_.SetLogNumber(0); | |||
edit_.SetNextFile(next_file_number_); | |||
edit_.SetLastSequence(max_sequence); | |||
for (int i = 0; i < tables_.size(); i++) { | |||
// TODO(opt): separate out into multiple levels | |||
const TableInfo& t = tables_[i]; | |||
edit_.AddFile(0, t.meta.number, t.meta.file_size, | |||
t.meta.smallest, t.meta.largest); | |||
} | |||
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); | |||
{ | |||
log::Writer log(file); | |||
std::string record; | |||
edit_.EncodeTo(&record); | |||
status = log.AddRecord(record); | |||
} | |||
if (status.ok()) { | |||
status = file->Close(); | |||
} | |||
delete file; | |||
file = NULL; | |||
if (!status.ok()) { | |||
env_->DeleteFile(tmp); | |||
} else { | |||
// Discard older manifests | |||
for (int i = 0; i < manifests_.size(); i++) { | |||
ArchiveFile(dbname_ + "/" + manifests_[i]); | |||
} | |||
// Install new manifest | |||
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); | |||
if (status.ok()) { | |||
status = SetCurrentFile(env_, dbname_, 1); | |||
} else { | |||
env_->DeleteFile(tmp); | |||
} | |||
} | |||
return status; | |||
} | |||
void ArchiveFile(const std::string& fname) { | |||
// Move into another directory. E.g., for | |||
// dir/foo | |||
// rename to | |||
// dir/lost/foo | |||
const char* slash = strrchr(fname.c_str(), '/'); | |||
std::string new_dir; | |||
if (slash != NULL) { | |||
new_dir.assign(fname.data(), slash - fname.data()); | |||
} | |||
new_dir.append("/lost"); | |||
env_->CreateDir(new_dir); // Ignore error | |||
std::string new_file = new_dir; | |||
new_file.append("/"); | |||
new_file.append((slash == NULL) ? fname.c_str() : slash + 1); | |||
Status s = env_->RenameFile(fname, new_file); | |||
Log(env_, options_.info_log, "Archiving %s: %s\n", | |||
fname.c_str(), s.ToString().c_str()); | |||
} | |||
}; | |||
} | |||
Status RepairDB(const std::string& dbname, const Options& options) { | |||
Repairer repairer(dbname, options); | |||
return repairer.Run(); | |||
} | |||
} |
@ -0,0 +1,378 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// Thread safety | |||
// ------------- | |||
// | |||
// Writes require external synchronization, most likely a mutex. | |||
// Reads require a guarantee that the SkipList will not be destroyed | |||
// while the read is in progress. Apart from that, reads progress | |||
// without any internal locking or synchronization. | |||
// | |||
// Invariants: | |||
// | |||
// (1) Allocated nodes are never deleted until the SkipList is | |||
// destroyed. This is trivially guaranteed by the code since we | |||
// never delete any skip list nodes. | |||
// | |||
// (2) The contents of a Node except for the next/prev pointers are | |||
// immutable after the Node has been linked into the SkipList. | |||
// Only Insert() modifies the list, and it is careful to initialize | |||
// a node and use release-stores to publish the nodes in one or | |||
// more lists. | |||
// | |||
// ... prev vs. next pointer ordering ... | |||
#include <assert.h> | |||
#include <stdlib.h> | |||
#include "port/port.h" | |||
#include "util/arena.h" | |||
#include "util/random.h" | |||
namespace leveldb { | |||
class Arena; | |||
template<typename Key, class Comparator> | |||
class SkipList { | |||
private: | |||
struct Node; | |||
public: | |||
// Create a new SkipList object that will use "cmp" for comparing keys, | |||
// and will allocate memory using "*arena". Objects allocated in the arena | |||
// must remain allocated for the lifetime of the skiplist object. | |||
explicit SkipList(Comparator cmp, Arena* arena); | |||
// Insert key into the list. | |||
// REQUIRES: nothing that compares equal to key is currently in the list. | |||
void Insert(const Key& key); | |||
// Returns true iff an entry that compares equal to key is in the list. | |||
bool Contains(const Key& key) const; | |||
// Iteration over the contents of a skip list | |||
class Iterator { | |||
public: | |||
// Initialize an iterator over the specified list. | |||
// The returned iterator is not valid. | |||
explicit Iterator(const SkipList* list); | |||
// Returns true iff the iterator is positioned at a valid node. | |||
bool Valid() const; | |||
// Returns the key at the current position. | |||
// REQUIRES: Valid() | |||
const Key& key() const; | |||
// Advances to the next position. | |||
// REQUIRES: Valid() | |||
void Next(); | |||
// Advances to the previous position. | |||
// REQUIRES: Valid() | |||
void Prev(); | |||
// Advance to the first entry with a key >= target | |||
void Seek(const Key& target); | |||
// Position at the first entry in list. | |||
// Final state of iterator is Valid() iff list is not empty. | |||
void SeekToFirst(); | |||
// Position at the last entry in list. | |||
// Final state of iterator is Valid() iff list is not empty. | |||
void SeekToLast(); | |||
private: | |||
const SkipList* list_; | |||
Node* node_; | |||
// Intentionally copyable | |||
}; | |||
private: | |||
enum { kMaxHeight = 12 }; | |||
// Immutable after construction | |||
Comparator const compare_; | |||
Arena* const arena_; // Arena used for allocations of nodes | |||
Node* const head_; | |||
// Modified only by Insert(). Read racily by readers, but stale | |||
// values are ok. | |||
port::AtomicPointer max_height_; // Height of the entire list | |||
inline int GetMaxHeight() const { | |||
return reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()); | |||
} | |||
// Read/written only by Insert(). | |||
Random rnd_; | |||
Node* NewNode(const Key& key, int height); | |||
int RandomHeight(); | |||
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } | |||
// Return true if key is greater than the data stored in "n" | |||
bool KeyIsAfterNode(const Key& key, Node* n) const; | |||
// Return the earliest node that comes at or after key. | |||
// Return NULL if there is no such node. | |||
// | |||
// If prev is non-NULL, fills prev[level] with pointer to previous | |||
// node at "level" for every level in [0..max_height_-1]. | |||
Node* FindGreaterOrEqual(const Key& key, Node** prev) const; | |||
// Return the latest node with a key < key. | |||
// Return head_ if there is no such node. | |||
Node* FindLessThan(const Key& key) const; | |||
// Return the last node in the list. | |||
// Return head_ if list is empty. | |||
Node* FindLast() const; | |||
// No copying allowed | |||
SkipList(const SkipList&); | |||
void operator=(const SkipList&); | |||
}; | |||
// Implementation details follow | |||
template<typename Key, class Comparator> | |||
struct SkipList<Key,Comparator>::Node { | |||
explicit Node(const Key& k) : key(k) { } | |||
Key const key; | |||
// Accessors/mutators for links. Wrapped in methods so we can | |||
// add the appropriate barriers as necessary. | |||
Node* Next(int n) { | |||
assert(n >= 0); | |||
// Use an 'acquire load' so that we observe a fully initialized | |||
// version of the returned Node. | |||
return reinterpret_cast<Node*>(next_[n].Acquire_Load()); | |||
} | |||
void SetNext(int n, Node* x) { | |||
assert(n >= 0); | |||
// Use a 'release store' so that anybody who reads through this | |||
// pointer observes a fully initialized version of the inserted node. | |||
next_[n].Release_Store(x); | |||
} | |||
// No-barrier variants that can be safely used in a few locations. | |||
Node* NoBarrier_Next(int n) { | |||
assert(n >= 0); | |||
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load()); | |||
} | |||
void NoBarrier_SetNext(int n, Node* x) { | |||
assert(n >= 0); | |||
next_[n].NoBarrier_Store(x); | |||
} | |||
private: | |||
// Array of length equal to the node height. next_[0] is lowest level link. | |||
port::AtomicPointer next_[1]; | |||
}; | |||
template<typename Key, class Comparator> | |||
typename SkipList<Key,Comparator>::Node* | |||
SkipList<Key,Comparator>::NewNode(const Key& key, int height) { | |||
char* mem = arena_->AllocateAligned( | |||
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); | |||
return new (mem) Node(key); | |||
} | |||
template<typename Key, class Comparator> | |||
inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) { | |||
list_ = list; | |||
node_ = NULL; | |||
} | |||
template<typename Key, class Comparator> | |||
inline bool SkipList<Key,Comparator>::Iterator::Valid() const { | |||
return node_ != NULL; | |||
} | |||
template<typename Key, class Comparator> | |||
inline const Key& SkipList<Key,Comparator>::Iterator::key() const { | |||
assert(Valid()); | |||
return node_->key; | |||
} | |||
template<typename Key, class Comparator> | |||
inline void SkipList<Key,Comparator>::Iterator::Next() { | |||
assert(Valid()); | |||
node_ = node_->Next(0); | |||
} | |||
template<typename Key, class Comparator> | |||
inline void SkipList<Key,Comparator>::Iterator::Prev() { | |||
// Instead of using explicit "prev" links, we just search for the | |||
// last node that falls before key. | |||
assert(Valid()); | |||
node_ = list_->FindLessThan(node_->key); | |||
if (node_ == list_->head_) { | |||
node_ = NULL; | |||
} | |||
} | |||
template<typename Key, class Comparator> | |||
inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) { | |||
node_ = list_->FindGreaterOrEqual(target, NULL); | |||
} | |||
template<typename Key, class Comparator> | |||
inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() { | |||
node_ = list_->head_->Next(0); | |||
} | |||
template<typename Key, class Comparator> | |||
inline void SkipList<Key,Comparator>::Iterator::SeekToLast() { | |||
node_ = list_->FindLast(); | |||
if (node_ == list_->head_) { | |||
node_ = NULL; | |||
} | |||
} | |||
template<typename Key, class Comparator> | |||
int SkipList<Key,Comparator>::RandomHeight() { | |||
// Increase height with probability 1 in kBranching | |||
static const unsigned int kBranching = 4; | |||
int height = 1; | |||
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { | |||
height++; | |||
} | |||
assert(height > 0); | |||
assert(height <= kMaxHeight); | |||
return height; | |||
} | |||
template<typename Key, class Comparator> | |||
bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const { | |||
// NULL n is considered infinite | |||
return (n != NULL) && (compare_(n->key, key) < 0); | |||
} | |||
template<typename Key, class Comparator> | |||
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev) | |||
const { | |||
Node* x = head_; | |||
int level = GetMaxHeight() - 1; | |||
while (true) { | |||
Node* next = x->Next(level); | |||
if (KeyIsAfterNode(key, next)) { | |||
// Keep searching in this list | |||
x = next; | |||
} else { | |||
if (prev != NULL) prev[level] = x; | |||
if (level == 0) { | |||
return next; | |||
} else { | |||
// Switch to next list | |||
level--; | |||
} | |||
} | |||
} | |||
} | |||
template<typename Key, class Comparator> | |||
typename SkipList<Key,Comparator>::Node* | |||
SkipList<Key,Comparator>::FindLessThan(const Key& key) const { | |||
Node* x = head_; | |||
int level = GetMaxHeight() - 1; | |||
while (true) { | |||
assert(x == head_ || compare_(x->key, key) < 0); | |||
Node* next = x->Next(level); | |||
if (next == NULL || compare_(next->key, key) >= 0) { | |||
if (level == 0) { | |||
return x; | |||
} else { | |||
// Switch to next list | |||
level--; | |||
} | |||
} else { | |||
x = next; | |||
} | |||
} | |||
} | |||
template<typename Key, class Comparator> | |||
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast() | |||
const { | |||
Node* x = head_; | |||
int level = GetMaxHeight() - 1; | |||
while (true) { | |||
Node* next = x->Next(level); | |||
if (next == NULL) { | |||
if (level == 0) { | |||
return x; | |||
} else { | |||
// Switch to next list | |||
level--; | |||
} | |||
} else { | |||
x = next; | |||
} | |||
} | |||
} | |||
template<typename Key, class Comparator> | |||
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena) | |||
: compare_(cmp), | |||
arena_(arena), | |||
head_(NewNode(0 /* any key will do */, kMaxHeight)), | |||
max_height_(reinterpret_cast<void*>(1)), | |||
rnd_(0xdeadbeef) { | |||
for (int i = 0; i < kMaxHeight; i++) { | |||
head_->SetNext(i, NULL); | |||
} | |||
} | |||
template<typename Key, class Comparator> | |||
void SkipList<Key,Comparator>::Insert(const Key& key) { | |||
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() | |||
// here since Insert() is externally synchronized. | |||
Node* prev[kMaxHeight]; | |||
Node* x = FindGreaterOrEqual(key, prev); | |||
// Our data structure does not allow duplicate insertion | |||
assert(x == NULL || !Equal(key, x->key)); | |||
int height = RandomHeight(); | |||
if (height > GetMaxHeight()) { | |||
for (int i = GetMaxHeight(); i < height; i++) { | |||
prev[i] = head_; | |||
} | |||
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height); | |||
// It is ok to mutate max_height_ without any synchronization | |||
// with concurrent readers. A concurrent reader that observes | |||
// the new value of max_height_ will see either the old value of | |||
// new level pointers from head_ (NULL), or a new value set in | |||
// the loop below. In the former case the reader will | |||
// immediately drop to the next level since NULL sorts after all | |||
// keys. In the latter case the reader will use the new node. | |||
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height)); | |||
} | |||
x = NewNode(key, height); | |||
for (int i = 0; i < height; i++) { | |||
// NoBarrier_SetNext() suffices since we will add a barrier when | |||
// we publish a pointer to "x" in prev[i]. | |||
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); | |||
prev[i]->SetNext(i, x); | |||
} | |||
} | |||
template<typename Key, class Comparator> | |||
bool SkipList<Key,Comparator>::Contains(const Key& key) const { | |||
Node* x = FindGreaterOrEqual(key, NULL); | |||
if (x != NULL && Equal(key, x->key)) { | |||
return true; | |||
} else { | |||
return false; | |||
} | |||
} | |||
} |
@ -0,0 +1,378 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/skiplist.h" | |||
#include <set> | |||
#include "include/env.h" | |||
#include "util/arena.h" | |||
#include "util/hash.h" | |||
#include "util/random.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
typedef uint64_t Key; | |||
struct Comparator { | |||
int operator()(const Key& a, const Key& b) const { | |||
if (a < b) { | |||
return -1; | |||
} else if (a > b) { | |||
return +1; | |||
} else { | |||
return 0; | |||
} | |||
} | |||
}; | |||
class SkipTest { }; | |||
TEST(SkipTest, Empty) { | |||
Arena arena; | |||
Comparator cmp; | |||
SkipList<Key, Comparator> list(cmp, &arena); | |||
ASSERT_TRUE(!list.Contains(10)); | |||
SkipList<Key, Comparator>::Iterator iter(&list); | |||
ASSERT_TRUE(!iter.Valid()); | |||
iter.SeekToFirst(); | |||
ASSERT_TRUE(!iter.Valid()); | |||
iter.Seek(100); | |||
ASSERT_TRUE(!iter.Valid()); | |||
iter.SeekToLast(); | |||
ASSERT_TRUE(!iter.Valid()); | |||
} | |||
TEST(SkipTest, InsertAndLookup) { | |||
const int N = 2000; | |||
const int R = 5000; | |||
Random rnd(1000); | |||
std::set<Key> keys; | |||
Arena arena; | |||
Comparator cmp; | |||
SkipList<Key, Comparator> list(cmp, &arena); | |||
for (int i = 0; i < N; i++) { | |||
Key key = rnd.Next() % R; | |||
if (keys.insert(key).second) { | |||
list.Insert(key); | |||
} | |||
} | |||
for (int i = 0; i < R; i++) { | |||
if (list.Contains(i)) { | |||
ASSERT_EQ(keys.count(i), 1); | |||
} else { | |||
ASSERT_EQ(keys.count(i), 0); | |||
} | |||
} | |||
// Simple iterator tests | |||
{ | |||
SkipList<Key, Comparator>::Iterator iter(&list); | |||
ASSERT_TRUE(!iter.Valid()); | |||
iter.Seek(0); | |||
ASSERT_TRUE(iter.Valid()); | |||
ASSERT_EQ(*(keys.begin()), iter.key()); | |||
iter.SeekToFirst(); | |||
ASSERT_TRUE(iter.Valid()); | |||
ASSERT_EQ(*(keys.begin()), iter.key()); | |||
iter.SeekToLast(); | |||
ASSERT_TRUE(iter.Valid()); | |||
ASSERT_EQ(*(keys.rbegin()), iter.key()); | |||
} | |||
// Forward iteration test | |||
for (int i = 0; i < R; i++) { | |||
SkipList<Key, Comparator>::Iterator iter(&list); | |||
iter.Seek(i); | |||
// Compare against model iterator | |||
std::set<Key>::iterator model_iter = keys.lower_bound(i); | |||
for (int j = 0; j < 3; j++) { | |||
if (model_iter == keys.end()) { | |||
ASSERT_TRUE(!iter.Valid()); | |||
break; | |||
} else { | |||
ASSERT_TRUE(iter.Valid()); | |||
ASSERT_EQ(*model_iter, iter.key()); | |||
++model_iter; | |||
iter.Next(); | |||
} | |||
} | |||
} | |||
// Backward iteration test | |||
{ | |||
SkipList<Key, Comparator>::Iterator iter(&list); | |||
iter.SeekToLast(); | |||
// Compare against model iterator | |||
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin(); | |||
model_iter != keys.rend(); | |||
++model_iter) { | |||
ASSERT_TRUE(iter.Valid()); | |||
ASSERT_EQ(*model_iter, iter.key()); | |||
iter.Prev(); | |||
} | |||
ASSERT_TRUE(!iter.Valid()); | |||
} | |||
} | |||
// We want to make sure that with a single writer and multiple | |||
// concurrent readers (with no synchronization other than when a | |||
// reader's iterator is created), the reader always observes all the | |||
// data that was present in the skip list when the iterator was | |||
// constructor. Because insertions are happening concurrently, we may | |||
// also observe new values that were inserted since the iterator was | |||
// constructed, but we should never miss any values that were present | |||
// at iterator construction time. | |||
// | |||
// We generate multi-part keys: | |||
// <key,gen,hash> | |||
// where: | |||
// key is in range [0..K-1] | |||
// gen is a generation number for key | |||
// hash is hash(key,gen) | |||
// | |||
// The insertion code picks a random key, sets gen to be 1 + the last | |||
// generation number inserted for that key, and sets hash to Hash(key,gen). | |||
// | |||
// At the beginning of a read, we snapshot the last inserted | |||
// generation number for each key. We then iterate, including random | |||
// calls to Next() and Seek(). For every key we encounter, we | |||
// check that it is either expected given the initial snapshot or has | |||
// been concurrently added since the iterator started. | |||
class ConcurrentTest { | |||
private: | |||
static const uint32_t K = 4; | |||
static uint64_t key(Key key) { return (key >> 40); } | |||
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } | |||
static uint64_t hash(Key key) { return key & 0xff; } | |||
static uint64_t HashNumbers(uint64_t k, uint64_t g) { | |||
uint64_t data[2] = { k, g }; | |||
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0); | |||
} | |||
static Key MakeKey(uint64_t k, uint64_t g) { | |||
assert(sizeof(Key) == sizeof(uint64_t)); | |||
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist | |||
assert(g <= 0xffffffffu); | |||
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); | |||
} | |||
static bool IsValidKey(Key k) { | |||
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); | |||
} | |||
static Key RandomTarget(Random* rnd) { | |||
switch (rnd->Next() % 10) { | |||
case 0: | |||
// Seek to beginning | |||
return MakeKey(0, 0); | |||
case 1: | |||
// Seek to end | |||
return MakeKey(K, 0); | |||
default: | |||
// Seek to middle | |||
return MakeKey(rnd->Next() % K, 0); | |||
} | |||
} | |||
// Per-key generation | |||
struct State { | |||
port::AtomicPointer generation[K]; | |||
void Set(int k, intptr_t v) { | |||
generation[k].Release_Store(reinterpret_cast<void*>(v)); | |||
} | |||
intptr_t Get(int k) { | |||
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load()); | |||
} | |||
State() { | |||
for (int k = 0; k < K; k++) { | |||
Set(k, 0); | |||
} | |||
} | |||
}; | |||
// Current state of the test | |||
State current_; | |||
Arena arena_; | |||
// SkipList is not protected by mu_. We just use a single writer | |||
// thread to modify it. | |||
SkipList<Key, Comparator> list_; | |||
public: | |||
ConcurrentTest() : list_(Comparator(), &arena_) { } | |||
// REQUIRES: External synchronization | |||
void WriteStep(Random* rnd) { | |||
const uint32_t k = rnd->Next() % K; | |||
const intptr_t g = current_.Get(k) + 1; | |||
const Key key = MakeKey(k, g); | |||
list_.Insert(key); | |||
current_.Set(k, g); | |||
} | |||
void ReadStep(Random* rnd) { | |||
// Remember the initial committed state of the skiplist. | |||
State initial_state; | |||
for (int k = 0; k < K; k++) { | |||
initial_state.Set(k, current_.Get(k)); | |||
} | |||
Key pos = RandomTarget(rnd); | |||
SkipList<Key, Comparator>::Iterator iter(&list_); | |||
iter.Seek(pos); | |||
while (true) { | |||
Key current; | |||
if (!iter.Valid()) { | |||
current = MakeKey(K, 0); | |||
} else { | |||
current = iter.key(); | |||
ASSERT_TRUE(IsValidKey(current)) << std::hex << current; | |||
} | |||
ASSERT_LE(pos, current) << "should not go backwards"; | |||
// Verify that everything in [pos,current) was not present in | |||
// initial_state. | |||
while (pos < current) { | |||
ASSERT_LT(key(pos), K) << std::hex << pos; | |||
// Note that generation 0 is never inserted, so it is ok if | |||
// <*,0,*> is missing. | |||
ASSERT_TRUE((gen(pos) == 0) || | |||
(gen(pos) > initial_state.Get(key(pos))) | |||
) << "key: " << key(pos) | |||
<< "; gen: " << gen(pos) | |||
<< "; initgen: " | |||
<< initial_state.Get(key(pos)); | |||
// Advance to next key in the valid key space | |||
if (key(pos) < key(current)) { | |||
pos = MakeKey(key(pos) + 1, 0); | |||
} else { | |||
pos = MakeKey(key(pos), gen(pos) + 1); | |||
} | |||
} | |||
if (!iter.Valid()) { | |||
break; | |||
} | |||
if (rnd->Next() % 2) { | |||
iter.Next(); | |||
pos = MakeKey(key(pos), gen(pos) + 1); | |||
} else { | |||
Key new_target = RandomTarget(rnd); | |||
if (new_target > pos) { | |||
pos = new_target; | |||
iter.Seek(new_target); | |||
} | |||
} | |||
} | |||
} | |||
}; | |||
const uint32_t ConcurrentTest::K; | |||
// Simple test that does single-threaded testing of the ConcurrentTest | |||
// scaffolding. | |||
TEST(SkipTest, ConcurrentWithoutThreads) { | |||
ConcurrentTest test; | |||
Random rnd(test::RandomSeed()); | |||
for (int i = 0; i < 10000; i++) { | |||
test.ReadStep(&rnd); | |||
test.WriteStep(&rnd); | |||
} | |||
} | |||
class TestState { | |||
public: | |||
ConcurrentTest t_; | |||
int seed_; | |||
port::AtomicPointer quit_flag_; | |||
enum ReaderState { | |||
STARTING, | |||
RUNNING, | |||
DONE | |||
}; | |||
explicit TestState(int s) | |||
: seed_(s), | |||
quit_flag_(NULL), | |||
state_(STARTING), | |||
state_cv_(&mu_) {} | |||
void Wait(ReaderState s) { | |||
mu_.Lock(); | |||
while (state_ != s) { | |||
state_cv_.Wait(); | |||
} | |||
mu_.Unlock(); | |||
} | |||
void Change(ReaderState s) { | |||
mu_.Lock(); | |||
state_ = s; | |||
state_cv_.Signal(); | |||
mu_.Unlock(); | |||
} | |||
private: | |||
port::Mutex mu_; | |||
ReaderState state_; | |||
port::CondVar state_cv_; | |||
}; | |||
static void ConcurrentReader(void* arg) { | |||
TestState* state = reinterpret_cast<TestState*>(arg); | |||
Random rnd(state->seed_); | |||
int64_t reads = 0; | |||
state->Change(TestState::RUNNING); | |||
while (!state->quit_flag_.Acquire_Load()) { | |||
state->t_.ReadStep(&rnd); | |||
++reads; | |||
} | |||
state->Change(TestState::DONE); | |||
} | |||
static void RunConcurrent(int run) { | |||
const int seed = test::RandomSeed() + (run * 100); | |||
Random rnd(seed); | |||
const int N = 1000; | |||
const int kSize = 1000; | |||
for (int i = 0; i < N; i++) { | |||
if ((i % 100) == 0) { | |||
fprintf(stderr, "Run %d of %d\n", i, N); | |||
} | |||
TestState state(seed + 1); | |||
Env::Default()->Schedule(ConcurrentReader, &state); | |||
state.Wait(TestState::RUNNING); | |||
for (int i = 0; i < kSize; i++) { | |||
state.t_.WriteStep(&rnd); | |||
} | |||
state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do | |||
state.Wait(TestState::DONE); | |||
} | |||
} | |||
TEST(SkipTest, Concurrent1) { RunConcurrent(1); } | |||
TEST(SkipTest, Concurrent2) { RunConcurrent(2); } | |||
TEST(SkipTest, Concurrent3) { RunConcurrent(3); } | |||
TEST(SkipTest, Concurrent4) { RunConcurrent(4); } | |||
TEST(SkipTest, Concurrent5) { RunConcurrent(5); } | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,66 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ | |||
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ | |||
#include "include/db.h" | |||
namespace leveldb { | |||
class SnapshotList; | |||
// Snapshots are kept in a doubly-linked list in the DB. | |||
// Each Snapshot corresponds to a particular sequence number. | |||
class Snapshot { | |||
public: | |||
SequenceNumber number_; // const after creation | |||
private: | |||
friend class SnapshotList; | |||
// Snapshot is kept in a doubly-linked circular list | |||
Snapshot* prev_; | |||
Snapshot* next_; | |||
SnapshotList* list_; // just for sanity checks | |||
}; | |||
class SnapshotList { | |||
public: | |||
SnapshotList() { | |||
list_.prev_ = &list_; | |||
list_.next_ = &list_; | |||
} | |||
bool empty() const { return list_.next_ == &list_; } | |||
Snapshot* oldest() const { assert(!empty()); return list_.next_; } | |||
Snapshot* newest() const { assert(!empty()); return list_.prev_; } | |||
const Snapshot* New(SequenceNumber seq) { | |||
Snapshot* s = new Snapshot; | |||
s->number_ = seq; | |||
s->list_ = this; | |||
s->next_ = &list_; | |||
s->prev_ = list_.prev_; | |||
s->prev_->next_ = s; | |||
s->next_->prev_ = s; | |||
return s; | |||
} | |||
void Delete(const Snapshot* s) { | |||
assert(s->list_ == this); | |||
s->prev_->next_ = s->next_; | |||
s->next_->prev_ = s->prev_; | |||
delete s; | |||
} | |||
private: | |||
// Dummy head of doubly-linked list of snapshots | |||
Snapshot list_; | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ |
@ -0,0 +1,94 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/table_cache.h" | |||
#include "db/filename.h" | |||
#include "include/env.h" | |||
#include "include/table.h" | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
struct TableAndFile { | |||
RandomAccessFile* file; | |||
Table* table; | |||
}; | |||
static void DeleteEntry(const Slice& key, void* value) { | |||
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value); | |||
delete tf->table; | |||
delete tf->file; | |||
delete tf; | |||
} | |||
static void UnrefEntry(void* arg1, void* arg2) { | |||
Cache* cache = reinterpret_cast<Cache*>(arg1); | |||
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2); | |||
cache->Release(h); | |||
} | |||
TableCache::TableCache(const std::string& dbname, | |||
const Options* options, | |||
int entries) | |||
: env_(options->env), | |||
dbname_(dbname), | |||
options_(options), | |||
cache_(NewLRUCache(entries)) { | |||
} | |||
TableCache::~TableCache() { | |||
delete cache_; | |||
} | |||
Iterator* TableCache::NewIterator(const ReadOptions& options, | |||
uint64_t file_number, | |||
Table** tableptr) { | |||
if (tableptr != NULL) { | |||
*tableptr = NULL; | |||
} | |||
char buf[sizeof(file_number)]; | |||
EncodeFixed64(buf, file_number); | |||
Slice key(buf, sizeof(buf)); | |||
Cache::Handle* handle = cache_->Lookup(key); | |||
if (handle == NULL) { | |||
std::string fname = TableFileName(dbname_, file_number); | |||
RandomAccessFile* file = NULL; | |||
Table* table = NULL; | |||
Status s = env_->NewRandomAccessFile(fname, &file); | |||
if (s.ok()) { | |||
s = Table::Open(*options_, file, &table); | |||
} | |||
if (!s.ok()) { | |||
assert(table == NULL); | |||
delete file; | |||
// We do not cache error results so that if the error is transient, | |||
// or somebody repairs the file, we recover automatically. | |||
return NewErrorIterator(s); | |||
} | |||
TableAndFile* tf = new TableAndFile; | |||
tf->file = file; | |||
tf->table = table; | |||
handle = cache_->Insert(key, tf, 1, &DeleteEntry); | |||
} | |||
Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table; | |||
Iterator* result = table->NewIterator(options); | |||
result->RegisterCleanup(&UnrefEntry, cache_, handle); | |||
if (tableptr != NULL) { | |||
*tableptr = table; | |||
} | |||
return result; | |||
} | |||
void TableCache::Evict(uint64_t file_number) { | |||
char buf[sizeof(file_number)]; | |||
EncodeFixed64(buf, file_number); | |||
cache_->Erase(Slice(buf, sizeof(buf))); | |||
} | |||
} |
@ -0,0 +1,49 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// Thread-safe (provides internal synchronization) | |||
#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ | |||
#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ | |||
#include <string> | |||
#include <stdint.h> | |||
#include "db/dbformat.h" | |||
#include "include/cache.h" | |||
#include "include/table.h" | |||
#include "port/port.h" | |||
namespace leveldb { | |||
class Env; | |||
class TableCache { | |||
public: | |||
TableCache(const std::string& dbname, const Options* options, int entries); | |||
~TableCache(); | |||
// Get an iterator for the specified file number and return it. If | |||
// "tableptr" is non-NULL, also sets "*tableptr" to point to the | |||
// Table object underlying the returned iterator, or NULL if no | |||
// Table object underlies the returned iterator. The returned | |||
// "*tableptr" object is owned by the cache and should not be | |||
// deleted, and is valid for as long as the returned iterator is | |||
// live. | |||
Iterator* NewIterator(const ReadOptions& options, | |||
uint64_t file_number, | |||
Table** tableptr = NULL); | |||
// Evict any entry for the specified file number | |||
void Evict(uint64_t file_number); | |||
private: | |||
Env* const env_; | |||
const std::string dbname_; | |||
const Options* options_; | |||
Cache* cache_; | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ |
@ -0,0 +1,282 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/version_edit.h" | |||
#include "db/version_set.h" | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
// Tag numbers for serialized VersionEdit. These numbers are written to | |||
// disk and should not be changed. | |||
enum Tag { | |||
kComparator = 1, | |||
kLogNumber = 2, | |||
kNextFileNumber = 3, | |||
kLastSequence = 4, | |||
kCompactPointer = 5, | |||
kDeletedFile = 6, | |||
kNewFile = 7, | |||
kLargeValueRef = 8, | |||
}; | |||
void VersionEdit::Clear() { | |||
comparator_.clear(); | |||
log_number_ = 0; | |||
last_sequence_ = 0; | |||
next_file_number_ = 0; | |||
has_comparator_ = false; | |||
has_log_number_ = false; | |||
has_next_file_number_ = false; | |||
has_last_sequence_ = false; | |||
deleted_files_.clear(); | |||
new_files_.clear(); | |||
large_refs_added_.clear(); | |||
} | |||
void VersionEdit::EncodeTo(std::string* dst) const { | |||
if (has_comparator_) { | |||
PutVarint32(dst, kComparator); | |||
PutLengthPrefixedSlice(dst, comparator_); | |||
} | |||
if (has_log_number_) { | |||
PutVarint32(dst, kLogNumber); | |||
PutVarint64(dst, log_number_); | |||
} | |||
if (has_next_file_number_) { | |||
PutVarint32(dst, kNextFileNumber); | |||
PutVarint64(dst, next_file_number_); | |||
} | |||
if (has_last_sequence_) { | |||
PutVarint32(dst, kLastSequence); | |||
PutVarint64(dst, last_sequence_); | |||
} | |||
for (int i = 0; i < compact_pointers_.size(); i++) { | |||
PutVarint32(dst, kCompactPointer); | |||
PutVarint32(dst, compact_pointers_[i].first); // level | |||
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); | |||
} | |||
for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); | |||
iter != deleted_files_.end(); | |||
++iter) { | |||
PutVarint32(dst, kDeletedFile); | |||
PutVarint32(dst, iter->first); // level | |||
PutVarint64(dst, iter->second); // file number | |||
} | |||
for (int i = 0; i < new_files_.size(); i++) { | |||
const FileMetaData& f = new_files_[i].second; | |||
PutVarint32(dst, kNewFile); | |||
PutVarint32(dst, new_files_[i].first); // level | |||
PutVarint64(dst, f.number); | |||
PutVarint64(dst, f.file_size); | |||
PutLengthPrefixedSlice(dst, f.smallest.Encode()); | |||
PutLengthPrefixedSlice(dst, f.largest.Encode()); | |||
} | |||
for (int i = 0; i < large_refs_added_.size(); i++) { | |||
const VersionEdit::Large& l = large_refs_added_[i]; | |||
PutVarint32(dst, kLargeValueRef); | |||
PutLengthPrefixedSlice(dst, | |||
Slice(l.large_ref.data, LargeValueRef::ByteSize())); | |||
PutVarint64(dst, l.fnum); | |||
PutLengthPrefixedSlice(dst, l.internal_key.Encode()); | |||
} | |||
} | |||
static bool GetInternalKey(Slice* input, InternalKey* dst) { | |||
Slice str; | |||
if (GetLengthPrefixedSlice(input, &str)) { | |||
dst->DecodeFrom(str); | |||
return true; | |||
} else { | |||
return false; | |||
} | |||
} | |||
static bool GetLevel(Slice* input, int* level) { | |||
uint32_t v; | |||
if (GetVarint32(input, &v) && | |||
v < config::kNumLevels) { | |||
*level = v; | |||
return true; | |||
} else { | |||
return false; | |||
} | |||
} | |||
Status VersionEdit::DecodeFrom(const Slice& src) { | |||
Clear(); | |||
Slice input = src; | |||
const char* msg = NULL; | |||
uint32_t tag; | |||
// Temporary storage for parsing | |||
int level; | |||
uint64_t number; | |||
FileMetaData f; | |||
Slice str; | |||
Large large; | |||
InternalKey key; | |||
while (msg == NULL && GetVarint32(&input, &tag)) { | |||
switch (tag) { | |||
case kComparator: | |||
if (GetLengthPrefixedSlice(&input, &str)) { | |||
comparator_ = str.ToString(); | |||
has_comparator_ = true; | |||
} else { | |||
msg = "comparator name"; | |||
} | |||
break; | |||
case kLogNumber: | |||
if (GetVarint64(&input, &log_number_)) { | |||
has_log_number_ = true; | |||
} else { | |||
msg = "log number"; | |||
} | |||
break; | |||
case kNextFileNumber: | |||
if (GetVarint64(&input, &next_file_number_)) { | |||
has_next_file_number_ = true; | |||
} else { | |||
msg = "next file number"; | |||
} | |||
break; | |||
case kLastSequence: | |||
if (GetVarint64(&input, &last_sequence_)) { | |||
has_last_sequence_ = true; | |||
} else { | |||
msg = "last sequence number"; | |||
} | |||
break; | |||
case kCompactPointer: | |||
if (GetLevel(&input, &level) && | |||
GetInternalKey(&input, &key)) { | |||
compact_pointers_.push_back(std::make_pair(level, key)); | |||
} else { | |||
msg = "compaction pointer"; | |||
} | |||
break; | |||
case kDeletedFile: | |||
if (GetLevel(&input, &level) && | |||
GetVarint64(&input, &number)) { | |||
deleted_files_.insert(std::make_pair(level, number)); | |||
} else { | |||
msg = "deleted file"; | |||
} | |||
break; | |||
case kNewFile: | |||
if (GetLevel(&input, &level) && | |||
GetVarint64(&input, &f.number) && | |||
GetVarint64(&input, &f.file_size) && | |||
GetInternalKey(&input, &f.smallest) && | |||
GetInternalKey(&input, &f.largest)) { | |||
new_files_.push_back(std::make_pair(level, f)); | |||
} else { | |||
msg = "new-file entry"; | |||
} | |||
break; | |||
case kLargeValueRef: | |||
if (GetLengthPrefixedSlice(&input, &str) && | |||
(str.size() == LargeValueRef::ByteSize()) && | |||
GetVarint64(&input, &large.fnum) && | |||
GetInternalKey(&input, &large.internal_key)) { | |||
large.large_ref = LargeValueRef::FromRef(str); | |||
large_refs_added_.push_back(large); | |||
} else { | |||
msg = "large ref"; | |||
} | |||
break; | |||
default: | |||
msg = "unknown tag"; | |||
break; | |||
} | |||
} | |||
if (msg == NULL && !input.empty()) { | |||
msg = "invalid tag"; | |||
} | |||
Status result; | |||
if (msg != NULL) { | |||
result = Status::Corruption("VersionEdit", msg); | |||
} | |||
return result; | |||
} | |||
std::string VersionEdit::DebugString() const { | |||
std::string r; | |||
r.append("VersionEdit {"); | |||
if (has_comparator_) { | |||
r.append("\n Comparator: "); | |||
r.append(comparator_); | |||
} | |||
if (has_log_number_) { | |||
r.append("\n LogNumber: "); | |||
AppendNumberTo(&r, log_number_); | |||
} | |||
if (has_next_file_number_) { | |||
r.append("\n NextFile: "); | |||
AppendNumberTo(&r, next_file_number_); | |||
} | |||
if (has_last_sequence_) { | |||
r.append("\n LastSeq: "); | |||
AppendNumberTo(&r, last_sequence_); | |||
} | |||
for (int i = 0; i < compact_pointers_.size(); i++) { | |||
r.append("\n CompactPointer: "); | |||
AppendNumberTo(&r, compact_pointers_[i].first); | |||
r.append(" '"); | |||
AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); | |||
r.append("'"); | |||
} | |||
for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); | |||
iter != deleted_files_.end(); | |||
++iter) { | |||
r.append("\n DeleteFile: "); | |||
AppendNumberTo(&r, iter->first); | |||
r.append(" "); | |||
AppendNumberTo(&r, iter->second); | |||
} | |||
for (int i = 0; i < new_files_.size(); i++) { | |||
const FileMetaData& f = new_files_[i].second; | |||
r.append("\n AddFile: "); | |||
AppendNumberTo(&r, new_files_[i].first); | |||
r.append(" "); | |||
AppendNumberTo(&r, f.number); | |||
r.append(" "); | |||
AppendNumberTo(&r, f.file_size); | |||
r.append(" '"); | |||
AppendEscapedStringTo(&r, f.smallest.Encode()); | |||
r.append("' .. '"); | |||
AppendEscapedStringTo(&r, f.largest.Encode()); | |||
r.append("'"); | |||
} | |||
for (int i = 0; i < large_refs_added_.size(); i++) { | |||
const VersionEdit::Large& l = large_refs_added_[i]; | |||
r.append("\n LargeRef: "); | |||
AppendNumberTo(&r, l.fnum); | |||
r.append(" "); | |||
r.append(LargeValueRefToFilenameString(l.large_ref)); | |||
r.append(" '"); | |||
AppendEscapedStringTo(&r, l.internal_key.Encode()); | |||
r.append("'"); | |||
} | |||
r.append("\n}\n"); | |||
return r; | |||
} | |||
} |
@ -0,0 +1,118 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ | |||
#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ | |||
#include <set> | |||
#include <utility> | |||
#include <vector> | |||
#include "db/dbformat.h" | |||
namespace leveldb { | |||
class VersionSet; | |||
struct FileMetaData { | |||
int refs; | |||
uint64_t number; | |||
uint64_t file_size; // File size in bytes | |||
InternalKey smallest; // Smallest internal key served by table | |||
InternalKey largest; // Largest internal key served by table | |||
FileMetaData() : refs(0), file_size(0) { } | |||
}; | |||
class VersionEdit { | |||
public: | |||
VersionEdit() { Clear(); } | |||
~VersionEdit() { } | |||
void Clear(); | |||
void SetComparatorName(const Slice& name) { | |||
has_comparator_ = true; | |||
comparator_ = name.ToString(); | |||
} | |||
void SetLogNumber(uint64_t num) { | |||
has_log_number_ = true; | |||
log_number_ = num; | |||
} | |||
void SetNextFile(uint64_t num) { | |||
has_next_file_number_ = true; | |||
next_file_number_ = num; | |||
} | |||
void SetLastSequence(SequenceNumber seq) { | |||
has_last_sequence_ = true; | |||
last_sequence_ = seq; | |||
} | |||
void SetCompactPointer(int level, const InternalKey& key) { | |||
compact_pointers_.push_back(std::make_pair(level, key)); | |||
} | |||
// Add the specified file at the specified number. | |||
// REQUIRES: This version has not been saved (see VersionSet::SaveTo) | |||
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file | |||
void AddFile(int level, uint64_t file, | |||
uint64_t file_size, | |||
const InternalKey& smallest, | |||
const InternalKey& largest) { | |||
FileMetaData f; | |||
f.number = file; | |||
f.file_size = file_size; | |||
f.smallest = smallest; | |||
f.largest = largest; | |||
new_files_.push_back(std::make_pair(level, f)); | |||
} | |||
// Delete the specified "file" from the specified "level". | |||
void DeleteFile(int level, uint64_t file) { | |||
deleted_files_.insert(std::make_pair(level, file)); | |||
} | |||
// Record that a large value with the specified large_ref was | |||
// written to the output file numbered "fnum" | |||
void AddLargeValueRef(const LargeValueRef& large_ref, | |||
uint64_t fnum, | |||
const Slice& internal_key) { | |||
large_refs_added_.resize(large_refs_added_.size() + 1); | |||
Large* large = &(large_refs_added_.back()); | |||
large->large_ref = large_ref; | |||
large->fnum = fnum; | |||
large->internal_key.DecodeFrom(internal_key); | |||
} | |||
void EncodeTo(std::string* dst) const; | |||
Status DecodeFrom(const Slice& src); | |||
std::string DebugString() const; | |||
private: | |||
friend class VersionSet; | |||
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet; | |||
std::string comparator_; | |||
uint64_t log_number_; | |||
uint64_t next_file_number_; | |||
SequenceNumber last_sequence_; | |||
bool has_comparator_; | |||
bool has_log_number_; | |||
bool has_next_file_number_; | |||
bool has_last_sequence_; | |||
std::vector< std::pair<int, InternalKey> > compact_pointers_; | |||
DeletedFileSet deleted_files_; | |||
std::vector< std::pair<int, FileMetaData> > new_files_; | |||
struct Large { | |||
LargeValueRef large_ref; | |||
uint64_t fnum; | |||
InternalKey internal_key; | |||
}; | |||
std::vector<Large> large_refs_added_; | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ |
@ -0,0 +1,50 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "db/version_edit.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
static void TestEncodeDecode(const VersionEdit& edit) { | |||
std::string encoded, encoded2; | |||
edit.EncodeTo(&encoded); | |||
VersionEdit parsed; | |||
Status s = parsed.DecodeFrom(encoded); | |||
ASSERT_TRUE(s.ok()) << s.ToString(); | |||
parsed.EncodeTo(&encoded2); | |||
ASSERT_EQ(encoded, encoded2); | |||
} | |||
class VersionEditTest { }; | |||
TEST(VersionEditTest, EncodeDecode) { | |||
static const uint64_t kBig = 1ull << 50; | |||
VersionEdit edit; | |||
for (int i = 0; i < 4; i++) { | |||
TestEncodeDecode(edit); | |||
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, | |||
InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef), | |||
InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); | |||
edit.DeleteFile(4, kBig + 700 + i); | |||
edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression), | |||
kBig + 800 + i, "foobar"); | |||
edit.AddLargeValueRef(LargeValueRef::Make("big2", kLightweightCompression), | |||
kBig + 801 + i, "baz"); | |||
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); | |||
} | |||
edit.SetComparatorName("foo"); | |||
edit.SetLogNumber(kBig + 100); | |||
edit.SetNextFile(kBig + 200); | |||
edit.SetLastSequence(kBig + 1000); | |||
TestEncodeDecode(edit); | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,290 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// The representation of a DBImpl consists of a set of Versions. The | |||
// newest version is called "current". Older versions may be kept | |||
// around to provide a consistent view to live iterators. | |||
// | |||
// Each Version keeps track of a set of Table files per level. The | |||
// entire set of versions is maintained in a VersionSet. | |||
// | |||
// Version,VersionSet are thread-compatible, but require external | |||
// synchronization on all accesses. | |||
#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ | |||
#define STORAGE_LEVELDB_DB_VERSION_SET_H_ | |||
#include <map> | |||
#include <set> | |||
#include <vector> | |||
#include "db/dbformat.h" | |||
#include "db/version_edit.h" | |||
#include "port/port.h" | |||
namespace leveldb { | |||
// Grouping of constants. We may want to make some of these | |||
// parameters set via options. | |||
namespace config { | |||
static const int kNumLevels = 7; | |||
} | |||
namespace log { class Writer; } | |||
class Compaction; | |||
class Iterator; | |||
class MemTable; | |||
class TableBuilder; | |||
class TableCache; | |||
class Version; | |||
class VersionSet; | |||
class WritableFile; | |||
class Version { | |||
public: | |||
// Append to *iters a sequence of iterators that will | |||
// yield the contents of this Version when merged together. | |||
// REQUIRES: This version has been saved (see VersionSet::SaveTo) | |||
void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters); | |||
// Reference count management (so Versions do not disappear out from | |||
// under live iterators) | |||
void Ref(); | |||
void Unref(); | |||
// Return a human readable string that describes this version's contents. | |||
std::string DebugString() const; | |||
private: | |||
friend class Compaction; | |||
friend class VersionSet; | |||
class LevelFileNumIterator; | |||
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; | |||
VersionSet* vset_; // VersionSet to which this Version belongs | |||
Version* next_; // Next version in linked list | |||
int refs_; // Number of live refs to this version | |||
MemTable* cleanup_mem_; // NULL, or table to delete when version dropped | |||
// List of files per level | |||
std::vector<FileMetaData*> files_[config::kNumLevels]; | |||
// Level that should be compacted next and its compaction score. | |||
// Score < 1 means compaction is not strictly needed. These fields | |||
// are initialized by Finalize(). | |||
double compaction_score_; | |||
int compaction_level_; | |||
explicit Version(VersionSet* vset) | |||
: vset_(vset), next_(NULL), refs_(0), | |||
cleanup_mem_(NULL), | |||
compaction_score_(-1), | |||
compaction_level_(-1) { | |||
} | |||
~Version(); | |||
// No copying allowed | |||
Version(const Version&); | |||
void operator=(const Version&); | |||
}; | |||
class VersionSet { | |||
public: | |||
VersionSet(const std::string& dbname, | |||
const Options* options, | |||
TableCache* table_cache, | |||
const InternalKeyComparator*); | |||
~VersionSet(); | |||
// Apply *edit to the current version to form a new descriptor that | |||
// is both saved to persistent state and installed as the new | |||
// current version. Iff Apply() returns OK, arrange to delete | |||
// cleanup_mem (if cleanup_mem != NULL) when it is no longer needed | |||
// by older versions. | |||
Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); | |||
// Recover the last saved descriptor from persistent storage. | |||
Status Recover(uint64_t* log_number, SequenceNumber* last_sequence); | |||
// Save current contents to *log | |||
Status WriteSnapshot(log::Writer* log); | |||
// Return the current version. | |||
Version* current() const { return current_; } | |||
// Return the current manifest file number | |||
uint64_t ManifestFileNumber() const { return manifest_file_number_; } | |||
// Allocate and return a new file number | |||
uint64_t NewFileNumber() { return next_file_number_++; } | |||
// Return the number of Table files at the specified level. | |||
int NumLevelFiles(int level) const; | |||
// Pick level and inputs for a new compaction. | |||
// Returns NULL if there is no compaction to be done. | |||
// Otherwise returns a pointer to a heap-allocated object that | |||
// describes the compaction. Caller should delete the result. | |||
Compaction* PickCompaction(); | |||
// Return a compaction object for compacting the range [begin,end] in | |||
// the specified level. Returns NULL if there is nothing in that | |||
// level that overlaps the specified range. Caller should delete | |||
// the result. | |||
Compaction* CompactRange( | |||
int level, | |||
const InternalKey& begin, | |||
const InternalKey& end); | |||
// Create an iterator that reads over the compaction inputs for "*c". | |||
// The caller should delete the iterator when no longer needed. | |||
Iterator* MakeInputIterator(Compaction* c); | |||
// Returns true iff some level needs a compaction. | |||
bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } | |||
// Add all files listed in any live version to *live. | |||
// May also mutate some internal state. | |||
void AddLiveFiles(std::set<uint64_t>* live); | |||
// Return the approximate offset in the database of the data for | |||
// "key" as of version "v". | |||
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); | |||
// Register a reference to a large value with the specified | |||
// large_ref from the specified file number. Returns "true" if this | |||
// is the first recorded reference to the "large_ref" value in the | |||
// database, and false otherwise. | |||
bool RegisterLargeValueRef(const LargeValueRef& large_ref, | |||
uint64_t filenum, | |||
const InternalKey& internal_key); | |||
// Cleanup the large value reference state by eliminating any | |||
// references from files that are not includes in either "live_tables" | |||
// or "log_file". | |||
void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables, | |||
uint64_t log_file_num); | |||
// Returns true if a large value with the given reference is live. | |||
bool LargeValueIsLive(const LargeValueRef& large_ref); | |||
private: | |||
class Builder; | |||
friend class Compaction; | |||
friend class Version; | |||
Status Finalize(Version* v); | |||
// Delete any old versions that are no longer needed. | |||
void MaybeDeleteOldVersions(); | |||
struct BySmallestKey; | |||
Status SortLevel(Version* v, uint64_t level); | |||
void GetOverlappingInputs( | |||
int level, | |||
const InternalKey& begin, | |||
const InternalKey& end, | |||
std::vector<FileMetaData*>* inputs); | |||
void GetRange(const std::vector<FileMetaData*>& inputs, | |||
InternalKey* smallest, | |||
InternalKey* largest); | |||
Env* const env_; | |||
const std::string dbname_; | |||
const Options* const options_; | |||
TableCache* const table_cache_; | |||
const InternalKeyComparator icmp_; | |||
uint64_t next_file_number_; | |||
uint64_t manifest_file_number_; | |||
// Opened lazily | |||
WritableFile* descriptor_file_; | |||
log::Writer* descriptor_log_; | |||
// Versions are kept in a singly linked list that is never empty | |||
Version* current_; // Pointer to the last (newest) list entry | |||
Version* oldest_; // Pointer to the first (oldest) list entry | |||
// Map from large value reference to the set of <file numbers,internal_key> | |||
// values containing references to the value. We keep the | |||
// internal key as a std::string rather than as an InternalKey because | |||
// we want to be able to easily use a set. | |||
typedef std::set<std::pair<uint64_t, std::string> > LargeReferencesSet; | |||
typedef std::map<LargeValueRef, LargeReferencesSet> LargeValueMap; | |||
LargeValueMap large_value_refs_; | |||
// Per-level key at which the next compaction at that level should start. | |||
// Either an empty string, or a valid InternalKey. | |||
std::string compact_pointer_[config::kNumLevels]; | |||
// No copying allowed | |||
VersionSet(const VersionSet&); | |||
void operator=(const VersionSet&); | |||
}; | |||
// A Compaction encapsulates information about a compaction. | |||
class Compaction { | |||
public: | |||
~Compaction(); | |||
// Return the level that is being compacted. Inputs from "level" | |||
// and "level+1" will be merged to produce a set of "level+1" files. | |||
int level() const { return level_; } | |||
// Return the object that holds the edits to the descriptor done | |||
// by this compaction. | |||
VersionEdit* edit() { return &edit_; } | |||
// "which" must be either 0 or 1 | |||
int num_input_files(int which) const { return inputs_[which].size(); } | |||
// Return the ith input file at "level()+which" ("which" must be 0 or 1). | |||
FileMetaData* input(int which, int i) const { return inputs_[which][i]; } | |||
// Maximum size of files to build during this compaction. | |||
uint64_t MaxOutputFileSize() const { return max_output_file_size_; } | |||
// Add all inputs to this compaction as delete operations to *edit. | |||
void AddInputDeletions(VersionEdit* edit); | |||
// Returns true if the information we have available guarantees that | |||
// the compaction is producing data in "level+1" for which no data exists | |||
// in levels greater than "level+1". | |||
bool IsBaseLevelForKey(const Slice& user_key); | |||
// Release the input version for the compaction, once the compaction | |||
// is successful. | |||
void ReleaseInputs(); | |||
private: | |||
friend class Version; | |||
friend class VersionSet; | |||
explicit Compaction(int level); | |||
int level_; | |||
uint64_t max_output_file_size_; | |||
Version* input_version_; | |||
VersionEdit edit_; | |||
// Each compaction reads inputs from "level_" and "level_+1" | |||
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs | |||
// State for implementing IsBaseLevelForKey | |||
// level_ptrs_ holds indices into input_version_->levels_: our state | |||
// is that we are positioned at one of the file ranges for each | |||
// higher level than the ones involved in this compaction (i.e. for | |||
// all L >= level_ + 2). | |||
int level_ptrs_[config::kNumLevels]; | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ |
@ -0,0 +1,164 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// WriteBatch::rep_ := | |||
// sequence: fixed64 | |||
// count: fixed32 | |||
// data: record[count] | |||
// record := | |||
// kTypeValue varstring varstring | | |||
// kTypeLargeValueRef varstring varstring | | |||
// kTypeDeletion varstring | |||
// varstring := | |||
// len: varint32 | |||
// data: uint8[len] | |||
#include "include/write_batch.h" | |||
#include "include/db.h" | |||
#include "db/dbformat.h" | |||
#include "db/memtable.h" | |||
#include "db/write_batch_internal.h" | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
WriteBatch::WriteBatch() { | |||
Clear(); | |||
} | |||
WriteBatch::~WriteBatch() { } | |||
void WriteBatch::Clear() { | |||
rep_.clear(); | |||
rep_.resize(12); | |||
} | |||
int WriteBatchInternal::Count(const WriteBatch* b) { | |||
return DecodeFixed32(b->rep_.data() + 8); | |||
} | |||
void WriteBatchInternal::SetCount(WriteBatch* b, int n) { | |||
EncodeFixed32(&b->rep_[8], n); | |||
} | |||
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { | |||
return SequenceNumber(DecodeFixed64(b->rep_.data())); | |||
} | |||
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { | |||
EncodeFixed64(&b->rep_[0], seq); | |||
} | |||
void WriteBatch::Put(const Slice& key, const Slice& value) { | |||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); | |||
rep_.push_back(static_cast<char>(kTypeValue)); | |||
PutLengthPrefixedSlice(&rep_, key); | |||
PutLengthPrefixedSlice(&rep_, value); | |||
} | |||
void WriteBatchInternal::PutLargeValueRef(WriteBatch* b, | |||
const Slice& key, | |||
const LargeValueRef& large_ref) { | |||
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); | |||
b->rep_.push_back(static_cast<char>(kTypeLargeValueRef)); | |||
PutLengthPrefixedSlice(&b->rep_, key); | |||
PutLengthPrefixedSlice(&b->rep_, | |||
Slice(large_ref.data, sizeof(large_ref.data))); | |||
} | |||
void WriteBatch::Delete(const Slice& key) { | |||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); | |||
rep_.push_back(static_cast<char>(kTypeDeletion)); | |||
PutLengthPrefixedSlice(&rep_, key); | |||
} | |||
Status WriteBatchInternal::InsertInto(const WriteBatch* b, | |||
MemTable* memtable) { | |||
const int count = WriteBatchInternal::Count(b); | |||
int found = 0; | |||
Iterator it(*b); | |||
for (; !it.Done(); it.Next()) { | |||
switch (it.op()) { | |||
case kTypeDeletion: | |||
memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); | |||
break; | |||
case kTypeValue: | |||
memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); | |||
break; | |||
case kTypeLargeValueRef: | |||
memtable->Add(it.sequence_number(), kTypeLargeValueRef, | |||
it.key(), it.value()); | |||
break; | |||
} | |||
found++; | |||
} | |||
if (!it.status().ok()) { | |||
return it.status(); | |||
} else if (found != count) { | |||
return Status::Corruption("wrong count in WriteBatch"); | |||
} | |||
return Status::OK(); | |||
} | |||
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { | |||
assert(contents.size() >= 12); | |||
b->rep_.assign(contents.data(), contents.size()); | |||
} | |||
WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) | |||
: input_(WriteBatchInternal::Contents(&batch)), | |||
done_(false) { | |||
if (input_.size() < 12) { | |||
done_ = true; | |||
} else { | |||
seq_ = WriteBatchInternal::Sequence(&batch), | |||
input_.remove_prefix(12); | |||
GetNextEntry(); | |||
} | |||
} | |||
void WriteBatchInternal::Iterator::Next() { | |||
assert(!done_); | |||
seq_++; | |||
GetNextEntry(); | |||
} | |||
void WriteBatchInternal::Iterator::GetNextEntry() { | |||
if (input_.empty()) { | |||
done_ = true; | |||
return; | |||
} | |||
char tag = input_[0]; | |||
input_.remove_prefix(1); | |||
switch (tag) { | |||
case kTypeValue: | |||
case kTypeLargeValueRef: | |||
if (GetLengthPrefixedSlice(&input_, &key_) && | |||
GetLengthPrefixedSlice(&input_, &value_)) { | |||
op_ = static_cast<ValueType>(tag); | |||
} else { | |||
status_ = Status::Corruption("bad WriteBatch Put"); | |||
done_ = true; | |||
input_.clear(); | |||
} | |||
break; | |||
case kTypeDeletion: | |||
if (GetLengthPrefixedSlice(&input_, &key_)) { | |||
op_ = kTypeDeletion; | |||
} else { | |||
status_ = Status::Corruption("bad WriteBatch Delete"); | |||
done_ = true; | |||
input_.clear(); | |||
} | |||
break; | |||
default: | |||
status_ = Status::Corruption("unknown WriteBatch tag"); | |||
done_ = true; | |||
input_.clear(); | |||
break; | |||
} | |||
} | |||
} |
@ -0,0 +1,73 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ | |||
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ | |||
#include "include/write_batch.h" | |||
namespace leveldb { | |||
// WriteBatchInternal provides static methods for manipulating a | |||
// WriteBatch that we don't want in the public WriteBatch interface. | |||
class WriteBatchInternal { | |||
public: | |||
static void PutLargeValueRef(WriteBatch* batch, | |||
const Slice& key, | |||
const LargeValueRef& large_ref); | |||
// Return the number of entries in the batch. | |||
static int Count(const WriteBatch* batch); | |||
// Set the count for the number of entries in the batch. | |||
static void SetCount(WriteBatch* batch, int n); | |||
// Return the seqeunce number for the start of this batch. | |||
static SequenceNumber Sequence(const WriteBatch* batch); | |||
// Store the specified number as the seqeunce number for the start of | |||
// this batch. | |||
static void SetSequence(WriteBatch* batch, SequenceNumber seq); | |||
static Slice Contents(const WriteBatch* batch) { | |||
return Slice(batch->rep_); | |||
} | |||
static size_t ByteSize(const WriteBatch* batch) { | |||
return batch->rep_.size(); | |||
} | |||
static void SetContents(WriteBatch* batch, const Slice& contents); | |||
static Status InsertInto(const WriteBatch* batch, MemTable* memtable); | |||
// Iterate over the contents of a write batch. | |||
class Iterator { | |||
public: | |||
explicit Iterator(const WriteBatch& batch); | |||
bool Done() const { return done_; } | |||
void Next(); | |||
ValueType op() const { return op_; } | |||
const Slice& key() const { return key_; } | |||
const Slice& value() const { return value_; } | |||
SequenceNumber sequence_number() const { return seq_; } | |||
Status status() const { return status_; } | |||
private: | |||
void GetNextEntry(); | |||
Slice input_; | |||
bool done_; | |||
ValueType op_; | |||
Slice key_; | |||
Slice value_; | |||
SequenceNumber seq_; | |||
Status status_; | |||
}; | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ |
@ -0,0 +1,110 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "include/db.h" | |||
#include "db/memtable.h" | |||
#include "db/write_batch_internal.h" | |||
#include "include/env.h" | |||
#include "util/logging.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
static std::string PrintContents(WriteBatch* b) { | |||
InternalKeyComparator cmp(BytewiseComparator()); | |||
MemTable mem(cmp); | |||
std::string state; | |||
Status s = WriteBatchInternal::InsertInto(b, &mem); | |||
Iterator* iter = mem.NewIterator(); | |||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { | |||
ParsedInternalKey ikey; | |||
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); | |||
switch (ikey.type) { | |||
case kTypeValue: | |||
state.append("Put("); | |||
state.append(ikey.user_key.ToString()); | |||
state.append(", "); | |||
state.append(iter->value().ToString()); | |||
state.append(")"); | |||
break; | |||
case kTypeLargeValueRef: | |||
state.append("PutRef("); | |||
state.append(ikey.user_key.ToString()); | |||
state.append(", "); | |||
state.append(iter->value().ToString()); | |||
state.append(")"); | |||
break; | |||
case kTypeDeletion: | |||
state.append("Delete("); | |||
state.append(ikey.user_key.ToString()); | |||
state.append(")"); | |||
break; | |||
} | |||
state.append("@"); | |||
state.append(NumberToString(ikey.sequence)); | |||
} | |||
delete iter; | |||
if (!s.ok()) { | |||
state.append("ParseError()"); | |||
} | |||
return state; | |||
} | |||
class WriteBatchTest { }; | |||
TEST(WriteBatchTest, Empty) { | |||
WriteBatch batch; | |||
ASSERT_EQ("", PrintContents(&batch)); | |||
ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); | |||
} | |||
TEST(WriteBatchTest, Multiple) { | |||
WriteBatch batch; | |||
batch.Put(Slice("foo"), Slice("bar")); | |||
batch.Delete(Slice("box")); | |||
batch.Put(Slice("baz"), Slice("boo")); | |||
WriteBatchInternal::SetSequence(&batch, 100); | |||
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); | |||
ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); | |||
ASSERT_EQ("Put(baz, boo)@102" | |||
"Delete(box)@101" | |||
"Put(foo, bar)@100", | |||
PrintContents(&batch)); | |||
} | |||
TEST(WriteBatchTest, PutIndirect) { | |||
WriteBatch batch; | |||
batch.Put(Slice("baz"), Slice("boo")); | |||
LargeValueRef h; | |||
for (int i = 0; i < LargeValueRef::ByteSize(); i++) { | |||
h.data[i] = (i < 20) ? 'a' : 'b'; | |||
} | |||
WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h); | |||
WriteBatchInternal::SetSequence(&batch, 100); | |||
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); | |||
ASSERT_EQ(2, WriteBatchInternal::Count(&batch)); | |||
ASSERT_EQ("Put(baz, boo)@100" | |||
"PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101", | |||
PrintContents(&batch)); | |||
} | |||
TEST(WriteBatchTest, Corruption) { | |||
WriteBatch batch; | |||
batch.Put(Slice("foo"), Slice("bar")); | |||
batch.Delete(Slice("box")); | |||
WriteBatchInternal::SetSequence(&batch, 200); | |||
Slice contents = WriteBatchInternal::Contents(&batch); | |||
WriteBatchInternal::SetContents(&batch, | |||
Slice(contents.data(),contents.size()-1)); | |||
ASSERT_EQ("Put(foo, bar)@200" | |||
"ParseError()", | |||
PrintContents(&batch)); | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,89 @@ | |||
body { | |||
margin-left: 0.5in; | |||
margin-right: 0.5in; | |||
background: white; | |||
color: black; | |||
} | |||
h1 { | |||
margin-left: -0.2in; | |||
font-size: 14pt; | |||
} | |||
h2 { | |||
margin-left: -0in; | |||
font-size: 12pt; | |||
} | |||
h3 { | |||
margin-left: -0in; | |||
} | |||
h4 { | |||
margin-left: -0in; | |||
} | |||
hr { | |||
margin-left: -0in; | |||
} | |||
/* Definition lists: definition term bold */ | |||
dt { | |||
font-weight: bold; | |||
} | |||
address { | |||
text-align: center; | |||
} | |||
code,samp,var { | |||
color: blue; | |||
} | |||
kbd { | |||
color: #600000; | |||
} | |||
div.note p { | |||
float: right; | |||
width: 3in; | |||
margin-right: 0%; | |||
padding: 1px; | |||
border: 2px solid #6060a0; | |||
background-color: #fffff0; | |||
} | |||
ul { | |||
margin-top: -0em; | |||
margin-bottom: -0em; | |||
} | |||
ol { | |||
margin-top: -0em; | |||
margin-bottom: -0em; | |||
} | |||
UL.nobullets { | |||
list-style-type: none; | |||
list-style-image: none; | |||
margin-left: -1em; | |||
} | |||
p { | |||
margin: 1em 0 1em 0; | |||
padding: 0 0 0 0; | |||
} | |||
pre { | |||
line-height: 1.3em; | |||
padding: 0.4em 0 0.8em 0; | |||
margin: 0 0 0 0; | |||
border: 0 0 0 0; | |||
color: blue; | |||
} | |||
.datatable { | |||
margin-left: auto; | |||
margin-right: auto; | |||
margin-top: 2em; | |||
margin-bottom: 2em; | |||
border: 1px solid; | |||
} | |||
.datatable td,th { | |||
padding: 0 0.5em 0 0.5em; | |||
text-align: right; | |||
} |
@ -0,0 +1,222 @@ | |||
<!DOCTYPE html> | |||
<html> | |||
<head> | |||
<link rel="stylesheet" type="text/css" href="doc.css" /> | |||
<title>Leveldb file layout and compactions</title> | |||
</head> | |||
<body> | |||
<h1>Files</h1> | |||
The implementation of leveldb is similar in spirit to the | |||
representation of a single | |||
<a href="http://labs.google.com/papers/bigtable.html"> | |||
Bigtable tablet (section 5.3)</a>. | |||
However the organization of the files that make up the representation | |||
is somewhat different and is explained below. | |||
<p> | |||
Each database is represented by a set of file stored in a directory. | |||
There are several different types of files as documented below: | |||
<p> | |||
<h2>Log files</h2> | |||
<p> | |||
A log file (*.log) stores a sequence of recent updates. Each update | |||
is appended to the current log file. When the log file reaches a | |||
pre-determined size (approximately 1MB by default), it is converted | |||
to a sorted table (see below) and a new log file is created for future | |||
updates. | |||
<p> | |||
A copy of the current log file is kept in an in-memory structure (the | |||
<code>memtable</code>). This copy is consulted on every read so that read | |||
operations reflect all logged updates. | |||
<p> | |||
<h2>Sorted tables</h2> | |||
<p> | |||
A sorted table (*.sst) stores a sequence of entries sorted by key. | |||
Each entry is either a value for the key, or a deletion marker for the | |||
key. (Deletion markers are kept around to hide obsolete values | |||
present in older sorted tables). | |||
<p> | |||
The set of sorted tables are organized into a sequence of levels. The | |||
sorted table generated from a log file is placed in a special <code>young</code> | |||
level (also called level-0). When the number of young files exceeds a | |||
certain threshold (currently four), all of the young files are merged | |||
together with all of the overlapping level-1 files to produce a | |||
sequence of new level-1 files (we create a new level-1 file for every | |||
2MB of data.) | |||
<p> | |||
Files in the young level may contain overlapping keys. However files | |||
in other levels have distinct non-overlapping key ranges. Consider | |||
level number L where L >= 1. When the combined size of files in | |||
level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, | |||
...), one file in level-L, and all of the overlapping files in | |||
level-(L+1) are merged to form a set of new files for level-(L+1). | |||
These merges have the effect of gradually migrating new updates from | |||
the young level to the largest level using only bulk reads and writes | |||
(i.e., minimizing expensive seeks). | |||
<h2>Large value files</h2> | |||
<p> | |||
Each large value (greater than 64KB by default) is placed in a large | |||
value file (*.val) of its own. An entry is maintained in the log | |||
and/or sorted tables that maps from the corresponding key to the | |||
name of this large value file. The name of the large value file | |||
is derived from a SHA1 hash of the value and its length so that | |||
identical values share the same file. | |||
<p> | |||
<h2>Manifest</h2> | |||
<p> | |||
A MANIFEST file lists the set of sorted tables that make up each | |||
level, the corresponding key ranges, and other important metadata. | |||
A new MANIFEST file (with a new number embedded in the file name) | |||
is created whenever the database is reopened. The MANIFEST file is | |||
formatted as a log, and changes made to the serving state (as files | |||
are added or removed) are appended to this log. | |||
<p> | |||
<h2>Current</h2> | |||
<p> | |||
CURRENT is a simple text file that contains the name of the latest | |||
MANIFEST file. | |||
<p> | |||
<h2>Info logs</h2> | |||
<p> | |||
Informational messages are printed to files named LOG and LOG.old. | |||
<p> | |||
<h2>Others</h2> | |||
<p> | |||
Other files used for miscellaneous purposes may also be present | |||
(LOCK, *.dbtmp). | |||
<h1>Level 0</h1> | |||
When the log file grows above a certain size (1MB by default): | |||
<ul> | |||
<li>Write the contents of the current memtable to an sstable | |||
<li>Replace the current memtable by a brand new empty memtable | |||
<li>Switch to a new log file | |||
<li>Delete the old log file and the old memtable | |||
</ul> | |||
Experimental measurements show that generating an sstable from a 1MB | |||
log file takes ~12ms, which seems like an acceptable latency hiccup to | |||
add infrequently to a log write. | |||
<p> | |||
The new sstable is added to a special level-0 level. level-0 contains | |||
a set of files (up to 4 by default). However unlike other levels, | |||
these files do not cover disjoint ranges, but may overlap each other. | |||
<h1>Compactions</h1> | |||
<p> | |||
When the size of level L exceeds its limit, we compact it in a | |||
background thread. The compaction picks a file from level L and all | |||
overlapping files from the next level L+1. Note that if a level-L | |||
file overlaps only part of a level-(L+1) file, the entire file at | |||
level-(L+1) is used as an input to the compaction and will be | |||
discarded after the compaction. Aside: because level-0 is special | |||
(files in it may overlap each other), we treat compactions from | |||
level-0 to level-1 specially: a level-0 compaction may pick more than | |||
one level-0 file in case some of these files overlap each other. | |||
<p> | |||
A compaction merges the contents of the picked files to produce a | |||
sequence of level-(L+1) files. We switch to producing a new | |||
level-(L+1) file after the current output file has reached the target | |||
file size (2MB). The old files are discarded and the new files are | |||
added to the serving state. | |||
<p> | |||
Compactions for a particular level rotate through the key space. In | |||
more detail, for each level L, we remember the ending key of the last | |||
compaction at level L. The next compaction for level L will pick the | |||
first file that starts after this key (wrapping around to the | |||
beginning of the key space if there is no such file). | |||
<p> | |||
Compactions drop overwritten values. They also drop deletion markers | |||
if there are no higher numbered levels that contain a file whose range | |||
overlaps the current key. | |||
<h2>Timing</h2> | |||
Level-0 compactions will read up to four 1MB files from level-0, and | |||
at worst all the level-1 files (10MB). I.e., we will read 14MB and | |||
write 14MB. | |||
<p> | |||
Other than the special level-0 compactions, we will pick one 2MB file | |||
from level L. In the worst case, this will overlap ~ 12 files from | |||
level L+1 (10 because level-(L+1) is ten times the size of level-L, | |||
and another two at the boundaries since the file ranges at level-L | |||
will usually not be aligned with the file ranges at level-L+1). The | |||
compaction will therefore read 26MB and write 26MB. Assuming a disk | |||
IO rate of 100MB/s (ballpark range for modern drives), the worst | |||
compaction cost will be approximately 0.5 second. | |||
<p> | |||
If we throttle the background writing to something small, say 10% of | |||
the full 100MB/s speed, a compaction may take up to 5 seconds. If the | |||
user is writing at 10MB/s, we might build up lots of level-0 files | |||
(~50 to hold the 5*10MB). This may signficantly increase the cost of | |||
reads due to the overhead of merging more files together on every | |||
read. | |||
<p> | |||
Solution 1: To reduce this problem, we might want to increase the log | |||
switching threshold when the number of level-0 files is large. Though | |||
the downside is that the larger this threshold, the larger the delay | |||
that we will add to write latency when a write triggers a log switch. | |||
<p> | |||
Solution 2: We might want to decrease write rate artificially when the | |||
number of level-0 files goes up. | |||
<p> | |||
Solution 3: We work on reducing the cost of very wide merges. | |||
Perhaps most of the level-0 files will have their blocks sitting | |||
uncompressed in the cache and we will only need to worry about the | |||
O(N) complexity in the merging iterator. | |||
<h2>Number of files</h2> | |||
Instead of always making 2MB files, we could make larger files for | |||
larger levels to reduce the total file count, though at the expense of | |||
more bursty compactions. Alternatively, we could shard the set of | |||
files into multiple directories. | |||
<p> | |||
An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows | |||
the following timings to do 100K file opens in directories with | |||
varying number of files: | |||
<table class="datatable"> | |||
<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr> | |||
<tr><td>1000</td><td>9</td> | |||
<tr><td>10000</td><td>10</td> | |||
<tr><td>100000</td><td>16</td> | |||
</table> | |||
So maybe even the sharding is not necessary on modern filesystems? | |||
<h1>Recovery</h1> | |||
<ul> | |||
<li> Read CURRENT to find name of the latest committed MANIFEST | |||
<li> Read the named MANIFEST file | |||
<li> Clean up stale files | |||
<li> We could open all sstables here, but it is probably better to be lazy... | |||
<li> Convert log chunk to a new level-0 sstable | |||
<li> Start directing new writes to a new log file with recovered sequence# | |||
</ul> | |||
<h1>Garbage collection of files</h1> | |||
<code>DeleteObsoleteFiles()</code> is called at the end of every | |||
compaction and at the end of recovery. It finds the names of all | |||
files in the database. It deletes all log files that are not the | |||
current log file. It deletes all table files that are not referenced | |||
from some level and are not the output of an active compaction. It | |||
deletes all large value files that are not referenced from any live | |||
table or log file. | |||
</body> | |||
</html> |
@ -0,0 +1,508 @@ | |||
<!DOCTYPE html> | |||
<html> | |||
<head> | |||
<link rel="stylesheet" type="text/css" href="doc.css" /> | |||
<title>Leveldb</title> | |||
</head> | |||
<body> | |||
<h1>Leveldb</h1> | |||
<address>Jeff Dean, Sanjay Ghemawat</address> | |||
<p> | |||
The <code>leveldb</code> library provides a persistent key value store. Keys and | |||
values are arbitrary byte arrays. The keys are ordered within the key | |||
value store according to a user-specified comparator function. | |||
<p> | |||
<h1>Opening A Database</h1> | |||
<p> | |||
A <code>leveldb</code> database has a name which corresponds to a file system | |||
directory. All of the contents of database are stored in this | |||
directory. The following example shows how to open a database, | |||
creating it if necessary: | |||
<p> | |||
<pre> | |||
#include <assert> | |||
#include "leveldb/include/db.h" | |||
leveldb::DB* db; | |||
leveldb::Options options; | |||
options.create_if_missing = true; | |||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db); | |||
assert(status.ok()); | |||
... | |||
</pre> | |||
If you want to raise an error if the database already exists, add | |||
the following line before the <code>leveldb::DB::Open</code> call: | |||
<pre> | |||
options.error_if_exists = true; | |||
</pre> | |||
<h1>Status</h1> | |||
<p> | |||
You may have noticed the <code>leveldb::Status</code> type above. Values of this | |||
type are returned by most functions in <code>leveldb</code> that may encounter an | |||
error. You can check if such a result is ok, and also print an | |||
associated error message: | |||
<p> | |||
<pre> | |||
leveldb::Status s = ...; | |||
if (!s.ok()) cerr << s.ToString() << endl; | |||
</pre> | |||
<h1>Closing A Database</h1> | |||
<p> | |||
When you are done with a database, just delete the database object. | |||
Example: | |||
<p> | |||
<pre> | |||
... open the db as described above ... | |||
... do something with db ... | |||
delete db; | |||
</pre> | |||
<h1>Reads And Writes</h1> | |||
<p> | |||
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to | |||
modify/query the database. For example, the following code | |||
moves the value stored under key1 to key2. | |||
<p> | |||
<pre> | |||
std::string value; | |||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value); | |||
if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value); | |||
if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1); | |||
</pre> | |||
See <a href="#async">important performance note</a> below for how to | |||
speed up writes significantly. | |||
<h1>Atomic Updates</h1> | |||
<p> | |||
Note that if the process dies after the Put of key2 but before the | |||
delete of key1, the same value may be left stored under multiple keys. | |||
Such problems can be avoided by using the <code>WriteBatch</code> class to | |||
atomically apply a set of updates: | |||
<p> | |||
<pre> | |||
#include "leveldb/include/write_batch.h" | |||
... | |||
std::string value; | |||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value); | |||
if (s.ok()) { | |||
leveldb::WriteBatch batch; | |||
batch.Delete(key1); | |||
batch.Put(key2, value); | |||
s = db->Write(leveldb::WriteOptions(), &batch); | |||
} | |||
</pre> | |||
The <code>WriteBatch</code> holds a sequence of edits to be made to the database, | |||
and these edits within the batch are applied in order. Note that we | |||
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>, | |||
we do not end up erroneously dropping the value entirely. | |||
<p> | |||
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to | |||
speed up bulk updates by placing lots of individual mutations into the | |||
same batch. | |||
<p> | |||
<h1>Concurrency</h1> | |||
<p> | |||
A database may only be opened by one process at a time. The <code>leveldb</code> | |||
implementation acquires a lock from the operating system to prevent | |||
misuse. Within a single process, the same <code>leveldb::DB</code> object may | |||
be safely used by multiple concurrent threads. | |||
<p> | |||
<h1>Iteration</h1> | |||
<p> | |||
The following example demonstrates how to print all key,value pairs | |||
in a database. | |||
<p> | |||
<pre> | |||
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions()); | |||
for (it->SeekToFirst(); it->Valid(); it->Next()) { | |||
cout << it->key().ToString() << ": " << it->value().ToString() << endl; | |||
} | |||
assert(it->status().ok()); // Check for any errors found during the scan | |||
delete it; | |||
</pre> | |||
The following variation shows how to process just the keys in the | |||
range <code>[start,limit)</code>: | |||
<p> | |||
<pre> | |||
for (it->Seek(start); | |||
it->Valid() && it->key().ToString() < limit; | |||
it->Next()) { | |||
... | |||
} | |||
</pre> | |||
You can also process entries in reverse order. (Caveat: reverse | |||
iteration is currently a factor of two or three slower than forward | |||
iteration.) | |||
<p> | |||
<pre> | |||
for (it->SeekToLast(); it->Valid(); it->Prev()) { | |||
... | |||
} | |||
</pre> | |||
<h1>Snapshots</h1> | |||
<p> | |||
Snapshots provide consistent read-only views over the entire state of | |||
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate | |||
that a read should operate on a particular version of the DB state. | |||
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an | |||
implicit snapshot of the current state. | |||
<p> | |||
Snapshots typically are created by the DB::GetSnapshot() method: | |||
<p> | |||
<pre> | |||
leveldb::ReadOptions options; | |||
options.snapshot = db->GetSnapshot(); | |||
... apply some updates to db ... | |||
leveldb::Iterator* iter = db->NewIterator(options); | |||
... read using iter to view the state when the snapshot was created ... | |||
delete iter; | |||
db->ReleaseSnapshot(options.snapshot); | |||
</pre> | |||
Note that when a snapshot is no longer needed, it should be released | |||
using the DB::ReleaseSnapshot interface. This allows the | |||
implementation to get rid of state that was being maintained just to | |||
support reading as of that snapshot. | |||
<p> | |||
A Write operation can also return a snapshot that | |||
represents the state of the database just after applying a particular | |||
set of updates: | |||
<p> | |||
<pre> | |||
leveldb::Snapshot* snapshot; | |||
leveldb::WriteOptions write_options; | |||
write_options.post_write_snapshot = &snapshot; | |||
leveldb::Status status = db->Write(write_options, ...); | |||
... perform other mutations to db ... | |||
leveldb::ReadOptions read_options; | |||
read_options.snapshot = snapshot; | |||
leveldb::Iterator* iter = db->NewIterator(read_options); | |||
... read as of the state just after the Write call returned ... | |||
delete iter; | |||
db->ReleaseSnapshot(snapshot); | |||
</pre> | |||
<h1>Slice</h1> | |||
<p> | |||
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above | |||
are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple | |||
structure that contains a length and a pointer to an external byte | |||
array. Returning a <code>Slice</code> is a cheaper alternative to returning a | |||
<code>std::string</code> since we do not need to copy potentially large keys and | |||
values. In addition, <code>leveldb</code> methods do not return null-terminated | |||
C-style strings since <code>leveldb</code> keys and values are allowed to | |||
contain '\0' bytes. | |||
<p> | |||
C++ strings and null-terminated C-style strings can be easily converted | |||
to a Slice: | |||
<p> | |||
<pre> | |||
leveldb::Slice s1 = "hello"; | |||
std::string str("world"); | |||
leveldb::Slice s2 = str; | |||
</pre> | |||
A Slice can be easily converted back to a C++ string: | |||
<pre> | |||
std::string str = s1.ToString(); | |||
assert(str == std::string("hello")); | |||
</pre> | |||
Be careful when using Slices since it is up to the caller to ensure that | |||
the external byte array into which the Slice points remains live while | |||
the Slice is in use. For example, the following is buggy: | |||
<p> | |||
<pre> | |||
leveldb::Slice slice; | |||
if (...) { | |||
std::string str = ...; | |||
slice = str; | |||
} | |||
Use(slice); | |||
</pre> | |||
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the | |||
backing storage for <code>slice</code> will disappear. | |||
<p> | |||
<h1>Comparators</h1> | |||
<p> | |||
The preceding examples used the default ordering function for key, | |||
which orders bytes lexicographically. You can however supply a custom | |||
comparator when opening a database. For example, suppose each | |||
database key consists of two numbers and we should sort by the first | |||
number, breaking ties by the second number. First, define a proper | |||
subclass of <code>leveldb::Comparator</code> that expresses these rules: | |||
<p> | |||
<pre> | |||
class TwoPartComparator : public leveldb::Comparator { | |||
public: | |||
// Three-way comparison function: | |||
// if a < b: negative result | |||
// if a > b: positive result | |||
// else: zero result | |||
int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const { | |||
int a1, a2, b1, b2; | |||
ParseKey(a, &a1, &a2); | |||
ParseKey(b, &b1, &b2); | |||
if (a1 < b1) return -1; | |||
if (a1 > b1) return +1; | |||
if (a2 < b2) return -1; | |||
if (a2 > b2) return +1; | |||
return 0; | |||
} | |||
// Ignore the following methods for now: | |||
const char* Name() { return "TwoPartComparator"; } | |||
void FindShortestSeparator(std::string*, const leveldb::Slice&) const { } | |||
void FindShortSuccessor(std::string*) const { } | |||
}; | |||
</pre> | |||
Now create a database using this custom comparator: | |||
<p> | |||
<pre> | |||
TwoPartComparator cmp; | |||
leveldb::DB* db; | |||
leveldb::Options options; | |||
options.create_if_missing = true; | |||
options.comparator = &cmp; | |||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db); | |||
... | |||
</pre> | |||
<h2>Backwards compatibility</h2> | |||
<p> | |||
The result of the comparator's <code>Name</code> method is attached to the | |||
database when it is created, and is checked on every subsequent | |||
database open. If the name changes, the <code>leveldb::DB::Open</code> call will | |||
fail. Therefore, change the name if and only if the new key format | |||
and comparison function are incompatible with existing databases, and | |||
it is ok to discard the contents of all existing databases. | |||
<p> | |||
You can however still gradually evolve your key format over time with | |||
a little bit of pre-planning. For example, you could store a version | |||
number at the end of each key (one byte should suffice for most uses). | |||
When you wish to switch to a new key format (e.g., adding an optional | |||
third part to the keys processed by <code>TwoPartComparator</code>), | |||
(a) keep the same comparator name (b) increment the version number | |||
for new keys (c) change the comparator function so it uses the | |||
version numbers found in the keys to decide how to interpret them. | |||
<p> | |||
<h1>Performance</h1> | |||
<p> | |||
Performance can be tuned by changing the default values of the | |||
types defined in <code>leveldb/include/options.h</code>. | |||
<p> | |||
<h2><a name="async">Asynchronous Writes</a></h2> | |||
By default, each write to <code>leveldb</code> is synchronous: it does | |||
not return until the write has been pushed from memory to persistent | |||
storage. (On Posix systems, this is implemented by calling either | |||
<code>fdatasync(...)</code> or <code>msync(..., MS_SYNC)</code>.) | |||
<strong>Synchronous writes may be very slow and the synchrony can be | |||
optionally disabled</strong>: | |||
<pre> | |||
leveldb::WriteOptions write_options; | |||
write_options.sync = false; | |||
db->Put(write_options, ...); | |||
</pre> | |||
Asynchronous writes are often more than a hundred times as fast as | |||
synchronous writes. The downside of asynchronous writes is that a | |||
crash of the machine may cause the last few updates to be lost. Note | |||
that a crash of just the writing process (i.e., not a reboot) will not | |||
cause any loss since even when <code>sync</code> is false, an update | |||
is pushed from the process memory into the operating system before it | |||
is considered done. | |||
<p> | |||
Asynchronous writes can be particularly beneficial when loading a | |||
large amount of data into the database since you can mitigate the | |||
problem of lost updates by restarting the bulk load. A hybrid scheme | |||
is also possible where every Nth write is synchronous, and in the | |||
event of a crash, the bulk load is restarted just after the last | |||
synchronous write finished by the previous run. | |||
<p> | |||
<code>WriteBatch</code> provides an alternative to asynchronous writes. | |||
Multiple updates may be placed in the same <code>WriteBatch</code> and | |||
applied together using a synchronous write. The extra cost of the | |||
synchronous write will be amortized across all of the writes in the batch. | |||
<p> | |||
<h2>Block size</h2> | |||
<p> | |||
<code>leveldb</code> groups adjacent keys together into the same block and such a | |||
block is the unit of transfer to and from persistent storage. The | |||
default block size is approximately 8192 uncompressed bytes. | |||
Applications that mostly do bulk scans over the contents of the | |||
database may wish to increase this size. Applications that do a lot | |||
of point reads of small values may wish to switch to a smaller block | |||
size if performance measurements indicate an improvement. There isn't | |||
much benefit in using blocks smaller than one kilobyte, or larger than | |||
a few megabytes. Also note that compression will be more effective | |||
with larger block sizes. | |||
<p> | |||
<h2>Compression</h2> | |||
<p> | |||
Each block is individually compressed before being written to | |||
persistent storage. Compression is on by default since the default | |||
compression method is very fast, and is automatically disabled for | |||
uncompressible data. In rare cases, applications may want to disable | |||
compression entirely, but should only do so if benchmarks show a | |||
performance improvement: | |||
<p> | |||
<pre> | |||
leveldb::Options options; | |||
options.compression = leveldb::kNoCompression; | |||
... leveldb::DB::Open(options, name, ...) .... | |||
</pre> | |||
<h2>Cache</h2> | |||
<p> | |||
The contents of the database are stored in a set of files in the | |||
filesystem and each file stores a sequence of compressed blocks. If | |||
<code>options.cache</code> is non-NULL, it is used to cache frequently used | |||
uncompressed block contents. | |||
<p> | |||
<pre> | |||
#include "leveldb/include/cache.h" | |||
leveldb::Options options; | |||
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache | |||
leveldb::DB* db; | |||
leveldb::DB::Open(options, name, &db); | |||
... use the db ... | |||
delete db | |||
delete options.cache; | |||
</pre> | |||
Note that the cache holds uncompressed data, and therefore it should | |||
be sized according to application level data sizes, without any | |||
reduction from compression. (Caching of compressed blocks is left to | |||
the operating system buffer cache, or any custom <code>Env</code> | |||
implementation provided by the client.) | |||
<p> | |||
When performing a bulk read, the application may wish to disable | |||
caching so that the data processed by the bulk read does not end up | |||
displacing most of the cached contents. A per-iterator option can be | |||
used to achieve this: | |||
<p> | |||
<pre> | |||
leveldb::ReadOptions options; | |||
options.fill_cache = false; | |||
leveldb::Iterator* it = db->NewIterator(options); | |||
for (it->SeekToFirst(); it->Valid(); it->Next()) { | |||
... | |||
} | |||
</pre> | |||
<h2>Key Layout</h2> | |||
<p> | |||
Note that the unit of disk transfer and caching is a block. Adjacent | |||
keys (according to the database sort order) will usually be placed in | |||
the same block. Therefore the application can improve its performance | |||
by placing keys that are accessed together near each other and placing | |||
infrequently used keys in a separate region of the key space. | |||
<p> | |||
For example, suppose we are implementing a simple file system on top | |||
of <code>leveldb</code>. The types of entries we might wish to store are: | |||
<p> | |||
<pre> | |||
filename -> permission-bits, length, list of file_block_ids | |||
file_block_id -> data | |||
</pre> | |||
We might want to prefix <code>filename</code> keys with one letter (say '/') and the | |||
<code>file_block_id</code> keys with a different letter (say '0') so that scans | |||
over just the metadata do not force us to fetch and cache bulky file | |||
contents. | |||
<p> | |||
<h2>Large Values</h2> | |||
<p> | |||
<code>leveldb</code> has special treatment of large values (by default, a value | |||
of length greater than or equal to 64K is considered large, though a | |||
field in Options can be used to adjust this threshold). Each such | |||
large value is placed in a separate operating system file, and the | |||
normal database blocks just contain pointers to such files. | |||
<p> | |||
Furthermore, if the same large value occurs multiple times in a single | |||
database, it will be stored just once. | |||
<p> | |||
<h1>Checksums</h1> | |||
<p> | |||
<code>leveldb</code> associates checksums with all data it stores in the file system. | |||
There are two separate controls provided over how aggressively these | |||
checksums are verified: | |||
<p> | |||
<ul> | |||
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force | |||
checksum verification of all data that is read from the file system on | |||
behalf of a particular read. By default, no such verification is | |||
done. | |||
<p> | |||
<li> <code>Options::paranoid_checks</code> may be set to true before opening a | |||
database to make the database implementation raise an error as soon as | |||
it detects an internal corruption. Depending on which portion of the | |||
database has been corrupted, the error may be raised when the database | |||
is opened, or later by another database operation. By default, | |||
paranoid checking is off so that the database can be used even if | |||
parts of its persistent storage have been corrupted. | |||
<p> | |||
If a database is corrupted (perhaps it cannot be opened when | |||
paranoid checking is turned on), the <code>leveldb::RepairDB</code> function | |||
may be used to recover as much of the data as possible | |||
<p> | |||
</ul> | |||
<h1>Approximate Sizes</h1> | |||
<p> | |||
The <code>GetApproximateSizes</code> method can used to get the approximate | |||
number of bytes of file system space used by one or more key ranges. | |||
<p> | |||
<pre> | |||
leveldb::Range ranges[2]; | |||
ranges[0] = leveldb::Range("a", "c"); | |||
ranges[1] = leveldb::Range("x", "z"); | |||
uint64_t sizes[2]; | |||
leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes); | |||
</pre> | |||
The preceding call will set <code>sizes[0]</code> to the approximate number of | |||
bytes of file system space used by the key range <code>[a..c)</code> and | |||
<code>sizes[1]</code> to the approximate number of bytes used by the key range | |||
<code>[x..z)</code>. | |||
<p> | |||
<h1>Environment</h1> | |||
<p> | |||
All file operations (and other operating system calls) issued by the | |||
<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object. | |||
Sophisticated clients may wish to provide their own <code>Env</code> | |||
implementation to get better control. For example, an application may | |||
introduce artificial delays in the file IO paths to limit the impact | |||
of <code>leveldb</code> on other activities in the system. | |||
<p> | |||
<pre> | |||
class SlowEnv : public leveldb::Env { | |||
.. implementation of the Env interface ... | |||
}; | |||
SlowEnv env; | |||
leveldb::Options options; | |||
options.env = &env; | |||
Status s = leveldb::DB::Open(options, ...); | |||
</pre> | |||
<h1>Porting</h1> | |||
<p> | |||
<code>leveldb</code> may be ported to a new platform by providing platform | |||
specific implementations of the types/methods/functions exported by | |||
<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more | |||
details. | |||
<p> | |||
In addition, the new platform may need a new default <code>leveldb::Env</code> | |||
implementation. See <code>leveldb/util/env_posix.h</code> for an example. | |||
<h1>Other Information</h1> | |||
<p> | |||
Details about the <code>leveldb</code> implementation may be found in | |||
the following documents: | |||
<ul> | |||
<li> <a href="impl.html">Implementation notes</a> | |||
<li> <a href="table_format.txt">Format of an immutable Table file</a> | |||
<li> <a href="log_format.txt">Format of a log file</a> | |||
</ul> | |||
</body> | |||
</html> |
@ -0,0 +1,72 @@ | |||
The log file contents are a sequence of 32KB blocks. The only | |||
exception is that the tail of the file may contain a partial block. | |||
Each block consists of a sequence of records: | |||
block := record* trailer? | |||
record := | |||
checksum: uint32 // crc32c of type and data[] | |||
length: uint16 | |||
type: uint8 // One of FULL, FIRST, MIDDLE, LAST | |||
data: uint8[length] | |||
A record never starts within the last seven bytes of a block. Any | |||
leftover bytes here form the trailer, which must consist entirely of | |||
zero bytes and must be skipped by readers. In particular, even if | |||
there are exactly seven bytes left in the block, and a zero-length | |||
user record is added (which will fit in these seven bytes), the writer | |||
must skip these trailer bytes and add the record to the next block. | |||
More types may be added in the future. Some Readers may skip record | |||
types they do not understand, others may report that some data was | |||
skipped. | |||
FULL == 1 | |||
FIRST == 2 | |||
MIDDLE == 3 | |||
LAST == 4 | |||
The FULL record contains the contents of an entire user record. | |||
FIRST, MIDDLE, LAST are types used for user records that have been | |||
split into multiple fragments (typically because of block boundaries). | |||
FIRST is the type of the first fragment of a user record, LAST is the | |||
type of the last fragment of a user record, and MID is the type of all | |||
interior fragments of a user record. | |||
Example: consider a sequence of user records: | |||
A: length 1000 | |||
B: length 97270 | |||
C: length 8000 | |||
A will be stored as a FULL record in the first block. | |||
B will be split into three fragments: first fragment occupies the rest | |||
of the first block, second fragment occupies the entirety of the | |||
second block, and the third fragment occupies a prefix of the third | |||
block. This will leave six bytes free in the third block, which will | |||
be left empty as the trailer. | |||
C will be stored as a FULL record in the fourth block. | |||
=================== | |||
Some benefits over the recordio format: | |||
(1) We do not need any heuristics for resyncing - just go to next | |||
block boundary and scan. If there is a corruption, skip to the next | |||
block. As a side-benefit, we do not get confused when part of the | |||
contents of one log file are embedded as a record inside another log | |||
file. | |||
(2) Splitting at approximate boundaries (e.g., for mapreduce) is | |||
simple: find the next block boundary and skip records until we | |||
hit a FULL or FIRST record. | |||
(3) We do not need extra buffering for large records. | |||
Some downsides compared to recordio format: | |||
(1) No packing of tiny records. This could be fixed by adding a new | |||
record type, so it is a shortcoming of the current implementation, | |||
not necessarily the format. | |||
(2) No compression. Again, this could be fixed by adding new record types. |
@ -0,0 +1,61 @@ | |||
File format | |||
=========== | |||
<beginning_of_file> | |||
[data block 1] | |||
[data block 2] | |||
... | |||
[data block N] | |||
[meta block 1] | |||
... | |||
[meta block K] | |||
[metaindex block] | |||
[index block] | |||
[Footer] (fixed size; starts at file_size - sizeof(Footer)) | |||
<end_of_file> | |||
The file contains internal pointers. Each such pointer is called | |||
a BlockHandle and contains the following information: | |||
offset: varint64 | |||
size: varint64 | |||
(1) The sequence of key/value pairs in the file are stored in sorted | |||
order and partitioned into a sequence of data blocks. These blocks | |||
come one after another at the beginning of the file. Each data block | |||
is formatted according to the code in block_builder.cc, and then | |||
optionally compressed. | |||
(2) After the data blocks we store a bunch of meta blocks. The | |||
supported meta block types are described below. More meta block types | |||
may be added in the future. Each meta block is again formatted using | |||
block_builder.cc and then optionally compressed. | |||
(3) A "metaindex" block. It contains one entry for every other meta | |||
block where the key is the name of the meta block and the value is a | |||
BlockHandle pointing to that meta block. | |||
(4) An "index" block. This block contains one entry per data block, | |||
where the key is a string >= last key in that data block and before | |||
the first key in the successive data block. The value is the | |||
BlockHandle for the data block. | |||
(6) At the very end of the file is a fixed length footer that contains | |||
the BlockHandle of the metaindex and index blocks as well as a magic number. | |||
metaindex_handle: char[p]; // Block handle for metaindex | |||
index_handle: char[q]; // Block handle for index | |||
padding: char[40-p-q]; // 0 bytes to make fixed length | |||
// (40==2*BlockHandle::kMaxEncodedLength) | |||
magic: fixed64; // == 0xdb4775248b80fb57 | |||
"stats" Meta Block | |||
------------------ | |||
This meta block contains a bunch of stats. The key is the name | |||
of the statistic. The value contains the statistic. | |||
TODO(postrelease): record following stats. | |||
data size | |||
index size | |||
key size (uncompressed) | |||
value size (uncompressed) | |||
number of entries | |||
number of data blocks |
@ -0,0 +1,99 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// A Cache is an interface that maps keys to values. It has internal | |||
// synchronization and may be safely accessed concurrently from | |||
// multiple threads. It may automatically evict entries to make room | |||
// for new entries. Values have a specified charge against the cache | |||
// capacity. For example, a cache where the values are variable | |||
// length strings, may use the length of the string as the charge for | |||
// the string. | |||
// | |||
// A builtin cache implementation with a least-recently-used eviction | |||
// policy is provided. Clients may use their own implementations if | |||
// they want something more sophisticated (like scan-resistance, a | |||
// custom eviction policy, variable cache sizing, etc.) | |||
#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ | |||
#include <stdint.h> | |||
#include "include/slice.h" | |||
namespace leveldb { | |||
class Cache; | |||
// Create a new cache with a fixed size capacity. This implementation | |||
// of Cache uses a least-recently-used eviction policy. | |||
extern Cache* NewLRUCache(size_t capacity); | |||
class Cache { | |||
public: | |||
Cache() { } | |||
// Destroys all existing entries by calling the "deleter" | |||
// function that was passed to the constructor. | |||
virtual ~Cache(); | |||
// Opaque handle to an entry stored in the cache. | |||
struct Handle { }; | |||
// Insert a mapping from key->value into the cache and assign it | |||
// the specified charge against the total cache capacity. | |||
// | |||
// Returns a handle that corresponds to the mapping. The caller | |||
// must call this->Release(handle) when the returned mapping is no | |||
// longer needed. | |||
// | |||
// When the inserted entry is no longer needed, the key and | |||
// value will be passed to "deleter". | |||
virtual Handle* Insert(const Slice& key, void* value, size_t charge, | |||
void (*deleter)(const Slice& key, void* value)) = 0; | |||
// If the cache has no mapping for "key", returns NULL. | |||
// | |||
// Else return a handle that corresponds to the mapping. The caller | |||
// must call this->Release(handle) when the returned mapping is no | |||
// longer needed. | |||
virtual Handle* Lookup(const Slice& key) = 0; | |||
// Release a mapping returned by a previous Lookup(). | |||
// REQUIRES: handle must not have been released yet. | |||
// REQUIRES: handle must have been returned by a method on *this. | |||
virtual void Release(Handle* handle) = 0; | |||
// Return the value encapsulated in a handle returned by a | |||
// successful Lookup(). | |||
// REQUIRES: handle must not have been released yet. | |||
// REQUIRES: handle must have been returned by a method on *this. | |||
virtual void* Value(Handle* handle) = 0; | |||
// If the cache contains entry for key, erase it. Note that the | |||
// underlying entry will be kept around until all existing handles | |||
// to it have been released. | |||
virtual void Erase(const Slice& key) = 0; | |||
// Return a new numeric id. May be used by multiple clients who are | |||
// sharing the same cache to partition the key space. Typically the | |||
// client will allocate a new id at startup and prepend the id to | |||
// its cache keys. | |||
virtual uint64_t NewId() = 0; | |||
private: | |||
void LRU_Remove(Handle* e); | |||
void LRU_Append(Handle* e); | |||
void Unref(Handle* e); | |||
struct Rep; | |||
Rep* rep_; | |||
// No copying allowed | |||
Cache(const Cache&); | |||
void operator=(const Cache&); | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ |
@ -0,0 +1,61 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ | |||
#include <string> | |||
namespace leveldb { | |||
class Slice; | |||
// A Comparator object provides a total order across slices that are | |||
// used as keys in an sstable or a database. | |||
class Comparator { | |||
public: | |||
virtual ~Comparator(); | |||
// Three-way comparison. Returns value: | |||
// < 0 iff "a" < "b", | |||
// == 0 iff "a" == "b", | |||
// > 0 iff "a" > "b" | |||
virtual int Compare(const Slice& a, const Slice& b) const = 0; | |||
// The name of the comparator. Used to check for comparator | |||
// mismatches (i.e., a DB created with one comparator is | |||
// accessed using a different comparator. | |||
// | |||
// The client of this package should switch to a new name whenever | |||
// the comparator implementation changes in a way that will cause | |||
// the relative ordering of any two keys to change. | |||
// | |||
// Names starting with "leveldb." are reserved and should not be used | |||
// by any clients of this package. | |||
virtual const char* Name() const = 0; | |||
// Advanced functions: these are used to reduce the space requirements | |||
// for internal data structures like index blocks. | |||
// If *start < limit, changes *start to a short string in [start,limit). | |||
// Simple comparator implementations may return with *start unchanged, | |||
// i.e., an implementation of this method that does nothing is correct. | |||
virtual void FindShortestSeparator( | |||
std::string* start, | |||
const Slice& limit) const = 0; | |||
// Changes *key to a short string >= *key. | |||
// Simple comparator implementations may return with *key unchanged, | |||
// i.e., an implementation of this method that does nothing is correct. | |||
virtual void FindShortSuccessor(std::string* key) const = 0; | |||
}; | |||
// Return a builtin comparator that uses lexicographic byte-wise | |||
// ordering. The result remains the property of this module and | |||
// must not be deleted. | |||
extern const Comparator* BytewiseComparator(); | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ |
@ -0,0 +1,137 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_DB_H_ | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include "include/iterator.h" | |||
#include "include/options.h" | |||
namespace leveldb { | |||
struct Options; | |||
struct ReadOptions; | |||
struct WriteOptions; | |||
class Snapshot; | |||
class WriteBatch; | |||
// Some internal types. Clients should ignore. | |||
class WriteBatchInternal; | |||
struct Range { | |||
Slice start; | |||
Slice limit; | |||
Range(const Slice& s, const Slice& l) : start(s), limit(l) { } | |||
}; | |||
// A DB is a persistent ordered map from keys to values. | |||
class DB { | |||
public: | |||
// Open the database with the specified "name". | |||
// Stores a pointer to a heap-allocated database in *dbptr and returns | |||
// OK on success. | |||
// Stores NULL in *dbptr and returns a non-OK status on error. | |||
// Caller should delete *dbptr when it is no longer needed. | |||
static Status Open(const Options& options, | |||
const std::string& name, | |||
DB** dbptr); | |||
DB() { } | |||
virtual ~DB(); | |||
// Set the database entry for "key" to "value". Returns OK on success, | |||
// and a non-OK status on error. | |||
// Note: consider setting options.sync = false. | |||
virtual Status Put(const WriteOptions& options, | |||
const Slice& key, | |||
const Slice& value) = 0; | |||
// Remove the database entry (if any) for "key". Returns OK on | |||
// success, and a non-OK status on error. It is not an error if "key" | |||
// did not exist in the database. | |||
// Note: consider setting options.sync = false. | |||
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; | |||
// Apply the specified updates to the database. | |||
// Returns OK on success, non-OK on failure. | |||
// Note: consider setting options.sync = false. | |||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; | |||
// If the database contains an entry for "key" store the | |||
// corresponding value in *value and return OK. | |||
// | |||
// If there is no entry for "key" leave *value unchanged and return | |||
// a status for which Status::IsNotFound() returns true. | |||
// | |||
// May return some other Status on an error. | |||
virtual Status Get(const ReadOptions& options, | |||
const Slice& key, std::string* value) = 0; | |||
// Return a heap-allocated iterator over the contents of the database. | |||
// The result of NewIterator() is initially invalid (caller must | |||
// call one of the Seek methods on the iterator before using it). | |||
// | |||
// Caller should delete the iterator when it is no longer needed. | |||
// The returned iterator should be deleted before this db is deleted. | |||
virtual Iterator* NewIterator(const ReadOptions& options) = 0; | |||
// Return a handle to the current DB state. Iterators created with | |||
// this handle will all observe a stable snapshot of the current DB | |||
// state. The caller must call ReleaseSnapshot(result) when the | |||
// snapshot is no longer needed. | |||
virtual const Snapshot* GetSnapshot() = 0; | |||
// Release a previously acquired snapshot. The caller must not | |||
// use "snapshot" after this call. | |||
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; | |||
// DB implementations can export properties about their state | |||
// via this method. If "property" is a valid property understood by this | |||
// DB implementation, fills "*value" with its current value and returns | |||
// true. Otherwise returns false. | |||
// | |||
// | |||
// Valid property names include: | |||
// | |||
// "leveldb.num-files-at-level<N>" - return the number of files at level <N>, | |||
// where <N> is an ASCII representation of a level number (e.g. "0"). | |||
virtual bool GetProperty(const Slice& property, uint64_t* value) = 0; | |||
// For each i in [0,n-1], store in "sizes[i]", the approximate | |||
// file system space used by keys in "[range[i].start .. range[i].limit)". | |||
// | |||
// Note that the returned sizes measure file system space usage, so | |||
// if the user data compresses by a factor of ten, the returned | |||
// sizes will be one-tenth the size of the corresponding user data size. | |||
// | |||
// The results may not include the sizes of recently written data. | |||
virtual void GetApproximateSizes(const Range* range, int n, | |||
uint64_t* sizes) = 0; | |||
// Possible extensions: | |||
// (1) Add a method to compact a range of keys | |||
private: | |||
// No copying allowed | |||
DB(const DB&); | |||
void operator=(const DB&); | |||
}; | |||
// Destroy the contents of the specified database. | |||
// Be very careful using this method. | |||
Status DestroyDB(const std::string& name, const Options& options); | |||
// If a DB cannot be opened, you may attempt to call this method to | |||
// resurrect as much of the contents of the database as possible. | |||
// Some data may be lost, so be careful when calling this function | |||
// on a database that contains important information. | |||
Status RepairDB(const std::string& dbname, const Options& options); | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ |
@ -0,0 +1,293 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// An Env is an interface used by the leveldb implementation to access | |||
// operating system functionality like the filesystem etc. Callers | |||
// may wish to provide a custom Env object when opening a database to | |||
// get fine gain control; e.g., to rate limit file system operations. | |||
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_ENV_H_ | |||
#include <cstdarg> | |||
#include <string> | |||
#include <vector> | |||
#include <stdint.h> | |||
#include "include/status.h" | |||
namespace leveldb { | |||
class FileLock; | |||
class RandomAccessFile; | |||
class SequentialFile; | |||
class Slice; | |||
class WritableFile; | |||
class Env { | |||
public: | |||
Env() { } | |||
virtual ~Env(); | |||
// Return a default environment suitable for the current operating | |||
// system. Sophisticated users may wish to provide their own Env | |||
// implementation instead of relying on this default environment. | |||
// | |||
// The result of Default() belongs to leveldb and must never be deleted. | |||
static Env* Default(); | |||
// Create a brand new sequentially-readable file with the specified name. | |||
// On success, stores a pointer to the new file in *result and returns OK. | |||
// On failure stores NULL in *result and returns non-OK. If the file does | |||
// not exist, returns a non-OK status. | |||
// | |||
// The returned file will only be accessed by one thread at a time. | |||
virtual Status NewSequentialFile(const std::string& fname, | |||
SequentialFile** result) = 0; | |||
// Create a brand new random access read-only file with the | |||
// specified name. On success, stores a pointer to the new file in | |||
// *result and returns OK. On failure stores NULL in *result and | |||
// returns non-OK. If the file does not exist, returns a non-OK | |||
// status. | |||
// | |||
// The returned file may be concurrently accessed by multiple threads. | |||
virtual Status NewRandomAccessFile(const std::string& fname, | |||
RandomAccessFile** result) = 0; | |||
// Create an object that writes to a new file with the specified | |||
// name. Deletes any existing file with the same name and creates a | |||
// new file. On success, stores a pointer to the new file in | |||
// *result and returns OK. On failure stores NULL in *result and | |||
// returns non-OK. | |||
// | |||
// The returned file will only be accessed by one thread at a time. | |||
virtual Status NewWritableFile(const std::string& fname, | |||
WritableFile** result) = 0; | |||
// Returns true iff the named file exists. | |||
virtual bool FileExists(const std::string& fname) = 0; | |||
// Store in *result the names of the children of the specified directory. | |||
// The names are relative to "dir". | |||
// Original contents of *results are dropped. | |||
virtual Status GetChildren(const std::string& dir, | |||
std::vector<std::string>* result) = 0; | |||
// Delete the named file. | |||
virtual Status DeleteFile(const std::string& fname) = 0; | |||
// Create the specified directory. | |||
virtual Status CreateDir(const std::string& dirname) = 0; | |||
// Delete the specified directory. | |||
virtual Status DeleteDir(const std::string& dirname) = 0; | |||
// Store the size of fname in *file_size. | |||
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; | |||
// Rename file src to target. | |||
virtual Status RenameFile(const std::string& src, | |||
const std::string& target) = 0; | |||
// Lock the specified file. Used to prevent concurrent access to | |||
// the same db by multiple processes. On failure, stores NULL in | |||
// *lock and returns non-OK. | |||
// | |||
// On success, stores a pointer to the object that represents the | |||
// acquired lock in *lock and returns OK. The caller should call | |||
// UnlockFile(*lock) to release the lock. If the process exits, | |||
// the lock will be automatically released. | |||
// | |||
// If somebody else already holds the lock, finishes immediately | |||
// with a failure. I.e., this call does not wait for existing locks | |||
// to go away. | |||
// | |||
// May create the named file if it does not already exist. | |||
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; | |||
// Release the lock acquired by a previous successful call to LockFile. | |||
// REQUIRES: lock was returned by a successful LockFile() call | |||
// REQUIRES: lock has not already been unlocked. | |||
virtual Status UnlockFile(FileLock* lock) = 0; | |||
// Arrange to run "(*function)(arg)" once in a background thread. | |||
// | |||
// "function" may run in an unspecified thread. Multiple functions | |||
// added to the same Env may run concurrently in different threads. | |||
// I.e., the caller may not assume that background work items are | |||
// serialized. | |||
virtual void Schedule( | |||
void (*function)(void* arg), | |||
void* arg) = 0; | |||
// Start a new thread, invoking "function(arg)" within the new thread. | |||
// When "function(arg)" returns, the thread will be destroyed. | |||
virtual void StartThread(void (*function)(void* arg), void* arg) = 0; | |||
// *path is set to a temporary directory that can be used for testing. It may | |||
// or many not have just been created. The directory may or may not differ | |||
// between runs of the same process, but subsequent calls will return the | |||
// same directory. | |||
virtual Status GetTestDirectory(std::string* path) = 0; | |||
// Write an entry to the log file with the specified format. | |||
virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; | |||
// Returns the number of micro-seconds since some fixed point in time. Only | |||
// useful for computing deltas of time. | |||
virtual uint64_t NowMicros() = 0; | |||
// Sleep/delay the thread for the perscribed number of micro-seconds. | |||
virtual void SleepForMicroseconds(int micros) = 0; | |||
private: | |||
// No copying allowed | |||
Env(const Env&); | |||
void operator=(const Env&); | |||
}; | |||
// A file abstraction for reading sequentially through a file | |||
class SequentialFile { | |||
public: | |||
SequentialFile() { } | |||
virtual ~SequentialFile(); | |||
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be | |||
// written by this routine. Sets "*result" to the data that was | |||
// read (including if fewer than "n" bytes were successfully read). | |||
// If an error was encountered, returns a non-OK status. | |||
// | |||
// REQUIRES: External synchronization | |||
virtual Status Read(size_t n, Slice* result, char* scratch) = 0; | |||
}; | |||
// A file abstraction for randomly reading the contents of a file. | |||
class RandomAccessFile { | |||
public: | |||
RandomAccessFile() { } | |||
virtual ~RandomAccessFile(); | |||
// Return the length of this file in bytes. | |||
virtual uint64_t Size() const = 0; | |||
// Read up to "n" bytes from the file starting at "offset". | |||
// "scratch[0..n-1]" may be written by this routine. Sets "*result" | |||
// to the data that was read (including if fewer than "n" bytes were | |||
// successfully read). If an error was encountered, returns a | |||
// non-OK status. | |||
// | |||
// Safe for concurrent use by multiple threads. | |||
virtual Status Read(uint64_t offset, size_t n, Slice* result, | |||
char* scratch) const = 0; | |||
}; | |||
// A file abstraction for sequential writing. The implementation | |||
// must provide buffering since callers may append small fragments | |||
// at a time to the file. | |||
class WritableFile { | |||
public: | |||
WritableFile() { } | |||
virtual ~WritableFile(); | |||
virtual Status Append(const Slice& data) = 0; | |||
virtual Status Close() = 0; | |||
virtual Status Flush() = 0; | |||
virtual Status Sync() = 0; | |||
private: | |||
// No copying allowed | |||
WritableFile(const WritableFile&); | |||
void operator=(const WritableFile&); | |||
}; | |||
// Identifies a locked file. | |||
class FileLock { | |||
public: | |||
FileLock() { } | |||
virtual ~FileLock(); | |||
private: | |||
// No copying allowed | |||
FileLock(const FileLock&); | |||
void operator=(const FileLock&); | |||
}; | |||
// Log the specified data to *info_log if info_log is non-NULL. | |||
extern void Log(Env* env, WritableFile* info_log, const char* format, ...) | |||
# if defined(__GNUC__) || defined(__clang__) | |||
__attribute__((__format__ (__printf__, 3, 4))) | |||
# endif | |||
; | |||
// A utility routine: write "data" to the named file. | |||
extern Status WriteStringToFile(Env* env, const Slice& data, | |||
const std::string& fname); | |||
// A utility routine: read contents of named file into *data | |||
extern Status ReadFileToString(Env* env, const std::string& fname, | |||
std::string* data); | |||
// An implementation of Env that forwards all calls to another Env. | |||
// May be useful to clients who wish to override just part of the | |||
// functionality of another Env. | |||
class EnvWrapper : public Env { | |||
public: | |||
// Initialize an EnvWrapper that delegates all calls to *target | |||
explicit EnvWrapper(Env* target) : target_(target) { } | |||
virtual ~EnvWrapper(); | |||
// Return the target to which this Env forwards all calls | |||
Env* target() const { return target_; } | |||
// The following text is boilerplate that forwards all methods to target() | |||
Status NewSequentialFile(const std::string& f, SequentialFile** r) { | |||
return target_->NewSequentialFile(f, r); | |||
} | |||
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { | |||
return target_->NewRandomAccessFile(f, r); | |||
} | |||
Status NewWritableFile(const std::string& f, WritableFile** r) { | |||
return target_->NewWritableFile(f, r); | |||
} | |||
bool FileExists(const std::string& f) { return target_->FileExists(f); } | |||
Status GetChildren(const std::string& dir, std::vector<std::string>* r) { | |||
return target_->GetChildren(dir, r); | |||
} | |||
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } | |||
Status CreateDir(const std::string& d) { return target_->CreateDir(d); } | |||
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } | |||
Status GetFileSize(const std::string& f, uint64_t* s) { | |||
return target_->GetFileSize(f, s); | |||
} | |||
Status RenameFile(const std::string& s, const std::string& t) { | |||
return target_->RenameFile(s, t); | |||
} | |||
Status LockFile(const std::string& f, FileLock** l) { | |||
return target_->LockFile(f, l); | |||
} | |||
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } | |||
void Schedule(void (*f)(void*), void* a) { | |||
return target_->Schedule(f, a); | |||
} | |||
void StartThread(void (*f)(void*), void* a) { | |||
return target_->StartThread(f, a); | |||
} | |||
virtual Status GetTestDirectory(std::string* path) { | |||
return target_->GetTestDirectory(path); | |||
} | |||
virtual void Logv(WritableFile* log, const char* format, va_list ap) { | |||
return target_->Logv(log, format, ap); | |||
} | |||
uint64_t NowMicros() { | |||
return target_->NowMicros(); | |||
} | |||
void SleepForMicroseconds(int micros) { | |||
target_->SleepForMicroseconds(micros); | |||
} | |||
private: | |||
Env* target_; | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ |
@ -0,0 +1,95 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// An iterator yields a sequence of key/value pairs from a source. | |||
// The following class defines the interface. Multiple implementations | |||
// are provided by this library. In particular, iterators are provided | |||
// to access the contents of a Table or a DB. | |||
#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ | |||
#include "include/slice.h" | |||
#include "include/status.h" | |||
namespace leveldb { | |||
class Iterator { | |||
public: | |||
Iterator(); | |||
virtual ~Iterator(); | |||
// An iterator is either positioned at a key/value pair, or | |||
// not valid. This method returns true iff the iterator is valid. | |||
virtual bool Valid() const = 0; | |||
// Position at the first key in the source. The iterator is Valid() | |||
// after this call iff the source is not empty. | |||
virtual void SeekToFirst() = 0; | |||
// Position at the last key in the source. The iterator is | |||
// Valid() after this call iff the source is not empty. | |||
virtual void SeekToLast() = 0; | |||
// Position at the first key in the source that at or past target | |||
// The iterator is Valid() after this call iff the source contains | |||
// an entry that comes at or past target. | |||
virtual void Seek(const Slice& target) = 0; | |||
// Moves to the next entry in the source. After this call, Valid() is | |||
// true iff the iterator was not positioned at the last entry in the source. | |||
// REQUIRES: Valid() | |||
virtual void Next() = 0; | |||
// Moves to the previous entry in the source. After this call, Valid() is | |||
// true iff the iterator was not positioned at the first entry in source. | |||
// REQUIRES: Valid() | |||
virtual void Prev() = 0; | |||
// Return the key for the current entry. The underlying storage for | |||
// the returned slice is valid only until the next modification of | |||
// the iterator. | |||
// REQUIRES: Valid() | |||
virtual Slice key() const = 0; | |||
// Return the value for the current entry. The underlying storage for | |||
// the returned slice is valid only until the next modification of | |||
// the iterator. | |||
// REQUIRES: !AtEnd() && !AtStart() | |||
virtual Slice value() const = 0; | |||
// If an error has occurred, return it. Else return an ok status. | |||
virtual Status status() const = 0; | |||
// Clients are allowed to register function/arg1/arg2 triples that | |||
// will be invoked when this iterator is destroyed. | |||
// | |||
// Note that unlike all of the preceding methods, this method is | |||
// not abstract and therefore clients should not override it. | |||
typedef void (*CleanupFunction)(void* arg1, void* arg2); | |||
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); | |||
private: | |||
struct Cleanup { | |||
CleanupFunction function; | |||
void* arg1; | |||
void* arg2; | |||
Cleanup* next; | |||
}; | |||
Cleanup cleanup_; | |||
// No copying allowed | |||
Iterator(const Iterator&); | |||
void operator=(const Iterator&); | |||
}; | |||
// Return an empty iterator (yields nothing). | |||
extern Iterator* NewEmptyIterator(); | |||
// Return an empty iterator with the specified status. | |||
extern Iterator* NewErrorIterator(const Status& status); | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ |
@ -0,0 +1,203 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ | |||
#include <stddef.h> | |||
namespace leveldb { | |||
class Cache; | |||
class Comparator; | |||
class Env; | |||
class Snapshot; | |||
class WritableFile; | |||
// DB contents are stored in a set of blocks, each of which holds a | |||
// sequence of key,value pairs. Each block may be compressed before | |||
// being stored in a file. The following enum describes which | |||
// compression method (if any) is used to compress a block. | |||
enum CompressionType { | |||
// NOTE: do not change the values of existing entries, as these are | |||
// part of the persistent format on disk. | |||
kNoCompression = 0x0, | |||
kLightweightCompression = 0x1, | |||
}; | |||
// Options to control the behavior of a database (passed to DB::Open) | |||
struct Options { | |||
// ------------------- | |||
// Parameters that affect behavior | |||
// Comparator used to define the order of keys in the table. | |||
// Default: a comparator that uses lexicographic byte-wise ordering | |||
// | |||
// REQUIRES: The client must ensure that the comparator supplied | |||
// here has the same name and orders keys *exactly* the same as the | |||
// comparator provided to previous open calls on the same DB. | |||
const Comparator* comparator; | |||
// If true, the database will be created if it is missing. | |||
// Default: false | |||
bool create_if_missing; | |||
// If true, an error is raised if the database already exists. | |||
// Default: false | |||
bool error_if_exists; | |||
// If true, the implementation will do aggressive checking of the | |||
// data it is processing and will stop early if it detects any | |||
// errors. This may have unforeseen ramifications: for example, a | |||
// corruption of one DB entry may cause a large number of entries to | |||
// become unreadable or for the entire DB to become unopenable. | |||
// Default: false | |||
bool paranoid_checks; | |||
// Use the specified object to interact with the environment, | |||
// e.g. to read/write files, schedule background work, etc. | |||
// Default: Env::Default() | |||
Env* env; | |||
// Any internal progress/error information generated by the db will | |||
// be to written to info_log if it is non-NULL, or to a file stored | |||
// in the same directory as the DB contents if info_log is NULL. | |||
// Default: NULL | |||
WritableFile* info_log; | |||
// ------------------- | |||
// Parameters that affect performance | |||
// Amount of data to build up in memory before converting to an | |||
// on-disk file. | |||
// | |||
// Some DB operations may encounter a delay proportional to the size | |||
// of this parameter. Therefore we recommend against increasing | |||
// this parameter unless you are willing to live with an occasional | |||
// slow operation in exchange for faster bulk loading throughput. | |||
// | |||
// Default: 1MB | |||
size_t write_buffer_size; | |||
// Number of open files that can be used by the DB. You may need to | |||
// increase this if your database has a large working set (budget | |||
// one open file per 2MB of working set). | |||
// | |||
// Default: 1000 | |||
int max_open_files; | |||
// Handle values larger than "large_value_threshold" bytes | |||
// specially, by writing them into their own files (to avoid | |||
// compaction overhead) and doing content-based elimination of | |||
// duplicate values to save space. | |||
// | |||
// We recommend against changing this value. | |||
// | |||
// Default: 64K | |||
size_t large_value_threshold; | |||
// Control over blocks (user data is stored in a set of blocks, and | |||
// a block is the unit of reading from disk). | |||
// Use the specified cache for blocks (if non-NULL). | |||
// Default: NULL | |||
Cache* block_cache; | |||
// Approximate size of user data packed per block. Note that the | |||
// block size specified here corresponds to uncompressed data. The | |||
// actual size of the unit read from disk may be smaller if | |||
// compression is enabled. This parameter can be changed dynamically. | |||
// | |||
// Default: 8K | |||
int block_size; | |||
// Number of keys between restart points for delta encoding of keys. | |||
// This parameter can be changed dynamically. Most clients should | |||
// leave this parameter alone. | |||
// | |||
// Default: 16 | |||
int block_restart_interval; | |||
// Compress blocks using the specified compression algorithm. This | |||
// parameter can be changed dynamically. | |||
// | |||
// Default: kLightweightCompression, which gives lightweight but fast | |||
// compression. | |||
// | |||
// Typical speeds of kLightweightCompression on an Intel(R) Core(TM)2 2.4GHz: | |||
// ~200-500MB/s compression | |||
// ~400-800MB/s decompression | |||
// Note that these speeds are significantly faster than most | |||
// persistent storage speeds, and therefore it is typically never | |||
// worth switching to kNoCompression. Even if the input data is | |||
// incompressible, the kLightweightCompression implementation will | |||
// efficiently detect that and will switch to uncompressed mode. | |||
CompressionType compression; | |||
// Create an Options object with default values for all fields. | |||
Options(); | |||
}; | |||
// Options that control read operations | |||
struct ReadOptions { | |||
// If true, all data read from underlying storage will be | |||
// verified against corresponding checksums. | |||
// Default: false | |||
bool verify_checksums; | |||
// Should the data read for this iteration be cached in memory? | |||
// Callers may wish to set this field to false for bulk scans. | |||
// Default: true | |||
bool fill_cache; | |||
// If "snapshot" is non-NULL, read as of the supplied snapshot | |||
// (which must belong to the DB that is being read and which must | |||
// not have been released). If "snapshot" is NULL, use an impliicit | |||
// snapshot of the state at the beginning of this read operation. | |||
// Default: NULL | |||
const Snapshot* snapshot; | |||
ReadOptions() | |||
: verify_checksums(false), | |||
fill_cache(true), | |||
snapshot(NULL) { | |||
} | |||
}; | |||
// Options that control write operations | |||
struct WriteOptions { | |||
// If true, the write will be flushed from the operating system | |||
// buffer cache (by calling WritableFile::Sync()) before the write | |||
// is considered complete. If this flag is true, writes will be | |||
// slower. | |||
// | |||
// If this flag is false, and the machine crashes, some recent | |||
// writes may be lost. Note that if it is just the process that | |||
// crashes (i.e., the machine does not reboot), no writes will be | |||
// lost even if sync==false. | |||
// | |||
// Default: true | |||
bool sync; | |||
// If "post_write_snapshot" is non-NULL, and the write succeeds, | |||
// *post_write_snapshot will be modified to point to a snapshot of | |||
// the DB state immediately after this write. The caller must call | |||
// DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the | |||
// snapshot is no longer needed. | |||
// | |||
// If "post_write_snapshot" is non-NULL, and the write fails, | |||
// *post_write_snapshot will be set to NULL. | |||
// | |||
// Default: NULL | |||
const Snapshot** post_write_snapshot; | |||
WriteOptions() | |||
: sync(true), | |||
post_write_snapshot(NULL) { | |||
} | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ |
@ -0,0 +1,104 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// Slice is a simple structure containing a pointer into some external | |||
// storage and a size. The user of a Slice must ensure that the slice | |||
// is not used after the corresponding external storage has been | |||
// deallocated. | |||
#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ | |||
#include <assert.h> | |||
#include <stddef.h> | |||
#include <string.h> | |||
#include <string> | |||
namespace leveldb { | |||
class Slice { | |||
public: | |||
// Create an empty slice. | |||
Slice() : data_(""), size_(0) { } | |||
// Create a slice that refers to data[0,n-1]. | |||
Slice(const char* data, size_t n) : data_(data), size_(n) { } | |||
// Create a slice that refers to the contents of "s" | |||
Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } | |||
// Create a slice that refers to s[0,strlen(s)-1] | |||
Slice(const char* s) : data_(s), size_(strlen(s)) { } | |||
// Return a pointer to the beginning of the referenced data | |||
const char* data() const { return data_; } | |||
// Return the length (in bytes) of the referenced data | |||
size_t size() const { return size_; } | |||
// Return true iff the length of the referenced data is zero | |||
bool empty() const { return size_ == 0; } | |||
// Return the ith byte in the referenced data. | |||
// REQUIRES: n < size() | |||
char operator[](size_t n) const { | |||
assert(n < size()); | |||
return data_[n]; | |||
} | |||
// Change this slice to refer to an empty array | |||
void clear() { data_ = ""; size_ = 0; } | |||
// Drop the first "n" bytes from this slice. | |||
void remove_prefix(size_t n) { | |||
assert(n <= size()); | |||
data_ += n; | |||
size_ -= n; | |||
} | |||
// Return a string that contains the copy of the referenced data. | |||
std::string ToString() const { return std::string(data_, size_); } | |||
// Three-way comparison. Returns value: | |||
// < 0 iff "*this" < "b", | |||
// == 0 iff "*this" == "b", | |||
// > 0 iff "*this" > "b" | |||
int compare(const Slice& b) const; | |||
// Return true iff "x" is a prefix of "*this" | |||
bool starts_with(const Slice& x) const { | |||
return ((size_ >= x.size_) && | |||
(memcmp(data_, x.data_, x.size_) == 0)); | |||
} | |||
private: | |||
const char* data_; | |||
size_t size_; | |||
// Intentionally copyable | |||
}; | |||
inline bool operator==(const Slice& x, const Slice& y) { | |||
return ((x.size() == y.size()) && | |||
(memcmp(x.data(), y.data(), x.size()) == 0)); | |||
} | |||
inline bool operator!=(const Slice& x, const Slice& y) { | |||
return !(x == y); | |||
} | |||
inline int Slice::compare(const Slice& b) const { | |||
const int min_len = (size_ < b.size_) ? size_ : b.size_; | |||
int r = memcmp(data_, b.data_, min_len); | |||
if (r == 0) { | |||
if (size_ < b.size_) r = -1; | |||
else if (size_ > b.size_) r = +1; | |||
} | |||
return r; | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ |
@ -0,0 +1,86 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// A Status encapsulates the result of an operation. It may indicate success, | |||
// or it may indicate an error with an associated error message. | |||
#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ | |||
#include <string> | |||
#include <utility> | |||
#include "include/slice.h" | |||
namespace leveldb { | |||
class Status { | |||
public: | |||
// Create a success status. | |||
Status() : state_(NULL) { } | |||
~Status() { delete state_; } | |||
// Copy the specified status. | |||
Status(const Status& s); | |||
void operator=(const Status& s); | |||
// Return a success status. | |||
static Status OK() { return Status(); } | |||
// Return error status of an appropriate type. | |||
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { | |||
return Status(kNotFound, msg, Slice()); | |||
} | |||
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { | |||
return Status(kCorruption, msg, msg2); | |||
} | |||
static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { | |||
return Status(kNotSupported, msg, msg2); | |||
} | |||
static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { | |||
return Status(kInvalidArgument, msg, msg2); | |||
} | |||
static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { | |||
return Status(kIOError, msg, msg2); | |||
} | |||
// Returns true iff the status indicates success. | |||
bool ok() const { return (state_ == NULL); } | |||
// Returns true iff the status indicates a NotFound error. | |||
bool IsNotFound() const { return code() == kNotFound; } | |||
// Return a string representation of this status suitable for printing. | |||
// Returns the string "OK" for success. | |||
std::string ToString() const; | |||
private: | |||
enum Code { | |||
kOk = 0, | |||
kNotFound = 1, | |||
kCorruption = 2, | |||
kNotSupported = 3, | |||
kInvalidArgument = 4, | |||
kIOError = 5, | |||
}; | |||
Code code() const { return (state_ == NULL) ? kOk : state_->first; } | |||
Status(Code code, const Slice& msg, const Slice& msg2); | |||
typedef std::pair<Code, std::string> State; | |||
State* state_; | |||
}; | |||
inline Status::Status(const Status& s) { | |||
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); | |||
} | |||
inline void Status::operator=(const Status& s) { | |||
if (this != &s) { | |||
delete state_; | |||
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); | |||
} | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ |
@ -0,0 +1,67 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ | |||
#include <stdint.h> | |||
#include "include/iterator.h" | |||
namespace leveldb { | |||
class Block; | |||
class BlockHandle; | |||
struct Options; | |||
class RandomAccessFile; | |||
struct ReadOptions; | |||
// A Table is a sorted map from strings to strings. Tables are | |||
// immutable and persistent. | |||
class Table { | |||
public: | |||
// Attempt to open the table that is stored in "file", and read the | |||
// metadata entries necessary to allow retrieving data from the table. | |||
// | |||
// If successful, returns ok and sets "*table" to the newly opened | |||
// table. The client should delete "*table" when no longer needed. | |||
// If there was an error while initializing the table, sets "*table" | |||
// to NULL and returns a non-ok status. Does not take ownership of | |||
// "*source", but the client must ensure that "source" remains live | |||
// for the duration of the returned table's lifetime. | |||
// | |||
// *file must remain live while this Table is in use. | |||
static Status Open(const Options& options, | |||
RandomAccessFile* file, | |||
Table** table); | |||
~Table(); | |||
// Returns a new iterator over the table contents. | |||
// The result of NewIterator() is initially invalid (caller must | |||
// call one of the Seek methods on the iterator before using it). | |||
Iterator* NewIterator(const ReadOptions&) const; | |||
// Given a key, return an approximate byte offset in the file where | |||
// the data for that key begins (or would begin if the key were | |||
// present in the file). The returned value is in terms of file | |||
// bytes, and so includes effects like compression of the underlying data. | |||
// E.g., the approximate offset of the last key in the table will | |||
// be close to the file length. | |||
uint64_t ApproximateOffsetOf(const Slice& key) const; | |||
private: | |||
struct Rep; | |||
Rep* rep_; | |||
explicit Table(Rep* rep) { rep_ = rep; } | |||
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); | |||
// No copying allowed | |||
Table(const Table&); | |||
void operator=(const Table&); | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ |
@ -0,0 +1,86 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// TableBuilder provides the interface used to build a Table | |||
// (an immutable and sorted map from keys to values). | |||
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ | |||
#include <stdint.h> | |||
#include "include/options.h" | |||
#include "include/status.h" | |||
namespace leveldb { | |||
class BlockBuilder; | |||
class BlockHandle; | |||
class WritableFile; | |||
class TableBuilder { | |||
public: | |||
// Create a builder that will store the contents of the table it is | |||
// building in *file. Does not close the file. It is up to the | |||
// caller to close the file after calling Finish(). | |||
TableBuilder(const Options& options, WritableFile* file); | |||
// REQUIRES: Either Finish() or Abandon() has been called. | |||
~TableBuilder(); | |||
// Change the options used by this builder. Note: only some of the | |||
// option fields can be changed after construction. If a field is | |||
// not allowed to change dynamically and its value in the structure | |||
// passed to the constructor is different from its value in the | |||
// structure passed to this method, this method will return an error | |||
// without changing any fields. | |||
Status ChangeOptions(const Options& options); | |||
// Add key,value to the table being constructed. | |||
// REQUIRES: key is after any previously added key according to comparator. | |||
// REQUIRES: Finish(), Abandon() have not been called | |||
void Add(const Slice& key, const Slice& value); | |||
// Advanced operation: flush any buffered key/value pairs to file. | |||
// Can be used to ensure that two adjacent entries never live in | |||
// the same data block. Most clients should not need to use this method. | |||
// REQUIRES: Finish(), Abandon() have not been called | |||
void Flush(); | |||
// Return non-ok iff some error has been detected. | |||
Status status() const; | |||
// Finish building the table. Stops using the file passed to the | |||
// constructor after this function returns. | |||
// REQUIRES: Finish(), Abandon() have not been called | |||
Status Finish(); | |||
// Indicate that the contents of this builder should be abandoned. Stops | |||
// using the file passed to the constructor after this function returns. | |||
// If the caller is not going to call Finish(), it must call Abandon() | |||
// before destroying this builder. | |||
// REQUIRES: Finish(), Abandon() have not been called | |||
void Abandon(); | |||
// Number of calls to Add() so far. | |||
uint64_t NumEntries() const; | |||
// Size of the file generated so far. If invoked after a successful | |||
// Finish() call, returns the size of the final generated file. | |||
uint64_t FileSize() const; | |||
private: | |||
bool ok() const { return status().ok(); } | |||
void WriteBlock(BlockBuilder* block, BlockHandle* handle); | |||
struct Rep; | |||
Rep* rep_; | |||
// No copying allowed | |||
TableBuilder(const TableBuilder&); | |||
void operator=(const TableBuilder&); | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ |
@ -0,0 +1,49 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// WriteBatch holds a collection of updates to apply atomically to a DB. | |||
// | |||
// The updates are applied in the order in which they are added | |||
// to the WriteBatch. For example, the value of "key" will be "v3" | |||
// after the following batch is written: | |||
// | |||
// batch.Put("key", "v1"); | |||
// batch.Delete("key"); | |||
// batch.Put("key", "v2"); | |||
// batch.Put("key", "v3"); | |||
#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ | |||
#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ | |||
#include <string> | |||
namespace leveldb { | |||
class Slice; | |||
class WriteBatch { | |||
public: | |||
WriteBatch(); | |||
~WriteBatch(); | |||
// Store the mapping "key->value" in the database. | |||
void Put(const Slice& key, const Slice& value); | |||
// If the database contains a mapping for "key", erase it. Else do nothing. | |||
void Delete(const Slice& key); | |||
// Clear all updates buffered in this batch. | |||
void Clear(); | |||
private: | |||
friend class WriteBatchInternal; | |||
std::string rep_; // See comment in write_batch.cc for the format of rep_ | |||
// Intentionally copyable | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ |
@ -0,0 +1,329 @@ | |||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
# Use of this source code is governed by a BSD-style license that can be | |||
# found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
{ | |||
'variables': { | |||
'use_snappy%': 0, | |||
}, | |||
'target_defaults': { | |||
'defines': [ | |||
'LEVELDB_PLATFORM_CHROMIUM=1', | |||
], | |||
'include_dirs': [ | |||
# MOE:begin_strip | |||
'../..', | |||
# MOE:end_strip_and_replace '.', | |||
], | |||
'conditions': [ | |||
['OS == "win"', { | |||
'include_dirs': [ | |||
'port/win', | |||
], | |||
}], | |||
['use_snappy', { | |||
'defines': [ | |||
'USE_SNAPPY=1', | |||
], | |||
}], | |||
], | |||
}, | |||
'targets': [ | |||
{ | |||
'target_name': 'leveldb', | |||
'type': '<(library)', | |||
'dependencies': [ | |||
# The base libary is a lightweight abstraction layer for things like | |||
# threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ | |||
# MOE:begin_strip | |||
'../../../../base/base.gyp:base', | |||
# MOE:end_strip_and_replace '../../base/base.gyp:base', | |||
], | |||
'conditions': [ | |||
['use_snappy', { | |||
'dependencies': [ | |||
'../../../../third_party/snappy/snappy.gyp:snappy', | |||
], | |||
}], | |||
], | |||
'sources': [ | |||
# Include and then exclude so that all files show up in IDEs, even if | |||
# they don't build. | |||
'db/builder.cc', | |||
'db/builder.h', | |||
'db/db_impl.cc', | |||
'db/db_impl.h', | |||
'db/db_iter.cc', | |||
'db/db_iter.h', | |||
'db/filename.cc', | |||
'db/filename.h', | |||
'db/dbformat.cc', | |||
'db/dbformat.h', | |||
'db/log_format.h', | |||
'db/log_reader.cc', | |||
'db/log_reader.h', | |||
'db/log_writer.cc', | |||
'db/log_writer.h', | |||
'db/memtable.cc', | |||
'db/memtable.h', | |||
'db/repair.cc', | |||
'db/skiplist.h', | |||
'db/snapshot.h', | |||
'db/table_cache.cc', | |||
'db/table_cache.h', | |||
'db/version_edit.cc', | |||
'db/version_edit.h', | |||
'db/version_set.cc', | |||
'db/version_set.h', | |||
'db/write_batch.cc', | |||
'db/write_batch_internal.h', | |||
'include/cache.h', | |||
'include/comparator.h', | |||
'include/db.h', | |||
'include/env.h', | |||
'include/iterator.h', | |||
'include/options.h', | |||
'include/slice.h', | |||
'include/status.h', | |||
'include/table.h', | |||
'include/table_builder.h', | |||
'include/write_batch.h', | |||
'port/port.h', | |||
'port/port_chromium.cc', | |||
'port/port_chromium.h', | |||
'port/port_example.h', | |||
'port/port_posix.cc', | |||
'port/port_posix.h', | |||
'port/sha1_portable.cc', | |||
'port/sha1_portable.h', | |||
'table/block.cc', | |||
'table/block.h', | |||
'table/block_builder.cc', | |||
'table/block_builder.h', | |||
'table/format.cc', | |||
'table/format.h', | |||
'table/iterator.cc', | |||
'table/iterator_wrapper.h', | |||
'table/merger.cc', | |||
'table/merger.h', | |||
'table/table.cc', | |||
'table/table_builder.cc', | |||
'table/two_level_iterator.cc', | |||
'table/two_level_iterator.h', | |||
'util/arena.cc', | |||
'util/arena.h', | |||
'util/cache.cc', | |||
'util/coding.cc', | |||
'util/coding.h', | |||
'util/comparator.cc', | |||
'util/crc32c.cc', | |||
'util/crc32c.h', | |||
'util/env.cc', | |||
'util/env_chromium.cc', | |||
'util/env_posix.cc', | |||
'util/hash.cc', | |||
'util/hash.h', | |||
'util/logging.cc', | |||
'util/logging.h', | |||
'util/mutexlock.h', | |||
'util/options.cc', | |||
'util/random.h', | |||
'util/status.cc', | |||
], | |||
'sources/': [ | |||
['exclude', '_(android|example|portable|posix)\\.cc$'], | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_testutil', | |||
'type': '<(library)', | |||
'dependencies': [ | |||
# MOE:begin_strip | |||
'../../../../base/base.gyp:base', | |||
# MOE:end_strip_and_replace '../../base/base.gyp:base', | |||
'leveldb', | |||
], | |||
'export_dependent_settings': [ | |||
# The tests use include directories from these projects. | |||
# MOE:begin_strip | |||
'../../../../base/base.gyp:base', | |||
# MOE:end_strip_and_replace '../../base/base.gyp:base', | |||
'leveldb', | |||
], | |||
'sources': [ | |||
'util/histogram.cc', | |||
'util/histogram.h', | |||
'util/testharness.cc', | |||
'util/testharness.h', | |||
'util/testutil.cc', | |||
'util/testutil.h', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_arena_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'util/arena_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_cache_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'util/cache_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_coding_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'util/coding_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_corruption_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/corruption_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_crc32c_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'util/crc32c_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_db_bench', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/db_bench.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_db_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/db_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_dbformat_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/dbformat_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_env_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'util/env_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_filename_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/filename_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_log_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/log_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_sha1_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'port/sha1_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_skiplist_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/skiplist_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_table_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'table/table_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_version_edit_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/version_edit_test.cc', | |||
], | |||
}, | |||
{ | |||
'target_name': 'leveldb_write_batch_test', | |||
'type': 'executable', | |||
'dependencies': [ | |||
'leveldb_testutil', | |||
], | |||
'sources': [ | |||
'db/write_batch_test.cc', | |||
], | |||
}, | |||
], | |||
} | |||
# Local Variables: | |||
# tab-width:2 | |||
# indent-tabs-mode:nil | |||
# End: | |||
# vim: set expandtab tabstop=2 shiftwidth=2: |
@ -0,0 +1,10 @@ | |||
This directory contains interfaces and implementations that isolate the | |||
rest of the package from platform details. | |||
Code in the rest of the package includes "port.h" from this directory. | |||
"port.h" in turn includes a platform specific "port_<platform>.h" file | |||
that provides the platform specific implementation. | |||
See port_posix.h for an example of what must be provided in a platform | |||
specific header file. | |||
@ -0,0 +1,21 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_PORT_PORT_H_ | |||
#define STORAGE_LEVELDB_PORT_PORT_H_ | |||
#include <string.h> | |||
// Include the appropriate platform specific file below. If you are | |||
// porting to a new platform, see "port_example.h" for documentation | |||
// of what the new port_<platform>.h file must provide. | |||
#if defined(LEVELDB_PLATFORM_POSIX) | |||
# include "port/port_posix.h" | |||
#elif defined(LEVELDB_PLATFORM_CHROMIUM) | |||
# include "port/port_chromium.h" | |||
#elif defined(LEVELDB_PLATFORM_ANDROID) | |||
# include "port/port_android.h" | |||
#endif | |||
#endif // STORAGE_LEVELDB_PORT_PORT_H_ |
@ -0,0 +1,65 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "port/port_android.h" | |||
#include <cstdlib> | |||
extern "C" { | |||
size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { | |||
return fread(a, b, c, d); | |||
} | |||
size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { | |||
return fwrite(a, b, c, d); | |||
} | |||
int fflush_unlocked(FILE *f) { | |||
return fflush(f); | |||
} | |||
int fdatasync(int fd) { | |||
return fsync(fd); | |||
} | |||
} | |||
// TODO(gabor): This is copied from port_posix.cc - not sure if I should do this? | |||
namespace leveldb { | |||
namespace port { | |||
static void PthreadCall(const char* label, int result) { | |||
if (result != 0) { | |||
fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); | |||
abort(); | |||
} | |||
} | |||
Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } | |||
Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } | |||
void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } | |||
void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } | |||
CondVar::CondVar(Mutex* mu) | |||
: mu_(mu) { | |||
PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); | |||
} | |||
CondVar::~CondVar() { | |||
PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); | |||
} | |||
void CondVar::Wait() { | |||
PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); | |||
} | |||
void CondVar::Signal(){ | |||
PthreadCall("signal", pthread_cond_signal(&cv_)); | |||
} | |||
void CondVar::SignalAll() { | |||
PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); | |||
} | |||
} | |||
} |
@ -0,0 +1,131 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// See port_example.h for documentation for the following types/functions. | |||
#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ | |||
#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ | |||
#include <endian.h> | |||
#include <pthread.h> | |||
#include <stdint.h> | |||
#include <sha1.h> | |||
#include <cstdatomic> | |||
#include <string> | |||
#include <cctype> | |||
extern "C" { | |||
size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); | |||
size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); | |||
int fflush_unlocked(FILE *f); | |||
int fdatasync (int fd); | |||
} | |||
namespace leveldb { | |||
namespace port { | |||
static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; | |||
class CondVar; | |||
class Mutex { | |||
public: | |||
Mutex(); | |||
~Mutex(); | |||
void Lock(); | |||
void Unlock(); | |||
void AssertHeld() { | |||
//TODO(gabor): How can I implement this? | |||
} | |||
private: | |||
friend class CondVar; | |||
pthread_mutex_t mu_; | |||
// No copying | |||
Mutex(const Mutex&); | |||
void operator=(const Mutex&); | |||
}; | |||
class CondVar { | |||
public: | |||
explicit CondVar(Mutex* mu); | |||
~CondVar(); | |||
void Wait(); | |||
void Signal(); | |||
void SignalAll(); | |||
private: | |||
Mutex* mu_; | |||
pthread_cond_t cv_; | |||
}; | |||
// Storage for a lock-free pointer | |||
class AtomicPointer { | |||
private: | |||
std::atomic<void*> rep_; | |||
public: | |||
AtomicPointer() { } | |||
explicit AtomicPointer(void* v) : rep_(v) { } | |||
inline void* Acquire_Load() const { | |||
return rep_.load(std::memory_order_acquire); | |||
} | |||
inline void Release_Store(void* v) { | |||
rep_.store(v, std::memory_order_release); | |||
} | |||
inline void* NoBarrier_Load() const { | |||
return rep_.load(std::memory_order_relaxed); | |||
} | |||
inline void NoBarrier_Store(void* v) { | |||
rep_.store(v, std::memory_order_relaxed); | |||
} | |||
}; | |||
/** | |||
* TODO(gabor): Implement actual compress | |||
* This is a hack - it just copies input to output. | |||
* No actual compression occurs. | |||
*/ | |||
inline void Lightweight_Compress( | |||
const char* input, | |||
size_t input_length, | |||
std::string* output) { | |||
output->copy((char*)input,0,input_length); | |||
} | |||
/** | |||
* TODO(gabor): Implement actual compress | |||
* This is a hack - it just copies input to output. | |||
* No actual uncompression occurs. | |||
*/ | |||
inline bool Lightweight_Uncompress( | |||
const char* input_data, | |||
size_t input_length, | |||
std::string* output) { | |||
output->copy((char*)input_data,0,input_length); | |||
return (bool)1; | |||
} | |||
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { | |||
SHA1_CTX sha1_ctx; | |||
SHA1Init(&sha1_ctx); | |||
SHA1Update(&sha1_ctx, (const u_char*)data, len); | |||
SHA1Final((u_char*)hash_array, &sha1_ctx); | |||
} | |||
inline uint64_t ThreadIdentifier() { | |||
pthread_t tid = pthread_self(); | |||
uint64_t r = 0; | |||
memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); | |||
return r; | |||
} | |||
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { | |||
return false; | |||
} | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ |
@ -0,0 +1,83 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "port/port_chromium.h" | |||
#include "util/logging.h" | |||
#if defined(USE_SNAPPY) | |||
# include "third_party/snappy/src/snappy.h" | |||
# include "third_party/snappy/src/snappy-stubs.h" | |||
#endif | |||
namespace leveldb { | |||
namespace port { | |||
Mutex::Mutex() { | |||
} | |||
Mutex::~Mutex() { | |||
} | |||
void Mutex::Lock() { | |||
mu_.Acquire(); | |||
} | |||
void Mutex::Unlock() { | |||
mu_.Release(); | |||
} | |||
void Mutex::AssertHeld() { | |||
mu_.AssertAcquired(); | |||
} | |||
CondVar::CondVar(Mutex* mu) | |||
: cv_(&mu->mu_) { | |||
} | |||
CondVar::~CondVar() { } | |||
void CondVar::Wait() { | |||
cv_.Wait(); | |||
} | |||
void CondVar::Signal(){ | |||
cv_.Signal(); | |||
} | |||
void CondVar::SignalAll() { | |||
cv_.Broadcast(); | |||
} | |||
void Lightweight_Compress(const char* input, size_t input_length, | |||
std::string* output) { | |||
#if defined(USE_SNAPPY) | |||
output->resize(snappy::MaxCompressedLength(input_length)); | |||
size_t outlen; | |||
snappy::RawCompress(snappy::StringPiece(input, input_length), | |||
&(*output)[0], &outlen); | |||
output->resize(outlen); | |||
#else | |||
output->assign(input, input_length); | |||
#endif | |||
} | |||
bool Lightweight_Uncompress(const char* input_data, size_t input_length, | |||
std::string* output) { | |||
#if defined(USE_SNAPPY) | |||
snappy::StringPiece input(input_data, input_length); | |||
size_t ulength; | |||
if (!snappy::GetUncompressedLength(input, &ulength)) { | |||
return false; | |||
} | |||
output->resize(ulength); | |||
return snappy::RawUncompress(input, &(*output)[0]); | |||
#else | |||
output->assign(input_data, input_length); | |||
return true; | |||
#endif | |||
} | |||
} | |||
} |
@ -0,0 +1,104 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// See port_example.h for documentation for the following types/functions. | |||
#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ | |||
#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ | |||
#include <stdint.h> | |||
#include <string> | |||
#include <cstring> | |||
#include "base/atomicops.h" | |||
#include "base/basictypes.h" | |||
#include "base/logging.h" | |||
#include "base/sha1.h" | |||
#include "base/synchronization/condition_variable.h" | |||
#include "base/synchronization/lock.h" | |||
// Linux's ThreadIdentifier() needs this. | |||
#if defined(OS_LINUX) | |||
# include <linux/unistd.h> | |||
#endif | |||
#if defined(OS_WIN) | |||
#define snprintf _snprintf | |||
#define va_copy(a, b) do { (a) = (b); } while (0) | |||
#endif | |||
namespace leveldb { | |||
namespace port { | |||
// Chromium only supports little endian. | |||
static const bool kLittleEndian = true; | |||
class Mutex { | |||
public: | |||
Mutex(); | |||
~Mutex(); | |||
void Lock(); | |||
void Unlock(); | |||
void AssertHeld(); | |||
private: | |||
base::Lock mu_; | |||
friend class CondVar; | |||
DISALLOW_COPY_AND_ASSIGN(Mutex); | |||
}; | |||
class CondVar { | |||
public: | |||
explicit CondVar(Mutex* mu); | |||
~CondVar(); | |||
void Wait(); | |||
void Signal(); | |||
void SignalAll(); | |||
private: | |||
base::ConditionVariable cv_; | |||
DISALLOW_COPY_AND_ASSIGN(CondVar); | |||
}; | |||
class AtomicPointer { | |||
private: | |||
typedef base::subtle::AtomicWord Rep; | |||
Rep rep_; | |||
public: | |||
AtomicPointer() { } | |||
explicit AtomicPointer(void* p) : rep_(reinterpret_cast<Rep>(p)) {} | |||
inline void* Acquire_Load() const { | |||
return reinterpret_cast<void*>(::base::subtle::Acquire_Load(&rep_)); | |||
} | |||
inline void Release_Store(void* v) { | |||
::base::subtle::Release_Store(&rep_, reinterpret_cast<Rep>(v)); | |||
} | |||
inline void* NoBarrier_Load() const { | |||
return reinterpret_cast<void*>(::base::subtle::NoBarrier_Load(&rep_)); | |||
} | |||
inline void NoBarrier_Store(void* v) { | |||
::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast<Rep>(v)); | |||
} | |||
}; | |||
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { | |||
return ::base::SHA1HashBytes(reinterpret_cast<const unsigned char*>(data), | |||
len, | |||
reinterpret_cast<unsigned char*>(hash_array)); | |||
} | |||
void Lightweight_Compress(const char* input, size_t input_length, | |||
std::string* output); | |||
bool Lightweight_Uncompress(const char* input_data, size_t input_length, | |||
std::string* output); | |||
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { | |||
return false; | |||
} | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ |
@ -0,0 +1,119 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// This file contains the specification, but not the implementations, | |||
// of the types/operations/etc. that should be defined by a platform | |||
// specific port_<platform>.h file. Use this file as a reference for | |||
// how to port this package to a new platform. | |||
#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ | |||
#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ | |||
namespace leveldb { | |||
namespace port { | |||
// TODO(jorlow): Many of these belong more in the environment class rather than | |||
// here. We should try moving them and see if it affects perf. | |||
// The following boolean constant must be true on a little-endian machine | |||
// and false otherwise. | |||
static const bool kLittleEndian = true /* or some other expression */; | |||
// ------------------ Threading ------------------- | |||
// A Mutex represents an exclusive lock. | |||
class Mutex { | |||
public: | |||
Mutex(); | |||
~Mutex(); | |||
// Lock the mutex. Waits until other lockers have exited. | |||
// Will deadlock if the mutex is already locked by this thread. | |||
void Lock(); | |||
// Unlock the mutex. | |||
// REQUIRES: This mutex was locked by this thread. | |||
void Unlock(); | |||
// Optionally crash if this thread does not hold this mutex. | |||
// The implementation must be fast, especially if NDEBUG is | |||
// defined. The implementation is allowed to skip all checks. | |||
void AssertHeld(); | |||
}; | |||
class CondVar { | |||
public: | |||
explicit CondVar(Mutex* mu); | |||
~CondVar(); | |||
// Atomically release *mu and block on this condition variable until | |||
// either a call to SignalAll(), or a call to Signal() that picks | |||
// this thread to wakeup. | |||
// REQUIRES: this thread holds *mu | |||
void Wait(); | |||
// If there are some threads waiting, wake up at least one of them. | |||
void Signal(); | |||
// Wake up all waiting threads. | |||
void SignallAll(); | |||
}; | |||
// A type that holds a pointer that can be read or written atomically | |||
// (i.e., without word-tearing.) | |||
class AtomicPointer { | |||
private: | |||
intptr_t rep_; | |||
public: | |||
// Initialize to arbitrary value | |||
AtomicPointer(); | |||
// Initialize to hold v | |||
explicit AtomicPointer(void* v) : rep_(v) { } | |||
// Read and return the stored pointer with the guarantee that no | |||
// later memory access (read or write) by this thread can be | |||
// reordered ahead of this read. | |||
void* Acquire_Load() const; | |||
// Set v as the stored pointer with the guarantee that no earlier | |||
// memory access (read or write) by this thread can be reordered | |||
// after this store. | |||
void Release_Store(void* v); | |||
// Read the stored pointer with no ordering guarantees. | |||
void* NoBarrier_Load() const; | |||
// Set va as the stored pointer with no ordering guarantees. | |||
void NoBarrier_Store(void* v); | |||
}; | |||
// ------------------ Checksumming ------------------- | |||
// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]" | |||
extern void SHA1_Hash(const char* data, size_t len, char* hash_array); | |||
// ------------------ Compression ------------------- | |||
// Store the lightweight compression of "input[0,input_length-1]" in *output. | |||
extern void Lightweight_Compress(const char* input, size_t input_length, | |||
std::string* output); | |||
// Attempt to lightweight uncompress input[0,input_length-1] into *output. | |||
// Returns true if successful, false if the input is invalid lightweight | |||
// compressed data. | |||
extern bool Lightweight_Uncompress(const char* input_data, size_t input_length, | |||
std::string* output); | |||
// ------------------ Miscellaneous ------------------- | |||
// If heap profiling is not supported, returns false. | |||
// Else repeatedly calls (*func)(arg, data, n) and then returns true. | |||
// The concatenation of all "data[0,n-1]" fragments is the heap profile. | |||
extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ |
@ -0,0 +1,50 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "port/port_posix.h" | |||
#include <cstdlib> | |||
#include <stdio.h> | |||
#include <string.h> | |||
#include "util/logging.h" | |||
namespace leveldb { | |||
namespace port { | |||
static void PthreadCall(const char* label, int result) { | |||
if (result != 0) { | |||
fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); | |||
abort(); | |||
} | |||
} | |||
Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } | |||
Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } | |||
void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } | |||
void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } | |||
CondVar::CondVar(Mutex* mu) | |||
: mu_(mu) { | |||
PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); | |||
} | |||
CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } | |||
void CondVar::Wait() { | |||
PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); | |||
} | |||
void CondVar::Signal() { | |||
PthreadCall("signal", pthread_cond_signal(&cv_)); | |||
} | |||
void CondVar::SignalAll() { | |||
PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); | |||
} | |||
} | |||
} |
@ -0,0 +1,108 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// See port_example.h for documentation for the following types/functions. | |||
#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ | |||
#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ | |||
#include <endian.h> | |||
#include <pthread.h> | |||
#include <stdint.h> | |||
#include <string> | |||
#include <cstdatomic> | |||
#include <cstring> | |||
#include "port/sha1_portable.h" | |||
namespace leveldb { | |||
namespace port { | |||
static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); | |||
class CondVar; | |||
class Mutex { | |||
public: | |||
Mutex(); | |||
~Mutex(); | |||
void Lock(); | |||
void Unlock(); | |||
void AssertHeld() { } | |||
private: | |||
friend class CondVar; | |||
pthread_mutex_t mu_; | |||
// No copying | |||
Mutex(const Mutex&); | |||
void operator=(const Mutex&); | |||
}; | |||
class CondVar { | |||
public: | |||
explicit CondVar(Mutex* mu); | |||
~CondVar(); | |||
void Wait(); | |||
void Signal(); | |||
void SignalAll(); | |||
private: | |||
pthread_cond_t cv_; | |||
Mutex* mu_; | |||
}; | |||
// Storage for a lock-free pointer | |||
class AtomicPointer { | |||
private: | |||
std::atomic<void*> rep_; | |||
public: | |||
AtomicPointer() { } | |||
explicit AtomicPointer(void* v) : rep_(v) { } | |||
inline void* Acquire_Load() const { | |||
return rep_.load(std::memory_order_acquire); | |||
} | |||
inline void Release_Store(void* v) { | |||
rep_.store(v, std::memory_order_release); | |||
} | |||
inline void* NoBarrier_Load() const { | |||
return rep_.load(std::memory_order_relaxed); | |||
} | |||
inline void NoBarrier_Store(void* v) { | |||
rep_.store(v, std::memory_order_relaxed); | |||
} | |||
}; | |||
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { | |||
SHA1_Hash_Portable(data, len, hash_array); | |||
} | |||
/** | |||
* TODO(gabor): Implement actual compress | |||
* This is a hack - it just copies input to output. | |||
* No actual compression occurs. | |||
*/ | |||
inline void Lightweight_Compress(const char* input, size_t input_length, | |||
std::string* output) { | |||
output->assign(input, input_length); | |||
} | |||
/** | |||
* TODO(gabor): Implement actual uncompress | |||
* This is a hack - it just copies input to output. | |||
* No actual uncompression occurs. | |||
*/ | |||
inline bool Lightweight_Uncompress(const char* input_data, size_t input_length, | |||
std::string* output) { | |||
output->assign(input_data, input_length); | |||
return true; | |||
} | |||
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { | |||
return false; | |||
} | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ |
@ -0,0 +1,298 @@ | |||
// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// This module provides a slow but portable implementation of | |||
// the SHA1 hash function. | |||
// | |||
// It is adapted from free code written by Paul E. Jones | |||
// <paulej@packetizer.com>. See http://www.packetizer.com/security/sha1/ | |||
// | |||
// The license for the original code is: | |||
/* | |||
Copyright (C) 1998, 2009 | |||
Paul E. Jones <paulej@packetizer.com> | |||
Freeware Public License (FPL) | |||
This software is licensed as "freeware." Permission to distribute | |||
this software in source and binary forms, including incorporation | |||
into other products, is hereby granted without a fee. THIS SOFTWARE | |||
IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES, | |||
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY | |||
AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD | |||
LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER | |||
DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA | |||
OR DATA BEING RENDERED INACCURATE. | |||
*/ | |||
#include "port/sha1_portable.h" | |||
#include <stdio.h> | |||
#include <stdlib.h> | |||
#include <stdint.h> | |||
namespace leveldb { | |||
namespace port { | |||
/* | |||
* Description: | |||
* This class implements the Secure Hashing Standard as defined | |||
* in FIPS PUB 180-1 published April 17, 1995. | |||
*/ | |||
/* | |||
* This structure will hold context information for the hashing | |||
* operation | |||
*/ | |||
typedef struct SHA1Context { | |||
unsigned Message_Digest[5]; /* Message Digest (output) */ | |||
unsigned Length_Low; /* Message length in bits */ | |||
unsigned Length_High; /* Message length in bits */ | |||
unsigned char Message_Block[64]; /* 512-bit message blocks */ | |||
int Message_Block_Index; /* Index into message block array */ | |||
bool Computed; /* Is the digest computed? */ | |||
bool Corrupted; /* Is the message digest corruped? */ | |||
} SHA1Context; | |||
/* | |||
* Portability Issues: | |||
* SHA-1 is defined in terms of 32-bit "words". This code was | |||
* written with the expectation that the processor has at least | |||
* a 32-bit machine word size. If the machine word size is larger, | |||
* the code should still function properly. One caveat to that | |||
* is that the input functions taking characters and character | |||
* arrays assume that only 8 bits of information are stored in each | |||
* character. | |||
*/ | |||
/* | |||
* Define the circular shift macro | |||
*/ | |||
#define SHA1CircularShift(bits,word) \ | |||
((((word) << (bits)) & 0xFFFFFFFF) | \ | |||
((word) >> (32-(bits)))) | |||
/* Function prototypes */ | |||
static void SHA1ProcessMessageBlock(SHA1Context *); | |||
static void SHA1PadMessage(SHA1Context *); | |||
// Initialize the SHA1Context in preparation for computing a new | |||
// message digest. | |||
static void SHA1Reset(SHA1Context* context) { | |||
context->Length_Low = 0; | |||
context->Length_High = 0; | |||
context->Message_Block_Index = 0; | |||
context->Message_Digest[0] = 0x67452301; | |||
context->Message_Digest[1] = 0xEFCDAB89; | |||
context->Message_Digest[2] = 0x98BADCFE; | |||
context->Message_Digest[3] = 0x10325476; | |||
context->Message_Digest[4] = 0xC3D2E1F0; | |||
context->Computed = false; | |||
context->Corrupted = false; | |||
} | |||
// This function will return the 160-bit message digest into the | |||
// Message_Digest array within the SHA1Context provided | |||
static bool SHA1Result(SHA1Context *context) { | |||
if (context->Corrupted) { | |||
return false; | |||
} | |||
if (!context->Computed) { | |||
SHA1PadMessage(context); | |||
context->Computed = true; | |||
} | |||
return true; | |||
} | |||
// This function accepts an array of bytes as the next portion of | |||
// the message. | |||
static void SHA1Input(SHA1Context *context, | |||
const unsigned char *message_array, | |||
unsigned length) { | |||
if (!length) return; | |||
if (context->Computed || context->Corrupted) { | |||
context->Corrupted = true; | |||
return; | |||
} | |||
while(length-- && !context->Corrupted) { | |||
context->Message_Block[context->Message_Block_Index++] = | |||
(*message_array & 0xFF); | |||
context->Length_Low += 8; | |||
/* Force it to 32 bits */ | |||
context->Length_Low &= 0xFFFFFFFF; | |||
if (context->Length_Low == 0) { | |||
context->Length_High++; | |||
/* Force it to 32 bits */ | |||
context->Length_High &= 0xFFFFFFFF; | |||
if (context->Length_High == 0) | |||
{ | |||
/* Message is too long */ | |||
context->Corrupted = true; | |||
} | |||
} | |||
if (context->Message_Block_Index == 64) | |||
{ | |||
SHA1ProcessMessageBlock(context); | |||
} | |||
message_array++; | |||
} | |||
} | |||
// This function will process the next 512 bits of the message stored | |||
// in the Message_Block array. | |||
static void SHA1ProcessMessageBlock(SHA1Context *context) { | |||
const unsigned K[] = // Constants defined in SHA-1 | |||
{ | |||
0x5A827999, | |||
0x6ED9EBA1, | |||
0x8F1BBCDC, | |||
0xCA62C1D6 | |||
}; | |||
int t; // Loop counter | |||
unsigned temp; // Temporary word value | |||
unsigned W[80]; // Word sequence | |||
unsigned A, B, C, D, E; // Word buffers | |||
// Initialize the first 16 words in the array W | |||
for(t = 0; t < 16; t++) { | |||
W[t] = ((unsigned) context->Message_Block[t * 4]) << 24; | |||
W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16; | |||
W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8; | |||
W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]); | |||
} | |||
for(t = 16; t < 80; t++) { | |||
W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]); | |||
} | |||
A = context->Message_Digest[0]; | |||
B = context->Message_Digest[1]; | |||
C = context->Message_Digest[2]; | |||
D = context->Message_Digest[3]; | |||
E = context->Message_Digest[4]; | |||
for(t = 0; t < 20; t++) { | |||
temp = SHA1CircularShift(5,A) + | |||
((B & C) | ((~B) & D)) + E + W[t] + K[0]; | |||
temp &= 0xFFFFFFFF; | |||
E = D; | |||
D = C; | |||
C = SHA1CircularShift(30,B); | |||
B = A; | |||
A = temp; | |||
} | |||
for(t = 20; t < 40; t++) { | |||
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1]; | |||
temp &= 0xFFFFFFFF; | |||
E = D; | |||
D = C; | |||
C = SHA1CircularShift(30,B); | |||
B = A; | |||
A = temp; | |||
} | |||
for(t = 40; t < 60; t++) { | |||
temp = SHA1CircularShift(5,A) + | |||
((B & C) | (B & D) | (C & D)) + E + W[t] + K[2]; | |||
temp &= 0xFFFFFFFF; | |||
E = D; | |||
D = C; | |||
C = SHA1CircularShift(30,B); | |||
B = A; | |||
A = temp; | |||
} | |||
for(t = 60; t < 80; t++) { | |||
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3]; | |||
temp &= 0xFFFFFFFF; | |||
E = D; | |||
D = C; | |||
C = SHA1CircularShift(30,B); | |||
B = A; | |||
A = temp; | |||
} | |||
context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF; | |||
context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF; | |||
context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF; | |||
context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF; | |||
context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF; | |||
context->Message_Block_Index = 0; | |||
} | |||
// According to the standard, the message must be padded to an even | |||
// 512 bits. The first padding bit must be a '1'. The last 64 bits | |||
// represent the length of the original message. All bits in between | |||
// should be 0. This function will pad the message according to those | |||
// rules by filling the Message_Block array accordingly. It will also | |||
// call SHA1ProcessMessageBlock() appropriately. When it returns, it | |||
// can be assumed that the message digest has been computed. | |||
static void SHA1PadMessage(SHA1Context *context) { | |||
// Check to see if the current message block is too small to hold | |||
// the initial padding bits and length. If so, we will pad the | |||
// block, process it, and then continue padding into a second block. | |||
if (context->Message_Block_Index > 55) { | |||
context->Message_Block[context->Message_Block_Index++] = 0x80; | |||
while(context->Message_Block_Index < 64) { | |||
context->Message_Block[context->Message_Block_Index++] = 0; | |||
} | |||
SHA1ProcessMessageBlock(context); | |||
while(context->Message_Block_Index < 56) { | |||
context->Message_Block[context->Message_Block_Index++] = 0; | |||
} | |||
} else { | |||
context->Message_Block[context->Message_Block_Index++] = 0x80; | |||
while(context->Message_Block_Index < 56) { | |||
context->Message_Block[context->Message_Block_Index++] = 0; | |||
} | |||
} | |||
// Store the message length as the last 8 octets | |||
context->Message_Block[56] = (context->Length_High >> 24) & 0xFF; | |||
context->Message_Block[57] = (context->Length_High >> 16) & 0xFF; | |||
context->Message_Block[58] = (context->Length_High >> 8) & 0xFF; | |||
context->Message_Block[59] = (context->Length_High) & 0xFF; | |||
context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF; | |||
context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF; | |||
context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF; | |||
context->Message_Block[63] = (context->Length_Low) & 0xFF; | |||
SHA1ProcessMessageBlock(context); | |||
} | |||
void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) { | |||
SHA1Context context; | |||
SHA1Reset(&context); | |||
SHA1Input(&context, reinterpret_cast<const unsigned char*>(data), len); | |||
bool ok = SHA1Result(&context); | |||
if (!ok) { | |||
fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n"); | |||
exit(1); | |||
} | |||
for (int i = 0; i < 5; i++) { | |||
uint32_t value = context.Message_Digest[i]; | |||
hash_array[i*4 + 0] = (value >> 24) & 0xff; | |||
hash_array[i*4 + 1] = (value >> 16) & 0xff; | |||
hash_array[i*4 + 2] = (value >> 8) & 0xff; | |||
hash_array[i*4 + 3] = value & 0xff; | |||
} | |||
} | |||
} | |||
} |
@ -0,0 +1,25 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ | |||
#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ | |||
#include <stddef.h> | |||
namespace leveldb { | |||
namespace port { | |||
// Compute the SHA1 hash value of "data[0..len-1]" and store it in | |||
// "hash_array[0..19]". hash_array must have 20 bytes of space available. | |||
// | |||
// This function is portable but may not be as fast as a version | |||
// optimized for your platform. It is provided as a default method | |||
// that can be used when porting leveldb to a new platform if no | |||
// better SHA1 hash implementation is available. | |||
void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array); | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ |
@ -0,0 +1,55 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "port/port.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
namespace port { | |||
class SHA1 { }; | |||
static std::string TestSHA1(const char* data, size_t len) { | |||
char hash_val[20]; | |||
SHA1_Hash(data, len, hash_val); | |||
char buf[41]; | |||
for (int i = 0; i < 20; i++) { | |||
snprintf(buf + i * 2, 41 - i * 2, | |||
"%02x", | |||
static_cast<unsigned int>(static_cast<unsigned char>( | |||
hash_val[i]))); | |||
} | |||
return std::string(buf, 40); | |||
} | |||
TEST(SHA1, Simple) { | |||
ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0)); | |||
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5)); | |||
std::string x(10000, 'x'); | |||
ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75", | |||
TestSHA1(x.data(), x.size())); | |||
} | |||
TEST(SHA1, Benchmark) { | |||
std::string data(1048576 * 100, 'x'); | |||
double start = Env::Default()->NowMicros() * 1e-6; | |||
static const int kIters = 10; | |||
uint32_t sha1 = 0; | |||
for (int i = 0; i < kIters; i++) { | |||
char hash_val[20]; | |||
SHA1_Hash(data.data(), data.size(), hash_val); | |||
sha1 |= hash_val[0]; | |||
} | |||
double finish = Env::Default()->NowMicros() * 1e-6; | |||
double mb = (static_cast<long long int>(data.size()) * kIters) / 1048576.0; | |||
fprintf(stderr, "SHA1 %0.0f MB: %.3f secs; %.1f MB/s, dummy=0x%02x\n", | |||
mb, (finish - start), mb / (finish - start), sha1); | |||
} | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,261 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// Decodes the blocks generated by block_builder.cc. | |||
#include "table/block.h" | |||
#include <vector> | |||
#include <algorithm> | |||
#include "include/comparator.h" | |||
#include "util/coding.h" | |||
#include "util/logging.h" | |||
namespace leveldb { | |||
inline uint32_t Block::NumRestarts() const { | |||
assert(size_ >= 2*sizeof(uint32_t)); | |||
return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); | |||
} | |||
Block::Block(const char* data, size_t size) | |||
: data_(data), | |||
size_(size) { | |||
if (size_ < sizeof(uint32_t)) { | |||
size_ = 0; // Error marker | |||
} else { | |||
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); | |||
if (restart_offset_ > size_ - sizeof(uint32_t)) { | |||
// The size is too small for NumRestarts() and therefore | |||
// restart_offset_ wrapped around. | |||
size_ = 0; | |||
} | |||
} | |||
} | |||
Block::~Block() { | |||
delete[] data_; | |||
} | |||
// Helper routine: decode the next block entry starting at "p", | |||
// storing the number of shared key bytes, non_shared key bytes, | |||
// and the length of the value in "*shared", "*non_shared", and | |||
// "*value_length", respectively. Will not derefence past "limit". | |||
// | |||
// If any errors are detected, returns NULL. Otherwise, returns a | |||
// pointer to the key delta (just past the three decoded values). | |||
static inline const char* DecodeEntry(const char* p, const char* limit, | |||
uint32_t* shared, | |||
uint32_t* non_shared, | |||
uint32_t* value_length) { | |||
if (limit - p < 3) return NULL; | |||
*shared = reinterpret_cast<const unsigned char*>(p)[0]; | |||
*non_shared = reinterpret_cast<const unsigned char*>(p)[1]; | |||
*value_length = reinterpret_cast<const unsigned char*>(p)[2]; | |||
if ((*shared | *non_shared | *value_length) < 128) { | |||
// Fast path: all three values are encoded in one byte each | |||
p += 3; | |||
} else { | |||
if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; | |||
if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; | |||
if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; | |||
} | |||
if (limit - p < (*non_shared + *value_length)) return NULL; | |||
return p; | |||
} | |||
class Block::Iter : public Iterator { | |||
private: | |||
const Comparator* const comparator_; | |||
const char* const data_; // underlying block contents | |||
uint32_t const restarts_; // Offset of restart array (list of fixed32) | |||
uint32_t const num_restarts_; // Number of uint32_t entries in restart array | |||
// current_ is offset in data_ of current entry. >= restarts_ if !Valid | |||
uint32_t current_; | |||
uint32_t restart_index_; // Index of restart block in which current_ falls | |||
std::string key_; | |||
Slice value_; | |||
Status status_; | |||
inline int Compare(const Slice& a, const Slice& b) const { | |||
return comparator_->Compare(a, b); | |||
} | |||
// Return the offset in data_ just past the end of the current entry. | |||
inline uint32_t NextEntryOffset() const { | |||
return (value_.data() + value_.size()) - data_; | |||
} | |||
uint32_t GetRestartPoint(uint32_t index) { | |||
assert(index < num_restarts_); | |||
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); | |||
} | |||
void SeekToRestartPoint(uint32_t index) { | |||
key_.clear(); | |||
restart_index_ = index; | |||
// current_ will be fixed by ParseNextKey(); | |||
// ParseNextKey() starts at the end of value_, so set value_ accordingly | |||
uint32_t offset = GetRestartPoint(index); | |||
value_ = Slice(data_ + offset, 0); | |||
} | |||
public: | |||
Iter(const Comparator* comparator, | |||
const char* data, | |||
uint32_t restarts, | |||
uint32_t num_restarts) | |||
: comparator_(comparator), | |||
data_(data), | |||
restarts_(restarts), | |||
num_restarts_(num_restarts), | |||
current_(restarts_), | |||
restart_index_(num_restarts_) { | |||
assert(num_restarts_ > 0); | |||
} | |||
virtual bool Valid() const { return current_ < restarts_; } | |||
virtual Status status() const { return status_; } | |||
virtual Slice key() const { | |||
assert(Valid()); | |||
return key_; | |||
} | |||
virtual Slice value() const { | |||
assert(Valid()); | |||
return value_; | |||
} | |||
virtual void Next() { | |||
assert(Valid()); | |||
ParseNextKey(); | |||
} | |||
virtual void Prev() { | |||
assert(Valid()); | |||
// Scan backwards to a restart point before current_ | |||
const uint32_t original = current_; | |||
while (GetRestartPoint(restart_index_) >= original) { | |||
if (restart_index_ == 0) { | |||
// No more entries | |||
current_ = restarts_; | |||
restart_index_ = num_restarts_; | |||
return; | |||
} | |||
restart_index_--; | |||
} | |||
SeekToRestartPoint(restart_index_); | |||
do { | |||
// Loop until end of current entry hits the start of original entry | |||
} while (ParseNextKey() && NextEntryOffset() < original); | |||
} | |||
virtual void Seek(const Slice& target) { | |||
// Binary search in restart array to find the first restart point | |||
// with a key >= target | |||
uint32_t left = 0; | |||
uint32_t right = num_restarts_ - 1; | |||
while (left < right) { | |||
uint32_t mid = (left + right + 1) / 2; | |||
uint32_t region_offset = GetRestartPoint(mid); | |||
uint32_t shared, non_shared, value_length; | |||
const char* key_ptr = DecodeEntry(data_ + region_offset, | |||
data_ + restarts_, | |||
&shared, &non_shared, &value_length); | |||
if (key_ptr == NULL || (shared != 0)) { | |||
CorruptionError(); | |||
return; | |||
} | |||
Slice mid_key(key_ptr, non_shared); | |||
if (Compare(mid_key, target) < 0) { | |||
// Key at "mid" is smaller than "target". Therefore all | |||
// blocks before "mid" are uninteresting. | |||
left = mid; | |||
} else { | |||
// Key at "mid" is >= "target". Therefore all blocks at or | |||
// after "mid" are uninteresting. | |||
right = mid - 1; | |||
} | |||
} | |||
// Linear search (within restart block) for first key >= target | |||
SeekToRestartPoint(left); | |||
while (true) { | |||
if (!ParseNextKey()) { | |||
return; | |||
} | |||
if (Compare(key_, target) >= 0) { | |||
return; | |||
} | |||
} | |||
} | |||
virtual void SeekToFirst() { | |||
SeekToRestartPoint(0); | |||
ParseNextKey(); | |||
} | |||
virtual void SeekToLast() { | |||
SeekToRestartPoint(num_restarts_ - 1); | |||
while (ParseNextKey() && NextEntryOffset() < restarts_) { | |||
// Keep skipping | |||
} | |||
} | |||
private: | |||
void CorruptionError() { | |||
current_ = restarts_; | |||
restart_index_ = num_restarts_; | |||
status_ = Status::Corruption("bad entry in block"); | |||
key_.clear(); | |||
value_.clear(); | |||
} | |||
bool ParseNextKey() { | |||
current_ = NextEntryOffset(); | |||
const char* p = data_ + current_; | |||
const char* limit = data_ + restarts_; // Restarts come right after data | |||
if (p >= limit) { | |||
// No more entries to return. Mark as invalid. | |||
current_ = restarts_; | |||
restart_index_ = num_restarts_; | |||
return false; | |||
} | |||
// Decode next entry | |||
uint32_t shared, non_shared, value_length; | |||
p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); | |||
if (p == NULL || key_.size() < shared) { | |||
CorruptionError(); | |||
return false; | |||
} else { | |||
key_.resize(shared); | |||
key_.append(p, non_shared); | |||
value_ = Slice(p + non_shared, value_length); | |||
while (restart_index_ + 1 < num_restarts_ && | |||
GetRestartPoint(restart_index_ + 1) < current_) { | |||
++restart_index_; | |||
} | |||
return true; | |||
} | |||
} | |||
}; | |||
Iterator* Block::NewIterator(const Comparator* cmp) { | |||
if (size_ < 2*sizeof(uint32_t)) { | |||
return NewErrorIterator(Status::Corruption("bad block contents")); | |||
} | |||
const uint32_t num_restarts = NumRestarts(); | |||
if (num_restarts == 0) { | |||
return NewEmptyIterator(); | |||
} else { | |||
return new Iter(cmp, data_, restart_offset_, num_restarts); | |||
} | |||
} | |||
} |
@ -0,0 +1,43 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ | |||
#define STORAGE_LEVELDB_TABLE_BLOCK_H_ | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "include/iterator.h" | |||
namespace leveldb { | |||
class Comparator; | |||
class Block { | |||
public: | |||
// Initialize the block with the specified contents. | |||
// Takes ownership of data[] and will delete[] it when done. | |||
Block(const char* data, size_t size); | |||
~Block(); | |||
size_t size() const { return size_; } | |||
Iterator* NewIterator(const Comparator* comparator); | |||
private: | |||
uint32_t NumRestarts() const; | |||
const char* data_; | |||
size_t size_; | |||
uint32_t restart_offset_; // Offset in data_ of restart array | |||
// No copying allowed | |||
Block(const Block&); | |||
void operator=(const Block&); | |||
class Iter; | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ |
@ -0,0 +1,109 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// BlockBuilder generates blocks where keys are prefix-compressed: | |||
// | |||
// When we store a key, we drop the prefix shared with the previous | |||
// string. This helps reduce the space requirement significantly. | |||
// Furthermore, once every K keys, we do not apply the prefix | |||
// compression and store the entire key. We call this a "restart | |||
// point". The tail end of the block stores the offsets of all of the | |||
// restart points, and can be used to do a binary search when looking | |||
// for a particular key. Values are stored as-is (without compression) | |||
// immediately following the corresponding key. | |||
// | |||
// An entry for a particular key-value pair has the form: | |||
// shared_bytes: varint32 | |||
// unshared_bytes: varint32 | |||
// value_length: varint32 | |||
// key_delta: char[unshared_bytes] | |||
// value: char[value_length] | |||
// shared_bytes == 0 for restart points. | |||
// | |||
// The trailer of the block has the form: | |||
// restarts: uint32[num_restarts] | |||
// num_restarts: uint32 | |||
// restarts[i] contains the offset within the block of the ith restart point. | |||
#include "table/block_builder.h" | |||
#include <algorithm> | |||
#include <assert.h> | |||
#include "include/comparator.h" | |||
#include "include/table_builder.h" | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
BlockBuilder::BlockBuilder(const Options* options) | |||
: options_(options), | |||
restarts_(), | |||
counter_(0), | |||
finished_(false) { | |||
assert(options->block_restart_interval >= 1); | |||
restarts_.push_back(0); // First restart point is at offset 0 | |||
} | |||
void BlockBuilder::Reset() { | |||
buffer_.clear(); | |||
restarts_.clear(); | |||
restarts_.push_back(0); // First restart point is at offset 0 | |||
counter_ = 0; | |||
finished_ = false; | |||
last_key_.clear(); | |||
} | |||
size_t BlockBuilder::CurrentSizeEstimate() const { | |||
return (buffer_.size() + // Raw data buffer | |||
restarts_.size() * sizeof(uint32_t) + // Restart array | |||
sizeof(uint32_t)); // Restart array length | |||
} | |||
Slice BlockBuilder::Finish() { | |||
// Append restart array | |||
for (int i = 0; i < restarts_.size(); i++) { | |||
PutFixed32(&buffer_, restarts_[i]); | |||
} | |||
PutFixed32(&buffer_, restarts_.size()); | |||
finished_ = true; | |||
return Slice(buffer_); | |||
} | |||
void BlockBuilder::Add(const Slice& key, const Slice& value) { | |||
Slice last_key_piece(last_key_); | |||
assert(!finished_); | |||
assert(counter_ <= options_->block_restart_interval); | |||
assert(buffer_.empty() // No values yet? | |||
|| options_->comparator->Compare(key, last_key_piece) > 0); | |||
size_t shared = 0; | |||
if (counter_ < options_->block_restart_interval) { | |||
// See how much sharing to do with previous string | |||
const size_t min_length = std::min(last_key_piece.size(), key.size()); | |||
while ((shared < min_length) && (last_key_[shared] == key[shared])) { | |||
shared++; | |||
} | |||
} else { | |||
// Restart compression | |||
restarts_.push_back(buffer_.size()); | |||
counter_ = 0; | |||
} | |||
const size_t non_shared = key.size() - shared; | |||
// Add "<shared><non_shared><value_size>" to buffer_ | |||
PutVarint32(&buffer_, shared); | |||
PutVarint32(&buffer_, non_shared); | |||
PutVarint32(&buffer_, value.size()); | |||
// Add string delta to buffer_ followed by value | |||
buffer_.append(key.data() + shared, non_shared); | |||
buffer_.append(value.data(), value.size()); | |||
// Update state | |||
last_key_.resize(shared); | |||
last_key_.append(key.data() + shared, non_shared); | |||
assert(Slice(last_key_) == key); | |||
counter_++; | |||
} | |||
} |
@ -0,0 +1,57 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ | |||
#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ | |||
#include <vector> | |||
#include <stdint.h> | |||
#include "include/slice.h" | |||
namespace leveldb { | |||
struct Options; | |||
class BlockBuilder { | |||
public: | |||
explicit BlockBuilder(const Options* options); | |||
// Reset the contents as if the BlockBuilder was just constructed. | |||
void Reset(); | |||
// REQUIRES: Finish() has not been callled since the last call to Reset(). | |||
// REQUIRES: key is larger than any previously added key | |||
void Add(const Slice& key, const Slice& value); | |||
// Finish building the block and return a slice that refers to the | |||
// block contents. The returned slice will remain valid for the | |||
// lifetime of this builder or until Reset() is called. | |||
Slice Finish(); | |||
// Returns an estimate of the current (uncompressed) size of the block | |||
// we are building. | |||
size_t CurrentSizeEstimate() const; | |||
// Return true iff no entries have been added since the last Reset() | |||
bool empty() const { | |||
return buffer_.empty(); | |||
} | |||
private: | |||
const Options* options_; | |||
std::string buffer_; // Destination buffer | |||
std::vector<uint32_t> restarts_; // Restart points | |||
int counter_; // Number of entries emitted since restart | |||
bool finished_; // Has Finish() been called? | |||
std::string last_key_; | |||
// No copying allowed | |||
BlockBuilder(const BlockBuilder&); | |||
void operator=(const BlockBuilder&); | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ |
@ -0,0 +1,131 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "table/format.h" | |||
#include "include/env.h" | |||
#include "port/port.h" | |||
#include "table/block.h" | |||
#include "util/coding.h" | |||
#include "util/crc32c.h" | |||
namespace leveldb { | |||
void BlockHandle::EncodeTo(std::string* dst) const { | |||
// Sanity check that all fields have been set | |||
assert(offset_ != ~static_cast<uint64_t>(0)); | |||
assert(size_ != ~static_cast<uint64_t>(0)); | |||
PutVarint64(dst, offset_); | |||
PutVarint64(dst, size_); | |||
} | |||
Status BlockHandle::DecodeFrom(Slice* input) { | |||
if (GetVarint64(input, &offset_) && | |||
GetVarint64(input, &size_)) { | |||
return Status::OK(); | |||
} else { | |||
return Status::Corruption("bad block handle"); | |||
} | |||
} | |||
void Footer::EncodeTo(std::string* dst) const { | |||
#ifndef NDEBUG | |||
const size_t original_size = dst->size(); | |||
#endif | |||
metaindex_handle_.EncodeTo(dst); | |||
index_handle_.EncodeTo(dst); | |||
dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding | |||
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber)); | |||
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32)); | |||
assert(dst->size() == original_size + kEncodedLength); | |||
} | |||
Status Footer::DecodeFrom(Slice* input) { | |||
const char* magic_ptr = input->data() + kEncodedLength - 8; | |||
const uint32_t magic_lo = DecodeFixed32(magic_ptr); | |||
const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); | |||
const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) | | |||
(static_cast<uint64_t>(magic_lo))); | |||
if (magic != kTableMagicNumber) { | |||
return Status::InvalidArgument("not an sstable (bad magic number)"); | |||
} | |||
Status result = metaindex_handle_.DecodeFrom(input); | |||
if (result.ok()) { | |||
result = index_handle_.DecodeFrom(input); | |||
} | |||
if (result.ok()) { | |||
// We skip over any leftover data (just padding for now) in "input" | |||
const char* end = magic_ptr + 8; | |||
*input = Slice(end, input->data() + input->size() - end); | |||
} | |||
return result; | |||
} | |||
Status ReadBlock(RandomAccessFile* file, | |||
const ReadOptions& options, | |||
const BlockHandle& handle, | |||
Block** block) { | |||
*block = NULL; | |||
// Read the block contents as well as the type/crc footer. | |||
// See table_builder.cc for the code that built this structure. | |||
size_t n = handle.size(); | |||
char* buf = new char[n + kBlockTrailerSize]; | |||
Slice contents; | |||
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); | |||
if (!s.ok()) { | |||
delete[] buf; | |||
return s; | |||
} | |||
if (contents.size() != n + kBlockTrailerSize) { | |||
delete[] buf; | |||
return Status::Corruption("truncated block read"); | |||
} | |||
// Check the crc of the type and the block contents | |||
const char* data = contents.data(); // Pointer to where Read put the data | |||
if (options.verify_checksums) { | |||
const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); | |||
const uint32_t actual = crc32c::Value(data, n + 1); | |||
if (actual != crc) { | |||
delete[] buf; | |||
s = Status::Corruption("block checksum mismatch"); | |||
return s; | |||
} | |||
} | |||
switch (data[n]) { | |||
case kNoCompression: | |||
if (data != buf) { | |||
// File implementation gave us pointer to some other data. | |||
// Copy into buf[]. | |||
memcpy(buf, data, n + kBlockTrailerSize); | |||
} | |||
// Ok | |||
break; | |||
case kLightweightCompression: { | |||
std::string decompressed; | |||
if (!port::Lightweight_Uncompress(data, n, &decompressed)) { | |||
delete[] buf; | |||
s = Status::Corruption("corrupted compressed block contents"); | |||
return s; | |||
} | |||
delete[] buf; // Done with uncompressed data | |||
buf = new char[decompressed.size()]; | |||
memcpy(buf, decompressed.data(), decompressed.size()); | |||
n = decompressed.size(); | |||
break; | |||
} | |||
default: | |||
delete[] buf; | |||
return Status::Corruption("bad block type"); | |||
} | |||
*block = new Block(buf, n); // Block takes ownership of buf[] | |||
return Status::OK(); | |||
} | |||
} |
@ -0,0 +1,103 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ | |||
#define STORAGE_LEVELDB_TABLE_FORMAT_H_ | |||
#include <string> | |||
#include <stdint.h> | |||
#include "include/slice.h" | |||
#include "include/status.h" | |||
#include "include/table_builder.h" | |||
namespace leveldb { | |||
class Block; | |||
class RandomAccessFile; | |||
struct ReadOptions; | |||
// BlockHandle is a pointer to the extent of a file that stores a data | |||
// block or a meta block. | |||
class BlockHandle { | |||
public: | |||
BlockHandle(); | |||
// The offset of the block in the file. | |||
uint64_t offset() const { return offset_; } | |||
void set_offset(uint64_t offset) { offset_ = offset; } | |||
// The size of the stored block | |||
uint64_t size() const { return size_; } | |||
void set_size(uint64_t size) { size_ = size; } | |||
void EncodeTo(std::string* dst) const; | |||
Status DecodeFrom(Slice* input); | |||
// Maximum encoding length of a BlockHandle | |||
enum { kMaxEncodedLength = 10 + 10 }; | |||
private: | |||
uint64_t offset_; | |||
uint64_t size_; | |||
}; | |||
// Footer encapsulates the fixed information stored at the tail | |||
// end of every table file. | |||
class Footer { | |||
public: | |||
Footer() { } | |||
// The block handle for the metaindex block of the table | |||
const BlockHandle& metaindex_handle() const { return metaindex_handle_; } | |||
void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } | |||
// The block handle for the index block of the table | |||
const BlockHandle& index_handle() const { | |||
return index_handle_; | |||
} | |||
void set_index_handle(const BlockHandle& h) { | |||
index_handle_ = h; | |||
} | |||
void EncodeTo(std::string* dst) const; | |||
Status DecodeFrom(Slice* input); | |||
// Encoded length of a Footer. Note that the serialization of a | |||
// Footer will always occupy exactly this many bytes. It consists | |||
// of two block handles and a magic number. | |||
enum { | |||
kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 | |||
}; | |||
private: | |||
BlockHandle metaindex_handle_; | |||
BlockHandle index_handle_; | |||
}; | |||
// kTableMagicNumber was picked by running | |||
// echo http://code.google.com/p/leveldb/ | sha1sum | |||
// and taking the leading 64 bits. | |||
static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; | |||
// 1-byte type + 32-bit crc | |||
static const size_t kBlockTrailerSize = 5; | |||
// Read the block identified by "handle" from "file". On success, | |||
// store a pointer to the heap-allocated result in *block and return | |||
// OK. On failure store NULL in *block and return non-OK. | |||
extern Status ReadBlock(RandomAccessFile* file, | |||
const ReadOptions& options, | |||
const BlockHandle& handle, | |||
Block** block); | |||
// Implementation details follow. Clients should ignore, | |||
inline BlockHandle::BlockHandle() | |||
: offset_(~static_cast<uint64_t>(0)), | |||
size_(~static_cast<uint64_t>(0)) { | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ |
@ -0,0 +1,68 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "include/iterator.h" | |||
#include "util/logging.h" | |||
namespace leveldb { | |||
Iterator::Iterator() { | |||
cleanup_.function = NULL; | |||
cleanup_.next = NULL; | |||
} | |||
Iterator::~Iterator() { | |||
if (cleanup_.function != NULL) { | |||
(*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); | |||
for (Cleanup* c = cleanup_.next; c != NULL; ) { | |||
(*c->function)(c->arg1, c->arg2); | |||
Cleanup* next = c->next; | |||
delete c; | |||
c = next; | |||
} | |||
} | |||
} | |||
void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { | |||
assert(func != NULL); | |||
Cleanup* c; | |||
if (cleanup_.function == NULL) { | |||
c = &cleanup_; | |||
} else { | |||
c = new Cleanup; | |||
c->next = cleanup_.next; | |||
cleanup_.next = c; | |||
} | |||
c->function = func; | |||
c->arg1 = arg1; | |||
c->arg2 = arg2; | |||
} | |||
namespace { | |||
class EmptyIterator : public Iterator { | |||
public: | |||
EmptyIterator(const Status& s) : status_(s) { } | |||
virtual bool Valid() const { return false; } | |||
virtual void Seek(const Slice& target) { } | |||
virtual void SeekToFirst() { } | |||
virtual void SeekToLast() { } | |||
virtual void Next() { assert(false); } | |||
virtual void Prev() { assert(false); } | |||
Slice key() const { assert(false); return Slice(); } | |||
Slice value() const { assert(false); return Slice(); } | |||
virtual Status status() const { return status_; } | |||
private: | |||
Status status_; | |||
}; | |||
} | |||
Iterator* NewEmptyIterator() { | |||
return new EmptyIterator(Status::OK()); | |||
} | |||
Iterator* NewErrorIterator(const Status& status) { | |||
return new EmptyIterator(status); | |||
} | |||
} |
@ -0,0 +1,64 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ | |||
#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ | |||
namespace leveldb { | |||
// A internal wrapper class with an interface similar to Iterator that | |||
// caches the valid() and key() results for an underlying iterator. | |||
// This can help avoid virtual function calls and also gives better | |||
// cache locality. | |||
class IteratorWrapper { | |||
private: | |||
Iterator* iter_; | |||
bool valid_; | |||
Slice key_; | |||
public: | |||
IteratorWrapper(): iter_(NULL), valid_(false) { } | |||
explicit IteratorWrapper(Iterator* iter): iter_(NULL) { | |||
Set(iter); | |||
} | |||
~IteratorWrapper() { delete iter_; } | |||
Iterator* iter() const { return iter_; } | |||
// Takes ownership of "iter" and will delete it when destroyed, or | |||
// when Set() is invoked again. | |||
void Set(Iterator* iter) { | |||
delete iter_; | |||
iter_ = iter; | |||
if (iter_ == NULL) { | |||
valid_ = false; | |||
} else { | |||
Update(); | |||
} | |||
} | |||
// Iterator interface methods | |||
bool Valid() const { return valid_; } | |||
Slice key() const { assert(Valid()); return key_; } | |||
Slice value() const { assert(Valid()); return iter_->value(); } | |||
// Methods below require iter() != NULL | |||
Status status() const { assert(iter_); return iter_->status(); } | |||
void Next() { assert(iter_); iter_->Next(); Update(); } | |||
void Prev() { assert(iter_); iter_->Prev(); Update(); } | |||
void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } | |||
void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } | |||
void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } | |||
private: | |||
void Update() { | |||
valid_ = iter_->Valid(); | |||
if (valid_) { | |||
key_ = iter_->key(); | |||
} | |||
} | |||
}; | |||
} | |||
#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ |
@ -0,0 +1,143 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "table/merger.h" | |||
#include "include/comparator.h" | |||
#include "include/iterator.h" | |||
#include "table/iterator_wrapper.h" | |||
namespace leveldb { | |||
namespace { | |||
class MergingIterator : public Iterator { | |||
public: | |||
MergingIterator(const Comparator* comparator, Iterator** children, int n) | |||
: comparator_(comparator), | |||
children_(new IteratorWrapper[n]), | |||
n_(n), | |||
current_(NULL) { | |||
for (int i = 0; i < n; i++) { | |||
children_[i].Set(children[i]); | |||
} | |||
} | |||
virtual ~MergingIterator() { | |||
delete[] children_; | |||
} | |||
virtual bool Valid() const { | |||
return (current_ != NULL); | |||
} | |||
virtual void SeekToFirst() { | |||
for (int i = 0; i < n_; i++) { | |||
children_[i].SeekToFirst(); | |||
} | |||
FindSmallest(); | |||
} | |||
virtual void SeekToLast() { | |||
for (int i = 0; i < n_; i++) { | |||
children_[i].SeekToLast(); | |||
} | |||
FindLargest(); | |||
} | |||
virtual void Seek(const Slice& target) { | |||
for (int i = 0; i < n_; i++) { | |||
children_[i].Seek(target); | |||
} | |||
FindSmallest(); | |||
} | |||
virtual void Next() { | |||
assert(Valid()); | |||
current_->Next(); | |||
FindSmallest(); | |||
} | |||
virtual void Prev() { | |||
assert(Valid()); | |||
current_->Prev(); | |||
FindLargest(); | |||
} | |||
virtual Slice key() const { | |||
assert(Valid()); | |||
return current_->key(); | |||
} | |||
virtual Slice value() const { | |||
assert(Valid()); | |||
return current_->value(); | |||
} | |||
virtual Status status() const { | |||
Status status; | |||
for (int i = 0; i < n_; i++) { | |||
status = children_[i].status(); | |||
if (!status.ok()) { | |||
break; | |||
} | |||
} | |||
return status; | |||
} | |||
private: | |||
void FindSmallest(); | |||
void FindLargest(); | |||
// We might want to use a heap in case there are lots of children. | |||
// For now we use a simple array since we expect a very small number | |||
// of children in leveldb. | |||
const Comparator* comparator_; | |||
IteratorWrapper* children_; | |||
int n_; | |||
IteratorWrapper* current_; | |||
}; | |||
void MergingIterator::FindSmallest() { | |||
IteratorWrapper* smallest = NULL; | |||
for (int i = 0; i < n_; i++) { | |||
IteratorWrapper* child = &children_[i]; | |||
if (child->Valid()) { | |||
if (smallest == NULL) { | |||
smallest = child; | |||
} else if (comparator_->Compare(child->key(), smallest->key()) < 0) { | |||
smallest = child; | |||
} | |||
} | |||
} | |||
current_ = smallest; | |||
} | |||
void MergingIterator::FindLargest() { | |||
IteratorWrapper* largest = NULL; | |||
for (int i = n_-1; i >= 0; i--) { | |||
IteratorWrapper* child = &children_[i]; | |||
if (child->Valid()) { | |||
if (largest == NULL) { | |||
largest = child; | |||
} else if (comparator_->Compare(child->key(), largest->key()) > 0) { | |||
largest = child; | |||
} | |||
} | |||
} | |||
current_ = largest; | |||
} | |||
} | |||
Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { | |||
assert(n >= 0); | |||
if (n == 0) { | |||
return NewEmptyIterator(); | |||
} else if (n == 1) { | |||
return list[0]; | |||
} else { | |||
return new MergingIterator(cmp, list, n); | |||
} | |||
} | |||
} |
@ -0,0 +1,26 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ | |||
#define STORAGE_LEVELDB_TABLE_MERGER_H_ | |||
namespace leveldb { | |||
class Comparator; | |||
class Iterator; | |||
// Return an iterator that provided the union of the data in | |||
// children[0,n-1]. Takes ownership of the child iterators and | |||
// will delete them when the result iterator is deleted. | |||
// | |||
// The result does no duplicate suppression. I.e., if a particular | |||
// key is present in K child iterators, it will be yielded K times. | |||
// | |||
// REQUIRES: n >= 0 | |||
extern Iterator* NewMergingIterator( | |||
const Comparator* comparator, Iterator** children, int n); | |||
} | |||
#endif // STORAGE_LEVELDB_TABLE_MERGER_H_ |
@ -0,0 +1,175 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "include/table.h" | |||
#include "include/cache.h" | |||
#include "include/env.h" | |||
#include "table/block.h" | |||
#include "table/format.h" | |||
#include "table/two_level_iterator.h" | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
struct Table::Rep { | |||
~Rep() { | |||
delete index_block; | |||
} | |||
Options options; | |||
Status status; | |||
RandomAccessFile* file; | |||
uint64_t cache_id; | |||
BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer | |||
Block* index_block; | |||
}; | |||
Status Table::Open(const Options& options, | |||
RandomAccessFile* file, | |||
Table** table) { | |||
*table = NULL; | |||
const uint64_t size = file->Size(); | |||
if (size < Footer::kEncodedLength) { | |||
return Status::InvalidArgument("file is too short to be an sstable"); | |||
} | |||
char footer_space[Footer::kEncodedLength]; | |||
Slice footer_input; | |||
Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, | |||
&footer_input, footer_space); | |||
if (!s.ok()) return s; | |||
Footer footer; | |||
s = footer.DecodeFrom(&footer_input); | |||
if (!s.ok()) return s; | |||
// Read the index block | |||
Block* index_block = NULL; | |||
if (s.ok()) { | |||
s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); | |||
} | |||
if (s.ok()) { | |||
// We've successfully read the footer and the index block: we're | |||
// ready to serve requests. | |||
Rep* rep = new Table::Rep; | |||
rep->options = options; | |||
rep->file = file; | |||
rep->metaindex_handle = footer.metaindex_handle(); | |||
rep->index_block = index_block; | |||
rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); | |||
*table = new Table(rep); | |||
} else { | |||
if (index_block) delete index_block; | |||
} | |||
return s; | |||
} | |||
Table::~Table() { | |||
delete rep_; | |||
} | |||
static void DeleteBlock(void* arg, void* ignored) { | |||
delete reinterpret_cast<Block*>(arg); | |||
} | |||
static void DeleteCachedBlock(const Slice& key, void* value) { | |||
Block* block = reinterpret_cast<Block*>(value); | |||
delete block; | |||
} | |||
static void ReleaseBlock(void* arg, void* h) { | |||
Cache* cache = reinterpret_cast<Cache*>(arg); | |||
Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); | |||
cache->Release(handle); | |||
} | |||
// Convert an index iterator value (i.e., an encoded BlockHandle) | |||
// into an iterator over the contents of the corresponding block. | |||
Iterator* Table::BlockReader(void* arg, | |||
const ReadOptions& options, | |||
const Slice& index_value) { | |||
Table* table = reinterpret_cast<Table*>(arg); | |||
Cache* block_cache = table->rep_->options.block_cache; | |||
Block* block = NULL; | |||
Cache::Handle* cache_handle = NULL; | |||
BlockHandle handle; | |||
Slice input = index_value; | |||
Status s = handle.DecodeFrom(&input); | |||
// We intentionally allow extra stuff in index_value so that we | |||
// can add more features in the future. | |||
if (s.ok()) { | |||
if (block_cache != NULL) { | |||
char cache_key_buffer[16]; | |||
EncodeFixed64(cache_key_buffer, table->rep_->cache_id); | |||
EncodeFixed64(cache_key_buffer+8, handle.offset()); | |||
Slice key(cache_key_buffer, sizeof(cache_key_buffer)); | |||
cache_handle = block_cache->Lookup(key); | |||
if (cache_handle != NULL) { | |||
block = reinterpret_cast<Block*>(block_cache->Value(cache_handle)); | |||
} else { | |||
s = ReadBlock(table->rep_->file, options, handle, &block); | |||
if (s.ok() && options.fill_cache) { | |||
cache_handle = block_cache->Insert( | |||
key, block, block->size(), &DeleteCachedBlock); | |||
} | |||
} | |||
} else { | |||
s = ReadBlock(table->rep_->file, options, handle, &block); | |||
} | |||
} | |||
Iterator* iter; | |||
if (block != NULL) { | |||
iter = block->NewIterator(table->rep_->options.comparator); | |||
if (cache_handle == NULL) { | |||
iter->RegisterCleanup(&DeleteBlock, block, NULL); | |||
} else { | |||
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); | |||
} | |||
} else { | |||
iter = NewErrorIterator(s); | |||
} | |||
return iter; | |||
} | |||
Iterator* Table::NewIterator(const ReadOptions& options) const { | |||
return NewTwoLevelIterator( | |||
rep_->index_block->NewIterator(rep_->options.comparator), | |||
&Table::BlockReader, const_cast<Table*>(this), options); | |||
} | |||
uint64_t Table::ApproximateOffsetOf(const Slice& key) const { | |||
Iterator* index_iter = | |||
rep_->index_block->NewIterator(rep_->options.comparator); | |||
index_iter->Seek(key); | |||
uint64_t result; | |||
if (index_iter->Valid()) { | |||
BlockHandle handle; | |||
Slice input = index_iter->value(); | |||
Status s = handle.DecodeFrom(&input); | |||
if (s.ok()) { | |||
result = handle.offset(); | |||
} else { | |||
// Strange: we can't decode the block handle in the index block. | |||
// We'll just return the offset of the metaindex block, which is | |||
// close to the whole file size for this case. | |||
result = rep_->metaindex_handle.offset(); | |||
} | |||
} else { | |||
// key is past the last key in the file. Approximate the offset | |||
// by returning the offset of the metaindex block (which is | |||
// right near the end of the file). | |||
result = rep_->metaindex_handle.offset(); | |||
} | |||
delete index_iter; | |||
return result; | |||
} | |||
} |
@ -0,0 +1,224 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "include/table_builder.h" | |||
#include <assert.h> | |||
#include <stdio.h> | |||
#include "include/comparator.h" | |||
#include "include/env.h" | |||
#include "table/block_builder.h" | |||
#include "table/format.h" | |||
#include "util/coding.h" | |||
#include "util/crc32c.h" | |||
#include "util/logging.h" | |||
namespace leveldb { | |||
struct TableBuilder::Rep { | |||
Options options; | |||
Options index_block_options; | |||
WritableFile* file; | |||
uint64_t offset; | |||
Status status; | |||
BlockBuilder data_block; | |||
BlockBuilder index_block; | |||
std::string last_key; | |||
int64_t num_entries; | |||
bool closed; // Either Finish() or Abandon() has been called. | |||
// We do not emit the index entry for a block until we have seen the | |||
// first key for the next data block. This allows us to use shorter | |||
// keys in the index block. For example, consider a block boundary | |||
// between the keys "the quick brown fox" and "the who". We can use | |||
// "the r" as the key for the index block entry since it is >= all | |||
// entries in the first block and < all entries in subsequent | |||
// blocks. | |||
// | |||
// Invariant: r->pending_index_entry is true only if data_block is empty. | |||
bool pending_index_entry; | |||
BlockHandle pending_handle; // Handle to add to index block | |||
std::string compressed_output; | |||
Rep(const Options& opt, WritableFile* f) | |||
: options(opt), | |||
index_block_options(opt), | |||
file(f), | |||
offset(0), | |||
data_block(&options), | |||
index_block(&index_block_options), | |||
num_entries(0), | |||
closed(false), | |||
pending_index_entry(false) { | |||
index_block_options.block_restart_interval = 1; | |||
} | |||
}; | |||
TableBuilder::TableBuilder(const Options& options, WritableFile* file) | |||
: rep_(new Rep(options, file)) { | |||
} | |||
TableBuilder::~TableBuilder() { | |||
assert(rep_->closed); // Catch errors where caller forgot to call Finish() | |||
delete rep_; | |||
} | |||
Status TableBuilder::ChangeOptions(const Options& options) { | |||
// Note: if more fields are added to Options, update | |||
// this function to catch changes that should not be allowed to | |||
// change in the middle of building a Table. | |||
if (options.comparator != rep_->options.comparator) { | |||
return Status::InvalidArgument("changing comparator while building table"); | |||
} | |||
// Note that any live BlockBuilders point to rep_->options and therefore | |||
// will automatically pick up the updated options. | |||
rep_->options = options; | |||
rep_->index_block_options = options; | |||
rep_->index_block_options.block_restart_interval = 1; | |||
return Status::OK(); | |||
} | |||
void TableBuilder::Add(const Slice& key, const Slice& value) { | |||
Rep* r = rep_; | |||
assert(!r->closed); | |||
if (!ok()) return; | |||
if (r->num_entries > 0) { | |||
assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); | |||
} | |||
if (r->pending_index_entry) { | |||
assert(r->data_block.empty()); | |||
r->options.comparator->FindShortestSeparator(&r->last_key, key); | |||
std::string handle_encoding; | |||
r->pending_handle.EncodeTo(&handle_encoding); | |||
r->index_block.Add(r->last_key, Slice(handle_encoding)); | |||
r->pending_index_entry = false; | |||
} | |||
r->last_key.assign(key.data(), key.size()); | |||
r->num_entries++; | |||
r->data_block.Add(key, value); | |||
const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); | |||
if (estimated_block_size >= r->options.block_size) { | |||
Flush(); | |||
} | |||
} | |||
void TableBuilder::Flush() { | |||
Rep* r = rep_; | |||
assert(!r->closed); | |||
if (!ok()) return; | |||
if (r->data_block.empty()) return; | |||
assert(!r->pending_index_entry); | |||
WriteBlock(&r->data_block, &r->pending_handle); | |||
if (ok()) { | |||
r->pending_index_entry = true; | |||
r->status = r->file->Flush(); | |||
} | |||
} | |||
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { | |||
// File format contains a sequence of blocks where each block has: | |||
// block_data: uint8[n] | |||
// type: uint8 | |||
// crc: uint32 | |||
assert(ok()); | |||
Rep* r = rep_; | |||
Slice raw = block->Finish(); | |||
Slice block_contents; | |||
CompressionType type = r->options.compression; | |||
// TODO(postrelease): Support more compression options: zlib? | |||
switch (type) { | |||
case kNoCompression: | |||
block_contents = raw; | |||
break; | |||
case kLightweightCompression: { | |||
port::Lightweight_Compress(raw.data(), raw.size(), &r->compressed_output); | |||
block_contents = r->compressed_output; | |||
if (block_contents.size() >= raw.size() - (raw.size() / 8u)) { | |||
// Compressed less than 12.5%, so just store uncompressed form | |||
block_contents = raw; | |||
type = kNoCompression; | |||
} | |||
break; | |||
} | |||
} | |||
handle->set_offset(r->offset); | |||
handle->set_size(block_contents.size()); | |||
r->status = r->file->Append(block_contents); | |||
if (r->status.ok()) { | |||
char trailer[kBlockTrailerSize]; | |||
trailer[0] = type; | |||
uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); | |||
crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type | |||
EncodeFixed32(trailer+1, crc32c::Mask(crc)); | |||
r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); | |||
if (r->status.ok()) { | |||
r->offset += block_contents.size() + kBlockTrailerSize; | |||
} | |||
} | |||
r->compressed_output.clear(); | |||
block->Reset(); | |||
} | |||
Status TableBuilder::status() const { | |||
return rep_->status; | |||
} | |||
Status TableBuilder::Finish() { | |||
Rep* r = rep_; | |||
Flush(); | |||
assert(!r->closed); | |||
r->closed = true; | |||
BlockHandle metaindex_block_handle; | |||
BlockHandle index_block_handle; | |||
if (ok()) { | |||
BlockBuilder meta_index_block(&r->options); | |||
// TODO(postrelease): Add stats and other meta blocks | |||
WriteBlock(&meta_index_block, &metaindex_block_handle); | |||
} | |||
if (ok()) { | |||
if (r->pending_index_entry) { | |||
r->options.comparator->FindShortSuccessor(&r->last_key); | |||
std::string handle_encoding; | |||
r->pending_handle.EncodeTo(&handle_encoding); | |||
r->index_block.Add(r->last_key, Slice(handle_encoding)); | |||
r->pending_index_entry = false; | |||
} | |||
WriteBlock(&r->index_block, &index_block_handle); | |||
} | |||
if (ok()) { | |||
Footer footer; | |||
footer.set_metaindex_handle(metaindex_block_handle); | |||
footer.set_index_handle(index_block_handle); | |||
std::string footer_encoding; | |||
footer.EncodeTo(&footer_encoding); | |||
r->status = r->file->Append(footer_encoding); | |||
if (r->status.ok()) { | |||
r->offset += footer_encoding.size(); | |||
} | |||
} | |||
return r->status; | |||
} | |||
void TableBuilder::Abandon() { | |||
Rep* r = rep_; | |||
assert(!r->closed); | |||
r->closed = true; | |||
} | |||
uint64_t TableBuilder::NumEntries() const { | |||
return rep_->num_entries; | |||
} | |||
uint64_t TableBuilder::FileSize() const { | |||
return rep_->offset; | |||
} | |||
} |
@ -0,0 +1,808 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "include/table.h" | |||
#include <map> | |||
#include "db/dbformat.h" | |||
#include "db/memtable.h" | |||
#include "db/write_batch_internal.h" | |||
#include "include/db.h" | |||
#include "include/env.h" | |||
#include "include/iterator.h" | |||
#include "include/table_builder.h" | |||
#include "table/block.h" | |||
#include "table/block_builder.h" | |||
#include "table/format.h" | |||
#include "util/random.h" | |||
#include "util/testharness.h" | |||
#include "util/testutil.h" | |||
namespace leveldb { | |||
// Return reverse of "key". | |||
// Used to test non-lexicographic comparators. | |||
static std::string Reverse(const Slice& key) { | |||
std::string str(key.ToString()); | |||
std::string rev(str.rbegin(), str.rend()); | |||
return rev; | |||
} | |||
namespace { | |||
class ReverseKeyComparator : public Comparator { | |||
public: | |||
virtual const char* Name() const { | |||
return "leveldb.ReverseBytewiseComparator"; | |||
} | |||
virtual int Compare(const Slice& a, const Slice& b) const { | |||
return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); | |||
} | |||
virtual void FindShortestSeparator( | |||
std::string* start, | |||
const Slice& limit) const { | |||
std::string s = Reverse(*start); | |||
std::string l = Reverse(limit); | |||
BytewiseComparator()->FindShortestSeparator(&s, l); | |||
*start = Reverse(s); | |||
} | |||
virtual void FindShortSuccessor(std::string* key) const { | |||
std::string s = Reverse(*key); | |||
BytewiseComparator()->FindShortSuccessor(&s); | |||
*key = Reverse(s); | |||
} | |||
}; | |||
} | |||
static ReverseKeyComparator reverse_key_comparator; | |||
static void Increment(const Comparator* cmp, std::string* key) { | |||
if (cmp == BytewiseComparator()) { | |||
key->push_back('\0'); | |||
} else { | |||
assert(cmp == &reverse_key_comparator); | |||
std::string rev = Reverse(*key); | |||
rev.push_back('\0'); | |||
*key = Reverse(rev); | |||
} | |||
} | |||
// An STL comparator that uses a Comparator | |||
namespace { | |||
struct STLLessThan { | |||
const Comparator* cmp; | |||
STLLessThan() : cmp(BytewiseComparator()) { } | |||
STLLessThan(const Comparator* c) : cmp(c) { } | |||
bool operator()(const std::string& a, const std::string& b) const { | |||
return cmp->Compare(Slice(a), Slice(b)) < 0; | |||
} | |||
}; | |||
} | |||
class StringSink: public WritableFile { | |||
public: | |||
~StringSink() { } | |||
const std::string& contents() const { return contents_; } | |||
virtual Status Close() { return Status::OK(); } | |||
virtual Status Flush() { return Status::OK(); } | |||
virtual Status Sync() { return Status::OK(); } | |||
virtual Status Append(const Slice& data) { | |||
contents_.append(data.data(), data.size()); | |||
return Status::OK(); | |||
} | |||
private: | |||
std::string contents_; | |||
}; | |||
class StringSource: public RandomAccessFile { | |||
public: | |||
StringSource(const Slice& contents) | |||
: contents_(contents.data(), contents.size()) { | |||
} | |||
virtual ~StringSource() { } | |||
virtual uint64_t Size() const { return contents_.size(); } | |||
virtual Status Read(uint64_t offset, size_t n, Slice* result, | |||
char* scratch) const { | |||
if (offset > contents_.size()) { | |||
return Status::InvalidArgument("invalid Read offset"); | |||
} | |||
if (offset + n > contents_.size()) { | |||
n = contents_.size() - offset; | |||
} | |||
memcpy(scratch, &contents_[offset], n); | |||
*result = Slice(scratch, n); | |||
return Status::OK(); | |||
} | |||
private: | |||
std::string contents_; | |||
}; | |||
typedef std::map<std::string, std::string, STLLessThan> KVMap; | |||
// Helper class for tests to unify the interface between | |||
// BlockBuilder/TableBuilder and Block/Table. | |||
class Constructor { | |||
public: | |||
explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } | |||
virtual ~Constructor() { } | |||
void Add(const std::string& key, const Slice& value) { | |||
data_[key] = value.ToString(); | |||
} | |||
// Finish constructing the data structure with all the keys that have | |||
// been added so far. Returns the keys in sorted order in "*keys" | |||
// and stores the key/value pairs in "*kvmap" | |||
void Finish(const Options& options, | |||
std::vector<std::string>* keys, | |||
KVMap* kvmap) { | |||
*kvmap = data_; | |||
keys->clear(); | |||
for (KVMap::const_iterator it = data_.begin(); | |||
it != data_.end(); | |||
++it) { | |||
keys->push_back(it->first); | |||
} | |||
data_.clear(); | |||
Status s = FinishImpl(options, *kvmap); | |||
ASSERT_TRUE(s.ok()) << s.ToString(); | |||
} | |||
// Construct the data structure from the data in "data" | |||
virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; | |||
virtual size_t NumBytes() const = 0; | |||
virtual Iterator* NewIterator() const = 0; | |||
virtual const KVMap& data() { return data_; } | |||
private: | |||
KVMap data_; | |||
}; | |||
class BlockConstructor: public Constructor { | |||
public: | |||
explicit BlockConstructor(const Comparator* cmp) | |||
: Constructor(cmp), | |||
comparator_(cmp), | |||
block_size_(-1), | |||
block_(NULL) { } | |||
~BlockConstructor() { | |||
delete block_; | |||
} | |||
virtual Status FinishImpl(const Options& options, const KVMap& data) { | |||
delete block_; | |||
block_ = NULL; | |||
BlockBuilder builder(&options); | |||
for (KVMap::const_iterator it = data.begin(); | |||
it != data.end(); | |||
++it) { | |||
builder.Add(it->first, it->second); | |||
} | |||
// Open the block | |||
Slice block_data = builder.Finish(); | |||
block_size_ = block_data.size(); | |||
char* block_data_copy = new char[block_size_]; | |||
memcpy(block_data_copy, block_data.data(), block_size_); | |||
block_ = new Block(block_data_copy, block_size_); | |||
return Status::OK(); | |||
} | |||
virtual size_t NumBytes() const { return block_size_; } | |||
virtual Iterator* NewIterator() const { | |||
return block_->NewIterator(comparator_); | |||
} | |||
private: | |||
const Comparator* comparator_; | |||
int block_size_; | |||
Block* block_; | |||
BlockConstructor(); | |||
}; | |||
class TableConstructor: public Constructor { | |||
public: | |||
TableConstructor(const Comparator* cmp) | |||
: Constructor(cmp), | |||
source_(NULL), table_(NULL) { | |||
} | |||
~TableConstructor() { | |||
Reset(); | |||
} | |||
virtual Status FinishImpl(const Options& options, const KVMap& data) { | |||
Reset(); | |||
StringSink sink; | |||
TableBuilder builder(options, &sink); | |||
for (KVMap::const_iterator it = data.begin(); | |||
it != data.end(); | |||
++it) { | |||
builder.Add(it->first, it->second); | |||
ASSERT_TRUE(builder.status().ok()); | |||
} | |||
Status s = builder.Finish(); | |||
ASSERT_TRUE(s.ok()) << s.ToString(); | |||
ASSERT_EQ(sink.contents().size(), builder.FileSize()); | |||
// Open the table | |||
source_ = new StringSource(sink.contents()); | |||
Options table_options; | |||
table_options.comparator = options.comparator; | |||
return Table::Open(table_options, source_, &table_); | |||
} | |||
virtual size_t NumBytes() const { return source_->Size(); } | |||
virtual Iterator* NewIterator() const { | |||
return table_->NewIterator(ReadOptions()); | |||
} | |||
uint64_t ApproximateOffsetOf(const Slice& key) const { | |||
return table_->ApproximateOffsetOf(key); | |||
} | |||
private: | |||
void Reset() { | |||
delete table_; | |||
delete source_; | |||
table_ = NULL; | |||
source_ = NULL; | |||
} | |||
StringSource* source_; | |||
Table* table_; | |||
TableConstructor(); | |||
}; | |||
// A helper class that converts internal format keys into user keys | |||
class KeyConvertingIterator: public Iterator { | |||
public: | |||
explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } | |||
virtual ~KeyConvertingIterator() { delete iter_; } | |||
virtual bool Valid() const { return iter_->Valid(); } | |||
virtual void Seek(const Slice& target) { | |||
ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); | |||
std::string encoded; | |||
AppendInternalKey(&encoded, ikey); | |||
iter_->Seek(encoded); | |||
} | |||
virtual void SeekToFirst() { iter_->SeekToFirst(); } | |||
virtual void SeekToLast() { iter_->SeekToLast(); } | |||
virtual void Next() { iter_->Next(); } | |||
virtual void Prev() { iter_->Prev(); } | |||
virtual Slice key() const { | |||
assert(Valid()); | |||
ParsedInternalKey key; | |||
if (!ParseInternalKey(iter_->key(), &key)) { | |||
status_ = Status::Corruption("malformed internal key"); | |||
return Slice("corrupted key"); | |||
} | |||
return key.user_key; | |||
} | |||
virtual Slice value() const { return iter_->value(); } | |||
virtual Status status() const { | |||
return status_.ok() ? iter_->status() : status_; | |||
} | |||
private: | |||
mutable Status status_; | |||
Iterator* iter_; | |||
// No copying allowed | |||
KeyConvertingIterator(const KeyConvertingIterator&); | |||
void operator=(const KeyConvertingIterator&); | |||
}; | |||
class MemTableConstructor: public Constructor { | |||
public: | |||
explicit MemTableConstructor(const Comparator* cmp) | |||
: Constructor(cmp), | |||
internal_comparator_(cmp) { | |||
memtable_ = new MemTable(internal_comparator_); | |||
} | |||
~MemTableConstructor() { | |||
delete memtable_; | |||
} | |||
virtual Status FinishImpl(const Options& options, const KVMap& data) { | |||
delete memtable_; | |||
memtable_ = new MemTable(internal_comparator_); | |||
int seq = 1; | |||
for (KVMap::const_iterator it = data.begin(); | |||
it != data.end(); | |||
++it) { | |||
memtable_->Add(seq, kTypeValue, it->first, it->second); | |||
seq++; | |||
} | |||
return Status::OK(); | |||
} | |||
virtual size_t NumBytes() const { | |||
return memtable_->ApproximateMemoryUsage(); | |||
} | |||
virtual Iterator* NewIterator() const { | |||
return new KeyConvertingIterator(memtable_->NewIterator()); | |||
} | |||
private: | |||
InternalKeyComparator internal_comparator_; | |||
MemTable* memtable_; | |||
}; | |||
class DBConstructor: public Constructor { | |||
public: | |||
explicit DBConstructor(const Comparator* cmp) | |||
: Constructor(cmp), | |||
comparator_(cmp) { | |||
db_ = NULL; | |||
NewDB(); | |||
} | |||
~DBConstructor() { | |||
delete db_; | |||
} | |||
virtual Status FinishImpl(const Options& options, const KVMap& data) { | |||
delete db_; | |||
db_ = NULL; | |||
NewDB(); | |||
for (KVMap::const_iterator it = data.begin(); | |||
it != data.end(); | |||
++it) { | |||
WriteBatch batch; | |||
batch.Put(it->first, it->second); | |||
ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); | |||
} | |||
return Status::OK(); | |||
} | |||
virtual size_t NumBytes() const { | |||
Range r("", "\xff\xff"); | |||
uint64_t size; | |||
db_->GetApproximateSizes(&r, 1, &size); | |||
return size; | |||
} | |||
virtual Iterator* NewIterator() const { | |||
return db_->NewIterator(ReadOptions()); | |||
} | |||
private: | |||
void NewDB() { | |||
std::string name = test::TmpDir() + "/table_testdb"; | |||
Options options; | |||
options.comparator = comparator_; | |||
Status status = DestroyDB(name, options); | |||
ASSERT_TRUE(status.ok()) << status.ToString(); | |||
options.create_if_missing = true; | |||
options.error_if_exists = true; | |||
status = DB::Open(options, name, &db_); | |||
ASSERT_TRUE(status.ok()) << status.ToString(); | |||
} | |||
const Comparator* comparator_; | |||
DB* db_; | |||
}; | |||
enum TestType { | |||
TABLE_TEST, | |||
BLOCK_TEST, | |||
MEMTABLE_TEST, | |||
DB_TEST, | |||
}; | |||
struct TestArgs { | |||
TestType type; | |||
bool reverse_compare; | |||
int restart_interval; | |||
}; | |||
static const TestArgs kTestArgList[] = { | |||
{ TABLE_TEST, false, 16 }, | |||
{ TABLE_TEST, false, 1 }, | |||
{ TABLE_TEST, false, 1024 }, | |||
{ TABLE_TEST, true, 16 }, | |||
{ TABLE_TEST, true, 1 }, | |||
{ TABLE_TEST, true, 1024 }, | |||
{ BLOCK_TEST, false, 16 }, | |||
{ BLOCK_TEST, false, 1 }, | |||
{ BLOCK_TEST, false, 1024 }, | |||
{ BLOCK_TEST, true, 16 }, | |||
{ BLOCK_TEST, true, 1 }, | |||
{ BLOCK_TEST, true, 1024 }, | |||
// Restart interval does not matter for memtables | |||
{ MEMTABLE_TEST, false, 16 }, | |||
{ MEMTABLE_TEST, true, 16 }, | |||
// Do not bother with restart interval variations for DB | |||
{ DB_TEST, false, 16 }, | |||
{ DB_TEST, true, 16 }, | |||
}; | |||
static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); | |||
class Harness { | |||
public: | |||
Harness() : constructor_(NULL) { } | |||
void Init(const TestArgs& args) { | |||
delete constructor_; | |||
constructor_ = NULL; | |||
options_ = Options(); | |||
options_.block_restart_interval = args.restart_interval; | |||
// Use shorter block size for tests to exercise block boundary | |||
// conditions more. | |||
options_.block_size = 256; | |||
if (args.reverse_compare) { | |||
options_.comparator = &reverse_key_comparator; | |||
} | |||
switch (args.type) { | |||
case TABLE_TEST: | |||
constructor_ = new TableConstructor(options_.comparator); | |||
break; | |||
case BLOCK_TEST: | |||
constructor_ = new BlockConstructor(options_.comparator); | |||
break; | |||
case MEMTABLE_TEST: | |||
constructor_ = new MemTableConstructor(options_.comparator); | |||
break; | |||
case DB_TEST: | |||
constructor_ = new DBConstructor(options_.comparator); | |||
break; | |||
} | |||
} | |||
~Harness() { | |||
delete constructor_; | |||
} | |||
void Add(const std::string& key, const std::string& value) { | |||
constructor_->Add(key, value); | |||
} | |||
void Test(Random* rnd) { | |||
std::vector<std::string> keys; | |||
KVMap data; | |||
constructor_->Finish(options_, &keys, &data); | |||
TestForwardScan(keys, data); | |||
TestBackwardScan(keys, data); | |||
TestRandomAccess(rnd, keys, data); | |||
} | |||
void TestForwardScan(const std::vector<std::string>& keys, | |||
const KVMap& data) { | |||
Iterator* iter = constructor_->NewIterator(); | |||
ASSERT_TRUE(!iter->Valid()); | |||
iter->SeekToFirst(); | |||
for (KVMap::const_iterator model_iter = data.begin(); | |||
model_iter != data.end(); | |||
++model_iter) { | |||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); | |||
iter->Next(); | |||
} | |||
ASSERT_TRUE(!iter->Valid()); | |||
delete iter; | |||
} | |||
void TestBackwardScan(const std::vector<std::string>& keys, | |||
const KVMap& data) { | |||
Iterator* iter = constructor_->NewIterator(); | |||
ASSERT_TRUE(!iter->Valid()); | |||
iter->SeekToLast(); | |||
for (KVMap::const_reverse_iterator model_iter = data.rbegin(); | |||
model_iter != data.rend(); | |||
++model_iter) { | |||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); | |||
iter->Prev(); | |||
} | |||
ASSERT_TRUE(!iter->Valid()); | |||
delete iter; | |||
} | |||
void TestRandomAccess(Random* rnd, | |||
const std::vector<std::string>& keys, | |||
const KVMap& data) { | |||
static const bool kVerbose = false; | |||
Iterator* iter = constructor_->NewIterator(); | |||
ASSERT_TRUE(!iter->Valid()); | |||
KVMap::const_iterator model_iter = data.begin(); | |||
if (kVerbose) fprintf(stderr, "---\n"); | |||
for (int i = 0; i < 200; i++) { | |||
const int toss = rnd->Uniform(5); | |||
switch (toss) { | |||
case 0: { | |||
if (iter->Valid()) { | |||
if (kVerbose) fprintf(stderr, "Next\n"); | |||
iter->Next(); | |||
++model_iter; | |||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); | |||
} | |||
break; | |||
} | |||
case 1: { | |||
if (kVerbose) fprintf(stderr, "SeekToFirst\n"); | |||
iter->SeekToFirst(); | |||
model_iter = data.begin(); | |||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); | |||
break; | |||
} | |||
case 2: { | |||
std::string key = PickRandomKey(rnd, keys); | |||
model_iter = data.lower_bound(key); | |||
if (kVerbose) fprintf(stderr, "Seek '%s'\n", | |||
EscapeString(key).c_str()); | |||
iter->Seek(Slice(key)); | |||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); | |||
break; | |||
} | |||
case 3: { | |||
if (iter->Valid()) { | |||
if (kVerbose) fprintf(stderr, "Prev\n"); | |||
iter->Prev(); | |||
if (model_iter == data.begin()) { | |||
model_iter = data.end(); // Wrap around to invalid value | |||
} else { | |||
--model_iter; | |||
} | |||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); | |||
} | |||
break; | |||
} | |||
case 4: { | |||
if (kVerbose) fprintf(stderr, "SeekToLast\n"); | |||
iter->SeekToLast(); | |||
if (keys.empty()) { | |||
model_iter = data.end(); | |||
} else { | |||
std::string last = data.rbegin()->first; | |||
model_iter = data.lower_bound(last); | |||
} | |||
ASSERT_EQ(ToString(data, model_iter), ToString(iter)); | |||
break; | |||
} | |||
} | |||
} | |||
delete iter; | |||
} | |||
std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { | |||
if (it == data.end()) { | |||
return "END"; | |||
} else { | |||
return "'" + it->first + "->" + it->second + "'"; | |||
} | |||
} | |||
std::string ToString(const KVMap& data, | |||
const KVMap::const_reverse_iterator& it) { | |||
if (it == data.rend()) { | |||
return "END"; | |||
} else { | |||
return "'" + it->first + "->" + it->second + "'"; | |||
} | |||
} | |||
std::string ToString(const Iterator* it) { | |||
if (!it->Valid()) { | |||
return "END"; | |||
} else { | |||
return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; | |||
} | |||
} | |||
std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) { | |||
if (keys.empty()) { | |||
return "foo"; | |||
} else { | |||
const int index = rnd->Uniform(keys.size()); | |||
std::string result = keys[index]; | |||
switch (rnd->Uniform(3)) { | |||
case 0: | |||
// Return an existing key | |||
break; | |||
case 1: { | |||
// Attempt to return something smaller than an existing key | |||
if (result.size() > 0 && result[result.size()-1] > '\0') { | |||
result[result.size()-1]--; | |||
} | |||
break; | |||
} | |||
case 2: { | |||
// Return something larger than an existing key | |||
Increment(options_.comparator, &result); | |||
break; | |||
} | |||
} | |||
return result; | |||
} | |||
} | |||
private: | |||
Options options_; | |||
Constructor* constructor_; | |||
}; | |||
// Test the empty key | |||
TEST(Harness, SimpleEmptyKey) { | |||
for (int i = 0; i < kNumTestArgs; i++) { | |||
Init(kTestArgList[i]); | |||
Random rnd(test::RandomSeed() + 1); | |||
Add("", "v"); | |||
Test(&rnd); | |||
} | |||
} | |||
TEST(Harness, SimpleSingle) { | |||
for (int i = 0; i < kNumTestArgs; i++) { | |||
Init(kTestArgList[i]); | |||
Random rnd(test::RandomSeed() + 2); | |||
Add("abc", "v"); | |||
Test(&rnd); | |||
} | |||
} | |||
TEST(Harness, SimpleMulti) { | |||
for (int i = 0; i < kNumTestArgs; i++) { | |||
Init(kTestArgList[i]); | |||
Random rnd(test::RandomSeed() + 3); | |||
Add("abc", "v"); | |||
Add("abcd", "v"); | |||
Add("ac", "v2"); | |||
Test(&rnd); | |||
} | |||
} | |||
TEST(Harness, SimpleSpecialKey) { | |||
for (int i = 0; i < kNumTestArgs; i++) { | |||
Init(kTestArgList[i]); | |||
Random rnd(test::RandomSeed() + 4); | |||
Add("\xff\xff", "v3"); | |||
Test(&rnd); | |||
} | |||
} | |||
TEST(Harness, Randomized) { | |||
for (int i = 0; i < kNumTestArgs; i++) { | |||
Init(kTestArgList[i]); | |||
Random rnd(test::RandomSeed() + 5); | |||
for (int num_entries = 0; num_entries < 2000; | |||
num_entries += (num_entries < 50 ? 1 : 200)) { | |||
if ((num_entries % 10) == 0) { | |||
fprintf(stderr, "case %d of %d: num_entries = %d\n", | |||
(i + 1), int(kNumTestArgs), num_entries); | |||
} | |||
for (int e = 0; e < num_entries; e++) { | |||
std::string v; | |||
Add(test::RandomKey(&rnd, rnd.Skewed(4)), | |||
test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); | |||
} | |||
Test(&rnd); | |||
} | |||
} | |||
} | |||
class MemTableTest { }; | |||
TEST(MemTableTest, Simple) { | |||
InternalKeyComparator cmp(BytewiseComparator()); | |||
MemTable memtable(cmp); | |||
WriteBatch batch; | |||
WriteBatchInternal::SetSequence(&batch, 100); | |||
batch.Put(std::string("k1"), std::string("v1")); | |||
batch.Put(std::string("k2"), std::string("v2")); | |||
batch.Put(std::string("k3"), std::string("v3")); | |||
batch.Put(std::string("largekey"), std::string("vlarge")); | |||
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); | |||
Iterator* iter = memtable.NewIterator(); | |||
iter->SeekToFirst(); | |||
while (iter->Valid()) { | |||
fprintf(stderr, "key: '%s' -> '%s'\n", | |||
iter->key().ToString().c_str(), | |||
iter->value().ToString().c_str()); | |||
iter->Next(); | |||
} | |||
delete iter; | |||
} | |||
static bool Between(uint64_t val, uint64_t low, uint64_t high) { | |||
bool result = (val >= low) && (val <= high); | |||
if (!result) { | |||
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", | |||
(unsigned long long)(val), | |||
(unsigned long long)(low), | |||
(unsigned long long)(high)); | |||
} | |||
return result; | |||
} | |||
class TableTest { }; | |||
TEST(TableTest, ApproximateOffsetOfPlain) { | |||
TableConstructor c(BytewiseComparator()); | |||
c.Add("k01", "hello"); | |||
c.Add("k02", "hello2"); | |||
c.Add("k03", std::string(10000, 'x')); | |||
c.Add("k04", std::string(200000, 'x')); | |||
c.Add("k05", std::string(300000, 'x')); | |||
c.Add("k06", "hello3"); | |||
c.Add("k07", std::string(100000, 'x')); | |||
std::vector<std::string> keys; | |||
KVMap kvmap; | |||
Options options; | |||
options.block_size = 1024; | |||
options.compression = kNoCompression; | |||
c.Finish(options, &keys, &kvmap); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); | |||
} | |||
TEST(TableTest, ApproximateOffsetOfCompressed) { | |||
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM) | |||
// Compression not supported yet, so skip this test. | |||
// TODO(sanjay) Reenable after compression support is added | |||
return; | |||
#endif | |||
Random rnd(301); | |||
TableConstructor c(BytewiseComparator()); | |||
std::string tmp; | |||
c.Add("k01", "hello"); | |||
c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); | |||
c.Add("k03", "hello3"); | |||
c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); | |||
std::vector<std::string> keys; | |||
KVMap kvmap; | |||
Options options; | |||
options.block_size = 1024; | |||
options.compression = kLightweightCompression; | |||
c.Finish(options, &keys, &kvmap); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); | |||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,182 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "table/two_level_iterator.h" | |||
#include "include/table.h" | |||
#include "table/block.h" | |||
#include "table/format.h" | |||
#include "table/iterator_wrapper.h" | |||
namespace leveldb { | |||
namespace { | |||
typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); | |||
class TwoLevelIterator: public Iterator { | |||
public: | |||
TwoLevelIterator( | |||
Iterator* index_iter, | |||
BlockFunction block_function, | |||
void* arg, | |||
const ReadOptions& options); | |||
virtual ~TwoLevelIterator(); | |||
virtual void Seek(const Slice& target); | |||
virtual void SeekToFirst(); | |||
virtual void SeekToLast(); | |||
virtual void Next(); | |||
virtual void Prev(); | |||
virtual bool Valid() const { | |||
return data_iter_.Valid(); | |||
} | |||
virtual Slice key() const { | |||
assert(Valid()); | |||
return data_iter_.key(); | |||
} | |||
virtual Slice value() const { | |||
assert(Valid()); | |||
return data_iter_.value(); | |||
} | |||
virtual Status status() const { | |||
// It'd be nice if status() returned a const Status& instead of a Status | |||
if (!index_iter_.status().ok()) { | |||
return index_iter_.status(); | |||
} else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { | |||
return data_iter_.status(); | |||
} else { | |||
return status_; | |||
} | |||
} | |||
private: | |||
void SaveError(const Status& s) { | |||
if (status_.ok() && !s.ok()) status_ = s; | |||
} | |||
void SkipEmptyDataBlocksForward(); | |||
void SkipEmptyDataBlocksBackward(); | |||
void SetDataIterator(Iterator* data_iter); | |||
void InitDataBlock(); | |||
BlockFunction block_function_; | |||
void* arg_; | |||
const ReadOptions options_; | |||
Status status_; | |||
IteratorWrapper index_iter_; | |||
IteratorWrapper data_iter_; // May be NULL | |||
// If data_iter_ is non-NULL, then "data_block_handle_" holds the | |||
// "index_value" passed to block_function_ to create the data_iter_. | |||
std::string data_block_handle_; | |||
}; | |||
TwoLevelIterator::TwoLevelIterator( | |||
Iterator* index_iter, | |||
BlockFunction block_function, | |||
void* arg, | |||
const ReadOptions& options) | |||
: block_function_(block_function), | |||
arg_(arg), | |||
options_(options), | |||
index_iter_(index_iter), | |||
data_iter_(NULL) { | |||
} | |||
TwoLevelIterator::~TwoLevelIterator() { | |||
} | |||
void TwoLevelIterator::Seek(const Slice& target) { | |||
index_iter_.Seek(target); | |||
InitDataBlock(); | |||
if (data_iter_.iter() != NULL) data_iter_.Seek(target); | |||
SkipEmptyDataBlocksForward(); | |||
} | |||
void TwoLevelIterator::SeekToFirst() { | |||
index_iter_.SeekToFirst(); | |||
InitDataBlock(); | |||
if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); | |||
SkipEmptyDataBlocksForward(); | |||
} | |||
void TwoLevelIterator::SeekToLast() { | |||
index_iter_.SeekToLast(); | |||
InitDataBlock(); | |||
if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); | |||
SkipEmptyDataBlocksBackward(); | |||
} | |||
void TwoLevelIterator::Next() { | |||
assert(Valid()); | |||
data_iter_.Next(); | |||
SkipEmptyDataBlocksForward(); | |||
} | |||
void TwoLevelIterator::Prev() { | |||
assert(Valid()); | |||
data_iter_.Prev(); | |||
SkipEmptyDataBlocksBackward(); | |||
} | |||
void TwoLevelIterator::SkipEmptyDataBlocksForward() { | |||
while (data_iter_.iter() == NULL || !data_iter_.Valid()) { | |||
// Move to next block | |||
if (!index_iter_.Valid()) { | |||
SetDataIterator(NULL); | |||
return; | |||
} | |||
index_iter_.Next(); | |||
InitDataBlock(); | |||
if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); | |||
} | |||
} | |||
void TwoLevelIterator::SkipEmptyDataBlocksBackward() { | |||
while (data_iter_.iter() == NULL || !data_iter_.Valid()) { | |||
// Move to next block | |||
if (!index_iter_.Valid()) { | |||
SetDataIterator(NULL); | |||
return; | |||
} | |||
index_iter_.Prev(); | |||
InitDataBlock(); | |||
if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); | |||
} | |||
} | |||
void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { | |||
if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); | |||
data_iter_.Set(data_iter); | |||
} | |||
void TwoLevelIterator::InitDataBlock() { | |||
if (!index_iter_.Valid()) { | |||
SetDataIterator(NULL); | |||
} else { | |||
Slice handle = index_iter_.value(); | |||
if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { | |||
// data_iter_ is already constructed with this iterator, so | |||
// no need to change anything | |||
} else { | |||
Iterator* iter = (*block_function_)(arg_, options_, handle); | |||
data_block_handle_.assign(handle.data(), handle.size()); | |||
SetDataIterator(iter); | |||
} | |||
} | |||
} | |||
} | |||
Iterator* NewTwoLevelIterator( | |||
Iterator* index_iter, | |||
BlockFunction block_function, | |||
void* arg, | |||
const ReadOptions& options) { | |||
return new TwoLevelIterator(index_iter, block_function, arg, options); | |||
} | |||
} |
@ -0,0 +1,34 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ | |||
#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ | |||
#include "include/iterator.h" | |||
namespace leveldb { | |||
struct ReadOptions; | |||
// Return a new two level iterator. A two-level iterator contains an | |||
// index iterator whose values point to a sequence of blocks where | |||
// each block is itself a sequence of key,value pairs. The returned | |||
// two-level iterator yields the concatenation of all key/value pairs | |||
// in the sequence of blocks. Takes ownership of "index_iter" and | |||
// will delete it when no longer needed. | |||
// | |||
// Uses a supplied function to convert an index_iter value into | |||
// an iterator over the contents of the corresponding block. | |||
extern Iterator* NewTwoLevelIterator( | |||
Iterator* index_iter, | |||
Iterator* (*block_function)( | |||
void* arg, | |||
const ReadOptions& options, | |||
const Slice& index_value), | |||
void* arg, | |||
const ReadOptions& options); | |||
} | |||
#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ |
@ -0,0 +1,68 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "util/arena.h" | |||
#include <assert.h> | |||
namespace leveldb { | |||
static const int kBlockSize = 4096; | |||
Arena::Arena() { | |||
blocks_memory_ = 0; | |||
alloc_ptr_ = NULL; // First allocation will allocate a block | |||
alloc_bytes_remaining_ = 0; | |||
} | |||
Arena::~Arena() { | |||
for (int i = 0; i < blocks_.size(); i++) { | |||
delete[] blocks_[i]; | |||
} | |||
} | |||
char* Arena::AllocateFallback(size_t bytes) { | |||
if (bytes > kBlockSize / 4) { | |||
// Object is more than a quarter of our block size. Allocate it separately | |||
// to avoid wasting too much space in leftover bytes. | |||
char* result = AllocateNewBlock(bytes); | |||
return result; | |||
} | |||
// We waste the remaining space in the current block. | |||
alloc_ptr_ = AllocateNewBlock(kBlockSize); | |||
alloc_bytes_remaining_ = kBlockSize; | |||
char* result = alloc_ptr_; | |||
alloc_ptr_ += bytes; | |||
alloc_bytes_remaining_ -= bytes; | |||
return result; | |||
} | |||
char* Arena::AllocateAligned(size_t bytes) { | |||
const int align = sizeof(void*); // We'll align to pointer size | |||
assert((align & (align-1)) == 0); // Pointer size should be a power of 2 | |||
size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1); | |||
size_t slop = (current_mod == 0 ? 0 : align - current_mod); | |||
size_t needed = bytes + slop; | |||
char* result; | |||
if (needed <= alloc_bytes_remaining_) { | |||
result = alloc_ptr_ + slop; | |||
alloc_ptr_ += needed; | |||
alloc_bytes_remaining_ -= needed; | |||
} else { | |||
// AllocateFallback always returned aligned memory | |||
result = AllocateFallback(bytes); | |||
} | |||
assert((reinterpret_cast<uintptr_t>(result) & (align-1)) == 0); | |||
return result; | |||
} | |||
char* Arena::AllocateNewBlock(size_t block_bytes) { | |||
char* result = new char[block_bytes]; | |||
blocks_memory_ += block_bytes; | |||
blocks_.push_back(result); | |||
return result; | |||
} | |||
} |
@ -0,0 +1,68 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ | |||
#define STORAGE_LEVELDB_UTIL_ARENA_H_ | |||
#include <cstddef> | |||
#include <vector> | |||
#include <assert.h> | |||
#include <stdint.h> | |||
namespace leveldb { | |||
class Arena { | |||
public: | |||
Arena(); | |||
~Arena(); | |||
// Return a pointer to a newly allocated memory block of "bytes" bytes. | |||
char* Allocate(size_t bytes); | |||
// Allocate memory with the normal alignment guarantees provided by malloc | |||
char* AllocateAligned(size_t bytes); | |||
// Returns an estimate of the total memory usage of data allocated | |||
// by the arena (including space allocated but not yet used for user | |||
// allocations). | |||
size_t MemoryUsage() const { | |||
return blocks_memory_ + blocks_.capacity() * sizeof(char*); | |||
} | |||
private: | |||
char* AllocateFallback(size_t bytes); | |||
char* AllocateNewBlock(size_t block_bytes); | |||
// Allocation state | |||
char* alloc_ptr_; | |||
size_t alloc_bytes_remaining_; | |||
// Array of new[] allocated memory blocks | |||
std::vector<char*> blocks_; | |||
// Bytes of memory in blocks allocated so far | |||
size_t blocks_memory_; | |||
// No copying allowed | |||
Arena(const Arena&); | |||
void operator=(const Arena&); | |||
}; | |||
inline char* Arena::Allocate(size_t bytes) { | |||
// The semantics of what to return are a bit messy if we allow | |||
// 0-byte allocations, so we disallow them here (we don't need | |||
// them for our internal use). | |||
assert(bytes > 0); | |||
if (bytes <= alloc_bytes_remaining_) { | |||
char* result = alloc_ptr_; | |||
alloc_ptr_ += bytes; | |||
alloc_bytes_remaining_ -= bytes; | |||
return result; | |||
} | |||
return AllocateFallback(bytes); | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ |
@ -0,0 +1,68 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "util/arena.h" | |||
#include "util/random.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
class ArenaTest { }; | |||
TEST(ArenaTest, Empty) { | |||
Arena arena; | |||
} | |||
TEST(ArenaTest, Simple) { | |||
std::vector<std::pair<size_t, char*> > allocated; | |||
Arena arena; | |||
const int N = 100000; | |||
size_t bytes = 0; | |||
Random rnd(301); | |||
for (int i = 0; i < N; i++) { | |||
size_t s; | |||
if (i % (N / 10) == 0) { | |||
s = i; | |||
} else { | |||
s = rnd.OneIn(4000) ? rnd.Uniform(6000) : | |||
(rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); | |||
} | |||
if (s == 0) { | |||
// Our arena disallows size 0 allocations. | |||
s = 1; | |||
} | |||
char* r; | |||
if (rnd.OneIn(10)) { | |||
r = arena.AllocateAligned(s); | |||
} else { | |||
r = arena.Allocate(s); | |||
} | |||
for (int b = 0; b < s; b++) { | |||
// Fill the "i"th allocation with a known bit pattern | |||
r[b] = i % 256; | |||
} | |||
bytes += s; | |||
allocated.push_back(std::make_pair(s, r)); | |||
ASSERT_GE(arena.MemoryUsage(), bytes); | |||
if (i > N/10) { | |||
ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); | |||
} | |||
} | |||
for (int i = 0; i < allocated.size(); i++) { | |||
size_t num_bytes = allocated[i].first; | |||
const char* p = allocated[i].second; | |||
for (int b = 0; b < num_bytes; b++) { | |||
// Check the "i"th allocation for the known bit pattern | |||
ASSERT_EQ(int(p[b]) & 0xff, i % 256); | |||
} | |||
} | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,253 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) | |||
#include <unordered_set> | |||
#elif defined(LEVELDB_PLATFORM_CHROMIUM) | |||
#include "base/hash_tables.h" | |||
#else | |||
#include <hash_set> // TODO(sanjay): Switch to unordered_set when possible. | |||
#endif | |||
#include <assert.h> | |||
#include "include/cache.h" | |||
#include "port/port.h" | |||
#include "util/hash.h" | |||
#include "util/mutexlock.h" | |||
namespace leveldb { | |||
Cache::~Cache() { | |||
} | |||
namespace { | |||
// LRU cache implementation | |||
// An entry is a variable length heap-allocated structure. Entries | |||
// are kept in a circular doubly linked list ordered by access time. | |||
struct LRUHandle { | |||
void* value; | |||
void (*deleter)(const Slice&, void* value); | |||
LRUHandle* next; | |||
LRUHandle* prev; | |||
size_t charge; // TODO(opt): Only allow uint32_t? | |||
size_t key_length; | |||
size_t refs; // TODO(opt): Pack with "key_length"? | |||
char key_data[1]; // Beginning of key | |||
Slice key() const { | |||
// For cheaper lookups, we allow a temporary Handle object | |||
// to store a pointer to a key in "value". | |||
if (next == this) { | |||
return *(reinterpret_cast<Slice*>(value)); | |||
} else { | |||
return Slice(key_data, key_length); | |||
} | |||
} | |||
}; | |||
// Pick a platform specific hash_set instantiation | |||
#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) | |||
// Microsoft's hash_set deviates from the standard. See | |||
// http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx | |||
// for details. Basically the 2 param () operator is a less than and | |||
// the 1 param () operator is a hash function. | |||
struct HandleHashCompare : public stdext::hash_compare<LRUHandle*> { | |||
size_t operator() (LRUHandle* h) const { | |||
Slice k = h->key(); | |||
return Hash(k.data(), k.size(), 0); | |||
} | |||
bool operator() (LRUHandle* a, LRUHandle* b) const { | |||
return a->key().compare(b->key()) < 0; | |||
} | |||
}; | |||
typedef base::hash_set<LRUHandle*, HandleHashCompare> HandleTable; | |||
#else | |||
struct HandleHash { | |||
inline size_t operator()(LRUHandle* h) const { | |||
Slice k = h->key(); | |||
return Hash(k.data(), k.size(), 0); | |||
} | |||
}; | |||
struct HandleEq { | |||
inline bool operator()(LRUHandle* a, LRUHandle* b) const { | |||
return a->key() == b->key(); | |||
} | |||
}; | |||
# if defined(LEVELDB_PLATFORM_CHROMIUM) | |||
typedef base::hash_set<LRUHandle*, HandleHash, HandleEq> HandleTable; | |||
# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) | |||
typedef std::unordered_set<LRUHandle*, HandleHash, HandleEq> HandleTable; | |||
# else | |||
typedef __gnu_cxx::hash_set<LRUHandle*, HandleHash, HandleEq> HandleTable; | |||
# endif | |||
#endif | |||
class LRUCache : public Cache { | |||
public: | |||
explicit LRUCache(size_t capacity); | |||
virtual ~LRUCache(); | |||
virtual Handle* Insert(const Slice& key, void* value, size_t charge, | |||
void (*deleter)(const Slice& key, void* value)); | |||
virtual Handle* Lookup(const Slice& key); | |||
virtual void Release(Handle* handle); | |||
virtual void* Value(Handle* handle); | |||
virtual void Erase(const Slice& key); | |||
virtual uint64_t NewId(); | |||
private: | |||
void LRU_Remove(LRUHandle* e); | |||
void LRU_Append(LRUHandle* e); | |||
void Unref(LRUHandle* e); | |||
// Constructor parameters | |||
const size_t capacity_; | |||
// mutex_ protects the following state. | |||
port::Mutex mutex_; | |||
size_t usage_; | |||
uint64_t last_id_; | |||
// Dummy head of LRU list. | |||
// lru.prev is newest entry, lru.next is oldest entry. | |||
LRUHandle lru_; | |||
HandleTable table_; | |||
}; | |||
LRUCache::LRUCache(size_t capacity) | |||
: capacity_(capacity), | |||
usage_(0), | |||
last_id_(0) { | |||
// Make empty circular linked list | |||
lru_.next = &lru_; | |||
lru_.prev = &lru_; | |||
} | |||
LRUCache::~LRUCache() { | |||
table_.clear(); | |||
for (LRUHandle* e = lru_.next; e != &lru_; ) { | |||
LRUHandle* next = e->next; | |||
assert(e->refs == 1); // Error if caller has an unreleased handle | |||
Unref(e); | |||
e = next; | |||
} | |||
} | |||
void LRUCache::Unref(LRUHandle* e) { | |||
assert(e->refs > 0); | |||
e->refs--; | |||
if (e->refs <= 0) { | |||
usage_ -= e->charge; | |||
(*e->deleter)(e->key(), e->value); | |||
free(e); | |||
} | |||
} | |||
void LRUCache::LRU_Remove(LRUHandle* e) { | |||
e->next->prev = e->prev; | |||
e->prev->next = e->next; | |||
} | |||
void LRUCache::LRU_Append(LRUHandle* e) { | |||
// Make "e" newest entry by inserting just before lru_ | |||
e->next = &lru_; | |||
e->prev = lru_.prev; | |||
e->prev->next = e; | |||
e->next->prev = e; | |||
} | |||
Cache::Handle* LRUCache::Lookup(const Slice& key) { | |||
MutexLock l(&mutex_); | |||
LRUHandle dummy; | |||
dummy.next = &dummy; | |||
dummy.value = const_cast<Slice*>(&key); | |||
HandleTable::iterator iter = table_.find(&dummy); | |||
if (iter == table_.end()) { | |||
return NULL; | |||
} else { | |||
LRUHandle* e = const_cast<LRUHandle*>(*iter); | |||
e->refs++; | |||
LRU_Remove(e); | |||
LRU_Append(e); | |||
return reinterpret_cast<Handle*>(e); | |||
} | |||
} | |||
void* LRUCache::Value(Handle* handle) { | |||
return reinterpret_cast<LRUHandle*>(handle)->value; | |||
} | |||
void LRUCache::Release(Handle* handle) { | |||
MutexLock l(&mutex_); | |||
Unref(reinterpret_cast<LRUHandle*>(handle)); | |||
} | |||
Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, | |||
void (*deleter)(const Slice& key, void* value)) { | |||
MutexLock l(&mutex_); | |||
LRUHandle* e = reinterpret_cast<LRUHandle*>( | |||
malloc(sizeof(LRUHandle)-1 + key.size())); | |||
e->value = value; | |||
e->deleter = deleter; | |||
e->charge = charge; | |||
e->key_length = key.size(); | |||
e->refs = 2; // One from LRUCache, one for the returned handle | |||
memcpy(e->key_data, key.data(), key.size()); | |||
LRU_Append(e); | |||
usage_ += charge; | |||
std::pair<HandleTable::iterator,bool> p = table_.insert(e); | |||
if (!p.second) { | |||
// Kill existing entry | |||
LRUHandle* old = const_cast<LRUHandle*>(*(p.first)); | |||
LRU_Remove(old); | |||
table_.erase(p.first); | |||
table_.insert(e); | |||
Unref(old); | |||
} | |||
while (usage_ > capacity_ && lru_.next != &lru_) { | |||
LRUHandle* old = lru_.next; | |||
LRU_Remove(old); | |||
table_.erase(old); | |||
Unref(old); | |||
} | |||
return reinterpret_cast<Handle*>(e); | |||
} | |||
void LRUCache::Erase(const Slice& key) { | |||
MutexLock l(&mutex_); | |||
LRUHandle dummy; | |||
dummy.next = &dummy; | |||
dummy.value = const_cast<Slice*>(&key); | |||
HandleTable::iterator iter = table_.find(&dummy); | |||
if (iter != table_.end()) { | |||
LRUHandle* e = const_cast<LRUHandle*>(*iter); | |||
LRU_Remove(e); | |||
table_.erase(iter); | |||
Unref(e); | |||
} | |||
} | |||
uint64_t LRUCache::NewId() { | |||
MutexLock l(&mutex_); | |||
return ++(last_id_); | |||
} | |||
} // end anonymous namespace | |||
Cache* NewLRUCache(size_t capacity) { | |||
return new LRUCache(capacity); | |||
} | |||
} |
@ -0,0 +1,169 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "include/cache.h" | |||
#include <vector> | |||
#include "util/coding.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
// Conversions between numeric keys/values and the types expected by Cache. | |||
static std::string EncodeKey(int k) { | |||
std::string result; | |||
PutFixed32(&result, k); | |||
return result; | |||
} | |||
static int DecodeKey(const Slice& k) { | |||
assert(k.size() == 4); | |||
return DecodeFixed32(k.data()); | |||
} | |||
static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); } | |||
static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); } | |||
class CacheTest { | |||
public: | |||
static CacheTest* current_; | |||
static void Deleter(const Slice& key, void* v) { | |||
current_->deleted_keys_.push_back(DecodeKey(key)); | |||
current_->deleted_values_.push_back(DecodeValue(v)); | |||
} | |||
static const int kCacheSize = 100; | |||
std::vector<int> deleted_keys_; | |||
std::vector<int> deleted_values_; | |||
Cache* cache_; | |||
CacheTest() : cache_(NewLRUCache(kCacheSize)) { | |||
current_ = this; | |||
} | |||
~CacheTest() { | |||
delete cache_; | |||
} | |||
int Lookup(int key) { | |||
Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); | |||
const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); | |||
if (handle != NULL) { | |||
cache_->Release(handle); | |||
} | |||
return r; | |||
} | |||
void Insert(int key, int value, int charge = 1) { | |||
cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, | |||
&CacheTest::Deleter)); | |||
} | |||
void Erase(int key) { | |||
cache_->Erase(EncodeKey(key)); | |||
} | |||
}; | |||
CacheTest* CacheTest::current_; | |||
TEST(CacheTest, HitAndMiss) { | |||
ASSERT_EQ(-1, Lookup(100)); | |||
Insert(100, 101); | |||
ASSERT_EQ(101, Lookup(100)); | |||
ASSERT_EQ(-1, Lookup(200)); | |||
ASSERT_EQ(-1, Lookup(300)); | |||
Insert(200, 201); | |||
ASSERT_EQ(101, Lookup(100)); | |||
ASSERT_EQ(201, Lookup(200)); | |||
ASSERT_EQ(-1, Lookup(300)); | |||
Insert(100, 102); | |||
ASSERT_EQ(102, Lookup(100)); | |||
ASSERT_EQ(201, Lookup(200)); | |||
ASSERT_EQ(-1, Lookup(300)); | |||
ASSERT_EQ(1, deleted_keys_.size()); | |||
ASSERT_EQ(100, deleted_keys_[0]); | |||
ASSERT_EQ(101, deleted_values_[0]); | |||
} | |||
TEST(CacheTest, Erase) { | |||
Erase(200); | |||
ASSERT_EQ(0, deleted_keys_.size()); | |||
Insert(100, 101); | |||
Insert(200, 201); | |||
Erase(100); | |||
ASSERT_EQ(-1, Lookup(100)); | |||
ASSERT_EQ(201, Lookup(200)); | |||
ASSERT_EQ(1, deleted_keys_.size()); | |||
ASSERT_EQ(100, deleted_keys_[0]); | |||
ASSERT_EQ(101, deleted_values_[0]); | |||
Erase(100); | |||
ASSERT_EQ(-1, Lookup(100)); | |||
ASSERT_EQ(201, Lookup(200)); | |||
ASSERT_EQ(1, deleted_keys_.size()); | |||
} | |||
TEST(CacheTest, EntriesArePinned) { | |||
Insert(100, 101); | |||
Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); | |||
ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); | |||
Insert(100, 102); | |||
Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); | |||
ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); | |||
ASSERT_EQ(0, deleted_keys_.size()); | |||
cache_->Release(h1); | |||
ASSERT_EQ(1, deleted_keys_.size()); | |||
ASSERT_EQ(100, deleted_keys_[0]); | |||
ASSERT_EQ(101, deleted_values_[0]); | |||
Erase(100); | |||
ASSERT_EQ(-1, Lookup(100)); | |||
ASSERT_EQ(1, deleted_keys_.size()); | |||
cache_->Release(h2); | |||
ASSERT_EQ(2, deleted_keys_.size()); | |||
ASSERT_EQ(100, deleted_keys_[1]); | |||
ASSERT_EQ(102, deleted_values_[1]); | |||
} | |||
TEST(CacheTest, EvictionPolicy) { | |||
Insert(100, 101); | |||
Insert(200, 201); | |||
// Frequently used entry must be kept around | |||
for (int i = 0; i < kCacheSize; i++) { | |||
Insert(1000+i, 2000+i); | |||
ASSERT_EQ(2000+i, Lookup(1000+i)); | |||
ASSERT_EQ(101, Lookup(100)); | |||
} | |||
ASSERT_EQ(101, Lookup(100)); | |||
ASSERT_EQ(2, deleted_keys_.size()); | |||
ASSERT_EQ(200, deleted_keys_[0]); | |||
ASSERT_EQ(201, deleted_values_[0]); | |||
} | |||
TEST(CacheTest, HeavyEntry) { | |||
Insert(100, 101); | |||
Insert(200, 201, kCacheSize); | |||
ASSERT_EQ(1, deleted_keys_.size()); | |||
ASSERT_EQ(100, deleted_keys_[0]); | |||
ASSERT_EQ(101, deleted_values_[0]); | |||
} | |||
TEST(CacheTest, NewId) { | |||
uint64_t a = cache_->NewId(); | |||
uint64_t b = cache_->NewId(); | |||
ASSERT_NE(a, b); | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,194 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
void EncodeFixed32(char* buf, uint32_t value) { | |||
#if __BYTE_ORDER == __LITTLE_ENDIAN | |||
memcpy(buf, &value, sizeof(value)); | |||
#else | |||
buf[0] = value & 0xff; | |||
buf[1] = (value >> 8) & 0xff; | |||
buf[2] = (value >> 16) & 0xff; | |||
buf[3] = (value >> 24) & 0xff; | |||
#endif | |||
} | |||
void EncodeFixed64(char* buf, uint64_t value) { | |||
#if __BYTE_ORDER == __LITTLE_ENDIAN | |||
memcpy(buf, &value, sizeof(value)); | |||
#else | |||
buf[0] = value & 0xff; | |||
buf[1] = (value >> 8) & 0xff; | |||
buf[2] = (value >> 16) & 0xff; | |||
buf[3] = (value >> 24) & 0xff; | |||
buf[4] = (value >> 32) & 0xff; | |||
buf[5] = (value >> 40) & 0xff; | |||
buf[6] = (value >> 48) & 0xff; | |||
buf[7] = (value >> 56) & 0xff; | |||
#endif | |||
} | |||
void PutFixed32(std::string* dst, uint32_t value) { | |||
char buf[sizeof(value)]; | |||
EncodeFixed32(buf, value); | |||
dst->append(buf, sizeof(buf)); | |||
} | |||
void PutFixed64(std::string* dst, uint64_t value) { | |||
char buf[sizeof(value)]; | |||
EncodeFixed64(buf, value); | |||
dst->append(buf, sizeof(buf)); | |||
} | |||
char* EncodeVarint32(char* dst, uint32_t v) { | |||
// Operate on characters as unsigneds | |||
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); | |||
static const int B = 128; | |||
if (v < (1<<7)) { | |||
*(ptr++) = v; | |||
} else if (v < (1<<14)) { | |||
*(ptr++) = v | B; | |||
*(ptr++) = v>>7; | |||
} else if (v < (1<<21)) { | |||
*(ptr++) = v | B; | |||
*(ptr++) = (v>>7) | B; | |||
*(ptr++) = v>>14; | |||
} else if (v < (1<<28)) { | |||
*(ptr++) = v | B; | |||
*(ptr++) = (v>>7) | B; | |||
*(ptr++) = (v>>14) | B; | |||
*(ptr++) = v>>21; | |||
} else { | |||
*(ptr++) = v | B; | |||
*(ptr++) = (v>>7) | B; | |||
*(ptr++) = (v>>14) | B; | |||
*(ptr++) = (v>>21) | B; | |||
*(ptr++) = v>>28; | |||
} | |||
return reinterpret_cast<char*>(ptr); | |||
} | |||
void PutVarint32(std::string* dst, uint32_t v) { | |||
char buf[5]; | |||
char* ptr = EncodeVarint32(buf, v); | |||
dst->append(buf, ptr - buf); | |||
} | |||
char* EncodeVarint64(char* dst, uint64_t v) { | |||
static const int B = 128; | |||
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); | |||
while (v >= B) { | |||
*(ptr++) = (v & (B-1)) | B; | |||
v >>= 7; | |||
} | |||
*(ptr++) = v; | |||
return reinterpret_cast<char*>(ptr); | |||
} | |||
void PutVarint64(std::string* dst, uint64_t v) { | |||
char buf[10]; | |||
char* ptr = EncodeVarint64(buf, v); | |||
dst->append(buf, ptr - buf); | |||
} | |||
void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { | |||
PutVarint32(dst, value.size()); | |||
dst->append(value.data(), value.size()); | |||
} | |||
int VarintLength(uint64_t v) { | |||
int len = 1; | |||
while (v >= 128) { | |||
v >>= 7; | |||
len++; | |||
} | |||
return len; | |||
} | |||
const char* GetVarint32PtrFallback(const char* p, | |||
const char* limit, | |||
uint32_t* value) { | |||
uint32_t result = 0; | |||
for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { | |||
uint32_t byte = *(reinterpret_cast<const unsigned char*>(p)); | |||
p++; | |||
if (byte & 128) { | |||
// More bytes are present | |||
result |= ((byte & 127) << shift); | |||
} else { | |||
result |= (byte << shift); | |||
*value = result; | |||
return reinterpret_cast<const char*>(p); | |||
} | |||
} | |||
return NULL; | |||
} | |||
bool GetVarint32(Slice* input, uint32_t* value) { | |||
const char* p = input->data(); | |||
const char* limit = p + input->size(); | |||
const char* q = GetVarint32Ptr(p, limit, value); | |||
if (q == NULL) { | |||
return false; | |||
} else { | |||
*input = Slice(q, limit - q); | |||
return true; | |||
} | |||
} | |||
const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { | |||
uint64_t result = 0; | |||
for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { | |||
uint64_t byte = *(reinterpret_cast<const unsigned char*>(p)); | |||
p++; | |||
if (byte & 128) { | |||
// More bytes are present | |||
result |= ((byte & 127) << shift); | |||
} else { | |||
result |= (byte << shift); | |||
*value = result; | |||
return reinterpret_cast<const char*>(p); | |||
} | |||
} | |||
return NULL; | |||
} | |||
bool GetVarint64(Slice* input, uint64_t* value) { | |||
const char* p = input->data(); | |||
const char* limit = p + input->size(); | |||
const char* q = GetVarint64Ptr(p, limit, value); | |||
if (q == NULL) { | |||
return false; | |||
} else { | |||
*input = Slice(q, limit - q); | |||
return true; | |||
} | |||
} | |||
const char* GetLengthPrefixedSlice(const char* p, const char* limit, | |||
Slice* result) { | |||
uint32_t len; | |||
p = GetVarint32Ptr(p, limit, &len); | |||
if (p == NULL) return NULL; | |||
if (p + len > limit) return NULL; | |||
*result = Slice(p, len); | |||
return p + len; | |||
} | |||
bool GetLengthPrefixedSlice(Slice* input, Slice* result) { | |||
uint32_t len; | |||
if (GetVarint32(input, &len) && | |||
input->size() >= len) { | |||
*result = Slice(input->data(), len); | |||
input->remove_prefix(len); | |||
return true; | |||
} else { | |||
return false; | |||
} | |||
} | |||
} |
@ -0,0 +1,104 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// Endian-neutral encoding: | |||
// * Fixed-length numbers are encoded with least-significant byte first | |||
// * In addition we support variable length "varint" encoding | |||
// * Strings are encoded prefixed by their length in varint format | |||
#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ | |||
#define STORAGE_LEVELDB_UTIL_CODING_H_ | |||
#include <stdint.h> | |||
#include <string.h> | |||
#include <string> | |||
#include "include/slice.h" | |||
#include "port/port.h" | |||
namespace leveldb { | |||
// Standard Put... routines append to a string | |||
extern void PutFixed32(std::string* dst, uint32_t value); | |||
extern void PutFixed64(std::string* dst, uint64_t value); | |||
extern void PutVarint32(std::string* dst, uint32_t value); | |||
extern void PutVarint64(std::string* dst, uint64_t value); | |||
extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); | |||
// Standard Get... routines parse a value from the beginning of a Slice | |||
// and advance the slice past the parsed value. | |||
extern bool GetVarint32(Slice* input, uint32_t* value); | |||
extern bool GetVarint64(Slice* input, uint64_t* value); | |||
extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); | |||
// Pointer-based variants of GetVarint... These either store a value | |||
// in *v and return a pointer just past the parsed value, or return | |||
// NULL on error. These routines only look at bytes in the range | |||
// [p..limit-1] | |||
extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); | |||
extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); | |||
// Returns the length of the varint32 or varint64 encoding of "v" | |||
extern int VarintLength(uint64_t v); | |||
// Lower-level versions of Put... that write directly into a character buffer | |||
// REQUIRES: dst has enough space for the value being written | |||
extern void EncodeFixed32(char* dst, uint32_t value); | |||
extern void EncodeFixed64(char* dst, uint64_t value); | |||
// Lower-level versions of Put... that write directly into a character buffer | |||
// and return a pointer just past the last byte written. | |||
// REQUIRES: dst has enough space for the value being written | |||
extern char* EncodeVarint32(char* dst, uint32_t value); | |||
extern char* EncodeVarint64(char* dst, uint64_t value); | |||
// Lower-level versions of Get... that read directly from a character buffer | |||
// without any bounds checking. | |||
inline uint32_t DecodeFixed32(const char* ptr) { | |||
if (port::kLittleEndian) { | |||
// Load the raw bytes | |||
uint32_t result; | |||
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load | |||
return result; | |||
} else { | |||
return ((static_cast<uint32_t>(ptr[0])) | |||
| (static_cast<uint32_t>(ptr[1]) << 8) | |||
| (static_cast<uint32_t>(ptr[2]) << 16) | |||
| (static_cast<uint32_t>(ptr[3]) << 24)); | |||
} | |||
} | |||
inline uint64_t DecodeFixed64(const char* ptr) { | |||
if (port::kLittleEndian) { | |||
// Load the raw bytes | |||
uint64_t result; | |||
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load | |||
return result; | |||
} else { | |||
uint64_t lo = DecodeFixed32(ptr); | |||
uint64_t hi = DecodeFixed32(ptr + 4); | |||
return (hi << 32) | lo; | |||
} | |||
} | |||
// Internal routine for use by fallback path of GetVarint32Ptr | |||
extern const char* GetVarint32PtrFallback(const char* p, | |||
const char* limit, | |||
uint32_t* value); | |||
inline const char* GetVarint32Ptr(const char* p, | |||
const char* limit, | |||
uint32_t* value) { | |||
if (p < limit) { | |||
uint32_t result = *(reinterpret_cast<const unsigned char*>(p)); | |||
if ((result & 128) == 0) { | |||
*value = result; | |||
return p + 1; | |||
} | |||
} | |||
return GetVarint32PtrFallback(p, limit, value); | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_UTIL_CODING_H_ |
@ -0,0 +1,173 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "util/coding.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
class Coding { }; | |||
TEST(Coding, Fixed32) { | |||
std::string s; | |||
for (uint32_t v = 0; v < 100000; v++) { | |||
PutFixed32(&s, v); | |||
} | |||
const char* p = s.data(); | |||
for (uint32_t v = 0; v < 100000; v++) { | |||
uint32_t actual = DecodeFixed32(p); | |||
ASSERT_EQ(v, actual); | |||
p += sizeof(uint32_t); | |||
} | |||
} | |||
TEST(Coding, Fixed64) { | |||
std::string s; | |||
for (int power = 0; power <= 63; power++) { | |||
uint64_t v = static_cast<uint64_t>(1) << power; | |||
PutFixed64(&s, v - 1); | |||
PutFixed64(&s, v + 0); | |||
PutFixed64(&s, v + 1); | |||
} | |||
const char* p = s.data(); | |||
for (int power = 0; power <= 63; power++) { | |||
uint64_t v = static_cast<uint64_t>(1) << power; | |||
uint64_t actual; | |||
actual = DecodeFixed64(p); | |||
ASSERT_EQ(v-1, actual); | |||
p += sizeof(uint64_t); | |||
actual = DecodeFixed64(p); | |||
ASSERT_EQ(v+0, actual); | |||
p += sizeof(uint64_t); | |||
actual = DecodeFixed64(p); | |||
ASSERT_EQ(v+1, actual); | |||
p += sizeof(uint64_t); | |||
} | |||
} | |||
TEST(Coding, Varint32) { | |||
std::string s; | |||
for (uint32_t i = 0; i < (32 * 32); i++) { | |||
uint32_t v = (i / 32) << (i % 32); | |||
PutVarint32(&s, v); | |||
} | |||
const char* p = s.data(); | |||
const char* limit = p + s.size(); | |||
for (uint32_t i = 0; i < (32 * 32); i++) { | |||
uint32_t expected = (i / 32) << (i % 32); | |||
uint32_t actual; | |||
const char* start = p; | |||
p = GetVarint32Ptr(p, limit, &actual); | |||
ASSERT_TRUE(p != NULL); | |||
ASSERT_EQ(expected, actual); | |||
ASSERT_EQ(VarintLength(actual), p - start); | |||
} | |||
ASSERT_EQ(p, s.data() + s.size()); | |||
} | |||
TEST(Coding, Varint64) { | |||
// Construct the list of values to check | |||
std::vector<uint64_t> values; | |||
// Some special values | |||
values.push_back(0); | |||
values.push_back(100); | |||
values.push_back(~static_cast<uint64_t>(0)); | |||
values.push_back(~static_cast<uint64_t>(0) - 1); | |||
for (uint32_t k = 0; k < 64; k++) { | |||
// Test values near powers of two | |||
const uint64_t power = 1ull << k; | |||
values.push_back(power); | |||
values.push_back(power-1); | |||
values.push_back(power+1); | |||
}; | |||
std::string s; | |||
for (int i = 0; i < values.size(); i++) { | |||
PutVarint64(&s, values[i]); | |||
} | |||
const char* p = s.data(); | |||
const char* limit = p + s.size(); | |||
for (int i = 0; i < values.size(); i++) { | |||
ASSERT_TRUE(p < limit); | |||
uint64_t actual; | |||
const char* start = p; | |||
p = GetVarint64Ptr(p, limit, &actual); | |||
ASSERT_TRUE(p != NULL); | |||
ASSERT_EQ(values[i], actual); | |||
ASSERT_EQ(VarintLength(actual), p - start); | |||
} | |||
ASSERT_EQ(p, limit); | |||
} | |||
TEST(Coding, Varint32Overflow) { | |||
uint32_t result; | |||
std::string input("\x81\x82\x83\x84\x85\x11"); | |||
ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) | |||
== NULL); | |||
} | |||
TEST(Coding, Varint32Truncation) { | |||
uint32_t large_value = (1u << 31) + 100; | |||
std::string s; | |||
PutVarint32(&s, large_value); | |||
uint32_t result; | |||
for (int len = 0; len < s.size() - 1; len++) { | |||
ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); | |||
} | |||
ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); | |||
ASSERT_EQ(large_value, result); | |||
} | |||
TEST(Coding, Varint64Overflow) { | |||
uint64_t result; | |||
std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); | |||
ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) | |||
== NULL); | |||
} | |||
TEST(Coding, Varint64Truncation) { | |||
uint64_t large_value = (1ull << 63) + 100ull; | |||
std::string s; | |||
PutVarint64(&s, large_value); | |||
uint64_t result; | |||
for (int len = 0; len < s.size() - 1; len++) { | |||
ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); | |||
} | |||
ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); | |||
ASSERT_EQ(large_value, result); | |||
} | |||
TEST(Coding, Strings) { | |||
std::string s; | |||
PutLengthPrefixedSlice(&s, Slice("")); | |||
PutLengthPrefixedSlice(&s, Slice("foo")); | |||
PutLengthPrefixedSlice(&s, Slice("bar")); | |||
PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); | |||
Slice input(s); | |||
Slice v; | |||
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); | |||
ASSERT_EQ("", v.ToString()); | |||
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); | |||
ASSERT_EQ("foo", v.ToString()); | |||
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); | |||
ASSERT_EQ("bar", v.ToString()); | |||
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); | |||
ASSERT_EQ(std::string(200, 'x'), v.ToString()); | |||
ASSERT_EQ("", input.ToString()); | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |
@ -0,0 +1,72 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include <stdint.h> | |||
#include "include/comparator.h" | |||
#include "include/slice.h" | |||
#include "util/logging.h" | |||
namespace leveldb { | |||
Comparator::~Comparator() { } | |||
namespace { | |||
class BytewiseComparatorImpl : public Comparator { | |||
public: | |||
BytewiseComparatorImpl() { } | |||
virtual const char* Name() const { | |||
return "leveldb.BytewiseComparator"; | |||
} | |||
virtual int Compare(const Slice& a, const Slice& b) const { | |||
return a.compare(b); | |||
} | |||
virtual void FindShortestSeparator( | |||
std::string* start, | |||
const Slice& limit) const { | |||
// Find length of common prefix | |||
size_t min_length = std::min(start->size(), limit.size()); | |||
size_t diff_index = 0; | |||
while ((diff_index < min_length) && | |||
((*start)[diff_index] == limit[diff_index])) { | |||
diff_index++; | |||
} | |||
if (diff_index >= min_length) { | |||
// Do not shorten if one string is a prefix of the other | |||
} else { | |||
uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]); | |||
if (diff_byte < static_cast<uint8_t>(0xff) && | |||
diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) { | |||
(*start)[diff_index]++; | |||
start->resize(diff_index + 1); | |||
assert(Compare(*start, limit) < 0); | |||
} | |||
} | |||
} | |||
virtual void FindShortSuccessor(std::string* key) const { | |||
// Find first character that can be incremented | |||
size_t n = key->size(); | |||
for (int i = 0; i < n; i++) { | |||
const uint8_t byte = (*key)[i]; | |||
if (byte != static_cast<uint8_t>(0xff)) { | |||
(*key)[i] = byte + 1; | |||
key->resize(i+1); | |||
return; | |||
} | |||
} | |||
// *key is a run of 0xffs. Leave it alone. | |||
} | |||
}; | |||
} | |||
static const BytewiseComparatorImpl bytewise; | |||
const Comparator* BytewiseComparator() { | |||
return &bytewise; | |||
} | |||
} |
@ -0,0 +1,332 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
// | |||
// A portable implementation of crc32c, optimized to handle | |||
// four bytes at a time. | |||
#include "util/crc32c.h" | |||
#include <stdint.h> | |||
#include "util/coding.h" | |||
namespace leveldb { | |||
namespace crc32c { | |||
static const uint32_t table0_[256] = { | |||
0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, | |||
0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, | |||
0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, | |||
0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, | |||
0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, | |||
0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, | |||
0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, | |||
0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, | |||
0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, | |||
0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, | |||
0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, | |||
0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, | |||
0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, | |||
0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, | |||
0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, | |||
0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, | |||
0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, | |||
0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, | |||
0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, | |||
0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, | |||
0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, | |||
0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, | |||
0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, | |||
0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, | |||
0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, | |||
0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, | |||
0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, | |||
0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, | |||
0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, | |||
0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, | |||
0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, | |||
0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, | |||
0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, | |||
0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, | |||
0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, | |||
0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, | |||
0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, | |||
0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, | |||
0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, | |||
0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, | |||
0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, | |||
0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, | |||
0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, | |||
0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, | |||
0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, | |||
0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, | |||
0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, | |||
0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, | |||
0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, | |||
0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, | |||
0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, | |||
0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, | |||
0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, | |||
0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, | |||
0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, | |||
0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, | |||
0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, | |||
0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, | |||
0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, | |||
0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, | |||
0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, | |||
0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, | |||
0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, | |||
0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 | |||
}; | |||
static const uint32_t table1_[256] = { | |||
0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, | |||
0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, | |||
0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, | |||
0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, | |||
0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, | |||
0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, | |||
0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, | |||
0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, | |||
0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, | |||
0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, | |||
0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, | |||
0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, | |||
0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, | |||
0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, | |||
0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, | |||
0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, | |||
0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, | |||
0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, | |||
0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, | |||
0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, | |||
0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, | |||
0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, | |||
0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, | |||
0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, | |||
0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, | |||
0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, | |||
0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, | |||
0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, | |||
0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, | |||
0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, | |||
0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, | |||
0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, | |||
0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, | |||
0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, | |||
0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, | |||
0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, | |||
0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, | |||
0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, | |||
0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, | |||
0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, | |||
0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, | |||
0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, | |||
0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, | |||
0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, | |||
0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, | |||
0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, | |||
0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, | |||
0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, | |||
0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, | |||
0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, | |||
0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, | |||
0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, | |||
0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, | |||
0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, | |||
0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, | |||
0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, | |||
0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, | |||
0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, | |||
0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, | |||
0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, | |||
0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, | |||
0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, | |||
0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, | |||
0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 | |||
}; | |||
static const uint32_t table2_[256] = { | |||
0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, | |||
0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, | |||
0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, | |||
0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, | |||
0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, | |||
0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, | |||
0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, | |||
0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, | |||
0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, | |||
0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, | |||
0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, | |||
0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, | |||
0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, | |||
0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, | |||
0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, | |||
0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, | |||
0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, | |||
0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, | |||
0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, | |||
0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, | |||
0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, | |||
0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, | |||
0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, | |||
0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, | |||
0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, | |||
0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, | |||
0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, | |||
0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, | |||
0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, | |||
0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, | |||
0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, | |||
0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, | |||
0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, | |||
0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, | |||
0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, | |||
0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, | |||
0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, | |||
0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, | |||
0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, | |||
0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, | |||
0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, | |||
0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, | |||
0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, | |||
0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, | |||
0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, | |||
0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, | |||
0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, | |||
0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, | |||
0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, | |||
0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, | |||
0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, | |||
0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, | |||
0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, | |||
0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, | |||
0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, | |||
0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, | |||
0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, | |||
0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, | |||
0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, | |||
0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, | |||
0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, | |||
0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, | |||
0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, | |||
0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 | |||
}; | |||
static const uint32_t table3_[256] = { | |||
0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, | |||
0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, | |||
0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, | |||
0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, | |||
0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, | |||
0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, | |||
0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, | |||
0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, | |||
0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, | |||
0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, | |||
0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, | |||
0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, | |||
0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, | |||
0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, | |||
0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, | |||
0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, | |||
0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, | |||
0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, | |||
0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, | |||
0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, | |||
0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, | |||
0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, | |||
0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, | |||
0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, | |||
0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, | |||
0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, | |||
0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, | |||
0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, | |||
0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, | |||
0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, | |||
0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, | |||
0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, | |||
0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, | |||
0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, | |||
0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, | |||
0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, | |||
0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, | |||
0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, | |||
0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, | |||
0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, | |||
0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, | |||
0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, | |||
0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, | |||
0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, | |||
0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, | |||
0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, | |||
0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, | |||
0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, | |||
0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, | |||
0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, | |||
0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, | |||
0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, | |||
0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, | |||
0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, | |||
0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, | |||
0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, | |||
0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, | |||
0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, | |||
0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, | |||
0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, | |||
0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, | |||
0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, | |||
0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, | |||
0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 | |||
}; | |||
// Used to fetch a naturally-aligned 32-bit word in little endian byte-order | |||
static inline uint32_t LE_LOAD32(const uint8_t *p) { | |||
return DecodeFixed32(reinterpret_cast<const char*>(p)); | |||
} | |||
uint32_t Extend(uint32_t crc, const char* buf, size_t size) { | |||
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); | |||
const uint8_t *e = p + size; | |||
uint32_t l = crc ^ 0xffffffffu; | |||
#define STEP1 do { \ | |||
int c = (l & 0xff) ^ *p++; \ | |||
l = table0_[c] ^ (l >> 8); \ | |||
} while (0) | |||
#define STEP4 do { \ | |||
uint32_t c = l ^ LE_LOAD32(p); \ | |||
p += 4; \ | |||
l = table3_[c & 0xff] ^ \ | |||
table2_[(c >> 8) & 0xff] ^ \ | |||
table1_[(c >> 16) & 0xff] ^ \ | |||
table0_[c >> 24]; \ | |||
} while (0) | |||
// Point x at first 4-byte aligned byte in string. This might be | |||
// just past the end of the string. | |||
const uintptr_t pval = reinterpret_cast<uintptr_t>(p); | |||
const uint8_t* x = reinterpret_cast<const uint8_t*>(((pval + 3) >> 2) << 2); | |||
if (x <= e) { | |||
// Process bytes until finished or p is 4-byte aligned | |||
while (p != x) { | |||
STEP1; | |||
} | |||
} | |||
// Process bytes 16 at a time | |||
while ((e-p) >= 16) { | |||
STEP4; STEP4; STEP4; STEP4; | |||
} | |||
// Process bytes 4 at a time | |||
while ((e-p) >= 4) { | |||
STEP4; | |||
} | |||
// Process the last few bytes | |||
while (p != e) { | |||
STEP1; | |||
} | |||
#undef STEP4 | |||
#undef STEP1 | |||
return l ^ 0xffffffffu; | |||
} | |||
} | |||
} |
@ -0,0 +1,45 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ | |||
#define STORAGE_LEVELDB_UTIL_CRC32C_H_ | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
namespace leveldb { | |||
namespace crc32c { | |||
// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the | |||
// crc32c of some string A. Extend() is often used to maintain the | |||
// crc32c of a stream of data. | |||
extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); | |||
// Return the crc32c of data[0,n-1] | |||
inline uint32_t Value(const char* data, size_t n) { | |||
return Extend(0, data, n); | |||
} | |||
static const uint32_t kMaskDelta = 0xa282ead8ul; | |||
// Return a masked representation of crc. | |||
// | |||
// Motivation: it is problematic to compute the CRC of a string that | |||
// contains embedded CRCs. Therefore we recommend that CRCs stored | |||
// somewhere (e.g., in files) should be masked before being stored. | |||
inline uint32_t Mask(uint32_t crc) { | |||
// Rotate right by 15 bits and add a constant. | |||
return ((crc >> 15) | (crc << 17)) + kMaskDelta; | |||
} | |||
// Return the crc whose masked representation is masked_crc. | |||
inline uint32_t Unmask(uint32_t masked_crc) { | |||
uint32_t rot = masked_crc - kMaskDelta; | |||
return ((rot >> 17) | (rot << 15)); | |||
} | |||
} | |||
} | |||
#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ |
@ -0,0 +1,86 @@ | |||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
// Use of this source code is governed by a BSD-style license that can be | |||
// found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
#include "util/crc32c.h" | |||
#include "util/testharness.h" | |||
namespace leveldb { | |||
namespace crc32c { | |||
class CRC { }; | |||
TEST(CRC, StandardResults) { | |||
// From rfc3720 section B.4. | |||
char buf[32]; | |||
memset(buf, 0, sizeof(buf)); | |||
ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); | |||
memset(buf, 0xff, sizeof(buf)); | |||
ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); | |||
for (int i = 0; i < 32; i++) { | |||
buf[i] = i; | |||
} | |||
ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); | |||
for (int i = 0; i < 32; i++) { | |||
buf[i] = 31 - i; | |||
} | |||
ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); | |||
unsigned char data[48] = { | |||
0x01, 0xc0, 0x00, 0x00, | |||
0x00, 0x00, 0x00, 0x00, | |||
0x00, 0x00, 0x00, 0x00, | |||
0x00, 0x00, 0x00, 0x00, | |||
0x14, 0x00, 0x00, 0x00, | |||
0x00, 0x00, 0x04, 0x00, | |||
0x00, 0x00, 0x00, 0x14, | |||
0x00, 0x00, 0x00, 0x18, | |||
0x28, 0x00, 0x00, 0x00, | |||
0x00, 0x00, 0x00, 0x00, | |||
0x02, 0x00, 0x00, 0x00, | |||
0x00, 0x00, 0x00, 0x00, | |||
}; | |||
ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data))); | |||
} | |||
TEST(CRC, Values) { | |||
ASSERT_NE(Value("a", 1), Value("foo", 3)); | |||
} | |||
TEST(CRC, Extend) { | |||
ASSERT_EQ(Value("hello world", 11), | |||
Extend(Value("hello ", 6), "world", 5)); | |||
} | |||
TEST(CRC, Mask) { | |||
uint32_t crc = Value("foo", 3); | |||
ASSERT_NE(crc, Mask(crc)); | |||
ASSERT_NE(crc, Mask(Mask(crc))); | |||
ASSERT_EQ(crc, Unmask(Mask(crc))); | |||
ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); | |||
} | |||
TEST(CRC, Benchmark) { | |||
std::string data(1048576 * 100, 'x'); | |||
double start = Env::Default()->NowMicros() * 1e-6; | |||
static const int kIters = 10; | |||
uint32_t crc = 0; | |||
for (int i = 0; i < kIters; i++) { | |||
crc |= Value(data.data(), data.size()); | |||
} | |||
double finish = Env::Default()->NowMicros() * 1e-6; | |||
double mb = (static_cast<long long int>(data.size()) * kIters) / 1048576.0; | |||
fprintf(stderr, "CRC %0.0f MB: %.3f secs; %.1f MB/s, crc=0x%08x\n", | |||
mb, (finish - start), mb / (finish - start), crc); | |||
} | |||
} | |||
} | |||
int main(int argc, char** argv) { | |||
return leveldb::test::RunAllTests(); | |||
} |