Browse Source

Initial checkin.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@2 62dab493-f737-651d-591e-8d6aee1b9529
ld
jorlow@chromium.org 13 years ago
parent
commit
f67e15e50f
118 changed files with 19207 additions and 0 deletions
  1. +8
    -0
      AUTHORS
  2. +64
    -0
      Android.mk
  3. +6
    -0
      Application.mk
  4. +27
    -0
      LICENSE
  5. +134
    -0
      Makefile
  6. +51
    -0
      README
  7. +23
    -0
      TODO
  8. +97
    -0
      db/builder.cc
  9. +36
    -0
      db/builder.h
  10. +366
    -0
      db/corruption_test.cc
  11. +376
    -0
      db/db_bench.cc
  12. +1195
    -0
      db/db_impl.cc
  13. +192
    -0
      db/db_impl.h
  14. +412
    -0
      db/db_iter.cc
  15. +26
    -0
      db/db_iter.h
  16. +963
    -0
      db/db_test.cc
  17. +152
    -0
      db/dbformat.cc
  18. +198
    -0
      db/dbformat.h
  19. +127
    -0
      db/dbformat_test.cc
  20. +154
    -0
      db/filename.cc
  21. +92
    -0
      db/filename.h
  22. +156
    -0
      db/filename_test.cc
  23. +35
    -0
      db/log_format.h
  24. +172
    -0
      db/log_reader.cc
  25. +75
    -0
      db/log_reader.h
  26. +361
    -0
      db/log_test.cc
  27. +101
    -0
      db/log_writer.cc
  28. +48
    -0
      db/log_writer.h
  29. +109
    -0
      db/memtable.cc
  30. +69
    -0
      db/memtable.h
  31. +396
    -0
      db/repair.cc
  32. +378
    -0
      db/skiplist.h
  33. +378
    -0
      db/skiplist_test.cc
  34. +66
    -0
      db/snapshot.h
  35. +94
    -0
      db/table_cache.cc
  36. +49
    -0
      db/table_cache.h
  37. +282
    -0
      db/version_edit.cc
  38. +118
    -0
      db/version_edit.h
  39. +50
    -0
      db/version_edit_test.cc
  40. +1003
    -0
      db/version_set.cc
  41. +290
    -0
      db/version_set.h
  42. +164
    -0
      db/write_batch.cc
  43. +73
    -0
      db/write_batch_internal.h
  44. +110
    -0
      db/write_batch_test.cc
  45. +89
    -0
      doc/doc.css
  46. +222
    -0
      doc/impl.html
  47. +508
    -0
      doc/index.html
  48. +72
    -0
      doc/log_format.txt
  49. +61
    -0
      doc/table_format.txt
  50. +99
    -0
      include/cache.h
  51. +61
    -0
      include/comparator.h
  52. +137
    -0
      include/db.h
  53. +293
    -0
      include/env.h
  54. +95
    -0
      include/iterator.h
  55. +203
    -0
      include/options.h
  56. +104
    -0
      include/slice.h
  57. +86
    -0
      include/status.h
  58. +67
    -0
      include/table.h
  59. +86
    -0
      include/table_builder.h
  60. +49
    -0
      include/write_batch.h
  61. +329
    -0
      leveldb.gyp
  62. +10
    -0
      port/README
  63. +21
    -0
      port/port.h
  64. +65
    -0
      port/port_android.cc
  65. +131
    -0
      port/port_android.h
  66. +83
    -0
      port/port_chromium.cc
  67. +104
    -0
      port/port_chromium.h
  68. +119
    -0
      port/port_example.h
  69. +50
    -0
      port/port_posix.cc
  70. +108
    -0
      port/port_posix.h
  71. +298
    -0
      port/sha1_portable.cc
  72. +25
    -0
      port/sha1_portable.h
  73. +55
    -0
      port/sha1_test.cc
  74. +261
    -0
      table/block.cc
  75. +43
    -0
      table/block.h
  76. +109
    -0
      table/block_builder.cc
  77. +57
    -0
      table/block_builder.h
  78. +131
    -0
      table/format.cc
  79. +103
    -0
      table/format.h
  80. +68
    -0
      table/iterator.cc
  81. +64
    -0
      table/iterator_wrapper.h
  82. +143
    -0
      table/merger.cc
  83. +26
    -0
      table/merger.h
  84. +175
    -0
      table/table.cc
  85. +224
    -0
      table/table_builder.cc
  86. +808
    -0
      table/table_test.cc
  87. +182
    -0
      table/two_level_iterator.cc
  88. +34
    -0
      table/two_level_iterator.h
  89. +68
    -0
      util/arena.cc
  90. +68
    -0
      util/arena.h
  91. +68
    -0
      util/arena_test.cc
  92. +253
    -0
      util/cache.cc
  93. +169
    -0
      util/cache_test.cc
  94. +194
    -0
      util/coding.cc
  95. +104
    -0
      util/coding.h
  96. +173
    -0
      util/coding_test.cc
  97. +72
    -0
      util/comparator.cc
  98. +332
    -0
      util/crc32c.cc
  99. +45
    -0
      util/crc32c.h
  100. +86
    -0
      util/crc32c_test.cc

+ 8
- 0
AUTHORS View File

@ -0,0 +1,8 @@
# Names should be added to this file like so:
# Name or Organization <email address>
Google Inc.
# Initial version authors:
Jeffrey Dean <jeff@google.com>
Sanjay Ghemawat <sanjay@google.com>

+ 64
- 0
Android.mk View File

@ -0,0 +1,64 @@
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. See the AUTHORS file for names of contributors.
# INSTRUCTIONS
# After you've downloaded and installed the Android NDK from:
# http://developer.android.com/sdk/ndk/index.html
# 1. In the same directory as this file, Android.mk, type:
# $ ln -s leveldb ../jni
# (The Android NDK will only build native projects in
# subdirectories named "jni".)
# 2. $ cd ..
# 3. Execute ndk-build:
# $ $(ANDROID_NDK_DIR)/ndk-build
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
LOCAL_MODULE := leveldb
# Build flags:
# - LEVELDB_PLATFORM_ANDROID to use the correct port header: port_android.h
LOCAL_CFLAGS := -DLEVELDB_PLATFORM_ANDROID -std=gnu++0x
LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../
LOCAL_CPP_EXTENSION := .cc
LOCAL_SRC_FILES := ./db/builder.cc \
./db/db_bench.cc \
./db/db_impl.cc \
./db/db_iter.cc \
./db/filename.cc \
./db/dbformat.cc \
./db/log_reader.cc \
./db/log_writer.cc \
./db/memtable.cc \
./db/repair.cc \
./db/table_cache.cc \
./db/version_edit.cc \
./db/version_set.cc \
./db/write_batch.cc \
./port/port_android.cc \
./table/block.cc \
./table/block_builder.cc \
./table/format.cc \
./table/iterator.cc \
./table/merger.cc \
./table/table.cc \
./table/table_builder.cc \
./table/two_level_iterator.cc \
./util/arena.cc \
./util/cache.cc \
./util/coding.cc \
./util/comparator.cc \
./util/crc32c.cc \
./util/env.cc \
./util/env_posix.cc \
./util/hash.cc \
./util/histogram.cc \
./util/logging.cc \
./util/options.cc \
./util/status.cc \
./util/testharness.cc \
./util/testutil.cc
include $(BUILD_SHARED_LIBRARY)

+ 6
- 0
Application.mk View File

@ -0,0 +1,6 @@
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. See the AUTHORS file for names of contributors.
APP_ABI := armeabi-v7a
APP_STL := gnustl_static

+ 27
- 0
LICENSE View File

@ -0,0 +1,27 @@
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ 134
- 0
Makefile View File

@ -0,0 +1,134 @@
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. See the AUTHORS file for names of contributors.
CC = g++
# Uncomment one of the following to switch between debug and opt mode
#OPT = -O2 -DNDEBUG
OPT = -g2
CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -std=c++0x $(OPT)
LDFLAGS=-lpthread
LIBOBJECTS = \
./db/builder.o \
./db/db_impl.o \
./db/db_iter.o \
./db/filename.o \
./db/format.o \
./db/log_reader.o \
./db/log_writer.o \
./db/memtable.o \
./db/repair.o \
./db/table_cache.o \
./db/version_edit.o \
./db/version_set.o \
./db/write_batch.o \
./port/port_posix.o \
./port/sha1_portable.o \
./table/block.o \
./table/block_builder.o \
./table/format.o \
./table/iterator.o \
./table/merger.o \
./table/table.o \
./table/table_builder.o \
./table/two_level_iterator.o \
./util/arena.o \
./util/cache.o \
./util/coding.o \
./util/comparator.o \
./util/crc32c.o \
./util/env.o \
./util/env_posix.o \
./util/hash.o \
./util/histogram.o \
./util/logging.o \
./util/options.o \
./util/status.o
TESTUTIL = ./util/testutil.o
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
TESTS = \
arena_test \
cache_test \
coding_test \
corruption_test \
crc32c_test \
db_test \
dbformat_test \
env_test \
filename_test \
log_test \
sha1_test \
skiplist_test \
table_test \
version_edit_test \
write_batch_test
PROGRAMS = db_bench $(TESTS)
all: $(PROGRAMS)
check: $(TESTS)
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
clean:
rm -f $(PROGRAMS) */*.o
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
.cc.o:
$(CC) $(CFLAGS) $< -o $@
# TODO(gabor): dependencies for .o files
# TODO(gabor): Build library

+ 51
- 0
README View File

@ -0,0 +1,51 @@
leveldb: A key-value store
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
The code under this directory implements a system for maintaining a
persistent key/value store.
See doc/index.html for more explanation.
See doc/db_layout.txt for a brief overview of the implementation.
The public interface is in include/*.h. Callers should not include or
rely on the details of any other header files in this package. Those
internal APIs may be changed without warning.
Guide to header files:
include/db.h
Main interface to the DB: Start here
include/options.h
Control over the behavior of an entire database, and also
control over the behavior of individual reads and writes.
include/comparator.h
Abstraction for user-specified comparison function. If you want
just bytewise comparison of keys, you can use the default comparator,
but clients can write their own comparator implementations if they
want custom ordering (e.g. to handle different character
encodings, etc.)
include/iterator.h
Interface for iterating over data. You can get an iterator
from a DB object.
include/write_batch.h
Interface for atomically applying multiple updates to a database.
include/slice.h
A simple module for maintaining a pointer and a length into some
other byte array.
include/status.h
Status is returned from many of the public interfaces and is used
to report success and various kinds of errors.
include/env.h
Abstraction of the OS environment. A posix implementation of
this interface is in util/env_posix.cc
include/table.h
include/table_builder.h
Lower-level modules that most clients probably won't use directly

+ 23
- 0
TODO View File

@ -0,0 +1,23 @@
Before adding to chrome
-----------------------
- multi-threaded test/benchmark
- Allow missing crc32c in Table format?
Maybe afterwards
----------------
ss
- Stats
- Speed up backwards scan (avoid three passes over data)
db
- Maybe implement DB::BulkDeleteForRange(start_key, end_key)
that would blow away files whose ranges are entirely contained
within [start_key..end_key]? For Chrome, deletion of obsolete
object stores, etc. can be done in the background anyway, so
probably not that important.
api changes?
- Efficient large value reading and writing
Faster Get implementation

+ 97
- 0
db/builder.cc View File

@ -0,0 +1,97 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/builder.h"
#include "db/filename.h"
#include "db/dbformat.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "include/db.h"
#include "include/env.h"
#include "include/iterator.h"
namespace leveldb {
Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta,
VersionEdit* edit) {
Status s;
meta->file_size = 0;
iter->SeekToFirst();
std::string fname = TableFileName(dbname, meta->number);
if (iter->Valid()) {
WritableFile* file;
s = env->NewWritableFile(fname, &file);
if (!s.ok()) {
return s;
}
TableBuilder* builder = new TableBuilder(options, file);
meta->smallest.DecodeFrom(iter->key());
for (; iter->Valid(); iter->Next()) {
Slice key = iter->key();
meta->largest.DecodeFrom(key);
if (ExtractValueType(key) == kTypeLargeValueRef) {
if (iter->value().size() != LargeValueRef::ByteSize()) {
s = Status::Corruption("invalid indirect reference hash value (L0)");
break;
}
edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
meta->number,
iter->key());
}
builder->Add(key, iter->value());
}
// Finish and check for builder errors
if (s.ok()) {
s = builder->Finish();
if (s.ok()) {
meta->file_size = builder->FileSize();
assert(meta->file_size > 0);
}
} else {
builder->Abandon();
}
delete builder;
// Finish and check for file errors
if (s.ok()) {
s = file->Sync();
}
if (s.ok()) {
s = file->Close();
}
delete file;
file = NULL;
if (s.ok()) {
// Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number);
s = it->status();
delete it;
}
}
// Check for input iterator errors
if (!iter->status().ok()) {
s = iter->status();
}
if (s.ok() && meta->file_size > 0) {
edit->AddFile(0, meta->number, meta->file_size,
meta->smallest, meta->largest);
} else {
env->DeleteFile(fname);
}
return s;
}
}

+ 36
- 0
db/builder.h View File

@ -0,0 +1,36 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_BUILDER_H_
#define STORAGE_LEVELDB_DB_BUILDER_H_
#include "include/status.h"
namespace leveldb {
struct Options;
struct FileMetaData;
class Env;
class Iterator;
class TableCache;
class VersionEdit;
// Build a Table file from the contents of *iter. The generated file
// will be named according to meta->number. On success, the rest of
// *meta will be filled with metadata about the generated table, and
// large value refs and the added file information will be added to
// *edit. If no data is present in *iter, meta->file_size will be set
// to zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta,
VersionEdit* edit);
}
#endif // STORAGE_LEVELDB_DB_BUILDER_H_

+ 366
- 0
db/corruption_test.cc View File

@ -0,0 +1,366 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/db.h"
#include <errno.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "include/env.h"
#include "include/table.h"
#include "include/write_batch.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static const int kValueSize = 1000;
class CorruptionTest {
public:
test::ErrorEnv env_;
Random rnd_;
std::string dbname_;
Options options_;
DB* db_;
CorruptionTest() : rnd_(test::RandomSeed()) {
options_.env = &env_;
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, options_);
db_ = NULL;
options_.create_if_missing = true;
Reopen();
options_.create_if_missing = false;
}
~CorruptionTest() {
delete db_;
DestroyDB(dbname_, Options());
}
Status TryReopen(Options* options = NULL) {
delete db_;
db_ = NULL;
Options opt = (options ? *options : options_);
opt.env = &env_;
return DB::Open(opt, dbname_, &db_);
}
void Reopen(Options* options = NULL) {
ASSERT_OK(TryReopen(options));
}
void RepairDB() {
delete db_;
db_ = NULL;
ASSERT_OK(::leveldb::RepairDB(dbname_, options_));
}
void Build(int n) {
std::string key_space, value_space;
WriteBatch batch;
for (int i = 0; i < n; i++) {
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
Slice key = Key(i, &key_space);
batch.Clear();
batch.Put(key, Value(i, &value_space));
ASSERT_OK(db_->Write(WriteOptions(), &batch));
}
}
void Check(int min_expected, int max_expected) {
int next_expected = 0;
int missed = 0;
int bad_keys = 0;
int bad_values = 0;
int correct = 0;
std::string value_space;
Iterator* iter = db_->NewIterator(ReadOptions());
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key;
Slice in(iter->key());
if (!ConsumeDecimalNumber(&in, &key) ||
!in.empty() ||
key < next_expected) {
bad_keys++;
continue;
}
missed += (key - next_expected);
next_expected = key + 1;
if (iter->value() != Value(key, &value_space)) {
bad_values++;
} else {
correct++;
}
}
delete iter;
fprintf(stderr,
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
min_expected, max_expected, correct, bad_keys, bad_values, missed);
ASSERT_LE(min_expected, correct);
ASSERT_GE(max_expected, correct);
}
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
// Pick file to corrupt
std::vector<std::string> filenames;
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
uint64_t number;
LargeValueRef large_ref;
FileType type;
std::vector<std::string> candidates;
for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
type == filetype) {
candidates.push_back(dbname_ + "/" + filenames[i]);
}
}
ASSERT_TRUE(!candidates.empty()) << filetype;
std::string fname = candidates[rnd_.Uniform(candidates.size())];
struct stat sbuf;
if (stat(fname.c_str(), &sbuf) != 0) {
const char* msg = strerror(errno);
ASSERT_TRUE(false) << fname << ": " << msg;
}
if (offset < 0) {
// Relative to end of file; make it absolute
if (-offset > sbuf.st_size) {
offset = 0;
} else {
offset = sbuf.st_size + offset;
}
}
if (offset > sbuf.st_size) {
offset = sbuf.st_size;
}
if (offset + bytes_to_corrupt > sbuf.st_size) {
bytes_to_corrupt = sbuf.st_size - offset;
}
// Do it
std::string contents;
Status s = ReadFileToString(Env::Default(), fname, &contents);
ASSERT_TRUE(s.ok()) << s.ToString();
for (int i = 0; i < bytes_to_corrupt; i++) {
contents[i + offset] ^= 0x80;
}
s = WriteStringToFile(Env::Default(), contents, fname);
ASSERT_TRUE(s.ok()) << s.ToString();
}
uint64_t Property(const std::string& name) {
uint64_t result;
if (!db_->GetProperty(name, &result)) {
result = ~static_cast<uint64_t>(0);
}
return result;
}
// Return the ith key
Slice Key(int i, std::string* storage) {
char buf[100];
snprintf(buf, sizeof(buf), "%016d", i);
storage->assign(buf, strlen(buf));
return Slice(*storage);
}
// Return the value to associate with the specified key
Slice Value(int k, std::string* storage) {
Random r(k);
return test::RandomString(&r, kValueSize, storage);
}
};
TEST(CorruptionTest, Recovery) {
Build(10);
Check(10, 10);
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
Corrupt(kLogFile, 2*kValueSize, 1); // Somewhere in second log record?
Reopen();
Check(8, 8);
}
TEST(CorruptionTest, RecoverWriteError) {
env_.writable_file_error_ = true;
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
}
TEST(CorruptionTest, NewFileErrorDuringWrite) {
// Do enough writing to force minor compaction
env_.writable_file_error_ = true;
const int num = 3 + (Options().write_buffer_size / kValueSize);
std::string value_storage;
Status s;
for (int i = 0; s.ok() && i < num; i++) {
WriteBatch batch;
batch.Put("a", Value(100, &value_storage));
s = db_->Write(WriteOptions(), &batch);
}
ASSERT_TRUE(!s.ok());
ASSERT_GE(env_.num_writable_file_errors_, 1);
env_.writable_file_error_ = false;
Reopen();
}
TEST(CorruptionTest, TableFile) {
Build(100);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, "", "~");
dbi->TEST_CompactRange(1, "", "~");
Corrupt(kTableFile, 100, 1);
Check(99, 99);
}
TEST(CorruptionTest, TableFileIndexData) {
Build(10000); // Enough to build multiple Tables
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, "", "~");
dbi->TEST_CompactRange(1, "", "~");
Corrupt(kTableFile, -1000, 500);
Reopen();
Check(5000, 9999);
}
TEST(CorruptionTest, MissingDescriptor) {
Build(1000);
RepairDB();
Reopen();
Check(1000, 1000);
}
TEST(CorruptionTest, SequenceNumberRecovery) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v5", v);
// Write something. If sequence number was not recovered properly,
// it will be hidden by an earlier write.
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
}
TEST(CorruptionTest, LargeValueRecovery) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big;
ASSERT_OK(db_->Put(WriteOptions(),
"foo", test::RandomString(&rnd, 100000, &big)));
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
RepairDB();
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
}
TEST(CorruptionTest, CorruptedDescriptor) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, "", "~");
Corrupt(kDescriptorFile, 0, 1000);
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("hello", v);
}
TEST(CorruptionTest, CompactionInputError) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
// Force compactions by writing lots of values
Build(10000);
Check(10000, 10000);
dbi->TEST_CompactRange(0, "", "~");
ASSERT_EQ(0, Property("leveldb.num-files-at-level0"));
}
TEST(CorruptionTest, CompactionInputErrorParanoid) {
Options options;
options.paranoid_checks = true;
Reopen(&options);
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
// Write must eventually fail because of corrupted table
Status s;
std::string tmp1, tmp2;
for (int i = 0; i < 10000 && s.ok(); i++) {
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
}
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
}
TEST(CorruptionTest, UnrelatedKeys) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
Corrupt(kTableFile, 100, 1);
std::string tmp1, tmp2;
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
dbi->TEST_CompactMemTable();
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 376
- 0
db/db_bench.cc View File

@ -0,0 +1,376 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include "db/db_impl.h"
#include "db/version_set.h"
#include "include/cache.h"
#include "include/db.h"
#include "include/env.h"
#include "include/write_batch.h"
#include "util/histogram.h"
#include "util/random.h"
#include "util/testutil.h"
// Comma-separated list of operations to run in the specified order
// Actual benchmarks:
// writeseq -- write N values in sequential key order
// writerandom -- write N values in random key order
// writebig -- write N/1000 100K valuesin random order
// readseq -- read N values sequentially
// readrandom -- read N values in random order
// Meta operations:
// compact -- Compact the entire DB
// heapprofile -- Dump a heap profile (if supported by this port)
// sync -- switch to synchronous writes (not the default)
// nosync -- switch to asynchronous writes (the default)
// tenth -- divide N by 10 (i.e., following benchmarks are smaller)
// normal -- reset N back to its normal value (1000000)
static const char* FLAGS_benchmarks =
"writeseq,"
"writeseq,"
"writerandom,"
"sync,tenth,tenth,writerandom,nosync,normal,"
"readseq,"
"readrandom,"
"compact,"
"readseq,"
"readrandom,"
"writebig";
// Number of key/values to place in database
static int FLAGS_num = 1000000;
// Size of each value
static int FLAGS_value_size = 100;
// Arrange to generate values that shrink to this fraction of
// their original size after compression
static double FLAGS_compression_ratio = 0.25;
// Print histogram of operation timings
static bool FLAGS_histogram = false;
// Number of bytes to buffer in memtable before compacting
static int FLAGS_write_buffer_size = 1 << 20;
namespace leveldb {
// Helper for quickly generating random data.
namespace {
class RandomGenerator {
private:
std::string data_;
int pos_;
public:
RandomGenerator() {
// We use a limited amount of data over and over again and ensure
// that it is larger than the compression window (32KB), and also
// large enough to serve all typical value sizes we want to write.
Random rnd(301);
std::string piece;
while (data_.size() < 1048576) {
// Add a short fragment that is as compressible as specified
// by FLAGS_compression_ratio.
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
data_.append(piece);
}
pos_ = 0;
}
Slice Generate(int len) {
if (pos_ + len > data_.size()) {
pos_ = 0;
assert(len < data_.size());
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
}
class Benchmark {
private:
Cache* cache_;
DB* db_;
int num_;
bool sync_;
int heap_counter_;
double start_;
double last_op_finish_;
int64_t bytes_;
std::string message_;
Histogram hist_;
RandomGenerator gen_;
Random rand_;
// State kept for progress messages
int done_;
int next_report_; // When to report next
void Start() {
start_ = Env::Default()->NowMicros() * 1e-6;
bytes_ = 0;
message_.clear();
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
next_report_ = 100;
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros() * 1e-6;
double micros = (now - last_op_finish_) * 1e6;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) {
next_report_ += 100;
} else if (next_report_ < 10000) {
next_report_ += 1000;
} else if (next_report_ < 100000) {
next_report_ += 10000;
} else {
next_report_ += 100000;
}
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void Stop(const Slice& name) {
double finish = Env::Default()->NowMicros() * 1e-6;
// Pretend at least one op was done in case we are running a benchmark
// that does nto call FinishedSingleOp().
if (done_ < 1) done_ = 1;
if (bytes_ > 0) {
char rate[100];
snprintf(rate, sizeof(rate), "%5.1f MB/s",
(bytes_ / 1048576.0) / (finish - start_));
if (!message_.empty()) {
message_.push_back(' ');
}
message_.append(rate);
}
fprintf(stdout, "%-12s : %10.3f micros/op;%s%s\n",
name.ToString().c_str(),
(finish - start_) * 1e6 / done_,
(message_.empty() ? "" : " "),
message_.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
}
public:
enum Order { SEQUENTIAL, RANDOM };
Benchmark() : cache_(NewLRUCache(200<<20)),
db_(NULL),
num_(FLAGS_num),
sync_(false),
heap_counter_(0),
bytes_(0),
rand_(301) {
std::vector<std::string> files;
Env::Default()->GetChildren("/tmp/dbbench", &files);
for (int i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("heap-")) {
Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
}
}
DestroyDB("/tmp/dbbench", Options());
}
~Benchmark() {
delete db_;
delete cache_;
}
void Run() {
Options options;
options.create_if_missing = true;
options.max_open_files = 10000;
options.block_cache = cache_;
options.write_buffer_size = FLAGS_write_buffer_size;
Start();
Status s = DB::Open(options, "/tmp/dbbench", &db_);
Stop("open");
if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
exit(1);
}
const char* benchmarks = FLAGS_benchmarks;
while (benchmarks != NULL) {
const char* sep = strchr(benchmarks, ',');
Slice name;
if (sep == NULL) {
name = benchmarks;
benchmarks = NULL;
} else {
name = Slice(benchmarks, sep - benchmarks);
benchmarks = sep + 1;
}
Start();
if (name == Slice("writeseq")) {
Write(SEQUENTIAL, num_, FLAGS_value_size);
} else if (name == Slice("writerandom")) {
Write(RANDOM, num_, FLAGS_value_size);
} else if (name == Slice("writebig")) {
Write(RANDOM, num_ / 1000, 100 * 1000);
} else if (name == Slice("readseq")) {
Read(SEQUENTIAL);
} else if (name == Slice("readrandom")) {
Read(RANDOM);
} else if (name == Slice("compact")) {
Compact();
} else if (name == Slice("heapprofile")) {
HeapProfile();
} else if (name == Slice("sync")) {
sync_ = true;
} else if (name == Slice("nosync")) {
sync_ = false;
} else if (name == Slice("tenth")) {
num_ = num_ / 10;
} else if (name == Slice("normal")) {
num_ = FLAGS_num;
} else {
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
}
Stop(name);
}
}
void Write(Order order, int num_entries, int value_size) {
WriteBatch batch;
Status s;
std::string val;
WriteOptions options;
options.sync = sync_;
for (int i = 0; i < num_entries; i++) {
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
char key[100];
snprintf(key, sizeof(key), "%012d", k);
batch.Clear();
batch.Put(key, gen_.Generate(value_size));
s = db_->Write(options, &batch);
bytes_ += value_size + strlen(key);
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
FinishedSingleOp();
}
}
void Read(Order order) {
ReadOptions options;
if (order == SEQUENTIAL) {
Iterator* iter = db_->NewIterator(options);
int i = 0;
for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
bytes_ += iter->key().size() + iter->value().size();
FinishedSingleOp();
++i;
}
delete iter;
} else {
std::string value;
for (int i = 0; i < num_; i++) {
char key[100];
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
snprintf(key, sizeof(key), "%012d", k);
db_->Get(options, key, &value);
FinishedSingleOp();
}
}
}
void Compact() {
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
int max_level_with_files = 1;
for (int level = 1; level < config::kNumLevels; level++) {
uint64_t v;
char name[100];
snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
if (db_->GetProperty(name, &v) && v > 0) {
max_level_with_files = level;
}
}
for (int level = 0; level < max_level_with_files; level++) {
dbi->TEST_CompactRange(level, "", "~");
}
}
static void WriteToFile(void* arg, const char* buf, int n) {
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
}
void HeapProfile() {
char fname[100];
snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_);
WritableFile* file;
Status s = Env::Default()->NewWritableFile(fname, &file);
if (!s.ok()) {
message_ = s.ToString();
return;
}
bool ok = port::GetHeapProfile(WriteToFile, file);
delete file;
if (!ok) {
message_ = "not supported";
Env::Default()->DeleteFile(fname);
}
}
};
}
int main(int argc, char** argv) {
for (int i = 1; i < argc; i++) {
double d;
int n;
char junk;
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
FLAGS_compression_ratio = d;
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_histogram = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n;
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n;
} else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);
}
}
leveldb::Benchmark benchmark;
benchmark.Run();
return 0;
}

+ 1195
- 0
db/db_impl.cc
File diff suppressed because it is too large
View File


+ 192
- 0
db/db_impl.h View File

@ -0,0 +1,192 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
#define STORAGE_LEVELDB_DB_DB_IMPL_H_
#include <set>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "include/db.h"
#include "include/env.h"
#include "port/port.h"
namespace leveldb {
class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class DBImpl : public DB {
public:
DBImpl(const Options& options, const std::string& dbname);
virtual ~DBImpl();
// Implementations of the DB interface
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
virtual Status Delete(const WriteOptions&, const Slice& key);
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value);
virtual Iterator* NewIterator(const ReadOptions&);
virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, uint64_t* value);
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
// Extra methods (for testing) that are not in the public DB interface
// Compact any files in the named level that overlap [begin,end]
void TEST_CompactRange(
int level,
const std::string& begin,
const std::string& end);
// Force current memtable contents to be compacted.
Status TEST_CompactMemTable();
// Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator();
private:
friend class DB;
Iterator* NewInternalIterator(const ReadOptions&,
SequenceNumber* latest_snapshot);
Status NewDB();
// Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. Any changes to
// be made to the descriptor are added to *edit.
Status Recover(VersionEdit* edit);
// Apply the specified updates and save the resulting descriptor to
// persistent storage. If cleanup_mem is non-NULL, arrange to
// delete it when all existing snapshots have gone away iff Install()
// returns OK.
Status Install(VersionEdit* edit,
uint64_t new_log_number,
MemTable* cleanup_mem);
void MaybeIgnoreError(Status* s) const;
// Delete any unneeded files and stale in-memory entries.
void DeleteObsoleteFiles();
// Called when an iterator over a particular version of the
// descriptor goes away.
static void Unref(void* arg1, void* arg2);
// Compact the in-memory write buffer to disk. Switches to a new
// log-file/memtable and writes a new descriptor iff successful.
Status CompactMemTable();
Status RecoverLogFile(uint64_t log_number,
VersionEdit* edit,
SequenceNumber* max_sequence);
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit);
bool HasLargeValues(const WriteBatch& batch) const;
// Process data in "*updates" and return a status. "assigned_seq"
// is the sequence number assigned to the first mod in "*updates".
// If no large values are encountered, "*final" is set to "updates".
// If large values were encountered, registers the references of the
// large values with the VersionSet, writes the large values to
// files (if appropriate), and allocates a new WriteBatch with the
// large values replaced with indirect references and stores a
// pointer to the new WriteBatch in *final. If *final != updates on
// return, then the client should delete *final when no longer
// needed. Returns OK on success, and an appropriate error
// otherwise.
Status HandleLargeValues(SequenceNumber assigned_seq,
WriteBatch* updates,
WriteBatch** final);
// Helper routine for HandleLargeValues
void MaybeCompressLargeValue(
const Slice& raw_value,
Slice* file_bytes,
std::string* scratch,
LargeValueRef* ref);
struct CompactionState;
void MaybeScheduleCompaction();
static void BGWork(void* db);
void BackgroundCall();
void BackgroundCompaction();
void CleanupCompaction(CompactionState* compact);
Status DoCompactionWork(CompactionState* compact);
Status OpenCompactionOutputFile(CompactionState* compact);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
Status InstallCompactionResults(CompactionState* compact);
// Constant after construction
Env* const env_;
const InternalKeyComparator internal_comparator_;
const Options options_; // options_.comparator == &internal_comparator_
bool owns_info_log_;
const std::string dbname_;
// table_cache_ provides its own synchronization
TableCache* table_cache_;
// Lock over the persistent DB state. Non-NULL iff successfully acquired.
FileLock* db_lock_;
// State below is protected by mutex_
port::Mutex mutex_;
port::AtomicPointer shutting_down_;
port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_
port::CondVar compacting_cv_; // Signalled when !compacting_
SequenceNumber last_sequence_;
MemTable* mem_;
WritableFile* logfile_;
log::Writer* log_;
uint64_t log_number_;
SnapshotList snapshots_;
// Set of table files to protect from deletion because they are
// part of ongoing compactions.
std::set<uint64_t> pending_outputs_;
// Has a background compaction been scheduled or is running?
bool bg_compaction_scheduled_;
// Is there a compaction running?
bool compacting_;
VersionSet* versions_;
// Have we encountered a background error in paranoid mode?
Status bg_error_;
// No copying allowed
DBImpl(const DBImpl&);
void operator=(const DBImpl&);
const Comparator* user_comparator() const {
return internal_comparator_.user_comparator();
}
};
// Sanitize db options. The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp,
const Options& src);
}
#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_

+ 412
- 0
db/db_iter.cc View File

@ -0,0 +1,412 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_iter.h"
#include "db/filename.h"
#include "db/dbformat.h"
#include "include/env.h"
#include "include/iterator.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/mutexlock.h"
namespace leveldb {
#if 0
static void DumpInternalIter(Iterator* iter) {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey k;
if (!ParseInternalKey(iter->key(), &k)) {
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
} else {
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
}
}
}
#endif
namespace {
// Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries. DBIter
// combines multiple entries for the same userkey found in the DB
// representation into a single entry while accounting for sequence
// numbers, deletion markers, overwrites, etc.
class DBIter: public Iterator {
public:
DBIter(const std::string* dbname, Env* env,
const Comparator* cmp, Iterator* iter, SequenceNumber s)
: dbname_(dbname),
env_(env),
user_comparator_(cmp),
iter_(iter),
sequence_(s),
large_(NULL),
valid_(false) {
}
virtual ~DBIter() {
delete iter_;
delete large_;
}
virtual bool Valid() const { return valid_; }
virtual Slice key() const {
assert(valid_);
return key_;
}
virtual Slice value() const {
assert(valid_);
if (large_ == NULL) {
return value_;
} else {
MutexLock l(&large_->mutex);
if (!large_->produced) {
ReadIndirectValue();
}
return large_->value;
}
}
virtual void Next() {
assert(valid_);
// iter_ is already positioned past DBIter::key()
FindNextUserEntry();
}
virtual void Prev() {
assert(valid_);
bool ignored;
ScanUntilBeforeCurrentKey(&ignored);
FindPrevUserEntry();
}
virtual void Seek(const Slice& target) {
ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek);
std::string tmp;
AppendInternalKey(&tmp, ikey);
iter_->Seek(tmp);
FindNextUserEntry();
}
virtual void SeekToFirst() {
iter_->SeekToFirst();
FindNextUserEntry();
}
virtual void SeekToLast();
virtual Status status() const {
if (status_.ok()) {
if (large_ != NULL && !large_->status.ok()) return large_->status;
return iter_->status();
} else {
return status_;
}
}
private:
void FindNextUserEntry();
void FindPrevUserEntry();
void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); }
void SaveValue(const Slice& v) {
if (value_.capacity() > v.size() + 1048576) {
std::string empty;
swap(empty, value_);
}
value_.assign(v.data(), v.size());
}
bool ParseKey(ParsedInternalKey* key);
void SkipPast(const Slice& k);
void ScanUntilBeforeCurrentKey(bool* found_live);
void ReadIndirectValue() const;
struct Large {
port::Mutex mutex;
std::string value;
bool produced;
Status status;
};
const std::string* const dbname_;
Env* const env_;
const Comparator* const user_comparator_;
// iter_ is positioned just past current entry for DBIter if valid_
Iterator* const iter_;
SequenceNumber const sequence_;
Status status_;
std::string key_; // Always a user key
std::string value_;
Large* large_; // Non-NULL if value is an indirect reference
bool valid_;
// No copying allowed
DBIter(const DBIter&);
void operator=(const DBIter&);
};
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
if (!ParseInternalKey(iter_->key(), ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter");
return false;
} else {
return true;
}
}
void DBIter::FindNextUserEntry() {
if (large_ != NULL) {
if (status_.ok() && !large_->status.ok()) {
status_ = large_->status;
}
delete large_;
large_ = NULL;
}
while (iter_->Valid()) {
ParsedInternalKey ikey;
if (!ParseKey(&ikey)) {
// Skip past corrupted entry
iter_->Next();
continue;
}
if (ikey.sequence > sequence_) {
// Ignore entries newer than the snapshot
iter_->Next();
continue;
}
switch (ikey.type) {
case kTypeDeletion:
SaveKey(ikey.user_key); // Make local copy for use by SkipPast()
iter_->Next();
SkipPast(key_);
// Do not return deleted entries. Instead keep looping.
break;
case kTypeValue:
SaveKey(ikey.user_key);
SaveValue(iter_->value());
iter_->Next();
SkipPast(key_);
// Yield the value we just found.
valid_ = true;
return;
case kTypeLargeValueRef:
SaveKey(ikey.user_key);
// Save the large value ref as value_, and read it lazily on a call
// to value()
SaveValue(iter_->value());
large_ = new Large;
large_->produced = false;
iter_->Next();
SkipPast(key_);
// Yield the value we just found.
valid_ = true;
return;
}
}
valid_ = false;
key_.clear();
value_.clear();
assert(large_ == NULL);
}
void DBIter::SkipPast(const Slice& k) {
while (iter_->Valid()) {
ParsedInternalKey ikey;
// Note that if we cannot parse an internal key, we keep looping
// so that if we have a run like the following:
// <x,100,v> => value100
// <corrupted entry for user key x>
// <x,50,v> => value50
// we will skip over the corrupted entry as well as value50.
if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) {
break;
}
iter_->Next();
}
}
void DBIter::SeekToLast() {
// Position iter_ at the last uncorrupted user key and then
// let FindPrevUserEntry() do the heavy lifting to find
// a user key that is live.
iter_->SeekToLast();
ParsedInternalKey current;
while (iter_->Valid() && !ParseKey(&current)) {
iter_->Prev();
}
if (iter_->Valid()) {
SaveKey(current.user_key);
}
FindPrevUserEntry();
}
// Let X be the user key at which iter_ is currently positioned.
// Adjust DBIter to point at the last entry with a key <= X that
// has a live value.
void DBIter::FindPrevUserEntry() {
// Consider the following example:
//
// A@540
// A@400
//
// B@300
// B@200
// B@100 <- iter_
//
// C@301
// C@201
//
// The comments marked "(first iteration)" below relate what happens
// for the preceding example in the first iteration of the while loop
// below. There may be more than one iteration either if there are
// no live values for B, or if there is a corruption.
while (iter_->Valid()) {
std::string saved = key_;
bool found_live;
ScanUntilBeforeCurrentKey(&found_live);
// (first iteration) iter_ at A@400
if (found_live) {
// Step forward into range of entries with user key >= saved
if (!iter_->Valid()) {
iter_->SeekToFirst();
} else {
iter_->Next();
}
// (first iteration) iter_ at B@300
FindNextUserEntry(); // Sets key_ to the key of the next value it found
if (valid_ && user_comparator_->Compare(key_, saved) == 0) {
// (first iteration) iter_ at C@301
return;
}
// FindNextUserEntry() could not find any entries under the
// user key "saved". This is probably a corruption since
// ScanUntilBefore(saved) found a live value. So we skip
// backwards to an earlier key and ignore the corrupted
// entries for "saved".
//
// (first iteration) iter_ at C@301 and saved == "B"
key_ = saved;
bool ignored;
ScanUntilBeforeCurrentKey(&ignored);
// (first iteration) iter_ at A@400
}
}
valid_ = false;
key_.clear();
value_.clear();
}
void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) {
*found_live = false;
if (!iter_->Valid()) {
iter_->SeekToLast();
}
while (iter_->Valid()) {
ParsedInternalKey current;
if (!ParseKey(&current)) {
iter_->Prev();
continue;
}
if (current.sequence > sequence_) {
// Ignore entries that are serialized after this read
iter_->Prev();
continue;
}
const int cmp = user_comparator_->Compare(current.user_key, key_);
if (cmp < 0) {
SaveKey(current.user_key);
return;
} else if (cmp == 0) {
switch (current.type) {
case kTypeDeletion:
*found_live = false;
break;
case kTypeValue:
case kTypeLargeValueRef:
*found_live = true;
break;
}
} else { // cmp > 0
*found_live = false;
}
iter_->Prev();
}
}
void DBIter::ReadIndirectValue() const {
assert(!large_->produced);
large_->produced = true;
LargeValueRef large_ref;
if (value_.size() != LargeValueRef::ByteSize()) {
large_->status = Status::Corruption("malformed large value reference");
return;
}
memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize());
std::string fname = LargeValueFileName(*dbname_, large_ref);
RandomAccessFile* file;
Status s = env_->NewRandomAccessFile(fname, &file);
if (s.ok()) {
uint64_t file_size = file->Size();
uint64_t value_size = large_ref.ValueSize();
large_->value.resize(value_size);
Slice result;
s = file->Read(0, file_size, &result,
const_cast<char*>(large_->value.data()));
if (s.ok()) {
if (result.size() == file_size) {
switch (large_ref.compression_type()) {
case kNoCompression: {
if (result.data() != large_->value.data()) {
large_->value.assign(result.data(), result.size());
}
break;
}
case kLightweightCompression: {
std::string uncompressed;
if (port::Lightweight_Uncompress(result.data(), result.size(),
&uncompressed) &&
uncompressed.size() == large_ref.ValueSize()) {
swap(uncompressed, large_->value);
} else {
s = Status::Corruption(
"Unable to read entire compressed large value file");
}
}
}
} else {
s = Status::Corruption("Unable to read entire large value file");
}
}
delete file; // Ignore errors on closing
}
if (!s.ok()) {
large_->value.clear();
large_->status = s;
}
}
} // anonymous namespace
Iterator* NewDBIterator(
const std::string* dbname,
Env* env,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence) {
return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
}
}

+ 26
- 0
db/db_iter.h View File

@ -0,0 +1,26 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_
#define STORAGE_LEVELDB_DB_DB_ITER_H_
#include <stdint.h>
#include "include/db.h"
#include "db/dbformat.h"
namespace leveldb {
// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys.
extern Iterator* NewDBIterator(
const std::string* dbname,
Env* env,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence);
}
#endif // STORAGE_LEVELDB_DB_DB_ITER_H_

+ 963
- 0
db/db_test.cc View File

@ -0,0 +1,963 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/db.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "include/env.h"
#include "include/table.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
class DBTest {
public:
std::string dbname_;
Env* env_;
DB* db_;
Options last_options_;
DBTest() : env_(Env::Default()) {
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, Options());
db_ = NULL;
Reopen();
}
~DBTest() {
delete db_;
DestroyDB(dbname_, Options());
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
void Reopen(Options* options = NULL) {
ASSERT_OK(TryReopen(options));
}
void DestroyAndReopen(Options* options = NULL) {
delete db_;
db_ = NULL;
DestroyDB(dbname_, Options());
ASSERT_OK(TryReopen(options));
}
Status TryReopen(Options* options) {
delete db_;
db_ = NULL;
Options opts;
if (options != NULL) {
opts = *options;
} else {
opts.create_if_missing = true;
}
last_options_ = opts;
return DB::Open(opts, dbname_, &db_);
}
Status Put(const std::string& k, const std::string& v) {
WriteBatch batch;
batch.Put(k, v);
return db_->Write(WriteOptions(), &batch);
}
Status Delete(const std::string& k) {
WriteBatch batch;
batch.Delete(k);
return db_->Write(WriteOptions(), &batch);
}
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
ReadOptions options;
options.snapshot = snapshot;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
std::string AllEntriesFor(const Slice& user_key) {
Iterator* iter = dbfull()->TEST_NewInternalIterator();
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
iter->Seek(target.Encode());
std::string result;
if (!iter->status().ok()) {
result = iter->status().ToString();
} else {
result = "[ ";
bool first = true;
while (iter->Valid()) {
ParsedInternalKey ikey;
if (!ParseInternalKey(iter->key(), &ikey)) {
result += "CORRUPTED";
} else {
if (last_options_.comparator->Compare(
ikey.user_key, user_key) != 0) {
break;
}
if (!first) {
result += ", ";
}
first = false;
switch (ikey.type) {
case kTypeValue:
result += iter->value().ToString();
break;
case kTypeLargeValueRef:
result += "LARGEVALUE(" + EscapeString(iter->value()) + ")";
break;
case kTypeDeletion:
result += "DEL";
break;
}
}
iter->Next();
}
if (!first) {
result += " ";
}
result += "]";
}
delete iter;
return result;
}
int NumTableFilesAtLevel(int level) {
uint64_t val;
ASSERT_TRUE(
db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
&val));
return val;
}
uint64_t Size(const Slice& start, const Slice& limit) {
Range r(start, limit);
uint64_t size;
db_->GetApproximateSizes(&r, 1, &size);
return size;
}
std::set<LargeValueRef> LargeValueFiles() const {
// Return the set of large value files that exist in the database
std::vector<std::string> filenames;
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
uint64_t number;
LargeValueRef large_ref;
FileType type;
std::set<LargeValueRef> live;
for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
type == kLargeValueFile) {
fprintf(stderr, " live: %s\n",
LargeValueRefToFilenameString(large_ref).c_str());
live.insert(large_ref);
}
}
fprintf(stderr, "Found %d live large value files\n", (int)live.size());
return live;
}
};
TEST(DBTest, Empty) {
ASSERT_TRUE(db_ != NULL);
ASSERT_EQ("NOT_FOUND", Get("foo"));
}
TEST(DBTest, ReadWrite) {
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3"));
ASSERT_EQ("v3", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
}
TEST(DBTest, PutDeleteGet) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_EQ("v2", Get("foo"));
ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
ASSERT_EQ("NOT_FOUND", Get("foo"));
}
TEST(DBTest, Recover) {
ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("baz", "v5"));
Reopen();
ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v5", Get("baz"));
ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3"));
Reopen();
ASSERT_EQ("v3", Get("foo"));
ASSERT_OK(Put("foo", "v4"));
ASSERT_EQ("v4", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
ASSERT_EQ("v5", Get("baz"));
}
TEST(DBTest, RecoveryWithEmptyLog) {
ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("foo", "v2"));
Reopen();
Reopen();
ASSERT_OK(Put("foo", "v3"));
Reopen();
ASSERT_EQ("v3", Get("foo"));
}
static std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key%06d", i);
return std::string(buf);
}
TEST(DBTest, MinorCompactionsHappen) {
Options options;
options.write_buffer_size = 10000;
Reopen(&options);
const int N = 100;
int starting_num_tables = NumTableFilesAtLevel(0);
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
}
int ending_num_tables = NumTableFilesAtLevel(0);
ASSERT_GT(ending_num_tables, starting_num_tables);
for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
}
Reopen();
for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
}
}
TEST(DBTest, RecoverWithLargeLog) {
{
Options options;
options.large_value_threshold = 1048576;
Reopen(&options);
ASSERT_OK(Put("big1", std::string(200000, '1')));
ASSERT_OK(Put("big2", std::string(200000, '2')));
ASSERT_OK(Put("small3", std::string(10, '3')));
ASSERT_OK(Put("small4", std::string(10, '4')));
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
}
// Make sure that if we re-open with a small write buffer size that
// we flush table files in the middle of a large log file.
Options options;
options.write_buffer_size = 100000;
options.large_value_threshold = 1048576;
Reopen(&options);
ASSERT_EQ(NumTableFilesAtLevel(0), 3);
ASSERT_EQ(std::string(200000, '1'), Get("big1"));
ASSERT_EQ(std::string(200000, '2'), Get("big2"));
ASSERT_EQ(std::string(10, '3'), Get("small3"));
ASSERT_EQ(std::string(10, '4'), Get("small4"));
ASSERT_GT(NumTableFilesAtLevel(0), 1);
}
TEST(DBTest, CompactionsGenerateMultipleFiles) {
Options options;
options.write_buffer_size = 100000000; // Large write buffer
options.large_value_threshold = 1048576;
Reopen(&options);
Random rnd(301);
// Write 8MB (80 values, each 100K)
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
std::vector<std::string> values;
for (int i = 0; i < 80; i++) {
values.push_back(RandomString(&rnd, 100000));
ASSERT_OK(Put(Key(i), values[i]));
}
// Reopening moves updates to level-0
Reopen(&options);
dbfull()->TEST_CompactRange(0, "", Key(100000));
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 1);
for (int i = 0; i < 80; i++) {
ASSERT_EQ(Get(Key(i)), values[i]);
}
}
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
bool result = (val >= low) && (val <= high);
if (!result) {
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
(unsigned long long)(val),
(unsigned long long)(low),
(unsigned long long)(high));
}
return result;
}
TEST(DBTest, ApproximateSizes) {
for (int test = 0; test < 2; test++) {
// test==0: default large_value_threshold
// test==1: 1 MB large_value_threshold
Options options;
options.large_value_threshold = (test == 0) ? 65536 : 1048576;
options.write_buffer_size = 100000000; // Large write buffer
options.compression = kNoCompression;
DestroyAndReopen();
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
Reopen(&options);
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
// Write 8MB (80 values, each 100K)
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
const int N = 80;
Random rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000)));
}
if (test == 1) {
// 0 because GetApproximateSizes() does not account for memtable space for
// non-large values
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
} else {
ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000));
ASSERT_TRUE(Between(Size(Key(20), Key(30)),
100000*10, 100000*10 + 10000));
}
// Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) {
Reopen(&options);
for (int compact_start = 0; compact_start < N; compact_start += 10) {
for (int i = 0; i < N; i += 10) {
ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000));
ASSERT_TRUE(Between(Size("", Key(i)+".suffix"),
100000 * (i+1), 100000 * (i+1) + 10000));
ASSERT_TRUE(Between(Size(Key(i), Key(i+10)),
100000 * 10, 100000 * 10 + 10000));
}
ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000));
ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000));
dbfull()->TEST_CompactRange(0,
Key(compact_start),
Key(compact_start + 9));
}
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 0);
}
}
}
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
Options options;
options.large_value_threshold = 65536;
options.compression = kNoCompression;
Reopen();
Random rnd(301);
std::string big1 = RandomString(&rnd, 100000);
ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(2), big1));
ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(4), big1));
ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
// Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) {
Reopen(&options);
ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000));
ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
dbfull()->TEST_CompactRange(0, Key(0), Key(100));
}
}
TEST(DBTest, IteratorPinsRef) {
Put("foo", "hello");
// Get iterator that will yield the current contents of the DB.
Iterator* iter = db_->NewIterator(ReadOptions());
// Write to force compactions
Put("foo", "newvalue1");
for (int i = 0; i < 100; i++) {
ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
}
Put("foo", "newvalue2");
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("foo", iter->key().ToString());
ASSERT_EQ("hello", iter->value().ToString());
iter->Next();
ASSERT_TRUE(!iter->Valid());
delete iter;
}
TEST(DBTest, Snapshot) {
Put("foo", "v1");
const Snapshot* s1 = db_->GetSnapshot();
Put("foo", "v2");
const Snapshot* s2 = db_->GetSnapshot();
Put("foo", "v3");
const Snapshot* s3 = db_->GetSnapshot();
Put("foo", "v4");
ASSERT_EQ("v1", Get("foo", s1));
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v3", Get("foo", s3));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s3);
ASSERT_EQ("v1", Get("foo", s1));
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s1);
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s2);
ASSERT_EQ("v4", Get("foo"));
}
TEST(DBTest, HiddenValuesAreRemoved) {
Random rnd(301);
std::string big = RandomString(&rnd, 50000);
Put("foo", big);
Put("pastfoo", "v");
const Snapshot* snapshot = db_->GetSnapshot();
Put("foo", "tiny");
Put("pastfoo2", "v2"); // Advance sequence number one more
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_GT(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(big, Get("foo", snapshot));
ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
db_->ReleaseSnapshot(snapshot);
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
dbfull()->TEST_CompactRange(0, "", "x");
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GE(NumTableFilesAtLevel(1), 1);
dbfull()->TEST_CompactRange(1, "", "x");
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
}
TEST(DBTest, DeletionMarkers1) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
dbfull()->TEST_CompactRange(0, "", "z");
dbfull()->TEST_CompactRange(1, "", "z");
ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file
Delete("foo");
Put("foo", "v2");
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
dbfull()->TEST_CompactRange(0, "", "z");
// DEL eliminated, but v1 remains because we aren't compacting that level
// (DEL can be eliminated because v2 hides v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
dbfull()->TEST_CompactRange(1, "", "z");
// Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
// (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
}
TEST(DBTest, DeletionMarkers2) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
dbfull()->TEST_CompactRange(0, "", "z");
dbfull()->TEST_CompactRange(1, "", "z");
ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file
Delete("foo");
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(0, "", "z");
// DEL kept: L2 file overlaps
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(1, "", "z");
// Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
// (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
}
TEST(DBTest, ComparatorCheck) {
class NewComparator : public Comparator {
public:
virtual const char* Name() const { return "leveldb.NewComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
return BytewiseComparator()->Compare(a, b);
}
virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
BytewiseComparator()->FindShortestSeparator(s, l);
}
virtual void FindShortSuccessor(std::string* key) const {
BytewiseComparator()->FindShortSuccessor(key);
}
};
NewComparator cmp;
Options new_options;
new_options.comparator = &cmp;
Status s = TryReopen(&new_options);
ASSERT_TRUE(!s.ok());
ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
<< s.ToString();
}
static bool LargeValuesOK(DBTest* db,
const std::set<LargeValueRef>& expected) {
std::set<LargeValueRef> actual = db->LargeValueFiles();
if (actual.size() != expected.size()) {
fprintf(stderr, "Sets differ in size: %d vs %d\n",
(int)actual.size(), (int)expected.size());
return false;
}
for (std::set<LargeValueRef>::const_iterator it = expected.begin();
it != expected.end();
++it) {
if (actual.count(*it) != 1) {
fprintf(stderr, " key '%s' not found in actual set\n",
LargeValueRefToFilenameString(*it).c_str());
return false;
}
}
return true;
}
TEST(DBTest, LargeValues1) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big1;
test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible
std::set<LargeValueRef> expected;
ASSERT_OK(Put("big1", big1));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Delete("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
// No handling of deletion markers on memtable compactions, so big1 remains
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
expected.erase(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
}
TEST(DBTest, LargeValues2) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big1, big2;
test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible
test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible
std::set<LargeValueRef> expected;
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big1", big1));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_EQ(big1, Get("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big2", big2));
ASSERT_EQ(big2, Get("big2"));
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM)
// TODO(sanjay) Reenable after compression support is added
expected.insert(LargeValueRef::Make(big2, kNoCompression));
#else
expected.insert(LargeValueRef::Make(big2, kLightweightCompression));
#endif
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big2", big2));
ASSERT_OK(Put("big2_b", big2));
ASSERT_EQ(big1, Get("big1"));
ASSERT_EQ(big2, Get("big2"));
ASSERT_EQ(big2, Get("big2_b"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Delete("big1"));
ASSERT_EQ("NOT_FOUND", Get("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
expected.erase(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(1, "", "z");
ASSERT_OK(Delete("big2"));
ASSERT_EQ("NOT_FOUND", Get("big2"));
ASSERT_EQ(big2, Get("big2_b"));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
// Make sure the large value refs survive a reload and compactions after
// the reload.
Reopen();
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
}
TEST(DBTest, LargeValues3) {
// Make sure we don't compress values if
Options options;
options.large_value_threshold = 10000;
options.compression = kNoCompression;
Reopen(&options);
Random rnd(301);
std::string big1 = std::string(100000, 'x'); // Very compressible
std::set<LargeValueRef> expected;
ASSERT_OK(Put("big1", big1));
ASSERT_EQ(big1, Get("big1"));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
}
TEST(DBTest, DBOpen_Options) {
std::string dbname = test::TmpDir() + "/db_options_test";
DestroyDB(dbname, Options());
// Does not exist, and create_if_missing == false: error
DB* db = NULL;
Options opts;
opts.create_if_missing = false;
Status s = DB::Open(opts, dbname, &db);
ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL);
ASSERT_TRUE(db == NULL);
// Does not exist, and create_if_missing == true: OK
opts.create_if_missing = true;
s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
db = NULL;
// Does exist, and error_if_exists == true: error
opts.create_if_missing = false;
opts.error_if_exists = true;
s = DB::Open(opts, dbname, &db);
ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL);
ASSERT_TRUE(db == NULL);
// Does exist, and error_if_exists == false: OK
opts.create_if_missing = true;
opts.error_if_exists = false;
s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
db = NULL;
}
class ModelDB: public DB {
public:
explicit ModelDB(const Options& options): options_(options) { }
~ModelDB() { }
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
return DB::Put(o, k, v);
}
virtual Status Delete(const WriteOptions& o, const Slice& key) {
return DB::Delete(o, key);
}
virtual Status Get(const ReadOptions& options,
const Slice& key, std::string* value) {
assert(false); // Not implemented
return Status::NotFound(key);
}
virtual Iterator* NewIterator(const ReadOptions& options) {
if (options.snapshot == NULL) {
KVMap* saved = new KVMap;
*saved = map_;
return new ModelIter(saved, true);
} else {
const KVMap* snapshot_state =
reinterpret_cast<const KVMap*>(options.snapshot->number_);
return new ModelIter(snapshot_state, false);
}
}
virtual const Snapshot* GetSnapshot() {
KVMap* saved = new KVMap;
*saved = map_;
return snapshots_.New(
reinterpret_cast<SequenceNumber>(saved));
}
virtual void ReleaseSnapshot(const Snapshot* snapshot) {
const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_);
delete saved;
snapshots_.Delete(snapshot);
}
virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
assert(options.post_write_snapshot == NULL); // Not supported
for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) {
switch (it.op()) {
case kTypeValue:
map_[it.key().ToString()] = it.value().ToString();
break;
case kTypeLargeValueRef:
assert(false); // Should not occur
break;
case kTypeDeletion:
map_.erase(it.key().ToString());
break;
}
}
return Status::OK();
}
virtual bool GetProperty(const Slice& property, uint64_t* value) {
return false;
}
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
for (int i = 0; i < n; i++) {
sizes[i] = 0;
}
}
private:
typedef std::map<std::string, std::string> KVMap;
class ModelIter: public Iterator {
public:
ModelIter(const KVMap* map, bool owned)
: map_(map), owned_(owned), iter_(map_->end()) {
}
~ModelIter() {
if (owned_) delete map_;
}
virtual bool Valid() const { return iter_ != map_->end(); }
virtual void SeekToFirst() { iter_ = map_->begin(); }
virtual void SeekToLast() {
if (map_->empty()) {
iter_ = map_->end();
} else {
iter_ = map_->find(map_->rbegin()->first);
}
}
virtual void Seek(const Slice& k) {
iter_ = map_->lower_bound(k.ToString());
}
virtual void Next() { ++iter_; }
virtual void Prev() { --iter_; }
virtual Slice key() const { return iter_->first; }
virtual Slice value() const { return iter_->second; }
virtual Status status() const { return Status::OK(); }
private:
const KVMap* const map_;
const bool owned_; // Do we own map_
KVMap::const_iterator iter_;
};
const Options options_;
KVMap map_;
SnapshotList snapshots_;
};
static std::string RandomKey(Random* rnd) {
int len = (rnd->OneIn(3)
? 1 // Short sometimes to encourage collisions
: (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
return test::RandomKey(rnd, len);
}
static bool CompareIterators(int step,
DB* model,
DB* db,
const Snapshot* model_snap,
const Snapshot* db_snap) {
ReadOptions options;
options.snapshot = model_snap;
Iterator* miter = model->NewIterator(options);
options.snapshot = db_snap;
Iterator* dbiter = db->NewIterator(options);
bool ok = true;
int count = 0;
for (miter->SeekToFirst(), dbiter->SeekToFirst();
ok && miter->Valid() && dbiter->Valid();
miter->Next(), dbiter->Next()) {
count++;
if (miter->key().compare(dbiter->key()) != 0) {
fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
step,
EscapeString(miter->key()).c_str(),
EscapeString(dbiter->key()).c_str());
ok = false;
break;
}
if (miter->value().compare(dbiter->value()) != 0) {
fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
step,
EscapeString(miter->key()).c_str(),
EscapeString(miter->value()).c_str(),
EscapeString(miter->value()).c_str());
ok = false;
}
}
if (ok) {
if (miter->Valid() != dbiter->Valid()) {
fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
step, miter->Valid(), dbiter->Valid());
ok = false;
}
}
fprintf(stderr, "%d entries compared: ok=%d\n", count, ok);
delete miter;
delete dbiter;
return ok;
}
TEST(DBTest, Randomized) {
Random rnd(test::RandomSeed());
ModelDB model(last_options_);
const int N = 10000;
const Snapshot* model_snap = NULL;
const Snapshot* db_snap = NULL;
std::string k, v;
for (int step = 0; step < N; step++) {
if (step % 100 == 0) {
fprintf(stderr, "Step %d of %d\n", step, N);
}
int p = rnd.Uniform(100);
if (p < 45) { // Put
k = RandomKey(&rnd);
v = RandomString(&rnd,
rnd.OneIn(20)
? 100 + rnd.Uniform(100)
: rnd.Uniform(8));
ASSERT_OK(model.Put(WriteOptions(), k, v));
ASSERT_OK(db_->Put(WriteOptions(), k, v));
} else if (p < 90) { // Delete
k = RandomKey(&rnd);
ASSERT_OK(model.Delete(WriteOptions(), k));
ASSERT_OK(db_->Delete(WriteOptions(), k));
} else { // Multi-element batch
WriteBatch b;
const int num = rnd.Uniform(8);
for (int i = 0; i < num; i++) {
if (i == 0 || !rnd.OneIn(10)) {
k = RandomKey(&rnd);
} else {
// Periodically re-use the same key from the previous iter, so
// we have multiple entries in the write batch for the same key
}
if (rnd.OneIn(2)) {
v = RandomString(&rnd, rnd.Uniform(10));
b.Put(k, v);
} else {
b.Delete(k);
}
}
ASSERT_OK(model.Write(WriteOptions(), &b));
ASSERT_OK(db_->Write(WriteOptions(), &b));
}
if ((step % 100) == 0) {
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
// Save a snapshot from each DB this time that we'll use next
// time we compare things, to make sure the current state is
// preserved with the snapshot
if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
Reopen();
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
model_snap = model.GetSnapshot();
db_snap = db_->GetSnapshot();
}
}
if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 152
- 0
db/dbformat.cc View File

@ -0,0 +1,152 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include "db/dbformat.h"
#include "port/port.h"
#include "util/coding.h"
namespace leveldb {
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
assert(seq <= kMaxSequenceNumber);
assert(t <= kValueTypeForSeek);
return (seq << 8) | t;
}
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
result->append(key.user_key.data(), key.user_key.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
std::string ParsedInternalKey::DebugString() const {
char buf[50];
snprintf(buf, sizeof(buf), "' @ %llu : %d",
(unsigned long long) sequence,
int(type));
std::string result = "'";
result += user_key.ToString();
result += buf;
return result;
}
const char* InternalKeyComparator::Name() const {
return "leveldb.InternalKeyComparator";
}
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
if (r == 0) {
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
if (anum > bnum) {
r = -1;
} else if (anum < bnum) {
r = +1;
}
}
return r;
}
void InternalKeyComparator::FindShortestSeparator(
std::string* start,
const Slice& limit) const {
// Attempt to shorten the user portion of the key
Slice user_start = ExtractUserKey(*start);
Slice user_limit = ExtractUserKey(limit);
std::string tmp(user_start.data(), user_start.size());
user_comparator_->FindShortestSeparator(&tmp, user_limit);
if (user_comparator_->Compare(*start, tmp) < 0) {
// User key has become larger. Tack on the earliest possible
// number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*start, tmp) < 0);
assert(this->Compare(tmp, limit) < 0);
start->swap(tmp);
}
}
void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
Slice user_key = ExtractUserKey(*key);
std::string tmp(user_key.data(), user_key.size());
user_comparator_->FindShortSuccessor(&tmp);
if (user_comparator_->Compare(user_key, tmp) < 0) {
// User key has become larger. Tack on the earliest possible
// number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*key, tmp) < 0);
key->swap(tmp);
}
}
LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) {
LargeValueRef result;
port::SHA1_Hash(value.data(), value.size(), &result.data[0]);
EncodeFixed64(&result.data[20], value.size());
result.data[28] = static_cast<unsigned char>(ctype);
return result;
}
std::string LargeValueRefToFilenameString(const LargeValueRef& h) {
assert(sizeof(h.data) == LargeValueRef::ByteSize());
assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf
static const char tohex[] = "0123456789abcdef";
char buf[20*2];
for (int i = 0; i < 20; i++) {
buf[2*i] = tohex[(h.data[i] >> 4) & 0xf];
buf[2*i+1] = tohex[h.data[i] & 0xf];
}
std::string result = std::string(buf, sizeof(buf));
result += "-";
result += NumberToString(h.ValueSize());
result += "-";
result += NumberToString(static_cast<uint64_t>(h.compression_type()));
return result;
}
static uint32_t hexvalue(char c) {
if (c >= '0' && c <= '9') {
return c - '0';
} else if (c >= 'A' && c <= 'F') {
return 10 + c - 'A';
} else {
assert(c >= 'a' && c <= 'f');
return 10 + c - 'a';
}
}
bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) {
Slice in = s;
if (in.size() < 40) {
return false;
}
for (int i = 0; i < 20; i++) {
if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) {
return false;
}
unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]);
h->data[i] = c;
}
in.remove_prefix(40);
uint64_t value_size, ctype;
if (ConsumeChar(&in, '-') &&
ConsumeDecimalNumber(&in, &value_size) &&
ConsumeChar(&in, '-') &&
ConsumeDecimalNumber(&in, &ctype) &&
in.empty() &&
(ctype <= kLightweightCompression)) {
EncodeFixed64(&h->data[20], value_size);
h->data[28] = static_cast<unsigned char>(ctype);
return true;
} else {
return false;
}
}
}

+ 198
- 0
db/dbformat.h View File

@ -0,0 +1,198 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
#define STORAGE_LEVELDB_DB_FORMAT_H_
#include <stdio.h>
#include "include/comparator.h"
#include "include/db.h"
#include "include/slice.h"
#include "include/table_builder.h"
#include "util/coding.h"
#include "util/logging.h"
namespace leveldb {
class InternalKey;
// Value types encoded as the last component of internal keys.
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
// data structures.
enum ValueType {
kTypeDeletion = 0x0,
kTypeValue = 0x1,
kTypeLargeValueRef = 0x2,
};
// kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order
// and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest).
static const ValueType kValueTypeForSeek = kTypeLargeValueRef;
typedef uint64_t SequenceNumber;
// We leave eight bits empty at the bottom so a type and sequence#
// can be packed together into 64-bits.
static const SequenceNumber kMaxSequenceNumber =
((0x1ull << 56) - 1);
struct ParsedInternalKey {
Slice user_key;
SequenceNumber sequence;
ValueType type;
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
: user_key(u), sequence(seq), type(t) { }
std::string DebugString() const;
};
// Return the length of the encoding of "key".
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
return key.user_key.size() + 8;
}
// Append the serialization of "key" to *result.
extern void AppendInternalKey(std::string* result,
const ParsedInternalKey& key);
// Attempt to parse an internal key from "internal_key". On success,
// stores the parsed data in "*result", and returns true.
//
// On error, returns false, leaves "*result" in an undefined state.
extern bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result);
// Returns the user key portion of an internal key.
inline Slice ExtractUserKey(const Slice& internal_key) {
assert(internal_key.size() >= 8);
return Slice(internal_key.data(), internal_key.size() - 8);
}
inline ValueType ExtractValueType(const Slice& internal_key) {
assert(internal_key.size() >= 8);
const size_t n = internal_key.size();
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
return static_cast<ValueType>(c);
}
// A comparator for internal keys that uses a specified comparator for
// the user key portion and breaks ties by decreasing sequence number.
class InternalKeyComparator : public Comparator {
private:
const Comparator* user_comparator_;
public:
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { }
virtual const char* Name() const;
virtual int Compare(const Slice& a, const Slice& b) const;
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const;
virtual void FindShortSuccessor(std::string* key) const;
const Comparator* user_comparator() const { return user_comparator_; }
int Compare(const InternalKey& a, const InternalKey& b) const;
};
// Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator.
class InternalKey {
private:
std::string rep_;
public:
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
}
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
Slice Encode() const {
assert(!rep_.empty());
return rep_;
}
Slice user_key() const { return ExtractUserKey(rep_); }
void SetFrom(const ParsedInternalKey& p) {
rep_.clear();
AppendInternalKey(&rep_, p);
}
void Clear() { rep_.clear(); }
};
inline int InternalKeyComparator::Compare(
const InternalKey& a, const InternalKey& b) const {
return Compare(a.Encode(), b.Encode());
}
// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte
// uncompressed size, and a 1 byte CompressionType code. An
// encoded form of it is embedded in the filenames of large value
// files stored in the database, and the raw binary form is stored as
// the iter->value() result for values of type kTypeLargeValueRef in
// the table and log files that make up the database.
struct LargeValueRef {
char data[29];
// Initialize a large value ref for the given data
static LargeValueRef Make(const Slice& data,
CompressionType compression_type);
// Initialize a large value ref from a serialized, 29-byte reference value
static LargeValueRef FromRef(const Slice& ref) {
LargeValueRef result;
assert(ref.size() == sizeof(result.data));
memcpy(result.data, ref.data(), sizeof(result.data));
return result;
}
// Return the number of bytes in a LargeValueRef (not the
// number of bytes in the value referenced).
static size_t ByteSize() { return sizeof(LargeValueRef().data); }
// Return the number of bytes in the value referenced by "*this".
uint64_t ValueSize() const { return DecodeFixed64(&data[20]); }
CompressionType compression_type() const {
return static_cast<CompressionType>(data[28]);
}
bool operator==(const LargeValueRef& b) const {
return memcmp(data, b.data, sizeof(data)) == 0;
}
bool operator<(const LargeValueRef& b) const {
return memcmp(data, b.data, sizeof(data)) < 0;
}
};
// Convert the large value ref to a human-readable string suitable
// for embedding in a large value filename.
extern std::string LargeValueRefToFilenameString(const LargeValueRef& h);
// Parse the large value filename string in "input" and store it in
// "*h". If successful, returns true. Otherwise returns false.
extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref);
inline bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result) {
const size_t n = internal_key.size();
if (n < 8) return false;
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
result->sequence = num >> 8;
result->type = static_cast<ValueType>(c);
result->user_key = Slice(internal_key.data(), n - 8);
return (c <= static_cast<unsigned char>(kTypeLargeValueRef));
}
}
#endif // STORAGE_LEVELDB_DB_FORMAT_H_

+ 127
- 0
db/dbformat_test.cc View File

@ -0,0 +1,127 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
static std::string IKey(const std::string& user_key,
uint64_t seq,
ValueType vt) {
std::string encoded;
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
return encoded;
}
static std::string Shorten(const std::string& s, const std::string& l) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
return result;
}
static std::string ShortSuccessor(const std::string& s) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
return result;
}
static void TestKey(const std::string& key,
uint64_t seq,
ValueType vt) {
std::string encoded = IKey(key, seq, vt);
Slice in(encoded);
ParsedInternalKey decoded("", 0, kTypeValue);
ASSERT_TRUE(ParseInternalKey(in, &decoded));
ASSERT_EQ(key, decoded.user_key.ToString());
ASSERT_EQ(seq, decoded.sequence);
ASSERT_EQ(vt, decoded.type);
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
}
class FormatTest { };
TEST(FormatTest, InternalKey_EncodeDecode) {
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
const uint64_t seq[] = {
1, 2, 3,
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
};
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
TestKey(keys[k], seq[s], kTypeValue);
TestKey("hello", 1, kTypeDeletion);
}
}
}
TEST(FormatTest, InternalKeyShortSeparator) {
// When user keys are same
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 99, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 101, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeDeletion)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeLargeValueRef)));
// When user keys are misordered
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("bar", 99, kTypeValue)));
// When user keys are different, but correctly ordered
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
Shorten(IKey("foo", 100, kTypeValue),
IKey("hello", 200, kTypeValue)));
// When start user key is prefix of limit user key
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foobar", 200, kTypeValue)));
// When limit user key is prefix of start user key
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
Shorten(IKey("foobar", 100, kTypeValue),
IKey("foo", 200, kTypeValue)));
}
TEST(FormatTest, InternalKeyShortestSuccessor) {
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
ShortSuccessor(IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
}
TEST(FormatTest, SHA1) {
// Check that we are computing the same value as sha1.
// Note that the last two numbers are the length of the input and the
// compression type.
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr
LargeValueRefToFilenameString(
LargeValueRef::Make("hello", kNoCompression)));
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr
LargeValueRefToFilenameString(
LargeValueRef::Make("hello", kLightweightCompression)));
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 154
- 0
db/filename.cc View File

@ -0,0 +1,154 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <ctype.h>
#include <stdio.h>
#include "db/filename.h"
#include "db/dbformat.h"
#include "include/env.h"
#include "util/logging.h"
namespace leveldb {
static std::string MakeFileName(const std::string& name, uint64_t number,
const char* suffix) {
char buf[100];
snprintf(buf, sizeof(buf), "/%06llu.%s",
static_cast<unsigned long long>(number),
suffix);
return name + buf;
}
std::string LogFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "log");
}
std::string TableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "sst");
}
std::string LargeValueFileName(const std::string& name,
const LargeValueRef& large_ref) {
std::string result = name + "/";
result += LargeValueRefToFilenameString(large_ref);
result += ".val";
return result;
}
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
char buf[100];
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
static_cast<unsigned long long>(number));
return dbname + buf;
}
std::string CurrentFileName(const std::string& dbname) {
return dbname + "/CURRENT";
}
std::string LockFileName(const std::string& dbname) {
return dbname + "/LOCK";
}
std::string TempFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
return MakeFileName(dbname, number, "dbtmp");
}
std::string InfoLogFileName(const std::string& dbname) {
return dbname + "/LOG";
}
// Return the name of the old info log file for "dbname".
std::string OldInfoLogFileName(const std::string& dbname) {
return dbname + "/LOG.old";
}
// Owned filenames have the form:
// dbname/CURRENT
// dbname/LOCK
// dbname/LOG
// dbname/LOG.old
// dbname/MANIFEST-[0-9]+
// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val
// dbname/[0-9]+.(log|sst)
bool ParseFileName(const std::string& fname,
uint64_t* number,
LargeValueRef* large_ref,
FileType* type) {
Slice rest(fname);
if (rest == "CURRENT") {
*number = 0;
*type = kCurrentFile;
} else if (rest == "LOCK") {
*number = 0;
*type = kDBLockFile;
} else if (rest == "LOG" || rest == "LOG.old") {
*number = 0;
*type = kInfoLogFile;
} else if (rest.size() >= 4 &&
Slice(rest.data() + rest.size() - 4, 4) == ".val") {
LargeValueRef h;
if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4),
&h)) {
return false;
}
*large_ref = h;
*type = kLargeValueFile;
} else if (rest.starts_with("MANIFEST-")) {
rest.remove_prefix(strlen("MANIFEST-"));
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
if (!rest.empty()) {
return false;
}
*type = kDescriptorFile;
*number = num;
} else {
// Avoid strtoull() to keep filename format independent of the
// current locale
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
Slice suffix = rest;
if (suffix == Slice(".log")) {
*type = kLogFile;
} else if (suffix == Slice(".sst")) {
*type = kTableFile;
} else if (suffix == Slice(".dbtmp")) {
*type = kTempFile;
} else {
return false;
}
*number = num;
}
return true;
}
Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number) {
// Remove leading "dbname/" and add newline to manifest file name
std::string manifest = DescriptorFileName(dbname, descriptor_number);
Slice contents = manifest;
assert(contents.starts_with(dbname + "/"));
contents.remove_prefix(dbname.size() + 1);
std::string tmp = TempFileName(dbname, descriptor_number);
Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp);
if (s.ok()) {
s = env->RenameFile(tmp, CurrentFileName(dbname));
}
if (!s.ok()) {
env->DeleteFile(tmp);
}
return s;
}
}

+ 92
- 0
db/filename.h View File

@ -0,0 +1,92 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// File names used by DB code
#ifndef STORAGE_LEVELDB_DB_FILENAME_H_
#define STORAGE_LEVELDB_DB_FILENAME_H_
#include <stdint.h>
#include <string>
#include "include/slice.h"
#include "include/status.h"
#include "port/port.h"
namespace leveldb {
class Env;
struct LargeValueRef;
enum FileType {
kLogFile,
kDBLockFile,
kTableFile,
kLargeValueFile,
kDescriptorFile,
kCurrentFile,
kTempFile,
kInfoLogFile, // Either the current one, or an old one
};
// Return the name of the log file with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string LogFileName(const std::string& dbname, uint64_t number);
// Return the name of the sstable with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number);
// Return the name of the large value file with the specified large
// value reference in the db named by "dbname". The result will be
// prefixed with "dbname".
extern std::string LargeValueFileName(const std::string& dbname,
const LargeValueRef& large_ref);
// Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be
// prefixed with "dbname".
extern std::string DescriptorFileName(const std::string& dbname,
uint64_t number);
// Return the name of the current file. This file contains the name
// of the current manifest file. The result will be prefixed with
// "dbname".
extern std::string CurrentFileName(const std::string& dbname);
// Return the name of the lock file for the db named by
// "dbname". The result will be prefixed with "dbname".
extern std::string LockFileName(const std::string& dbname);
// Return the name of a temporary file owned by the db named "dbname".
// The result will be prefixed with "dbname".
extern std::string TempFileName(const std::string& dbname, uint64_t number);
// Return the name of the info log file for "dbname".
extern std::string InfoLogFileName(const std::string& dbname);
// Return the name of the old info log file for "dbname".
extern std::string OldInfoLogFileName(const std::string& dbname);
// If filename is a leveldb file, store the type of the file in *type.
// If *type is kLargeValueFile, then the large value reference data
// from the filename is stored in "*large_ref. For all other types of
// files, the number encoded in the filename is stored in *number. If
// the filename was successfully parsed, returns true. Else return
// false.
extern bool ParseFileName(const std::string& filename,
uint64_t* number,
LargeValueRef* large_ref,
FileType* type);
// Make the CURRENT file point to the descriptor file with the
// specified number.
extern Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number);
}
#endif // STORAGE_LEVELDB_DB_FILENAME_H_

+ 156
- 0
db/filename_test.cc View File

@ -0,0 +1,156 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include "db/dbformat.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
class FileNameTest { };
TEST(FileNameTest, Parse) {
Slice db;
FileType type;
uint64_t number;
LargeValueRef large_ref;
// Successful parses
static struct {
const char* fname;
uint64_t number;
const char* large_ref;
FileType type;
} cases[] = {
{ "100.log", 100, "", kLogFile },
{ "0.log", 0, "", kLogFile },
{ "0.sst", 0, "", kTableFile },
{ "CURRENT", 0, "", kCurrentFile },
{ "LOCK", 0, "", kDBLockFile },
{ "MANIFEST-2", 2, "", kDescriptorFile },
{ "MANIFEST-7", 7, "", kDescriptorFile },
{ "LOG", 0, "", kInfoLogFile },
{ "LOG.old", 0, "", kInfoLogFile },
{ "18446744073709551615.log", 18446744073709551615ull, "",
kLogFile },
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0,
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile },
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0,
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0",
kLargeValueFile },
};
for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
std::string f = cases[i].fname;
ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f;
ASSERT_EQ(cases[i].type, type) << f;
if (type == kLargeValueFile) {
ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref))
<< f;
} else {
ASSERT_EQ(cases[i].number, number) << f;
}
}
// Errors
static const char* errors[] = {
"",
"foo",
"foo-dx-100.log",
".log",
"",
"manifest",
"CURREN",
"CURRENTX",
"MANIFES",
"MANIFEST",
"MANIFEST-",
"XMANIFEST-3",
"MANIFEST-3x",
"LOC",
"LOCKx",
"LO",
"LOGx",
"18446744073709551616.log",
"184467440737095516150.log",
"100",
"100.",
"100.lop",
"100.val",
".val",
"123456789012345678901234567890123456789-12340.val",
"1234567890123456789012345678901234567-123-0.val",
"12345678901234567890123456789012345678902-100-1-.val",
// Overflow on value size
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val",
// '03.val' is a bad compression type
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" };
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f;
};
}
TEST(FileNameTest, Construction) {
uint64_t number;
FileType type;
LargeValueRef large_ref;
std::string fname;
fname = CurrentFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kCurrentFile, type);
fname = LockFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kDBLockFile, type);
fname = LogFileName("foo", 192);
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(192, number);
ASSERT_EQ(kLogFile, type);
fname = TableFileName("bar", 200);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(200, number);
ASSERT_EQ(kTableFile, type);
fname = DescriptorFileName("bar", 100);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(100, number);
ASSERT_EQ(kDescriptorFile, type);
fname = TempFileName("tmp", 999);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(999, number);
ASSERT_EQ(kTempFile, type);
for (int i = 0; i <= kLightweightCompression; i++) {
CompressionType ctype = static_cast<CompressionType>(i);
std::string value = "abcdef";
LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype);
fname = LargeValueFileName("tmp", real_large_ref);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_TRUE(real_large_ref == large_ref);
ASSERT_EQ(kLargeValueFile, type);
ASSERT_EQ(large_ref.compression_type(), ctype);
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 35
- 0
db/log_format.h View File

@ -0,0 +1,35 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Log format information shared by reader and writer.
// See ../doc/log_format.txt for more detail.
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
namespace leveldb {
namespace log {
enum RecordType {
// Zero is reserved for preallocated files
kZeroType = 0,
kFullType = 1,
// For fragments
kFirstType = 2,
kMiddleType = 3,
kLastType = 4,
};
static const int kMaxRecordType = kLastType;
static const int kBlockSize = 32768;
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
static const int kHeaderSize = 4 + 1 + 2;
}
}
#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_

+ 172
- 0
db/log_reader.cc View File

@ -0,0 +1,172 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include <stdint.h>
#include "include/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum)
: file_(file),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
buffer_(),
eof_(false) {
}
Reader::~Reader() {
delete[] backing_store_;
}
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
scratch->clear();
record->clear();
bool in_fragmented_record = false;
Slice fragment;
while (true) {
switch (ReadPhysicalRecord(&fragment)) {
case kFullType:
if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end");
}
scratch->clear();
*record = fragment;
return true;
case kFirstType:
if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end");
}
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
ReportDrop(fragment.size(), "missing start of fragmented record");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportDrop(fragment.size(), "missing start of fragmented record");
} else {
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
return true;
}
break;
case kEof:
if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end");
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportDrop(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default:
ReportDrop(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
"unknown record type");
in_fragmented_record = false;
scratch->clear();
break;
}
}
return false;
}
void Reader::ReportDrop(size_t bytes, const char* reason) {
if (reporter_ != NULL) {
reporter_->Corruption(bytes, Status::Corruption(reason));
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) {
if (buffer_.size() <= kHeaderSize) {
if (!eof_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
if (!status.ok()) {
if (reporter_ != NULL) {
reporter_->Corruption(kBlockSize, status);
}
buffer_.clear();
eof_ = true;
return kEof;
} else if (buffer_.size() < kBlockSize) {
eof_ = true;
}
continue;
} else if (buffer_.size() == 0) {
// End of file
return kEof;
} else if (buffer_.size() < kHeaderSize) {
ReportDrop(buffer_.size(), "truncated record at end of file");
buffer_.clear();
return kEof;
} else {
// We have a trailing zero-length record. Fall through and check it.
}
}
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) {
ReportDrop(buffer_.size(), "bad record length");
buffer_.clear();
return kBadRecord;
}
// Check crc
if (checksum_) {
if (type == kZeroType && length == 0) {
// Skip zero length record
buffer_.remove_prefix(kHeaderSize + length);
return kBadRecord;
}
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
ReportDrop(length, "checksum mismatch");
buffer_.remove_prefix(kHeaderSize + length);
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length);
*result = Slice(header + kHeaderSize, length);
return type;
}
}
}
}

+ 75
- 0
db/log_reader.h View File

@ -0,0 +1,75 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
#define STORAGE_LEVELDB_DB_LOG_READER_H_
#include "db/log_format.h"
#include "include/slice.h"
#include "include/status.h"
namespace leveldb {
class SequentialFile;
namespace log {
class Reader {
public:
// Interface for reporting errors.
class Reporter {
public:
virtual ~Reporter();
// Some corruption was detected. "size" is the approximate number
// of bytes dropped due to the corruption.
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use.
//
// If "reporter" is non-NULL, it is notified whenever some data is
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
Reader(SequentialFile* file, Reporter* reporter, bool checksum);
~Reader();
// Read the next record into *record. Returns true if read
// successfully, false if we hit end of the input. May use
// "*scratch" as temporary storage. The contents filled in *record
// will only be valid until the next mutating operation on this
// reader or the next mutation to *scratch.
bool ReadRecord(Slice* record, std::string* scratch);
private:
SequentialFile* const file_;
Reporter* const reporter_;
bool const checksum_;
char* const backing_store_;
Slice buffer_;
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
// Extend record types with the following special values
enum {
kEof = kMaxRecordType + 1,
kBadRecord = kMaxRecordType + 2
};
// Return type, or one of the preceding special values
unsigned int ReadPhysicalRecord(Slice* result);
void ReportDrop(size_t bytes, const char* reason);
// No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
}
}
#endif // STORAGE_LEVELDB_DB_LOG_READER_H_

+ 361
- 0
db/log_test.cc View File

@ -0,0 +1,361 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "include/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/random.h"
#include "util/testharness.h"
namespace leveldb {
namespace log {
// Construct a string of the specified length made out of the supplied
// partial string.
static std::string BigString(const std::string& partial_string, size_t n) {
std::string result;
while (result.size() < n) {
result.append(partial_string);
}
result.resize(n);
return result;
}
// Construct a string from a number
static std::string NumberString(int n) {
char buf[50];
snprintf(buf, sizeof(buf), "%d.", n);
return std::string(buf);
}
// Return a skewed potentially long string
static std::string RandomSkewedString(int i, Random* rnd) {
return BigString(NumberString(i), rnd->Skewed(17));
}
class LogTest {
private:
class StringDest : public WritableFile {
public:
std::string contents_;
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
virtual Status Append(const Slice& slice) {
contents_.append(slice.data(), slice.size());
return Status::OK();
}
};
class StringSource : public SequentialFile {
public:
Slice contents_;
bool force_error_;
bool returned_partial_;
StringSource() : force_error_(false), returned_partial_(false) { }
virtual Status Read(size_t n, Slice* result, char* scratch) {
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
ASSERT_EQ(kBlockSize, n);
if (force_error_) {
force_error_ = false;
returned_partial_ = true;
return Status::Corruption("read error");
}
if (contents_.size() < n) {
n = contents_.size();
returned_partial_ = true;
}
*result = Slice(contents_.data(), n);
contents_.remove_prefix(n);
return Status::OK();
}
};
class ReportCollector : public Reader::Reporter {
public:
size_t dropped_bytes_;
std::string message_;
ReportCollector() : dropped_bytes_(0) { }
virtual void Corruption(size_t bytes, const Status& status) {
dropped_bytes_ += bytes;
message_.append(status.ToString());
}
};
StringDest dest_;
StringSource source_;
ReportCollector report_;
bool reading_;
Writer writer_;
Reader reader_;
public:
LogTest() : reading_(false),
writer_(&dest_),
reader_(&source_, &report_, true/*checksum*/) {
}
void Write(const std::string& msg) {
ASSERT_TRUE(!reading_) << "Write() after starting to read";
writer_.AddRecord(Slice(msg));
}
size_t WrittenBytes() const {
return dest_.contents_.size();
}
std::string Read() {
if (!reading_) {
reading_ = true;
source_.contents_ = Slice(dest_.contents_);
}
std::string scratch;
Slice record;
if (reader_.ReadRecord(&record, &scratch)) {
return record.ToString();
} else {
return "EOF";
}
}
void IncrementByte(int offset, int delta) {
dest_.contents_[offset] += delta;
}
void SetByte(int offset, char new_byte) {
dest_.contents_[offset] = new_byte;
}
void ShrinkSize(int bytes) {
dest_.contents_.resize(dest_.contents_.size() - bytes);
}
void FixChecksum(int header_offset, int len) {
// Compute crc of type/len/data
uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len);
crc = crc32c::Mask(crc);
EncodeFixed32(&dest_.contents_[header_offset], crc);
}
void ForceError() {
source_.force_error_ = true;
}
size_t DroppedBytes() const {
return report_.dropped_bytes_;
}
// Returns OK iff recorded error message contains "msg"
std::string MatchError(const std::string& msg) const {
if (report_.message_.find(msg) == std::string::npos) {
return report_.message_;
} else {
return "OK";
}
}
};
TEST(LogTest, Empty) {
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ReadWrite) {
Write("foo");
Write("bar");
Write("");
Write("xxxx");
ASSERT_EQ("foo", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("", Read());
ASSERT_EQ("xxxx", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
}
TEST(LogTest, ManyBlocks) {
for (int i = 0; i < 100000; i++) {
Write(NumberString(i));
}
for (int i = 0; i < 100000; i++) {
ASSERT_EQ(NumberString(i), Read());
}
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, Fragmentation) {
Write("small");
Write(BigString("medium", 50000));
Write(BigString("large", 100000));
ASSERT_EQ("small", Read());
ASSERT_EQ(BigString("medium", 50000), Read());
ASSERT_EQ(BigString("large", 100000), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, MarginalTrailer) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ShortTrailer) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, AlignedEof) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, RandomRead) {
const int N = 500;
Random write_rnd(301);
for (int i = 0; i < N; i++) {
Write(RandomSkewedString(i, &write_rnd));
}
Random read_rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
}
ASSERT_EQ("EOF", Read());
}
// Tests of all the error paths in log_reader.cc follow:
TEST(LogTest, ReadError) {
Write("foo");
ForceError();
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kBlockSize, DroppedBytes());
ASSERT_EQ("OK", MatchError("read error"));
}
TEST(LogTest, BadRecordType) {
Write("foo");
// Type is stored in header[6]
IncrementByte(6, 100);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("unknown record type"));
}
TEST(LogTest, TruncatedTrailingRecord) {
Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
ASSERT_EQ("OK", MatchError("truncated record at end of file"));
}
TEST(LogTest, BadLength) {
Write("foo");
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
ASSERT_EQ("OK", MatchError("bad record length"));
}
TEST(LogTest, ChecksumMismatch) {
Write("foo");
IncrementByte(0, 10);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("checksum mismatch"));
}
TEST(LogTest, UnexpectedMiddleType) {
Write("foo");
SetByte(6, kMiddleType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedLastType) {
Write("foo");
SetByte(6, kLastType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedFullType) {
Write("foo");
Write("bar");
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, UnexpectedFirstType) {
Write("foo");
Write(BigString("bar", 100000));
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ(BigString("bar", 100000), Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2)
// where the middle two fragments disappear. We do not want
// first(R1),last(R2) to get joined and returned as a valid record.
// Write records that span two blocks
Write(BigString("foo", kBlockSize));
Write(BigString("bar", kBlockSize));
Write("correct");
// Wipe the middle block
for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
SetByte(offset, 'x');
}
ASSERT_EQ("correct", Read());
ASSERT_EQ("EOF", Read());
const int dropped = DroppedBytes();
ASSERT_LE(dropped, 2*kBlockSize + 100);
ASSERT_GE(dropped, 2*kBlockSize);
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 101
- 0
db/log_writer.cc View File

@ -0,0 +1,101 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_writer.h"
#include <stdint.h>
#include "include/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
namespace log {
Writer::Writer(WritableFile* dest)
: dest_(dest),
block_offset_(0) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
type_crc_[i] = crc32c::Value(&t, 1);
}
}
Writer::~Writer() {
}
Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single
// zero-length record
Status s;
do {
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
if (leftover <= kHeaderSize) {
// Switch to a new block
if (leftover > 0) {
// Fill the trailer
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover));
}
block_offset_ = 0;
}
// Invariant: we never leave <= kHeaderSize bytes in a block.
const int avail = kBlockSize - block_offset_ - kHeaderSize;
assert(avail > 0);
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
const bool begin = (ptr == slice.data());
const bool end = (left == fragment_length);
if (begin && end) {
type = kFullType;
} else if (begin) {
type = kFirstType;
} else if (end) {
type = kLastType;
} else {
type = kMiddleType;
}
s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length;
left -= fragment_length;
} while (s.ok() && left > 0);
return s;
}
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
assert(n <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
// Format the header
char buf[kHeaderSize];
buf[4] = static_cast<char>(n & 0xff);
buf[5] = static_cast<char>(n >> 8);
buf[6] = static_cast<char>(t);
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc);
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize));
if (s.ok()) {
s = dest_->Append(Slice(ptr, n));
if (s.ok()) {
s = dest_->Flush();
}
}
block_offset_ += kHeaderSize + n;
return s;
}
}
}

+ 48
- 0
db/log_writer.h View File

@ -0,0 +1,48 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_
#define STORAGE_LEVELDB_DB_LOG_WRITER_H_
#include <stdint.h>
#include "db/log_format.h"
#include "include/slice.h"
#include "include/status.h"
namespace leveldb {
class WritableFile;
namespace log {
class Writer {
public:
// Create a writer that will append data to "*dest".
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(WritableFile* dest);
~Writer();
Status AddRecord(const Slice& slice);
private:
WritableFile* dest_;
int block_offset_; // Current offset in block
// crc32c values for all supported record types. These are
// pre-computed to reduce the overhead of computing the crc of the
// record type stored in the header.
uint32_t type_crc_[kMaxRecordType + 1];
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
// No copying allowed
Writer(const Writer&);
void operator=(const Writer&);
};
}
}
#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_

+ 109
- 0
db/memtable.cc View File

@ -0,0 +1,109 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/memtable.h"
#include "db/dbformat.h"
#include "include/comparator.h"
#include "include/env.h"
#include "include/iterator.h"
#include "util/coding.h"
namespace leveldb {
static Slice GetLengthPrefixedSlice(const char* data) {
uint32_t len;
const char* p = data;
p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
return Slice(p, len);
}
MemTable::MemTable(const InternalKeyComparator& cmp)
: comparator_(cmp),
table_(comparator_, &arena_) {
}
MemTable::~MemTable() {
}
size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
const {
// Internal keys are encoded as length-prefixed strings.
Slice a = GetLengthPrefixedSlice(aptr);
Slice b = GetLengthPrefixedSlice(bptr);
return comparator.Compare(a, b);
}
// Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space.
static const char* EncodeKey(std::string* scratch, const Slice& target) {
scratch->clear();
PutVarint32(scratch, target.size());
scratch->append(target.data(), target.size());
return scratch->data();
}
class MemTableIterator: public Iterator {
public:
explicit MemTableIterator(MemTable::Table* table) {
iter_ = new MemTable::Table::Iterator(table);
}
virtual ~MemTableIterator() { delete iter_; }
virtual bool Valid() const { return iter_->Valid(); }
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
virtual void SeekToFirst() { iter_->SeekToFirst(); }
virtual void SeekToLast() { iter_->SeekToLast(); }
virtual void Next() { iter_->Next(); }
virtual void Prev() { iter_->Prev(); }
virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); }
virtual Slice value() const {
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
}
virtual Status status() const { return Status::OK(); }
private:
MemTable::Table::Iterator* iter_;
std::string tmp_; // For passing to EncodeKey
// No copying allowed
MemTableIterator(const MemTableIterator&);
void operator=(const MemTableIterator&);
};
Iterator* MemTable::NewIterator() {
return new MemTableIterator(&table_);
}
void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key,
const Slice& value) {
// Format of an entry is concatenation of:
// key_size : varint32 of internal_key.size()
// key bytes : char[internal_key.size()]
// value_size : varint32 of value.size()
// value bytes : char[value.size()]
size_t key_size = key.size();
size_t val_size = value.size();
size_t internal_key_size = key_size + 8;
const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size;
char* buf = arena_.Allocate(encoded_len);
char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size);
p += key_size;
EncodeFixed64(p, (s << 8) | type);
p += 8;
p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size);
assert((p + val_size) - buf == encoded_len);
table_.Insert(buf);
}
}

+ 69
- 0
db/memtable.h View File

@ -0,0 +1,69 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_
#define STORAGE_LEVELDB_DB_MEMTABLE_H_
#include <string>
#include "include/db.h"
#include "db/dbformat.h"
#include "db/skiplist.h"
#include "util/arena.h"
namespace leveldb {
class InternalKeyComparator;
class Mutex;
class MemTableIterator;
class MemTable {
public:
explicit MemTable(const InternalKeyComparator& comparator);
~MemTable();
// Returns an estimate of the number of bytes of data in use by this
// data structure.
//
// REQUIRES: external synchronization to prevent simultaneous
// operations on the same MemTable.
size_t ApproximateMemoryUsage();
// Return an iterator that yields the contents of the memtable.
//
// The caller must ensure that the underlying MemTable remains live
// while the returned iterator is live. The keys returned by this
// iterator are internal keys encoded by AppendInternalKey in the
// db/format.{h,cc} module.
Iterator* NewIterator();
// Add an entry into memtable that maps key to value at the
// specified sequence number and with the specified type.
// Typically value will be empty if type==kTypeDeletion.
void Add(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value);
private:
struct KeyComparator {
const InternalKeyComparator comparator;
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
int operator()(const char* a, const char* b) const;
};
friend class MemTableIterator;
friend class MemTableBackwardIterator;
typedef SkipList<const char*, KeyComparator> Table;
KeyComparator comparator_;
Arena arena_;
Table table_;
// No copying allowed
MemTable(const MemTable&);
void operator=(const MemTable&);
};
}
#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_

+ 396
- 0
db/repair.cc View File

@ -0,0 +1,396 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// We recover the contents of the descriptor from the other files we find.
// (1) Any log files are first converted to tables
// (2) We scan every table to compute
// (a) smallest/largest for the table
// (b) large value refs from the table
// (c) largest sequence number in the table
// (3) We generate descriptor contents:
// - log number is set to zero
// - next-file-number is set to 1 + largest file number we found
// - last-sequence-number is set to largest sequence# found across
// all tables (see 2c)
// - compaction pointers are cleared
// - every table file is added at level 0
//
// Possible optimization 1:
// (a) Compute total size and use to pick appropriate max-level M
// (b) Sort tables by largest sequence# in the table
// (c) For each table: if it overlaps earlier table, place in level-0,
// else place in level-M.
// Possible optimization 2:
// Store per-table metadata (smallest, largest, largest-seq#,
// large-value-refs, ...) in the table's meta section to speed up
// ScanTable.
#include "db/builder.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/write_batch_internal.h"
#include "include/comparator.h"
#include "include/db.h"
#include "include/env.h"
namespace leveldb {
namespace {
class Repairer {
public:
Repairer(const std::string& dbname, const Options& options)
: dbname_(dbname),
env_(options.env),
icmp_(options.comparator),
options_(SanitizeOptions(dbname, &icmp_, options)),
owns_info_log_(options_.info_log != options.info_log),
next_file_number_(1) {
// TableCache can be small since we expect each table to be opened once.
table_cache_ = new TableCache(dbname_, &options_, 10);
}
~Repairer() {
delete table_cache_;
if (owns_info_log_) {
delete options_.info_log;
}
}
Status Run() {
Status status = FindFiles();
if (status.ok()) {
ConvertLogFilesToTables();
ExtractMetaData();
status = WriteDescriptor();
}
if (status.ok()) {
unsigned long long bytes = 0;
for (int i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.file_size;
}
Log(env_, options_.info_log,
"**** Repaired leveldb %s; "
"recovered %d files; %llu bytes. "
"Some data may have been lost. "
"****",
dbname_.c_str(),
static_cast<int>(tables_.size()),
bytes);
}
return status;
}
private:
struct TableInfo {
FileMetaData meta;
SequenceNumber max_sequence;
};
std::string const dbname_;
Env* const env_;
InternalKeyComparator const icmp_;
Options const options_;
bool owns_info_log_;
TableCache* table_cache_;
VersionEdit edit_;
std::vector<std::string> manifests_;
std::vector<uint64_t> table_numbers_;
std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_;
uint64_t next_file_number_;
Status FindFiles() {
std::vector<std::string> filenames;
Status status = env_->GetChildren(dbname_, &filenames);
if (!status.ok()) {
return status;
}
if (filenames.empty()) {
return Status::IOError(dbname_, "repair found no files");
}
uint64_t number;
LargeValueRef large_ref;
FileType type;
for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
if (type == kLargeValueFile) {
// Will be picked up when we process a Table that points to it
} else if (type == kDescriptorFile) {
manifests_.push_back(filenames[i]);
} else {
if (number + 1 > next_file_number_) {
next_file_number_ = number + 1;
}
if (type == kLogFile) {
logs_.push_back(number);
} else if (type == kTableFile) {
table_numbers_.push_back(number);
} else {
// Ignore other files
}
}
}
}
return status;
}
void ConvertLogFilesToTables() {
for (int i = 0; i < logs_.size(); i++) {
std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) {
Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s",
(unsigned long long) logs_[i],
status.ToString().c_str());
}
ArchiveFile(logname);
}
}
Status ConvertLogToTable(uint64_t log) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
WritableFile* info_log;
uint64_t lognum;
virtual void Corruption(size_t bytes, const Status& s) {
// We print error messages for corruption, but continue repairing.
Log(env, info_log, "Log #%llu: dropping %d bytes; %s",
(unsigned long long) lognum,
static_cast<int>(bytes),
s.ToString().c_str());
}
};
// Open the log file
std::string logname = LogFileName(dbname_, log);
SequentialFile* lfile;
Status status = env_->NewSequentialFile(logname, &lfile);
if (!status.ok()) {
return status;
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.lognum = log;
// We intentially make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
log::Reader reader(lfile, &reporter, false/*do not checksum*/);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
MemTable mem(icmp_);
int counter = 0;
while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) {
reporter.Corruption(
record.size(), Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, &mem);
if (status.ok()) {
counter += WriteBatchInternal::Count(&batch);
} else {
Log(env_, options_.info_log, "Log #%llu: ignoring %s",
(unsigned long long) log,
status.ToString().c_str());
status = Status::OK(); // Keep going with rest of file
}
}
delete lfile;
// We ignore any version edits generated by the conversion to a Table
// since ExtractMetaData() will also generate edits.
VersionEdit skipped;
FileMetaData meta;
meta.number = next_file_number_++;
Iterator* iter = mem.NewIterator();
status = BuildTable(dbname_, env_, options_, table_cache_, iter,
&meta, &skipped);
delete iter;
if (status.ok()) {
if (meta.file_size > 0) {
table_numbers_.push_back(meta.number);
}
}
Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
(unsigned long long) log,
counter,
(unsigned long long) meta.number,
status.ToString().c_str());
return status;
}
void ExtractMetaData() {
std::vector<TableInfo> kept;
for (int i = 0; i < table_numbers_.size(); i++) {
TableInfo t;
t.meta.number = table_numbers_[i];
Status status = ScanTable(&t);
if (!status.ok()) {
std::string fname = TableFileName(dbname_, table_numbers_[i]);
Log(env_, options_.info_log, "Table #%llu: ignoring %s",
(unsigned long long) table_numbers_[i],
status.ToString().c_str());
ArchiveFile(fname);
} else {
tables_.push_back(t);
}
}
}
Status ScanTable(TableInfo* t) {
std::string fname = TableFileName(dbname_, t->meta.number);
int counter = 0;
Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (status.ok()) {
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), t->meta.number);
bool empty = true;
ParsedInternalKey parsed;
t->max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
Log(env_, options_.info_log, "Table #%llu: unparsable key %s",
(unsigned long long) t->meta.number,
EscapeString(key).c_str());
continue;
}
counter++;
if (empty) {
empty = false;
t->meta.smallest.DecodeFrom(key);
}
t->meta.largest.DecodeFrom(key);
if (parsed.sequence > t->max_sequence) {
t->max_sequence = parsed.sequence;
}
if (ExtractValueType(key) == kTypeLargeValueRef) {
if (iter->value().size() != LargeValueRef::ByteSize()) {
Log(env_, options_.info_log, "Table #%llu: bad large value ref",
(unsigned long long) t->meta.number);
} else {
edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
t->meta.number,
key);
}
}
}
if (!iter->status().ok()) {
status = iter->status();
}
delete iter;
}
Log(env_, options_.info_log, "Table #%llu: %d entries %s",
(unsigned long long) t->meta.number,
counter,
status.ToString().c_str());
return status;
}
Status WriteDescriptor() {
std::string tmp = TempFileName(dbname_, 1);
WritableFile* file;
Status status = env_->NewWritableFile(tmp, &file);
if (!status.ok()) {
return status;
}
SequenceNumber max_sequence = 0;
for (int i = 0; i < tables_.size(); i++) {
if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence;
}
}
edit_.SetComparatorName(icmp_.user_comparator()->Name());
edit_.SetLogNumber(0);
edit_.SetNextFile(next_file_number_);
edit_.SetLastSequence(max_sequence);
for (int i = 0; i < tables_.size(); i++) {
// TODO(opt): separate out into multiple levels
const TableInfo& t = tables_[i];
edit_.AddFile(0, t.meta.number, t.meta.file_size,
t.meta.smallest, t.meta.largest);
}
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
{
log::Writer log(file);
std::string record;
edit_.EncodeTo(&record);
status = log.AddRecord(record);
}
if (status.ok()) {
status = file->Close();
}
delete file;
file = NULL;
if (!status.ok()) {
env_->DeleteFile(tmp);
} else {
// Discard older manifests
for (int i = 0; i < manifests_.size(); i++) {
ArchiveFile(dbname_ + "/" + manifests_[i]);
}
// Install new manifest
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
if (status.ok()) {
status = SetCurrentFile(env_, dbname_, 1);
} else {
env_->DeleteFile(tmp);
}
}
return status;
}
void ArchiveFile(const std::string& fname) {
// Move into another directory. E.g., for
// dir/foo
// rename to
// dir/lost/foo
const char* slash = strrchr(fname.c_str(), '/');
std::string new_dir;
if (slash != NULL) {
new_dir.assign(fname.data(), slash - fname.data());
}
new_dir.append("/lost");
env_->CreateDir(new_dir); // Ignore error
std::string new_file = new_dir;
new_file.append("/");
new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
Status s = env_->RenameFile(fname, new_file);
Log(env_, options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str());
}
};
}
Status RepairDB(const std::string& dbname, const Options& options) {
Repairer repairer(dbname, options);
return repairer.Run();
}
}

+ 378
- 0
db/skiplist.h View File

@ -0,0 +1,378 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread safety
// -------------
//
// Writes require external synchronization, most likely a mutex.
// Reads require a guarantee that the SkipList will not be destroyed
// while the read is in progress. Apart from that, reads progress
// without any internal locking or synchronization.
//
// Invariants:
//
// (1) Allocated nodes are never deleted until the SkipList is
// destroyed. This is trivially guaranteed by the code since we
// never delete any skip list nodes.
//
// (2) The contents of a Node except for the next/prev pointers are
// immutable after the Node has been linked into the SkipList.
// Only Insert() modifies the list, and it is careful to initialize
// a node and use release-stores to publish the nodes in one or
// more lists.
//
// ... prev vs. next pointer ordering ...
#include <assert.h>
#include <stdlib.h>
#include "port/port.h"
#include "util/arena.h"
#include "util/random.h"
namespace leveldb {
class Arena;
template<typename Key, class Comparator>
class SkipList {
private:
struct Node;
public:
// Create a new SkipList object that will use "cmp" for comparing keys,
// and will allocate memory using "*arena". Objects allocated in the arena
// must remain allocated for the lifetime of the skiplist object.
explicit SkipList(Comparator cmp, Arena* arena);
// Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list.
void Insert(const Key& key);
// Returns true iff an entry that compares equal to key is in the list.
bool Contains(const Key& key) const;
// Iteration over the contents of a skip list
class Iterator {
public:
// Initialize an iterator over the specified list.
// The returned iterator is not valid.
explicit Iterator(const SkipList* list);
// Returns true iff the iterator is positioned at a valid node.
bool Valid() const;
// Returns the key at the current position.
// REQUIRES: Valid()
const Key& key() const;
// Advances to the next position.
// REQUIRES: Valid()
void Next();
// Advances to the previous position.
// REQUIRES: Valid()
void Prev();
// Advance to the first entry with a key >= target
void Seek(const Key& target);
// Position at the first entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToFirst();
// Position at the last entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast();
private:
const SkipList* list_;
Node* node_;
// Intentionally copyable
};
private:
enum { kMaxHeight = 12 };
// Immutable after construction
Comparator const compare_;
Arena* const arena_; // Arena used for allocations of nodes
Node* const head_;
// Modified only by Insert(). Read racily by readers, but stale
// values are ok.
port::AtomicPointer max_height_; // Height of the entire list
inline int GetMaxHeight() const {
return reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load());
}
// Read/written only by Insert().
Random rnd_;
Node* NewNode(const Key& key, int height);
int RandomHeight();
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
// Return true if key is greater than the data stored in "n"
bool KeyIsAfterNode(const Key& key, Node* n) const;
// Return the earliest node that comes at or after key.
// Return NULL if there is no such node.
//
// If prev is non-NULL, fills prev[level] with pointer to previous
// node at "level" for every level in [0..max_height_-1].
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
// Return the latest node with a key < key.
// Return head_ if there is no such node.
Node* FindLessThan(const Key& key) const;
// Return the last node in the list.
// Return head_ if list is empty.
Node* FindLast() const;
// No copying allowed
SkipList(const SkipList&);
void operator=(const SkipList&);
};
// Implementation details follow
template<typename Key, class Comparator>
struct SkipList<Key,Comparator>::Node {
explicit Node(const Key& k) : key(k) { }
Key const key;
// Accessors/mutators for links. Wrapped in methods so we can
// add the appropriate barriers as necessary.
Node* Next(int n) {
assert(n >= 0);
// Use an 'acquire load' so that we observe a fully initialized
// version of the returned Node.
return reinterpret_cast<Node*>(next_[n].Acquire_Load());
}
void SetNext(int n, Node* x) {
assert(n >= 0);
// Use a 'release store' so that anybody who reads through this
// pointer observes a fully initialized version of the inserted node.
next_[n].Release_Store(x);
}
// No-barrier variants that can be safely used in a few locations.
Node* NoBarrier_Next(int n) {
assert(n >= 0);
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
}
void NoBarrier_SetNext(int n, Node* x) {
assert(n >= 0);
next_[n].NoBarrier_Store(x);
}
private:
// Array of length equal to the node height. next_[0] is lowest level link.
port::AtomicPointer next_[1];
};
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
char* mem = arena_->AllocateAligned(
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
return new (mem) Node(key);
}
template<typename Key, class Comparator>
inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
list_ = list;
node_ = NULL;
}
template<typename Key, class Comparator>
inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
return node_ != NULL;
}
template<typename Key, class Comparator>
inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
assert(Valid());
return node_->key;
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Next() {
assert(Valid());
node_ = node_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Prev() {
// Instead of using explicit "prev" links, we just search for the
// last node that falls before key.
assert(Valid());
node_ = list_->FindLessThan(node_->key);
if (node_ == list_->head_) {
node_ = NULL;
}
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
node_ = list_->FindGreaterOrEqual(target, NULL);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
node_ = list_->head_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
node_ = list_->FindLast();
if (node_ == list_->head_) {
node_ = NULL;
}
}
template<typename Key, class Comparator>
int SkipList<Key,Comparator>::RandomHeight() {
// Increase height with probability 1 in kBranching
static const unsigned int kBranching = 4;
int height = 1;
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
height++;
}
assert(height > 0);
assert(height <= kMaxHeight);
return height;
}
template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
// NULL n is considered infinite
return (n != NULL) && (compare_(n->key, key) < 0);
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
if (KeyIsAfterNode(key, next)) {
// Keep searching in this list
x = next;
} else {
if (prev != NULL) prev[level] = x;
if (level == 0) {
return next;
} else {
// Switch to next list
level--;
}
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
assert(x == head_ || compare_(x->key, key) < 0);
Node* next = x->Next(level);
if (next == NULL || compare_(next->key, key) >= 0) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
if (next == NULL) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
: compare_(cmp),
arena_(arena),
head_(NewNode(0 /* any key will do */, kMaxHeight)),
max_height_(reinterpret_cast<void*>(1)),
rnd_(0xdeadbeef) {
for (int i = 0; i < kMaxHeight; i++) {
head_->SetNext(i, NULL);
}
}
template<typename Key, class Comparator>
void SkipList<Key,Comparator>::Insert(const Key& key) {
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
// here since Insert() is externally synchronized.
Node* prev[kMaxHeight];
Node* x = FindGreaterOrEqual(key, prev);
// Our data structure does not allow duplicate insertion
assert(x == NULL || !Equal(key, x->key));
int height = RandomHeight();
if (height > GetMaxHeight()) {
for (int i = GetMaxHeight(); i < height; i++) {
prev[i] = head_;
}
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
// It is ok to mutate max_height_ without any synchronization
// with concurrent readers. A concurrent reader that observes
// the new value of max_height_ will see either the old value of
// new level pointers from head_ (NULL), or a new value set in
// the loop below. In the former case the reader will
// immediately drop to the next level since NULL sorts after all
// keys. In the latter case the reader will use the new node.
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
}
x = NewNode(key, height);
for (int i = 0; i < height; i++) {
// NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
prev[i]->SetNext(i, x);
}
}
template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::Contains(const Key& key) const {
Node* x = FindGreaterOrEqual(key, NULL);
if (x != NULL && Equal(key, x->key)) {
return true;
} else {
return false;
}
}
}

+ 378
- 0
db/skiplist_test.cc View File

@ -0,0 +1,378 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/skiplist.h"
#include <set>
#include "include/env.h"
#include "util/arena.h"
#include "util/hash.h"
#include "util/random.h"
#include "util/testharness.h"
namespace leveldb {
typedef uint64_t Key;
struct Comparator {
int operator()(const Key& a, const Key& b) const {
if (a < b) {
return -1;
} else if (a > b) {
return +1;
} else {
return 0;
}
}
};
class SkipTest { };
TEST(SkipTest, Empty) {
Arena arena;
Comparator cmp;
SkipList<Key, Comparator> list(cmp, &arena);
ASSERT_TRUE(!list.Contains(10));
SkipList<Key, Comparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.SeekToFirst();
ASSERT_TRUE(!iter.Valid());
iter.Seek(100);
ASSERT_TRUE(!iter.Valid());
iter.SeekToLast();
ASSERT_TRUE(!iter.Valid());
}
TEST(SkipTest, InsertAndLookup) {
const int N = 2000;
const int R = 5000;
Random rnd(1000);
std::set<Key> keys;
Arena arena;
Comparator cmp;
SkipList<Key, Comparator> list(cmp, &arena);
for (int i = 0; i < N; i++) {
Key key = rnd.Next() % R;
if (keys.insert(key).second) {
list.Insert(key);
}
}
for (int i = 0; i < R; i++) {
if (list.Contains(i)) {
ASSERT_EQ(keys.count(i), 1);
} else {
ASSERT_EQ(keys.count(i), 0);
}
}
// Simple iterator tests
{
SkipList<Key, Comparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.Seek(0);
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToFirst();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToLast();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.rbegin()), iter.key());
}
// Forward iteration test
for (int i = 0; i < R; i++) {
SkipList<Key, Comparator>::Iterator iter(&list);
iter.Seek(i);
// Compare against model iterator
std::set<Key>::iterator model_iter = keys.lower_bound(i);
for (int j = 0; j < 3; j++) {
if (model_iter == keys.end()) {
ASSERT_TRUE(!iter.Valid());
break;
} else {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
++model_iter;
iter.Next();
}
}
}
// Backward iteration test
{
SkipList<Key, Comparator>::Iterator iter(&list);
iter.SeekToLast();
// Compare against model iterator
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
model_iter != keys.rend();
++model_iter) {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
iter.Prev();
}
ASSERT_TRUE(!iter.Valid());
}
}
// We want to make sure that with a single writer and multiple
// concurrent readers (with no synchronization other than when a
// reader's iterator is created), the reader always observes all the
// data that was present in the skip list when the iterator was
// constructor. Because insertions are happening concurrently, we may
// also observe new values that were inserted since the iterator was
// constructed, but we should never miss any values that were present
// at iterator construction time.
//
// We generate multi-part keys:
// <key,gen,hash>
// where:
// key is in range [0..K-1]
// gen is a generation number for key
// hash is hash(key,gen)
//
// The insertion code picks a random key, sets gen to be 1 + the last
// generation number inserted for that key, and sets hash to Hash(key,gen).
//
// At the beginning of a read, we snapshot the last inserted
// generation number for each key. We then iterate, including random
// calls to Next() and Seek(). For every key we encounter, we
// check that it is either expected given the initial snapshot or has
// been concurrently added since the iterator started.
class ConcurrentTest {
private:
static const uint32_t K = 4;
static uint64_t key(Key key) { return (key >> 40); }
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
static uint64_t hash(Key key) { return key & 0xff; }
static uint64_t HashNumbers(uint64_t k, uint64_t g) {
uint64_t data[2] = { k, g };
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
}
static Key MakeKey(uint64_t k, uint64_t g) {
assert(sizeof(Key) == sizeof(uint64_t));
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
assert(g <= 0xffffffffu);
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
}
static bool IsValidKey(Key k) {
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
}
static Key RandomTarget(Random* rnd) {
switch (rnd->Next() % 10) {
case 0:
// Seek to beginning
return MakeKey(0, 0);
case 1:
// Seek to end
return MakeKey(K, 0);
default:
// Seek to middle
return MakeKey(rnd->Next() % K, 0);
}
}
// Per-key generation
struct State {
port::AtomicPointer generation[K];
void Set(int k, intptr_t v) {
generation[k].Release_Store(reinterpret_cast<void*>(v));
}
intptr_t Get(int k) {
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
}
State() {
for (int k = 0; k < K; k++) {
Set(k, 0);
}
}
};
// Current state of the test
State current_;
Arena arena_;
// SkipList is not protected by mu_. We just use a single writer
// thread to modify it.
SkipList<Key, Comparator> list_;
public:
ConcurrentTest() : list_(Comparator(), &arena_) { }
// REQUIRES: External synchronization
void WriteStep(Random* rnd) {
const uint32_t k = rnd->Next() % K;
const intptr_t g = current_.Get(k) + 1;
const Key key = MakeKey(k, g);
list_.Insert(key);
current_.Set(k, g);
}
void ReadStep(Random* rnd) {
// Remember the initial committed state of the skiplist.
State initial_state;
for (int k = 0; k < K; k++) {
initial_state.Set(k, current_.Get(k));
}
Key pos = RandomTarget(rnd);
SkipList<Key, Comparator>::Iterator iter(&list_);
iter.Seek(pos);
while (true) {
Key current;
if (!iter.Valid()) {
current = MakeKey(K, 0);
} else {
current = iter.key();
ASSERT_TRUE(IsValidKey(current)) << std::hex << current;
}
ASSERT_LE(pos, current) << "should not go backwards";
// Verify that everything in [pos,current) was not present in
// initial_state.
while (pos < current) {
ASSERT_LT(key(pos), K) << std::hex << pos;
// Note that generation 0 is never inserted, so it is ok if
// <*,0,*> is missing.
ASSERT_TRUE((gen(pos) == 0) ||
(gen(pos) > initial_state.Get(key(pos)))
) << "key: " << key(pos)
<< "; gen: " << gen(pos)
<< "; initgen: "
<< initial_state.Get(key(pos));
// Advance to next key in the valid key space
if (key(pos) < key(current)) {
pos = MakeKey(key(pos) + 1, 0);
} else {
pos = MakeKey(key(pos), gen(pos) + 1);
}
}
if (!iter.Valid()) {
break;
}
if (rnd->Next() % 2) {
iter.Next();
pos = MakeKey(key(pos), gen(pos) + 1);
} else {
Key new_target = RandomTarget(rnd);
if (new_target > pos) {
pos = new_target;
iter.Seek(new_target);
}
}
}
}
};
const uint32_t ConcurrentTest::K;
// Simple test that does single-threaded testing of the ConcurrentTest
// scaffolding.
TEST(SkipTest, ConcurrentWithoutThreads) {
ConcurrentTest test;
Random rnd(test::RandomSeed());
for (int i = 0; i < 10000; i++) {
test.ReadStep(&rnd);
test.WriteStep(&rnd);
}
}
class TestState {
public:
ConcurrentTest t_;
int seed_;
port::AtomicPointer quit_flag_;
enum ReaderState {
STARTING,
RUNNING,
DONE
};
explicit TestState(int s)
: seed_(s),
quit_flag_(NULL),
state_(STARTING),
state_cv_(&mu_) {}
void Wait(ReaderState s) {
mu_.Lock();
while (state_ != s) {
state_cv_.Wait();
}
mu_.Unlock();
}
void Change(ReaderState s) {
mu_.Lock();
state_ = s;
state_cv_.Signal();
mu_.Unlock();
}
private:
port::Mutex mu_;
ReaderState state_;
port::CondVar state_cv_;
};
static void ConcurrentReader(void* arg) {
TestState* state = reinterpret_cast<TestState*>(arg);
Random rnd(state->seed_);
int64_t reads = 0;
state->Change(TestState::RUNNING);
while (!state->quit_flag_.Acquire_Load()) {
state->t_.ReadStep(&rnd);
++reads;
}
state->Change(TestState::DONE);
}
static void RunConcurrent(int run) {
const int seed = test::RandomSeed() + (run * 100);
Random rnd(seed);
const int N = 1000;
const int kSize = 1000;
for (int i = 0; i < N; i++) {
if ((i % 100) == 0) {
fprintf(stderr, "Run %d of %d\n", i, N);
}
TestState state(seed + 1);
Env::Default()->Schedule(ConcurrentReader, &state);
state.Wait(TestState::RUNNING);
for (int i = 0; i < kSize; i++) {
state.t_.WriteStep(&rnd);
}
state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do
state.Wait(TestState::DONE);
}
}
TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 66
- 0
db/snapshot.h View File

@ -0,0 +1,66 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
#include "include/db.h"
namespace leveldb {
class SnapshotList;
// Snapshots are kept in a doubly-linked list in the DB.
// Each Snapshot corresponds to a particular sequence number.
class Snapshot {
public:
SequenceNumber number_; // const after creation
private:
friend class SnapshotList;
// Snapshot is kept in a doubly-linked circular list
Snapshot* prev_;
Snapshot* next_;
SnapshotList* list_; // just for sanity checks
};
class SnapshotList {
public:
SnapshotList() {
list_.prev_ = &list_;
list_.next_ = &list_;
}
bool empty() const { return list_.next_ == &list_; }
Snapshot* oldest() const { assert(!empty()); return list_.next_; }
Snapshot* newest() const { assert(!empty()); return list_.prev_; }
const Snapshot* New(SequenceNumber seq) {
Snapshot* s = new Snapshot;
s->number_ = seq;
s->list_ = this;
s->next_ = &list_;
s->prev_ = list_.prev_;
s->prev_->next_ = s;
s->next_->prev_ = s;
return s;
}
void Delete(const Snapshot* s) {
assert(s->list_ == this);
s->prev_->next_ = s->next_;
s->next_->prev_ = s->prev_;
delete s;
}
private:
// Dummy head of doubly-linked list of snapshots
Snapshot list_;
};
}
#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_

+ 94
- 0
db/table_cache.cc View File

@ -0,0 +1,94 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/table_cache.h"
#include "db/filename.h"
#include "include/env.h"
#include "include/table.h"
#include "util/coding.h"
namespace leveldb {
struct TableAndFile {
RandomAccessFile* file;
Table* table;
};
static void DeleteEntry(const Slice& key, void* value) {
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
delete tf->table;
delete tf->file;
delete tf;
}
static void UnrefEntry(void* arg1, void* arg2) {
Cache* cache = reinterpret_cast<Cache*>(arg1);
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
cache->Release(h);
}
TableCache::TableCache(const std::string& dbname,
const Options* options,
int entries)
: env_(options->env),
dbname_(dbname),
options_(options),
cache_(NewLRUCache(entries)) {
}
TableCache::~TableCache() {
delete cache_;
}
Iterator* TableCache::NewIterator(const ReadOptions& options,
uint64_t file_number,
Table** tableptr) {
if (tableptr != NULL) {
*tableptr = NULL;
}
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
Slice key(buf, sizeof(buf));
Cache::Handle* handle = cache_->Lookup(key);
if (handle == NULL) {
std::string fname = TableFileName(dbname_, file_number);
RandomAccessFile* file = NULL;
Table* table = NULL;
Status s = env_->NewRandomAccessFile(fname, &file);
if (s.ok()) {
s = Table::Open(*options_, file, &table);
}
if (!s.ok()) {
assert(table == NULL);
delete file;
// We do not cache error results so that if the error is transient,
// or somebody repairs the file, we recover automatically.
return NewErrorIterator(s);
}
TableAndFile* tf = new TableAndFile;
tf->file = file;
tf->table = table;
handle = cache_->Insert(key, tf, 1, &DeleteEntry);
}
Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
Iterator* result = table->NewIterator(options);
result->RegisterCleanup(&UnrefEntry, cache_, handle);
if (tableptr != NULL) {
*tableptr = table;
}
return result;
}
void TableCache::Evict(uint64_t file_number) {
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
cache_->Erase(Slice(buf, sizeof(buf)));
}
}

+ 49
- 0
db/table_cache.h View File

@ -0,0 +1,49 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread-safe (provides internal synchronization)
#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_
#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_
#include <string>
#include <stdint.h>
#include "db/dbformat.h"
#include "include/cache.h"
#include "include/table.h"
#include "port/port.h"
namespace leveldb {
class Env;
class TableCache {
public:
TableCache(const std::string& dbname, const Options* options, int entries);
~TableCache();
// Get an iterator for the specified file number and return it. If
// "tableptr" is non-NULL, also sets "*tableptr" to point to the
// Table object underlying the returned iterator, or NULL if no
// Table object underlies the returned iterator. The returned
// "*tableptr" object is owned by the cache and should not be
// deleted, and is valid for as long as the returned iterator is
// live.
Iterator* NewIterator(const ReadOptions& options,
uint64_t file_number,
Table** tableptr = NULL);
// Evict any entry for the specified file number
void Evict(uint64_t file_number);
private:
Env* const env_;
const std::string dbname_;
const Options* options_;
Cache* cache_;
};
}
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_

+ 282
- 0
db/version_edit.cc View File

@ -0,0 +1,282 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "db/version_set.h"
#include "util/coding.h"
namespace leveldb {
// Tag numbers for serialized VersionEdit. These numbers are written to
// disk and should not be changed.
enum Tag {
kComparator = 1,
kLogNumber = 2,
kNextFileNumber = 3,
kLastSequence = 4,
kCompactPointer = 5,
kDeletedFile = 6,
kNewFile = 7,
kLargeValueRef = 8,
};
void VersionEdit::Clear() {
comparator_.clear();
log_number_ = 0;
last_sequence_ = 0;
next_file_number_ = 0;
has_comparator_ = false;
has_log_number_ = false;
has_next_file_number_ = false;
has_last_sequence_ = false;
deleted_files_.clear();
new_files_.clear();
large_refs_added_.clear();
}
void VersionEdit::EncodeTo(std::string* dst) const {
if (has_comparator_) {
PutVarint32(dst, kComparator);
PutLengthPrefixedSlice(dst, comparator_);
}
if (has_log_number_) {
PutVarint32(dst, kLogNumber);
PutVarint64(dst, log_number_);
}
if (has_next_file_number_) {
PutVarint32(dst, kNextFileNumber);
PutVarint64(dst, next_file_number_);
}
if (has_last_sequence_) {
PutVarint32(dst, kLastSequence);
PutVarint64(dst, last_sequence_);
}
for (int i = 0; i < compact_pointers_.size(); i++) {
PutVarint32(dst, kCompactPointer);
PutVarint32(dst, compact_pointers_[i].first); // level
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
PutVarint32(dst, kDeletedFile);
PutVarint32(dst, iter->first); // level
PutVarint64(dst, iter->second); // file number
}
for (int i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
PutVarint32(dst, kNewFile);
PutVarint32(dst, new_files_[i].first); // level
PutVarint64(dst, f.number);
PutVarint64(dst, f.file_size);
PutLengthPrefixedSlice(dst, f.smallest.Encode());
PutLengthPrefixedSlice(dst, f.largest.Encode());
}
for (int i = 0; i < large_refs_added_.size(); i++) {
const VersionEdit::Large& l = large_refs_added_[i];
PutVarint32(dst, kLargeValueRef);
PutLengthPrefixedSlice(dst,
Slice(l.large_ref.data, LargeValueRef::ByteSize()));
PutVarint64(dst, l.fnum);
PutLengthPrefixedSlice(dst, l.internal_key.Encode());
}
}
static bool GetInternalKey(Slice* input, InternalKey* dst) {
Slice str;
if (GetLengthPrefixedSlice(input, &str)) {
dst->DecodeFrom(str);
return true;
} else {
return false;
}
}
static bool GetLevel(Slice* input, int* level) {
uint32_t v;
if (GetVarint32(input, &v) &&
v < config::kNumLevels) {
*level = v;
return true;
} else {
return false;
}
}
Status VersionEdit::DecodeFrom(const Slice& src) {
Clear();
Slice input = src;
const char* msg = NULL;
uint32_t tag;
// Temporary storage for parsing
int level;
uint64_t number;
FileMetaData f;
Slice str;
Large large;
InternalKey key;
while (msg == NULL && GetVarint32(&input, &tag)) {
switch (tag) {
case kComparator:
if (GetLengthPrefixedSlice(&input, &str)) {
comparator_ = str.ToString();
has_comparator_ = true;
} else {
msg = "comparator name";
}
break;
case kLogNumber:
if (GetVarint64(&input, &log_number_)) {
has_log_number_ = true;
} else {
msg = "log number";
}
break;
case kNextFileNumber:
if (GetVarint64(&input, &next_file_number_)) {
has_next_file_number_ = true;
} else {
msg = "next file number";
}
break;
case kLastSequence:
if (GetVarint64(&input, &last_sequence_)) {
has_last_sequence_ = true;
} else {
msg = "last sequence number";
}
break;
case kCompactPointer:
if (GetLevel(&input, &level) &&
GetInternalKey(&input, &key)) {
compact_pointers_.push_back(std::make_pair(level, key));
} else {
msg = "compaction pointer";
}
break;
case kDeletedFile:
if (GetLevel(&input, &level) &&
GetVarint64(&input, &number)) {
deleted_files_.insert(std::make_pair(level, number));
} else {
msg = "deleted file";
}
break;
case kNewFile:
if (GetLevel(&input, &level) &&
GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest)) {
new_files_.push_back(std::make_pair(level, f));
} else {
msg = "new-file entry";
}
break;
case kLargeValueRef:
if (GetLengthPrefixedSlice(&input, &str) &&
(str.size() == LargeValueRef::ByteSize()) &&
GetVarint64(&input, &large.fnum) &&
GetInternalKey(&input, &large.internal_key)) {
large.large_ref = LargeValueRef::FromRef(str);
large_refs_added_.push_back(large);
} else {
msg = "large ref";
}
break;
default:
msg = "unknown tag";
break;
}
}
if (msg == NULL && !input.empty()) {
msg = "invalid tag";
}
Status result;
if (msg != NULL) {
result = Status::Corruption("VersionEdit", msg);
}
return result;
}
std::string VersionEdit::DebugString() const {
std::string r;
r.append("VersionEdit {");
if (has_comparator_) {
r.append("\n Comparator: ");
r.append(comparator_);
}
if (has_log_number_) {
r.append("\n LogNumber: ");
AppendNumberTo(&r, log_number_);
}
if (has_next_file_number_) {
r.append("\n NextFile: ");
AppendNumberTo(&r, next_file_number_);
}
if (has_last_sequence_) {
r.append("\n LastSeq: ");
AppendNumberTo(&r, last_sequence_);
}
for (int i = 0; i < compact_pointers_.size(); i++) {
r.append("\n CompactPointer: ");
AppendNumberTo(&r, compact_pointers_[i].first);
r.append(" '");
AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode());
r.append("'");
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
r.append("\n DeleteFile: ");
AppendNumberTo(&r, iter->first);
r.append(" ");
AppendNumberTo(&r, iter->second);
}
for (int i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
r.append("\n AddFile: ");
AppendNumberTo(&r, new_files_[i].first);
r.append(" ");
AppendNumberTo(&r, f.number);
r.append(" ");
AppendNumberTo(&r, f.file_size);
r.append(" '");
AppendEscapedStringTo(&r, f.smallest.Encode());
r.append("' .. '");
AppendEscapedStringTo(&r, f.largest.Encode());
r.append("'");
}
for (int i = 0; i < large_refs_added_.size(); i++) {
const VersionEdit::Large& l = large_refs_added_[i];
r.append("\n LargeRef: ");
AppendNumberTo(&r, l.fnum);
r.append(" ");
r.append(LargeValueRefToFilenameString(l.large_ref));
r.append(" '");
AppendEscapedStringTo(&r, l.internal_key.Encode());
r.append("'");
}
r.append("\n}\n");
return r;
}
}

+ 118
- 0
db/version_edit.h View File

@ -0,0 +1,118 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_
#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_
#include <set>
#include <utility>
#include <vector>
#include "db/dbformat.h"
namespace leveldb {
class VersionSet;
struct FileMetaData {
int refs;
uint64_t number;
uint64_t file_size; // File size in bytes
InternalKey smallest; // Smallest internal key served by table
InternalKey largest; // Largest internal key served by table
FileMetaData() : refs(0), file_size(0) { }
};
class VersionEdit {
public:
VersionEdit() { Clear(); }
~VersionEdit() { }
void Clear();
void SetComparatorName(const Slice& name) {
has_comparator_ = true;
comparator_ = name.ToString();
}
void SetLogNumber(uint64_t num) {
has_log_number_ = true;
log_number_ = num;
}
void SetNextFile(uint64_t num) {
has_next_file_number_ = true;
next_file_number_ = num;
}
void SetLastSequence(SequenceNumber seq) {
has_last_sequence_ = true;
last_sequence_ = seq;
}
void SetCompactPointer(int level, const InternalKey& key) {
compact_pointers_.push_back(std::make_pair(level, key));
}
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file,
uint64_t file_size,
const InternalKey& smallest,
const InternalKey& largest) {
FileMetaData f;
f.number = file;
f.file_size = file_size;
f.smallest = smallest;
f.largest = largest;
new_files_.push_back(std::make_pair(level, f));
}
// Delete the specified "file" from the specified "level".
void DeleteFile(int level, uint64_t file) {
deleted_files_.insert(std::make_pair(level, file));
}
// Record that a large value with the specified large_ref was
// written to the output file numbered "fnum"
void AddLargeValueRef(const LargeValueRef& large_ref,
uint64_t fnum,
const Slice& internal_key) {
large_refs_added_.resize(large_refs_added_.size() + 1);
Large* large = &(large_refs_added_.back());
large->large_ref = large_ref;
large->fnum = fnum;
large->internal_key.DecodeFrom(internal_key);
}
void EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src);
std::string DebugString() const;
private:
friend class VersionSet;
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
std::string comparator_;
uint64_t log_number_;
uint64_t next_file_number_;
SequenceNumber last_sequence_;
bool has_comparator_;
bool has_log_number_;
bool has_next_file_number_;
bool has_last_sequence_;
std::vector< std::pair<int, InternalKey> > compact_pointers_;
DeletedFileSet deleted_files_;
std::vector< std::pair<int, FileMetaData> > new_files_;
struct Large {
LargeValueRef large_ref;
uint64_t fnum;
InternalKey internal_key;
};
std::vector<Large> large_refs_added_;
};
}
#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_

+ 50
- 0
db/version_edit_test.cc View File

@ -0,0 +1,50 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "util/testharness.h"
namespace leveldb {
static void TestEncodeDecode(const VersionEdit& edit) {
std::string encoded, encoded2;
edit.EncodeTo(&encoded);
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
parsed.EncodeTo(&encoded2);
ASSERT_EQ(encoded, encoded2);
}
class VersionEditTest { };
TEST(VersionEditTest, EncodeDecode) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit);
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
edit.DeleteFile(4, kBig + 700 + i);
edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression),
kBig + 800 + i, "foobar");
edit.AddLargeValueRef(LargeValueRef::Make("big2", kLightweightCompression),
kBig + 801 + i, "baz");
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
}
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
TestEncodeDecode(edit);
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 1003
- 0
db/version_set.cc
File diff suppressed because it is too large
View File


+ 290
- 0
db/version_set.h View File

@ -0,0 +1,290 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// The representation of a DBImpl consists of a set of Versions. The
// newest version is called "current". Older versions may be kept
// around to provide a consistent view to live iterators.
//
// Each Version keeps track of a set of Table files per level. The
// entire set of versions is maintained in a VersionSet.
//
// Version,VersionSet are thread-compatible, but require external
// synchronization on all accesses.
#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
#define STORAGE_LEVELDB_DB_VERSION_SET_H_
#include <map>
#include <set>
#include <vector>
#include "db/dbformat.h"
#include "db/version_edit.h"
#include "port/port.h"
namespace leveldb {
// Grouping of constants. We may want to make some of these
// parameters set via options.
namespace config {
static const int kNumLevels = 7;
}
namespace log { class Writer; }
class Compaction;
class Iterator;
class MemTable;
class TableBuilder;
class TableCache;
class Version;
class VersionSet;
class WritableFile;
class Version {
public:
// Append to *iters a sequence of iterators that will
// yield the contents of this Version when merged together.
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
void Unref();
// Return a human readable string that describes this version's contents.
std::string DebugString() const;
private:
friend class Compaction;
friend class VersionSet;
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
int refs_; // Number of live refs to this version
MemTable* cleanup_mem_; // NULL, or table to delete when version dropped
// List of files per level
std::vector<FileMetaData*> files_[config::kNumLevels];
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
double compaction_score_;
int compaction_level_;
explicit Version(VersionSet* vset)
: vset_(vset), next_(NULL), refs_(0),
cleanup_mem_(NULL),
compaction_score_(-1),
compaction_level_(-1) {
}
~Version();
// No copying allowed
Version(const Version&);
void operator=(const Version&);
};
class VersionSet {
public:
VersionSet(const std::string& dbname,
const Options* options,
TableCache* table_cache,
const InternalKeyComparator*);
~VersionSet();
// Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new
// current version. Iff Apply() returns OK, arrange to delete
// cleanup_mem (if cleanup_mem != NULL) when it is no longer needed
// by older versions.
Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
// Recover the last saved descriptor from persistent storage.
Status Recover(uint64_t* log_number, SequenceNumber* last_sequence);
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);
// Return the current version.
Version* current() const { return current_; }
// Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
// Allocate and return a new file number
uint64_t NewFileNumber() { return next_file_number_++; }
// Return the number of Table files at the specified level.
int NumLevelFiles(int level) const;
// Pick level and inputs for a new compaction.
// Returns NULL if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
Compaction* PickCompaction();
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns NULL if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
Compaction* CompactRange(
int level,
const InternalKey& begin,
const InternalKey& end);
// Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed.
Iterator* MakeInputIterator(Compaction* c);
// Returns true iff some level needs a compaction.
bool NeedsCompaction() const { return current_->compaction_score_ >= 1; }
// Add all files listed in any live version to *live.
// May also mutate some internal state.
void AddLiveFiles(std::set<uint64_t>* live);
// Return the approximate offset in the database of the data for
// "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Register a reference to a large value with the specified
// large_ref from the specified file number. Returns "true" if this
// is the first recorded reference to the "large_ref" value in the
// database, and false otherwise.
bool RegisterLargeValueRef(const LargeValueRef& large_ref,
uint64_t filenum,
const InternalKey& internal_key);
// Cleanup the large value reference state by eliminating any
// references from files that are not includes in either "live_tables"
// or "log_file".
void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables,
uint64_t log_file_num);
// Returns true if a large value with the given reference is live.
bool LargeValueIsLive(const LargeValueRef& large_ref);
private:
class Builder;
friend class Compaction;
friend class Version;
Status Finalize(Version* v);
// Delete any old versions that are no longer needed.
void MaybeDeleteOldVersions();
struct BySmallestKey;
Status SortLevel(Version* v, uint64_t level);
void GetOverlappingInputs(
int level,
const InternalKey& begin,
const InternalKey& end,
std::vector<FileMetaData*>* inputs);
void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest);
Env* const env_;
const std::string dbname_;
const Options* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
// Opened lazily
WritableFile* descriptor_file_;
log::Writer* descriptor_log_;
// Versions are kept in a singly linked list that is never empty
Version* current_; // Pointer to the last (newest) list entry
Version* oldest_; // Pointer to the first (oldest) list entry
// Map from large value reference to the set of <file numbers,internal_key>
// values containing references to the value. We keep the
// internal key as a std::string rather than as an InternalKey because
// we want to be able to easily use a set.
typedef std::set<std::pair<uint64_t, std::string> > LargeReferencesSet;
typedef std::map<LargeValueRef, LargeReferencesSet> LargeValueMap;
LargeValueMap large_value_refs_;
// Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey.
std::string compact_pointer_[config::kNumLevels];
// No copying allowed
VersionSet(const VersionSet&);
void operator=(const VersionSet&);
};
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// and "level+1" will be merged to produce a set of "level+1" files.
int level() const { return level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return &edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
private:
friend class Version;
friend class VersionSet;
explicit Compaction(int level);
int level_;
uint64_t max_output_file_size_;
Version* input_version_;
VersionEdit edit_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State for implementing IsBaseLevelForKey
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
int level_ptrs_[config::kNumLevels];
};
}
#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_

+ 164
- 0
db/write_batch.cc View File

@ -0,0 +1,164 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// WriteBatch::rep_ :=
// sequence: fixed64
// count: fixed32
// data: record[count]
// record :=
// kTypeValue varstring varstring |
// kTypeLargeValueRef varstring varstring |
// kTypeDeletion varstring
// varstring :=
// len: varint32
// data: uint8[len]
#include "include/write_batch.h"
#include "include/db.h"
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "util/coding.h"
namespace leveldb {
WriteBatch::WriteBatch() {
Clear();
}
WriteBatch::~WriteBatch() { }
void WriteBatch::Clear() {
rep_.clear();
rep_.resize(12);
}
int WriteBatchInternal::Count(const WriteBatch* b) {
return DecodeFixed32(b->rep_.data() + 8);
}
void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
EncodeFixed32(&b->rep_[8], n);
}
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
return SequenceNumber(DecodeFixed64(b->rep_.data()));
}
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
EncodeFixed64(&b->rep_[0], seq);
}
void WriteBatch::Put(const Slice& key, const Slice& value) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeValue));
PutLengthPrefixedSlice(&rep_, key);
PutLengthPrefixedSlice(&rep_, value);
}
void WriteBatchInternal::PutLargeValueRef(WriteBatch* b,
const Slice& key,
const LargeValueRef& large_ref) {
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
b->rep_.push_back(static_cast<char>(kTypeLargeValueRef));
PutLengthPrefixedSlice(&b->rep_, key);
PutLengthPrefixedSlice(&b->rep_,
Slice(large_ref.data, sizeof(large_ref.data)));
}
void WriteBatch::Delete(const Slice& key) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeDeletion));
PutLengthPrefixedSlice(&rep_, key);
}
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
MemTable* memtable) {
const int count = WriteBatchInternal::Count(b);
int found = 0;
Iterator it(*b);
for (; !it.Done(); it.Next()) {
switch (it.op()) {
case kTypeDeletion:
memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice());
break;
case kTypeValue:
memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
break;
case kTypeLargeValueRef:
memtable->Add(it.sequence_number(), kTypeLargeValueRef,
it.key(), it.value());
break;
}
found++;
}
if (!it.status().ok()) {
return it.status();
} else if (found != count) {
return Status::Corruption("wrong count in WriteBatch");
}
return Status::OK();
}
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
assert(contents.size() >= 12);
b->rep_.assign(contents.data(), contents.size());
}
WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch)
: input_(WriteBatchInternal::Contents(&batch)),
done_(false) {
if (input_.size() < 12) {
done_ = true;
} else {
seq_ = WriteBatchInternal::Sequence(&batch),
input_.remove_prefix(12);
GetNextEntry();
}
}
void WriteBatchInternal::Iterator::Next() {
assert(!done_);
seq_++;
GetNextEntry();
}
void WriteBatchInternal::Iterator::GetNextEntry() {
if (input_.empty()) {
done_ = true;
return;
}
char tag = input_[0];
input_.remove_prefix(1);
switch (tag) {
case kTypeValue:
case kTypeLargeValueRef:
if (GetLengthPrefixedSlice(&input_, &key_) &&
GetLengthPrefixedSlice(&input_, &value_)) {
op_ = static_cast<ValueType>(tag);
} else {
status_ = Status::Corruption("bad WriteBatch Put");
done_ = true;
input_.clear();
}
break;
case kTypeDeletion:
if (GetLengthPrefixedSlice(&input_, &key_)) {
op_ = kTypeDeletion;
} else {
status_ = Status::Corruption("bad WriteBatch Delete");
done_ = true;
input_.clear();
}
break;
default:
status_ = Status::Corruption("unknown WriteBatch tag");
done_ = true;
input_.clear();
break;
}
}
}

+ 73
- 0
db/write_batch_internal.h View File

@ -0,0 +1,73 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#include "include/write_batch.h"
namespace leveldb {
// WriteBatchInternal provides static methods for manipulating a
// WriteBatch that we don't want in the public WriteBatch interface.
class WriteBatchInternal {
public:
static void PutLargeValueRef(WriteBatch* batch,
const Slice& key,
const LargeValueRef& large_ref);
// Return the number of entries in the batch.
static int Count(const WriteBatch* batch);
// Set the count for the number of entries in the batch.
static void SetCount(WriteBatch* batch, int n);
// Return the seqeunce number for the start of this batch.
static SequenceNumber Sequence(const WriteBatch* batch);
// Store the specified number as the seqeunce number for the start of
// this batch.
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
static Slice Contents(const WriteBatch* batch) {
return Slice(batch->rep_);
}
static size_t ByteSize(const WriteBatch* batch) {
return batch->rep_.size();
}
static void SetContents(WriteBatch* batch, const Slice& contents);
static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
// Iterate over the contents of a write batch.
class Iterator {
public:
explicit Iterator(const WriteBatch& batch);
bool Done() const { return done_; }
void Next();
ValueType op() const { return op_; }
const Slice& key() const { return key_; }
const Slice& value() const { return value_; }
SequenceNumber sequence_number() const { return seq_; }
Status status() const { return status_; }
private:
void GetNextEntry();
Slice input_;
bool done_;
ValueType op_;
Slice key_;
Slice value_;
SequenceNumber seq_;
Status status_;
};
};
}
#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_

+ 110
- 0
db/write_batch_test.cc View File

@ -0,0 +1,110 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/db.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "include/env.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
static std::string PrintContents(WriteBatch* b) {
InternalKeyComparator cmp(BytewiseComparator());
MemTable mem(cmp);
std::string state;
Status s = WriteBatchInternal::InsertInto(b, &mem);
Iterator* iter = mem.NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey ikey;
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
switch (ikey.type) {
case kTypeValue:
state.append("Put(");
state.append(ikey.user_key.ToString());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
break;
case kTypeLargeValueRef:
state.append("PutRef(");
state.append(ikey.user_key.ToString());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
break;
case kTypeDeletion:
state.append("Delete(");
state.append(ikey.user_key.ToString());
state.append(")");
break;
}
state.append("@");
state.append(NumberToString(ikey.sequence));
}
delete iter;
if (!s.ok()) {
state.append("ParseError()");
}
return state;
}
class WriteBatchTest { };
TEST(WriteBatchTest, Empty) {
WriteBatch batch;
ASSERT_EQ("", PrintContents(&batch));
ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
}
TEST(WriteBatchTest, Multiple) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
batch.Delete(Slice("box"));
batch.Put(Slice("baz"), Slice("boo"));
WriteBatchInternal::SetSequence(&batch, 100);
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
ASSERT_EQ("Put(baz, boo)@102"
"Delete(box)@101"
"Put(foo, bar)@100",
PrintContents(&batch));
}
TEST(WriteBatchTest, PutIndirect) {
WriteBatch batch;
batch.Put(Slice("baz"), Slice("boo"));
LargeValueRef h;
for (int i = 0; i < LargeValueRef::ByteSize(); i++) {
h.data[i] = (i < 20) ? 'a' : 'b';
}
WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h);
WriteBatchInternal::SetSequence(&batch, 100);
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
ASSERT_EQ(2, WriteBatchInternal::Count(&batch));
ASSERT_EQ("Put(baz, boo)@100"
"PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101",
PrintContents(&batch));
}
TEST(WriteBatchTest, Corruption) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
batch.Delete(Slice("box"));
WriteBatchInternal::SetSequence(&batch, 200);
Slice contents = WriteBatchInternal::Contents(&batch);
WriteBatchInternal::SetContents(&batch,
Slice(contents.data(),contents.size()-1));
ASSERT_EQ("Put(foo, bar)@200"
"ParseError()",
PrintContents(&batch));
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 89
- 0
doc/doc.css View File

@ -0,0 +1,89 @@
body {
margin-left: 0.5in;
margin-right: 0.5in;
background: white;
color: black;
}
h1 {
margin-left: -0.2in;
font-size: 14pt;
}
h2 {
margin-left: -0in;
font-size: 12pt;
}
h3 {
margin-left: -0in;
}
h4 {
margin-left: -0in;
}
hr {
margin-left: -0in;
}
/* Definition lists: definition term bold */
dt {
font-weight: bold;
}
address {
text-align: center;
}
code,samp,var {
color: blue;
}
kbd {
color: #600000;
}
div.note p {
float: right;
width: 3in;
margin-right: 0%;
padding: 1px;
border: 2px solid #6060a0;
background-color: #fffff0;
}
ul {
margin-top: -0em;
margin-bottom: -0em;
}
ol {
margin-top: -0em;
margin-bottom: -0em;
}
UL.nobullets {
list-style-type: none;
list-style-image: none;
margin-left: -1em;
}
p {
margin: 1em 0 1em 0;
padding: 0 0 0 0;
}
pre {
line-height: 1.3em;
padding: 0.4em 0 0.8em 0;
margin: 0 0 0 0;
border: 0 0 0 0;
color: blue;
}
.datatable {
margin-left: auto;
margin-right: auto;
margin-top: 2em;
margin-bottom: 2em;
border: 1px solid;
}
.datatable td,th {
padding: 0 0.5em 0 0.5em;
text-align: right;
}

+ 222
- 0
doc/impl.html View File

@ -0,0 +1,222 @@
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" type="text/css" href="doc.css" />
<title>Leveldb file layout and compactions</title>
</head>
<body>
<h1>Files</h1>
The implementation of leveldb is similar in spirit to the
representation of a single
<a href="http://labs.google.com/papers/bigtable.html">
Bigtable tablet (section 5.3)</a>.
However the organization of the files that make up the representation
is somewhat different and is explained below.
<p>
Each database is represented by a set of file stored in a directory.
There are several different types of files as documented below:
<p>
<h2>Log files</h2>
<p>
A log file (*.log) stores a sequence of recent updates. Each update
is appended to the current log file. When the log file reaches a
pre-determined size (approximately 1MB by default), it is converted
to a sorted table (see below) and a new log file is created for future
updates.
<p>
A copy of the current log file is kept in an in-memory structure (the
<code>memtable</code>). This copy is consulted on every read so that read
operations reflect all logged updates.
<p>
<h2>Sorted tables</h2>
<p>
A sorted table (*.sst) stores a sequence of entries sorted by key.
Each entry is either a value for the key, or a deletion marker for the
key. (Deletion markers are kept around to hide obsolete values
present in older sorted tables).
<p>
The set of sorted tables are organized into a sequence of levels. The
sorted table generated from a log file is placed in a special <code>young</code>
level (also called level-0). When the number of young files exceeds a
certain threshold (currently four), all of the young files are merged
together with all of the overlapping level-1 files to produce a
sequence of new level-1 files (we create a new level-1 file for every
2MB of data.)
<p>
Files in the young level may contain overlapping keys. However files
in other levels have distinct non-overlapping key ranges. Consider
level number L where L >= 1. When the combined size of files in
level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
...), one file in level-L, and all of the overlapping files in
level-(L+1) are merged to form a set of new files for level-(L+1).
These merges have the effect of gradually migrating new updates from
the young level to the largest level using only bulk reads and writes
(i.e., minimizing expensive seeks).
<h2>Large value files</h2>
<p>
Each large value (greater than 64KB by default) is placed in a large
value file (*.val) of its own. An entry is maintained in the log
and/or sorted tables that maps from the corresponding key to the
name of this large value file. The name of the large value file
is derived from a SHA1 hash of the value and its length so that
identical values share the same file.
<p>
<h2>Manifest</h2>
<p>
A MANIFEST file lists the set of sorted tables that make up each
level, the corresponding key ranges, and other important metadata.
A new MANIFEST file (with a new number embedded in the file name)
is created whenever the database is reopened. The MANIFEST file is
formatted as a log, and changes made to the serving state (as files
are added or removed) are appended to this log.
<p>
<h2>Current</h2>
<p>
CURRENT is a simple text file that contains the name of the latest
MANIFEST file.
<p>
<h2>Info logs</h2>
<p>
Informational messages are printed to files named LOG and LOG.old.
<p>
<h2>Others</h2>
<p>
Other files used for miscellaneous purposes may also be present
(LOCK, *.dbtmp).
<h1>Level 0</h1>
When the log file grows above a certain size (1MB by default):
<ul>
<li>Write the contents of the current memtable to an sstable
<li>Replace the current memtable by a brand new empty memtable
<li>Switch to a new log file
<li>Delete the old log file and the old memtable
</ul>
Experimental measurements show that generating an sstable from a 1MB
log file takes ~12ms, which seems like an acceptable latency hiccup to
add infrequently to a log write.
<p>
The new sstable is added to a special level-0 level. level-0 contains
a set of files (up to 4 by default). However unlike other levels,
these files do not cover disjoint ranges, but may overlap each other.
<h1>Compactions</h1>
<p>
When the size of level L exceeds its limit, we compact it in a
background thread. The compaction picks a file from level L and all
overlapping files from the next level L+1. Note that if a level-L
file overlaps only part of a level-(L+1) file, the entire file at
level-(L+1) is used as an input to the compaction and will be
discarded after the compaction. Aside: because level-0 is special
(files in it may overlap each other), we treat compactions from
level-0 to level-1 specially: a level-0 compaction may pick more than
one level-0 file in case some of these files overlap each other.
<p>
A compaction merges the contents of the picked files to produce a
sequence of level-(L+1) files. We switch to producing a new
level-(L+1) file after the current output file has reached the target
file size (2MB). The old files are discarded and the new files are
added to the serving state.
<p>
Compactions for a particular level rotate through the key space. In
more detail, for each level L, we remember the ending key of the last
compaction at level L. The next compaction for level L will pick the
first file that starts after this key (wrapping around to the
beginning of the key space if there is no such file).
<p>
Compactions drop overwritten values. They also drop deletion markers
if there are no higher numbered levels that contain a file whose range
overlaps the current key.
<h2>Timing</h2>
Level-0 compactions will read up to four 1MB files from level-0, and
at worst all the level-1 files (10MB). I.e., we will read 14MB and
write 14MB.
<p>
Other than the special level-0 compactions, we will pick one 2MB file
from level L. In the worst case, this will overlap ~ 12 files from
level L+1 (10 because level-(L+1) is ten times the size of level-L,
and another two at the boundaries since the file ranges at level-L
will usually not be aligned with the file ranges at level-L+1). The
compaction will therefore read 26MB and write 26MB. Assuming a disk
IO rate of 100MB/s (ballpark range for modern drives), the worst
compaction cost will be approximately 0.5 second.
<p>
If we throttle the background writing to something small, say 10% of
the full 100MB/s speed, a compaction may take up to 5 seconds. If the
user is writing at 10MB/s, we might build up lots of level-0 files
(~50 to hold the 5*10MB). This may signficantly increase the cost of
reads due to the overhead of merging more files together on every
read.
<p>
Solution 1: To reduce this problem, we might want to increase the log
switching threshold when the number of level-0 files is large. Though
the downside is that the larger this threshold, the larger the delay
that we will add to write latency when a write triggers a log switch.
<p>
Solution 2: We might want to decrease write rate artificially when the
number of level-0 files goes up.
<p>
Solution 3: We work on reducing the cost of very wide merges.
Perhaps most of the level-0 files will have their blocks sitting
uncompressed in the cache and we will only need to worry about the
O(N) complexity in the merging iterator.
<h2>Number of files</h2>
Instead of always making 2MB files, we could make larger files for
larger levels to reduce the total file count, though at the expense of
more bursty compactions. Alternatively, we could shard the set of
files into multiple directories.
<p>
An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
the following timings to do 100K file opens in directories with
varying number of files:
<table class="datatable">
<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
<tr><td>1000</td><td>9</td>
<tr><td>10000</td><td>10</td>
<tr><td>100000</td><td>16</td>
</table>
So maybe even the sharding is not necessary on modern filesystems?
<h1>Recovery</h1>
<ul>
<li> Read CURRENT to find name of the latest committed MANIFEST
<li> Read the named MANIFEST file
<li> Clean up stale files
<li> We could open all sstables here, but it is probably better to be lazy...
<li> Convert log chunk to a new level-0 sstable
<li> Start directing new writes to a new log file with recovered sequence#
</ul>
<h1>Garbage collection of files</h1>
<code>DeleteObsoleteFiles()</code> is called at the end of every
compaction and at the end of recovery. It finds the names of all
files in the database. It deletes all log files that are not the
current log file. It deletes all table files that are not referenced
from some level and are not the output of an active compaction. It
deletes all large value files that are not referenced from any live
table or log file.
</body>
</html>

+ 508
- 0
doc/index.html View File

@ -0,0 +1,508 @@
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" type="text/css" href="doc.css" />
<title>Leveldb</title>
</head>
<body>
<h1>Leveldb</h1>
<address>Jeff Dean, Sanjay Ghemawat</address>
<p>
The <code>leveldb</code> library provides a persistent key value store. Keys and
values are arbitrary byte arrays. The keys are ordered within the key
value store according to a user-specified comparator function.
<p>
<h1>Opening A Database</h1>
<p>
A <code>leveldb</code> database has a name which corresponds to a file system
directory. All of the contents of database are stored in this
directory. The following example shows how to open a database,
creating it if necessary:
<p>
<pre>
#include &lt;assert&gt;
#include "leveldb/include/db.h"
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
assert(status.ok());
...
</pre>
If you want to raise an error if the database already exists, add
the following line before the <code>leveldb::DB::Open</code> call:
<pre>
options.error_if_exists = true;
</pre>
<h1>Status</h1>
<p>
You may have noticed the <code>leveldb::Status</code> type above. Values of this
type are returned by most functions in <code>leveldb</code> that may encounter an
error. You can check if such a result is ok, and also print an
associated error message:
<p>
<pre>
leveldb::Status s = ...;
if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
</pre>
<h1>Closing A Database</h1>
<p>
When you are done with a database, just delete the database object.
Example:
<p>
<pre>
... open the db as described above ...
... do something with db ...
delete db;
</pre>
<h1>Reads And Writes</h1>
<p>
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
modify/query the database. For example, the following code
moves the value stored under key1 to key2.
<p>
<pre>
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
</pre>
See <a href="#async">important performance note</a> below for how to
speed up writes significantly.
<h1>Atomic Updates</h1>
<p>
Note that if the process dies after the Put of key2 but before the
delete of key1, the same value may be left stored under multiple keys.
Such problems can be avoided by using the <code>WriteBatch</code> class to
atomically apply a set of updates:
<p>
<pre>
#include "leveldb/include/write_batch.h"
...
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) {
leveldb::WriteBatch batch;
batch.Delete(key1);
batch.Put(key2, value);
s = db-&gt;Write(leveldb::WriteOptions(), &amp;batch);
}
</pre>
The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
and these edits within the batch are applied in order. Note that we
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
we do not end up erroneously dropping the value entirely.
<p>
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
speed up bulk updates by placing lots of individual mutations into the
same batch.
<p>
<h1>Concurrency</h1>
<p>
A database may only be opened by one process at a time. The <code>leveldb</code>
implementation acquires a lock from the operating system to prevent
misuse. Within a single process, the same <code>leveldb::DB</code> object may
be safely used by multiple concurrent threads.
<p>
<h1>Iteration</h1>
<p>
The following example demonstrates how to print all key,value pairs
in a database.
<p>
<pre>
leveldb::Iterator* it = db-&gt;NewIterator(leveldb::ReadOptions());
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": " &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
}
assert(it-&gt;status().ok()); // Check for any errors found during the scan
delete it;
</pre>
The following variation shows how to process just the keys in the
range <code>[start,limit)</code>:
<p>
<pre>
for (it-&gt;Seek(start);
it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
it-&gt;Next()) {
...
}
</pre>
You can also process entries in reverse order. (Caveat: reverse
iteration is currently a factor of two or three slower than forward
iteration.)
<p>
<pre>
for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
...
}
</pre>
<h1>Snapshots</h1>
<p>
Snapshots provide consistent read-only views over the entire state of
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
that a read should operate on a particular version of the DB state.
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
implicit snapshot of the current state.
<p>
Snapshots typically are created by the DB::GetSnapshot() method:
<p>
<pre>
leveldb::ReadOptions options;
options.snapshot = db-&gt;GetSnapshot();
... apply some updates to db ...
leveldb::Iterator* iter = db-&gt;NewIterator(options);
... read using iter to view the state when the snapshot was created ...
delete iter;
db-&gt;ReleaseSnapshot(options.snapshot);
</pre>
Note that when a snapshot is no longer needed, it should be released
using the DB::ReleaseSnapshot interface. This allows the
implementation to get rid of state that was being maintained just to
support reading as of that snapshot.
<p>
A Write operation can also return a snapshot that
represents the state of the database just after applying a particular
set of updates:
<p>
<pre>
leveldb::Snapshot* snapshot;
leveldb::WriteOptions write_options;
write_options.post_write_snapshot = &amp;snapshot;
leveldb::Status status = db-&gt;Write(write_options, ...);
... perform other mutations to db ...
leveldb::ReadOptions read_options;
read_options.snapshot = snapshot;
leveldb::Iterator* iter = db-&gt;NewIterator(read_options);
... read as of the state just after the Write call returned ...
delete iter;
db-&gt;ReleaseSnapshot(snapshot);
</pre>
<h1>Slice</h1>
<p>
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple
structure that contains a length and a pointer to an external byte
array. Returning a <code>Slice</code> is a cheaper alternative to returning a
<code>std::string</code> since we do not need to copy potentially large keys and
values. In addition, <code>leveldb</code> methods do not return null-terminated
C-style strings since <code>leveldb</code> keys and values are allowed to
contain '\0' bytes.
<p>
C++ strings and null-terminated C-style strings can be easily converted
to a Slice:
<p>
<pre>
leveldb::Slice s1 = "hello";
std::string str("world");
leveldb::Slice s2 = str;
</pre>
A Slice can be easily converted back to a C++ string:
<pre>
std::string str = s1.ToString();
assert(str == std::string("hello"));
</pre>
Be careful when using Slices since it is up to the caller to ensure that
the external byte array into which the Slice points remains live while
the Slice is in use. For example, the following is buggy:
<p>
<pre>
leveldb::Slice slice;
if (...) {
std::string str = ...;
slice = str;
}
Use(slice);
</pre>
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
backing storage for <code>slice</code> will disappear.
<p>
<h1>Comparators</h1>
<p>
The preceding examples used the default ordering function for key,
which orders bytes lexicographically. You can however supply a custom
comparator when opening a database. For example, suppose each
database key consists of two numbers and we should sort by the first
number, breaking ties by the second number. First, define a proper
subclass of <code>leveldb::Comparator</code> that expresses these rules:
<p>
<pre>
class TwoPartComparator : public leveldb::Comparator {
public:
// Three-way comparison function:
// if a &lt; b: negative result
// if a &gt; b: positive result
// else: zero result
int Compare(const leveldb::Slice&amp; a, const leveldb::Slice&amp; b) const {
int a1, a2, b1, b2;
ParseKey(a, &amp;a1, &amp;a2);
ParseKey(b, &amp;b1, &amp;b2);
if (a1 &lt; b1) return -1;
if (a1 &gt; b1) return +1;
if (a2 &lt; b2) return -1;
if (a2 &gt; b2) return +1;
return 0;
}
// Ignore the following methods for now:
const char* Name() { return "TwoPartComparator"; }
void FindShortestSeparator(std::string*, const leveldb::Slice&amp;) const { }
void FindShortSuccessor(std::string*) const { }
};
</pre>
Now create a database using this custom comparator:
<p>
<pre>
TwoPartComparator cmp;
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
options.comparator = &amp;cmp;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
...
</pre>
<h2>Backwards compatibility</h2>
<p>
The result of the comparator's <code>Name</code> method is attached to the
database when it is created, and is checked on every subsequent
database open. If the name changes, the <code>leveldb::DB::Open</code> call will
fail. Therefore, change the name if and only if the new key format
and comparison function are incompatible with existing databases, and
it is ok to discard the contents of all existing databases.
<p>
You can however still gradually evolve your key format over time with
a little bit of pre-planning. For example, you could store a version
number at the end of each key (one byte should suffice for most uses).
When you wish to switch to a new key format (e.g., adding an optional
third part to the keys processed by <code>TwoPartComparator</code>),
(a) keep the same comparator name (b) increment the version number
for new keys (c) change the comparator function so it uses the
version numbers found in the keys to decide how to interpret them.
<p>
<h1>Performance</h1>
<p>
Performance can be tuned by changing the default values of the
types defined in <code>leveldb/include/options.h</code>.
<p>
<h2><a name="async">Asynchronous Writes</a></h2>
By default, each write to <code>leveldb</code> is synchronous: it does
not return until the write has been pushed from memory to persistent
storage. (On Posix systems, this is implemented by calling either
<code>fdatasync(...)</code> or <code>msync(..., MS_SYNC)</code>.)
<strong>Synchronous writes may be very slow and the synchrony can be
optionally disabled</strong>:
<pre>
leveldb::WriteOptions write_options;
write_options.sync = false;
db-&gt;Put(write_options, ...);
</pre>
Asynchronous writes are often more than a hundred times as fast as
synchronous writes. The downside of asynchronous writes is that a
crash of the machine may cause the last few updates to be lost. Note
that a crash of just the writing process (i.e., not a reboot) will not
cause any loss since even when <code>sync</code> is false, an update
is pushed from the process memory into the operating system before it
is considered done.
<p>
Asynchronous writes can be particularly beneficial when loading a
large amount of data into the database since you can mitigate the
problem of lost updates by restarting the bulk load. A hybrid scheme
is also possible where every Nth write is synchronous, and in the
event of a crash, the bulk load is restarted just after the last
synchronous write finished by the previous run.
<p>
<code>WriteBatch</code> provides an alternative to asynchronous writes.
Multiple updates may be placed in the same <code>WriteBatch</code> and
applied together using a synchronous write. The extra cost of the
synchronous write will be amortized across all of the writes in the batch.
<p>
<h2>Block size</h2>
<p>
<code>leveldb</code> groups adjacent keys together into the same block and such a
block is the unit of transfer to and from persistent storage. The
default block size is approximately 8192 uncompressed bytes.
Applications that mostly do bulk scans over the contents of the
database may wish to increase this size. Applications that do a lot
of point reads of small values may wish to switch to a smaller block
size if performance measurements indicate an improvement. There isn't
much benefit in using blocks smaller than one kilobyte, or larger than
a few megabytes. Also note that compression will be more effective
with larger block sizes.
<p>
<h2>Compression</h2>
<p>
Each block is individually compressed before being written to
persistent storage. Compression is on by default since the default
compression method is very fast, and is automatically disabled for
uncompressible data. In rare cases, applications may want to disable
compression entirely, but should only do so if benchmarks show a
performance improvement:
<p>
<pre>
leveldb::Options options;
options.compression = leveldb::kNoCompression;
... leveldb::DB::Open(options, name, ...) ....
</pre>
<h2>Cache</h2>
<p>
The contents of the database are stored in a set of files in the
filesystem and each file stores a sequence of compressed blocks. If
<code>options.cache</code> is non-NULL, it is used to cache frequently used
uncompressed block contents.
<p>
<pre>
#include "leveldb/include/cache.h"
leveldb::Options options;
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
leveldb::DB* db;
leveldb::DB::Open(options, name, &db);
... use the db ...
delete db
delete options.cache;
</pre>
Note that the cache holds uncompressed data, and therefore it should
be sized according to application level data sizes, without any
reduction from compression. (Caching of compressed blocks is left to
the operating system buffer cache, or any custom <code>Env</code>
implementation provided by the client.)
<p>
When performing a bulk read, the application may wish to disable
caching so that the data processed by the bulk read does not end up
displacing most of the cached contents. A per-iterator option can be
used to achieve this:
<p>
<pre>
leveldb::ReadOptions options;
options.fill_cache = false;
leveldb::Iterator* it = db-&gt;NewIterator(options);
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
...
}
</pre>
<h2>Key Layout</h2>
<p>
Note that the unit of disk transfer and caching is a block. Adjacent
keys (according to the database sort order) will usually be placed in
the same block. Therefore the application can improve its performance
by placing keys that are accessed together near each other and placing
infrequently used keys in a separate region of the key space.
<p>
For example, suppose we are implementing a simple file system on top
of <code>leveldb</code>. The types of entries we might wish to store are:
<p>
<pre>
filename -&gt; permission-bits, length, list of file_block_ids
file_block_id -&gt; data
</pre>
We might want to prefix <code>filename</code> keys with one letter (say '/') and the
<code>file_block_id</code> keys with a different letter (say '0') so that scans
over just the metadata do not force us to fetch and cache bulky file
contents.
<p>
<h2>Large Values</h2>
<p>
<code>leveldb</code> has special treatment of large values (by default, a value
of length greater than or equal to 64K is considered large, though a
field in Options can be used to adjust this threshold). Each such
large value is placed in a separate operating system file, and the
normal database blocks just contain pointers to such files.
<p>
Furthermore, if the same large value occurs multiple times in a single
database, it will be stored just once.
<p>
<h1>Checksums</h1>
<p>
<code>leveldb</code> associates checksums with all data it stores in the file system.
There are two separate controls provided over how aggressively these
checksums are verified:
<p>
<ul>
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
checksum verification of all data that is read from the file system on
behalf of a particular read. By default, no such verification is
done.
<p>
<li> <code>Options::paranoid_checks</code> may be set to true before opening a
database to make the database implementation raise an error as soon as
it detects an internal corruption. Depending on which portion of the
database has been corrupted, the error may be raised when the database
is opened, or later by another database operation. By default,
paranoid checking is off so that the database can be used even if
parts of its persistent storage have been corrupted.
<p>
If a database is corrupted (perhaps it cannot be opened when
paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
may be used to recover as much of the data as possible
<p>
</ul>
<h1>Approximate Sizes</h1>
<p>
The <code>GetApproximateSizes</code> method can used to get the approximate
number of bytes of file system space used by one or more key ranges.
<p>
<pre>
leveldb::Range ranges[2];
ranges[0] = leveldb::Range("a", "c");
ranges[1] = leveldb::Range("x", "z");
uint64_t sizes[2];
leveldb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
</pre>
The preceding call will set <code>sizes[0]</code> to the approximate number of
bytes of file system space used by the key range <code>[a..c)</code> and
<code>sizes[1]</code> to the approximate number of bytes used by the key range
<code>[x..z)</code>.
<p>
<h1>Environment</h1>
<p>
All file operations (and other operating system calls) issued by the
<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
Sophisticated clients may wish to provide their own <code>Env</code>
implementation to get better control. For example, an application may
introduce artificial delays in the file IO paths to limit the impact
of <code>leveldb</code> on other activities in the system.
<p>
<pre>
class SlowEnv : public leveldb::Env {
.. implementation of the Env interface ...
};
SlowEnv env;
leveldb::Options options;
options.env = &amp;env;
Status s = leveldb::DB::Open(options, ...);
</pre>
<h1>Porting</h1>
<p>
<code>leveldb</code> may be ported to a new platform by providing platform
specific implementations of the types/methods/functions exported by
<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more
details.
<p>
In addition, the new platform may need a new default <code>leveldb::Env</code>
implementation. See <code>leveldb/util/env_posix.h</code> for an example.
<h1>Other Information</h1>
<p>
Details about the <code>leveldb</code> implementation may be found in
the following documents:
<ul>
<li> <a href="impl.html">Implementation notes</a>
<li> <a href="table_format.txt">Format of an immutable Table file</a>
<li> <a href="log_format.txt">Format of a log file</a>
</ul>
</body>
</html>

+ 72
- 0
doc/log_format.txt View File

@ -0,0 +1,72 @@
The log file contents are a sequence of 32KB blocks. The only
exception is that the tail of the file may contain a partial block.
Each block consists of a sequence of records:
block := record* trailer?
record :=
checksum: uint32 // crc32c of type and data[]
length: uint16
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
data: uint8[length]
A record never starts within the last seven bytes of a block. Any
leftover bytes here form the trailer, which must consist entirely of
zero bytes and must be skipped by readers. In particular, even if
there are exactly seven bytes left in the block, and a zero-length
user record is added (which will fit in these seven bytes), the writer
must skip these trailer bytes and add the record to the next block.
More types may be added in the future. Some Readers may skip record
types they do not understand, others may report that some data was
skipped.
FULL == 1
FIRST == 2
MIDDLE == 3
LAST == 4
The FULL record contains the contents of an entire user record.
FIRST, MIDDLE, LAST are types used for user records that have been
split into multiple fragments (typically because of block boundaries).
FIRST is the type of the first fragment of a user record, LAST is the
type of the last fragment of a user record, and MID is the type of all
interior fragments of a user record.
Example: consider a sequence of user records:
A: length 1000
B: length 97270
C: length 8000
A will be stored as a FULL record in the first block.
B will be split into three fragments: first fragment occupies the rest
of the first block, second fragment occupies the entirety of the
second block, and the third fragment occupies a prefix of the third
block. This will leave six bytes free in the third block, which will
be left empty as the trailer.
C will be stored as a FULL record in the fourth block.
===================
Some benefits over the recordio format:
(1) We do not need any heuristics for resyncing - just go to next
block boundary and scan. If there is a corruption, skip to the next
block. As a side-benefit, we do not get confused when part of the
contents of one log file are embedded as a record inside another log
file.
(2) Splitting at approximate boundaries (e.g., for mapreduce) is
simple: find the next block boundary and skip records until we
hit a FULL or FIRST record.
(3) We do not need extra buffering for large records.
Some downsides compared to recordio format:
(1) No packing of tiny records. This could be fixed by adding a new
record type, so it is a shortcoming of the current implementation,
not necessarily the format.
(2) No compression. Again, this could be fixed by adding new record types.

+ 61
- 0
doc/table_format.txt View File

@ -0,0 +1,61 @@
File format
===========
<beginning_of_file>
[data block 1]
[data block 2]
...
[data block N]
[meta block 1]
...
[meta block K]
[metaindex block]
[index block]
[Footer] (fixed size; starts at file_size - sizeof(Footer))
<end_of_file>
The file contains internal pointers. Each such pointer is called
a BlockHandle and contains the following information:
offset: varint64
size: varint64
(1) The sequence of key/value pairs in the file are stored in sorted
order and partitioned into a sequence of data blocks. These blocks
come one after another at the beginning of the file. Each data block
is formatted according to the code in block_builder.cc, and then
optionally compressed.
(2) After the data blocks we store a bunch of meta blocks. The
supported meta block types are described below. More meta block types
may be added in the future. Each meta block is again formatted using
block_builder.cc and then optionally compressed.
(3) A "metaindex" block. It contains one entry for every other meta
block where the key is the name of the meta block and the value is a
BlockHandle pointing to that meta block.
(4) An "index" block. This block contains one entry per data block,
where the key is a string >= last key in that data block and before
the first key in the successive data block. The value is the
BlockHandle for the data block.
(6) At the very end of the file is a fixed length footer that contains
the BlockHandle of the metaindex and index blocks as well as a magic number.
metaindex_handle: char[p]; // Block handle for metaindex
index_handle: char[q]; // Block handle for index
padding: char[40-p-q]; // 0 bytes to make fixed length
// (40==2*BlockHandle::kMaxEncodedLength)
magic: fixed64; // == 0xdb4775248b80fb57
"stats" Meta Block
------------------
This meta block contains a bunch of stats. The key is the name
of the statistic. The value contains the statistic.
TODO(postrelease): record following stats.
data size
index size
key size (uncompressed)
value size (uncompressed)
number of entries
number of data blocks

+ 99
- 0
include/cache.h View File

@ -0,0 +1,99 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A Cache is an interface that maps keys to values. It has internal
// synchronization and may be safely accessed concurrently from
// multiple threads. It may automatically evict entries to make room
// for new entries. Values have a specified charge against the cache
// capacity. For example, a cache where the values are variable
// length strings, may use the length of the string as the charge for
// the string.
//
// A builtin cache implementation with a least-recently-used eviction
// policy is provided. Clients may use their own implementations if
// they want something more sophisticated (like scan-resistance, a
// custom eviction policy, variable cache sizing, etc.)
#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_
#define STORAGE_LEVELDB_INCLUDE_CACHE_H_
#include <stdint.h>
#include "include/slice.h"
namespace leveldb {
class Cache;
// Create a new cache with a fixed size capacity. This implementation
// of Cache uses a least-recently-used eviction policy.
extern Cache* NewLRUCache(size_t capacity);
class Cache {
public:
Cache() { }
// Destroys all existing entries by calling the "deleter"
// function that was passed to the constructor.
virtual ~Cache();
// Opaque handle to an entry stored in the cache.
struct Handle { };
// Insert a mapping from key->value into the cache and assign it
// the specified charge against the total cache capacity.
//
// Returns a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
//
// When the inserted entry is no longer needed, the key and
// value will be passed to "deleter".
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) = 0;
// If the cache has no mapping for "key", returns NULL.
//
// Else return a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
virtual Handle* Lookup(const Slice& key) = 0;
// Release a mapping returned by a previous Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void Release(Handle* handle) = 0;
// Return the value encapsulated in a handle returned by a
// successful Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void* Value(Handle* handle) = 0;
// If the cache contains entry for key, erase it. Note that the
// underlying entry will be kept around until all existing handles
// to it have been released.
virtual void Erase(const Slice& key) = 0;
// Return a new numeric id. May be used by multiple clients who are
// sharing the same cache to partition the key space. Typically the
// client will allocate a new id at startup and prepend the id to
// its cache keys.
virtual uint64_t NewId() = 0;
private:
void LRU_Remove(Handle* e);
void LRU_Append(Handle* e);
void Unref(Handle* e);
struct Rep;
Rep* rep_;
// No copying allowed
Cache(const Cache&);
void operator=(const Cache&);
};
}
#endif // STORAGE_LEVELDB_UTIL_CACHE_H_

+ 61
- 0
include/comparator.h View File

@ -0,0 +1,61 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
#include <string>
namespace leveldb {
class Slice;
// A Comparator object provides a total order across slices that are
// used as keys in an sstable or a database.
class Comparator {
public:
virtual ~Comparator();
// Three-way comparison. Returns value:
// < 0 iff "a" < "b",
// == 0 iff "a" == "b",
// > 0 iff "a" > "b"
virtual int Compare(const Slice& a, const Slice& b) const = 0;
// The name of the comparator. Used to check for comparator
// mismatches (i.e., a DB created with one comparator is
// accessed using a different comparator.
//
// The client of this package should switch to a new name whenever
// the comparator implementation changes in a way that will cause
// the relative ordering of any two keys to change.
//
// Names starting with "leveldb." are reserved and should not be used
// by any clients of this package.
virtual const char* Name() const = 0;
// Advanced functions: these are used to reduce the space requirements
// for internal data structures like index blocks.
// If *start < limit, changes *start to a short string in [start,limit).
// Simple comparator implementations may return with *start unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const = 0;
// Changes *key to a short string >= *key.
// Simple comparator implementations may return with *key unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortSuccessor(std::string* key) const = 0;
};
// Return a builtin comparator that uses lexicographic byte-wise
// ordering. The result remains the property of this module and
// must not be deleted.
extern const Comparator* BytewiseComparator();
}
#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_

+ 137
- 0
include/db.h View File

@ -0,0 +1,137 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_
#define STORAGE_LEVELDB_INCLUDE_DB_H_
#include <stdint.h>
#include <stdio.h>
#include "include/iterator.h"
#include "include/options.h"
namespace leveldb {
struct Options;
struct ReadOptions;
struct WriteOptions;
class Snapshot;
class WriteBatch;
// Some internal types. Clients should ignore.
class WriteBatchInternal;
struct Range {
Slice start;
Slice limit;
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
};
// A DB is a persistent ordered map from keys to values.
class DB {
public:
// Open the database with the specified "name".
// Stores a pointer to a heap-allocated database in *dbptr and returns
// OK on success.
// Stores NULL in *dbptr and returns a non-OK status on error.
// Caller should delete *dbptr when it is no longer needed.
static Status Open(const Options& options,
const std::string& name,
DB** dbptr);
DB() { }
virtual ~DB();
// Set the database entry for "key" to "value". Returns OK on success,
// and a non-OK status on error.
// Note: consider setting options.sync = false.
virtual Status Put(const WriteOptions& options,
const Slice& key,
const Slice& value) = 0;
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
// Note: consider setting options.sync = false.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
// Note: consider setting options.sync = false.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
// corresponding value in *value and return OK.
//
// If there is no entry for "key" leave *value unchanged and return
// a status for which Status::IsNotFound() returns true.
//
// May return some other Status on an error.
virtual Status Get(const ReadOptions& options,
const Slice& key, std::string* value) = 0;
// Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// Caller should delete the iterator when it is no longer needed.
// The returned iterator should be deleted before this db is deleted.
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
// Return a handle to the current DB state. Iterators created with
// this handle will all observe a stable snapshot of the current DB
// state. The caller must call ReleaseSnapshot(result) when the
// snapshot is no longer needed.
virtual const Snapshot* GetSnapshot() = 0;
// Release a previously acquired snapshot. The caller must not
// use "snapshot" after this call.
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
// DB implementations can export properties about their state
// via this method. If "property" is a valid property understood by this
// DB implementation, fills "*value" with its current value and returns
// true. Otherwise returns false.
//
//
// Valid property names include:
//
// "leveldb.num-files-at-level<N>" - return the number of files at level <N>,
// where <N> is an ASCII representation of a level number (e.g. "0").
virtual bool GetProperty(const Slice& property, uint64_t* value) = 0;
// For each i in [0,n-1], store in "sizes[i]", the approximate
// file system space used by keys in "[range[i].start .. range[i].limit)".
//
// Note that the returned sizes measure file system space usage, so
// if the user data compresses by a factor of ten, the returned
// sizes will be one-tenth the size of the corresponding user data size.
//
// The results may not include the sizes of recently written data.
virtual void GetApproximateSizes(const Range* range, int n,
uint64_t* sizes) = 0;
// Possible extensions:
// (1) Add a method to compact a range of keys
private:
// No copying allowed
DB(const DB&);
void operator=(const DB&);
};
// Destroy the contents of the specified database.
// Be very careful using this method.
Status DestroyDB(const std::string& name, const Options& options);
// If a DB cannot be opened, you may attempt to call this method to
// resurrect as much of the contents of the database as possible.
// Some data may be lost, so be careful when calling this function
// on a database that contains important information.
Status RepairDB(const std::string& dbname, const Options& options);
}
#endif // STORAGE_LEVELDB_INCLUDE_DB_H_

+ 293
- 0
include/env.h View File

@ -0,0 +1,293 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An Env is an interface used by the leveldb implementation to access
// operating system functionality like the filesystem etc. Callers
// may wish to provide a custom Env object when opening a database to
// get fine gain control; e.g., to rate limit file system operations.
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
#include <cstdarg>
#include <string>
#include <vector>
#include <stdint.h>
#include "include/status.h"
namespace leveldb {
class FileLock;
class RandomAccessFile;
class SequentialFile;
class Slice;
class WritableFile;
class Env {
public:
Env() { }
virtual ~Env();
// Return a default environment suitable for the current operating
// system. Sophisticated users may wish to provide their own Env
// implementation instead of relying on this default environment.
//
// The result of Default() belongs to leveldb and must never be deleted.
static Env* Default();
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores NULL in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) = 0;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores NULL in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) = 0;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores NULL in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) = 0;
// Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) = 0;
// Delete the named file.
virtual Status DeleteFile(const std::string& fname) = 0;
// Create the specified directory.
virtual Status CreateDir(const std::string& dirname) = 0;
// Delete the specified directory.
virtual Status DeleteDir(const std::string& dirname) = 0;
// Store the size of fname in *file_size.
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
// Rename file src to target.
virtual Status RenameFile(const std::string& src,
const std::string& target) = 0;
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores NULL in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
virtual Status UnlockFile(FileLock* lock) = 0;
// Arrange to run "(*function)(arg)" once in a background thread.
//
// "function" may run in an unspecified thread. Multiple functions
// added to the same Env may run concurrently in different threads.
// I.e., the caller may not assume that background work items are
// serialized.
virtual void Schedule(
void (*function)(void* arg),
void* arg) = 0;
// Start a new thread, invoking "function(arg)" within the new thread.
// When "function(arg)" returns, the thread will be destroyed.
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
// *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ
// between runs of the same process, but subsequent calls will return the
// same directory.
virtual Status GetTestDirectory(std::string* path) = 0;
// Write an entry to the log file with the specified format.
virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0;
// Returns the number of micro-seconds since some fixed point in time. Only
// useful for computing deltas of time.
virtual uint64_t NowMicros() = 0;
// Sleep/delay the thread for the perscribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0;
private:
// No copying allowed
Env(const Env&);
void operator=(const Env&);
};
// A file abstraction for reading sequentially through a file
class SequentialFile {
public:
SequentialFile() { }
virtual ~SequentialFile();
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// If an error was encountered, returns a non-OK status.
//
// REQUIRES: External synchronization
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
};
// A file abstraction for randomly reading the contents of a file.
class RandomAccessFile {
public:
RandomAccessFile() { }
virtual ~RandomAccessFile();
// Return the length of this file in bytes.
virtual uint64_t Size() const = 0;
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). If an error was encountered, returns a
// non-OK status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
};
// A file abstraction for sequential writing. The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class WritableFile {
public:
WritableFile() { }
virtual ~WritableFile();
virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0;
virtual Status Flush() = 0;
virtual Status Sync() = 0;
private:
// No copying allowed
WritableFile(const WritableFile&);
void operator=(const WritableFile&);
};
// Identifies a locked file.
class FileLock {
public:
FileLock() { }
virtual ~FileLock();
private:
// No copying allowed
FileLock(const FileLock&);
void operator=(const FileLock&);
};
// Log the specified data to *info_log if info_log is non-NULL.
extern void Log(Env* env, WritableFile* info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 3, 4)))
# endif
;
// A utility routine: write "data" to the named file.
extern Status WriteStringToFile(Env* env, const Slice& data,
const std::string& fname);
// A utility routine: read contents of named file into *data
extern Status ReadFileToString(Env* env, const std::string& fname,
std::string* data);
// An implementation of Env that forwards all calls to another Env.
// May be useful to clients who wish to override just part of the
// functionality of another Env.
class EnvWrapper : public Env {
public:
// Initialize an EnvWrapper that delegates all calls to *target
explicit EnvWrapper(Env* target) : target_(target) { }
virtual ~EnvWrapper();
// Return the target to which this Env forwards all calls
Env* target() const { return target_; }
// The following text is boilerplate that forwards all methods to target()
Status NewSequentialFile(const std::string& f, SequentialFile** r) {
return target_->NewSequentialFile(f, r);
}
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
return target_->NewRandomAccessFile(f, r);
}
Status NewWritableFile(const std::string& f, WritableFile** r) {
return target_->NewWritableFile(f, r);
}
bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
return target_->GetChildren(dir, r);
}
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
Status GetFileSize(const std::string& f, uint64_t* s) {
return target_->GetFileSize(f, s);
}
Status RenameFile(const std::string& s, const std::string& t) {
return target_->RenameFile(s, t);
}
Status LockFile(const std::string& f, FileLock** l) {
return target_->LockFile(f, l);
}
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
void Schedule(void (*f)(void*), void* a) {
return target_->Schedule(f, a);
}
void StartThread(void (*f)(void*), void* a) {
return target_->StartThread(f, a);
}
virtual Status GetTestDirectory(std::string* path) {
return target_->GetTestDirectory(path);
}
virtual void Logv(WritableFile* log, const char* format, va_list ap) {
return target_->Logv(log, format, ap);
}
uint64_t NowMicros() {
return target_->NowMicros();
}
void SleepForMicroseconds(int micros) {
target_->SleepForMicroseconds(micros);
}
private:
Env* target_;
};
}
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_

+ 95
- 0
include/iterator.h View File

@ -0,0 +1,95 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An iterator yields a sequence of key/value pairs from a source.
// The following class defines the interface. Multiple implementations
// are provided by this library. In particular, iterators are provided
// to access the contents of a Table or a DB.
#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
#include "include/slice.h"
#include "include/status.h"
namespace leveldb {
class Iterator {
public:
Iterator();
virtual ~Iterator();
// An iterator is either positioned at a key/value pair, or
// not valid. This method returns true iff the iterator is valid.
virtual bool Valid() const = 0;
// Position at the first key in the source. The iterator is Valid()
// after this call iff the source is not empty.
virtual void SeekToFirst() = 0;
// Position at the last key in the source. The iterator is
// Valid() after this call iff the source is not empty.
virtual void SeekToLast() = 0;
// Position at the first key in the source that at or past target
// The iterator is Valid() after this call iff the source contains
// an entry that comes at or past target.
virtual void Seek(const Slice& target) = 0;
// Moves to the next entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the last entry in the source.
// REQUIRES: Valid()
virtual void Next() = 0;
// Moves to the previous entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the first entry in source.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Return the key for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: Valid()
virtual Slice key() const = 0;
// Return the value for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: !AtEnd() && !AtStart()
virtual Slice value() const = 0;
// If an error has occurred, return it. Else return an ok status.
virtual Status status() const = 0;
// Clients are allowed to register function/arg1/arg2 triples that
// will be invoked when this iterator is destroyed.
//
// Note that unlike all of the preceding methods, this method is
// not abstract and therefore clients should not override it.
typedef void (*CleanupFunction)(void* arg1, void* arg2);
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
private:
struct Cleanup {
CleanupFunction function;
void* arg1;
void* arg2;
Cleanup* next;
};
Cleanup cleanup_;
// No copying allowed
Iterator(const Iterator&);
void operator=(const Iterator&);
};
// Return an empty iterator (yields nothing).
extern Iterator* NewEmptyIterator();
// Return an empty iterator with the specified status.
extern Iterator* NewErrorIterator(const Status& status);
}
#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_

+ 203
- 0
include/options.h View File

@ -0,0 +1,203 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
#include <stddef.h>
namespace leveldb {
class Cache;
class Comparator;
class Env;
class Snapshot;
class WritableFile;
// DB contents are stored in a set of blocks, each of which holds a
// sequence of key,value pairs. Each block may be compressed before
// being stored in a file. The following enum describes which
// compression method (if any) is used to compress a block.
enum CompressionType {
// NOTE: do not change the values of existing entries, as these are
// part of the persistent format on disk.
kNoCompression = 0x0,
kLightweightCompression = 0x1,
};
// Options to control the behavior of a database (passed to DB::Open)
struct Options {
// -------------------
// Parameters that affect behavior
// Comparator used to define the order of keys in the table.
// Default: a comparator that uses lexicographic byte-wise ordering
//
// REQUIRES: The client must ensure that the comparator supplied
// here has the same name and orders keys *exactly* the same as the
// comparator provided to previous open calls on the same DB.
const Comparator* comparator;
// If true, the database will be created if it is missing.
// Default: false
bool create_if_missing;
// If true, an error is raised if the database already exists.
// Default: false
bool error_if_exists;
// If true, the implementation will do aggressive checking of the
// data it is processing and will stop early if it detects any
// errors. This may have unforeseen ramifications: for example, a
// corruption of one DB entry may cause a large number of entries to
// become unreadable or for the entire DB to become unopenable.
// Default: false
bool paranoid_checks;
// Use the specified object to interact with the environment,
// e.g. to read/write files, schedule background work, etc.
// Default: Env::Default()
Env* env;
// Any internal progress/error information generated by the db will
// be to written to info_log if it is non-NULL, or to a file stored
// in the same directory as the DB contents if info_log is NULL.
// Default: NULL
WritableFile* info_log;
// -------------------
// Parameters that affect performance
// Amount of data to build up in memory before converting to an
// on-disk file.
//
// Some DB operations may encounter a delay proportional to the size
// of this parameter. Therefore we recommend against increasing
// this parameter unless you are willing to live with an occasional
// slow operation in exchange for faster bulk loading throughput.
//
// Default: 1MB
size_t write_buffer_size;
// Number of open files that can be used by the DB. You may need to
// increase this if your database has a large working set (budget
// one open file per 2MB of working set).
//
// Default: 1000
int max_open_files;
// Handle values larger than "large_value_threshold" bytes
// specially, by writing them into their own files (to avoid
// compaction overhead) and doing content-based elimination of
// duplicate values to save space.
//
// We recommend against changing this value.
//
// Default: 64K
size_t large_value_threshold;
// Control over blocks (user data is stored in a set of blocks, and
// a block is the unit of reading from disk).
// Use the specified cache for blocks (if non-NULL).
// Default: NULL
Cache* block_cache;
// Approximate size of user data packed per block. Note that the
// block size specified here corresponds to uncompressed data. The
// actual size of the unit read from disk may be smaller if
// compression is enabled. This parameter can be changed dynamically.
//
// Default: 8K
int block_size;
// Number of keys between restart points for delta encoding of keys.
// This parameter can be changed dynamically. Most clients should
// leave this parameter alone.
//
// Default: 16
int block_restart_interval;
// Compress blocks using the specified compression algorithm. This
// parameter can be changed dynamically.
//
// Default: kLightweightCompression, which gives lightweight but fast
// compression.
//
// Typical speeds of kLightweightCompression on an Intel(R) Core(TM)2 2.4GHz:
// ~200-500MB/s compression
// ~400-800MB/s decompression
// Note that these speeds are significantly faster than most
// persistent storage speeds, and therefore it is typically never
// worth switching to kNoCompression. Even if the input data is
// incompressible, the kLightweightCompression implementation will
// efficiently detect that and will switch to uncompressed mode.
CompressionType compression;
// Create an Options object with default values for all fields.
Options();
};
// Options that control read operations
struct ReadOptions {
// If true, all data read from underlying storage will be
// verified against corresponding checksums.
// Default: false
bool verify_checksums;
// Should the data read for this iteration be cached in memory?
// Callers may wish to set this field to false for bulk scans.
// Default: true
bool fill_cache;
// If "snapshot" is non-NULL, read as of the supplied snapshot
// (which must belong to the DB that is being read and which must
// not have been released). If "snapshot" is NULL, use an impliicit
// snapshot of the state at the beginning of this read operation.
// Default: NULL
const Snapshot* snapshot;
ReadOptions()
: verify_checksums(false),
fill_cache(true),
snapshot(NULL) {
}
};
// Options that control write operations
struct WriteOptions {
// If true, the write will be flushed from the operating system
// buffer cache (by calling WritableFile::Sync()) before the write
// is considered complete. If this flag is true, writes will be
// slower.
//
// If this flag is false, and the machine crashes, some recent
// writes may be lost. Note that if it is just the process that
// crashes (i.e., the machine does not reboot), no writes will be
// lost even if sync==false.
//
// Default: true
bool sync;
// If "post_write_snapshot" is non-NULL, and the write succeeds,
// *post_write_snapshot will be modified to point to a snapshot of
// the DB state immediately after this write. The caller must call
// DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the
// snapshot is no longer needed.
//
// If "post_write_snapshot" is non-NULL, and the write fails,
// *post_write_snapshot will be set to NULL.
//
// Default: NULL
const Snapshot** post_write_snapshot;
WriteOptions()
: sync(true),
post_write_snapshot(NULL) {
}
};
}
#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_

+ 104
- 0
include/slice.h View File

@ -0,0 +1,104 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Slice is a simple structure containing a pointer into some external
// storage and a size. The user of a Slice must ensure that the slice
// is not used after the corresponding external storage has been
// deallocated.
#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
#define STORAGE_LEVELDB_INCLUDE_SLICE_H_
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include <string>
namespace leveldb {
class Slice {
public:
// Create an empty slice.
Slice() : data_(""), size_(0) { }
// Create a slice that refers to data[0,n-1].
Slice(const char* data, size_t n) : data_(data), size_(n) { }
// Create a slice that refers to the contents of "s"
Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
// Create a slice that refers to s[0,strlen(s)-1]
Slice(const char* s) : data_(s), size_(strlen(s)) { }
// Return a pointer to the beginning of the referenced data
const char* data() const { return data_; }
// Return the length (in bytes) of the referenced data
size_t size() const { return size_; }
// Return true iff the length of the referenced data is zero
bool empty() const { return size_ == 0; }
// Return the ith byte in the referenced data.
// REQUIRES: n < size()
char operator[](size_t n) const {
assert(n < size());
return data_[n];
}
// Change this slice to refer to an empty array
void clear() { data_ = ""; size_ = 0; }
// Drop the first "n" bytes from this slice.
void remove_prefix(size_t n) {
assert(n <= size());
data_ += n;
size_ -= n;
}
// Return a string that contains the copy of the referenced data.
std::string ToString() const { return std::string(data_, size_); }
// Three-way comparison. Returns value:
// < 0 iff "*this" < "b",
// == 0 iff "*this" == "b",
// > 0 iff "*this" > "b"
int compare(const Slice& b) const;
// Return true iff "x" is a prefix of "*this"
bool starts_with(const Slice& x) const {
return ((size_ >= x.size_) &&
(memcmp(data_, x.data_, x.size_) == 0));
}
private:
const char* data_;
size_t size_;
// Intentionally copyable
};
inline bool operator==(const Slice& x, const Slice& y) {
return ((x.size() == y.size()) &&
(memcmp(x.data(), y.data(), x.size()) == 0));
}
inline bool operator!=(const Slice& x, const Slice& y) {
return !(x == y);
}
inline int Slice::compare(const Slice& b) const {
const int min_len = (size_ < b.size_) ? size_ : b.size_;
int r = memcmp(data_, b.data_, min_len);
if (r == 0) {
if (size_ < b.size_) r = -1;
else if (size_ > b.size_) r = +1;
}
return r;
}
}
#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_

+ 86
- 0
include/status.h View File

@ -0,0 +1,86 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A Status encapsulates the result of an operation. It may indicate success,
// or it may indicate an error with an associated error message.
#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
#define STORAGE_LEVELDB_INCLUDE_STATUS_H_
#include <string>
#include <utility>
#include "include/slice.h"
namespace leveldb {
class Status {
public:
// Create a success status.
Status() : state_(NULL) { }
~Status() { delete state_; }
// Copy the specified status.
Status(const Status& s);
void operator=(const Status& s);
// Return a success status.
static Status OK() { return Status(); }
// Return error status of an appropriate type.
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kNotFound, msg, Slice());
}
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kCorruption, msg, msg2);
}
static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kNotSupported, msg, msg2);
}
static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kInvalidArgument, msg, msg2);
}
static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kIOError, msg, msg2);
}
// Returns true iff the status indicates success.
bool ok() const { return (state_ == NULL); }
// Returns true iff the status indicates a NotFound error.
bool IsNotFound() const { return code() == kNotFound; }
// Return a string representation of this status suitable for printing.
// Returns the string "OK" for success.
std::string ToString() const;
private:
enum Code {
kOk = 0,
kNotFound = 1,
kCorruption = 2,
kNotSupported = 3,
kInvalidArgument = 4,
kIOError = 5,
};
Code code() const { return (state_ == NULL) ? kOk : state_->first; }
Status(Code code, const Slice& msg, const Slice& msg2);
typedef std::pair<Code, std::string> State;
State* state_;
};
inline Status::Status(const Status& s) {
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
}
inline void Status::operator=(const Status& s) {
if (this != &s) {
delete state_;
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
}
}
}
#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_

+ 67
- 0
include/table.h View File

@ -0,0 +1,67 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_
#define STORAGE_LEVELDB_INCLUDE_TABLE_H_
#include <stdint.h>
#include "include/iterator.h"
namespace leveldb {
class Block;
class BlockHandle;
struct Options;
class RandomAccessFile;
struct ReadOptions;
// A Table is a sorted map from strings to strings. Tables are
// immutable and persistent.
class Table {
public:
// Attempt to open the table that is stored in "file", and read the
// metadata entries necessary to allow retrieving data from the table.
//
// If successful, returns ok and sets "*table" to the newly opened
// table. The client should delete "*table" when no longer needed.
// If there was an error while initializing the table, sets "*table"
// to NULL and returns a non-ok status. Does not take ownership of
// "*source", but the client must ensure that "source" remains live
// for the duration of the returned table's lifetime.
//
// *file must remain live while this Table is in use.
static Status Open(const Options& options,
RandomAccessFile* file,
Table** table);
~Table();
// Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
Iterator* NewIterator(const ReadOptions&) const;
// Given a key, return an approximate byte offset in the file where
// the data for that key begins (or would begin if the key were
// present in the file). The returned value is in terms of file
// bytes, and so includes effects like compression of the underlying data.
// E.g., the approximate offset of the last key in the table will
// be close to the file length.
uint64_t ApproximateOffsetOf(const Slice& key) const;
private:
struct Rep;
Rep* rep_;
explicit Table(Rep* rep) { rep_ = rep; }
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&);
// No copying allowed
Table(const Table&);
void operator=(const Table&);
};
}
#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_

+ 86
- 0
include/table_builder.h View File

@ -0,0 +1,86 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// TableBuilder provides the interface used to build a Table
// (an immutable and sorted map from keys to values).
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
#include <stdint.h>
#include "include/options.h"
#include "include/status.h"
namespace leveldb {
class BlockBuilder;
class BlockHandle;
class WritableFile;
class TableBuilder {
public:
// Create a builder that will store the contents of the table it is
// building in *file. Does not close the file. It is up to the
// caller to close the file after calling Finish().
TableBuilder(const Options& options, WritableFile* file);
// REQUIRES: Either Finish() or Abandon() has been called.
~TableBuilder();
// Change the options used by this builder. Note: only some of the
// option fields can be changed after construction. If a field is
// not allowed to change dynamically and its value in the structure
// passed to the constructor is different from its value in the
// structure passed to this method, this method will return an error
// without changing any fields.
Status ChangeOptions(const Options& options);
// Add key,value to the table being constructed.
// REQUIRES: key is after any previously added key according to comparator.
// REQUIRES: Finish(), Abandon() have not been called
void Add(const Slice& key, const Slice& value);
// Advanced operation: flush any buffered key/value pairs to file.
// Can be used to ensure that two adjacent entries never live in
// the same data block. Most clients should not need to use this method.
// REQUIRES: Finish(), Abandon() have not been called
void Flush();
// Return non-ok iff some error has been detected.
Status status() const;
// Finish building the table. Stops using the file passed to the
// constructor after this function returns.
// REQUIRES: Finish(), Abandon() have not been called
Status Finish();
// Indicate that the contents of this builder should be abandoned. Stops
// using the file passed to the constructor after this function returns.
// If the caller is not going to call Finish(), it must call Abandon()
// before destroying this builder.
// REQUIRES: Finish(), Abandon() have not been called
void Abandon();
// Number of calls to Add() so far.
uint64_t NumEntries() const;
// Size of the file generated so far. If invoked after a successful
// Finish() call, returns the size of the final generated file.
uint64_t FileSize() const;
private:
bool ok() const { return status().ok(); }
void WriteBlock(BlockBuilder* block, BlockHandle* handle);
struct Rep;
Rep* rep_;
// No copying allowed
TableBuilder(const TableBuilder&);
void operator=(const TableBuilder&);
};
}
#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_

+ 49
- 0
include/write_batch.h View File

@ -0,0 +1,49 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// WriteBatch holds a collection of updates to apply atomically to a DB.
//
// The updates are applied in the order in which they are added
// to the WriteBatch. For example, the value of "key" will be "v3"
// after the following batch is written:
//
// batch.Put("key", "v1");
// batch.Delete("key");
// batch.Put("key", "v2");
// batch.Put("key", "v3");
#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
#include <string>
namespace leveldb {
class Slice;
class WriteBatch {
public:
WriteBatch();
~WriteBatch();
// Store the mapping "key->value" in the database.
void Put(const Slice& key, const Slice& value);
// If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(const Slice& key);
// Clear all updates buffered in this batch.
void Clear();
private:
friend class WriteBatchInternal;
std::string rep_; // See comment in write_batch.cc for the format of rep_
// Intentionally copyable
};
}
#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_

+ 329
- 0
leveldb.gyp View File

@ -0,0 +1,329 @@
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. See the AUTHORS file for names of contributors.
{
'variables': {
'use_snappy%': 0,
},
'target_defaults': {
'defines': [
'LEVELDB_PLATFORM_CHROMIUM=1',
],
'include_dirs': [
# MOE:begin_strip
'../..',
# MOE:end_strip_and_replace '.',
],
'conditions': [
['OS == "win"', {
'include_dirs': [
'port/win',
],
}],
['use_snappy', {
'defines': [
'USE_SNAPPY=1',
],
}],
],
},
'targets': [
{
'target_name': 'leveldb',
'type': '<(library)',
'dependencies': [
# The base libary is a lightweight abstraction layer for things like
# threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/
# MOE:begin_strip
'../../../../base/base.gyp:base',
# MOE:end_strip_and_replace '../../base/base.gyp:base',
],
'conditions': [
['use_snappy', {
'dependencies': [
'../../../../third_party/snappy/snappy.gyp:snappy',
],
}],
],
'sources': [
# Include and then exclude so that all files show up in IDEs, even if
# they don't build.
'db/builder.cc',
'db/builder.h',
'db/db_impl.cc',
'db/db_impl.h',
'db/db_iter.cc',
'db/db_iter.h',
'db/filename.cc',
'db/filename.h',
'db/dbformat.cc',
'db/dbformat.h',
'db/log_format.h',
'db/log_reader.cc',
'db/log_reader.h',
'db/log_writer.cc',
'db/log_writer.h',
'db/memtable.cc',
'db/memtable.h',
'db/repair.cc',
'db/skiplist.h',
'db/snapshot.h',
'db/table_cache.cc',
'db/table_cache.h',
'db/version_edit.cc',
'db/version_edit.h',
'db/version_set.cc',
'db/version_set.h',
'db/write_batch.cc',
'db/write_batch_internal.h',
'include/cache.h',
'include/comparator.h',
'include/db.h',
'include/env.h',
'include/iterator.h',
'include/options.h',
'include/slice.h',
'include/status.h',
'include/table.h',
'include/table_builder.h',
'include/write_batch.h',
'port/port.h',
'port/port_chromium.cc',
'port/port_chromium.h',
'port/port_example.h',
'port/port_posix.cc',
'port/port_posix.h',
'port/sha1_portable.cc',
'port/sha1_portable.h',
'table/block.cc',
'table/block.h',
'table/block_builder.cc',
'table/block_builder.h',
'table/format.cc',
'table/format.h',
'table/iterator.cc',
'table/iterator_wrapper.h',
'table/merger.cc',
'table/merger.h',
'table/table.cc',
'table/table_builder.cc',
'table/two_level_iterator.cc',
'table/two_level_iterator.h',
'util/arena.cc',
'util/arena.h',
'util/cache.cc',
'util/coding.cc',
'util/coding.h',
'util/comparator.cc',
'util/crc32c.cc',
'util/crc32c.h',
'util/env.cc',
'util/env_chromium.cc',
'util/env_posix.cc',
'util/hash.cc',
'util/hash.h',
'util/logging.cc',
'util/logging.h',
'util/mutexlock.h',
'util/options.cc',
'util/random.h',
'util/status.cc',
],
'sources/': [
['exclude', '_(android|example|portable|posix)\\.cc$'],
],
},
{
'target_name': 'leveldb_testutil',
'type': '<(library)',
'dependencies': [
# MOE:begin_strip
'../../../../base/base.gyp:base',
# MOE:end_strip_and_replace '../../base/base.gyp:base',
'leveldb',
],
'export_dependent_settings': [
# The tests use include directories from these projects.
# MOE:begin_strip
'../../../../base/base.gyp:base',
# MOE:end_strip_and_replace '../../base/base.gyp:base',
'leveldb',
],
'sources': [
'util/histogram.cc',
'util/histogram.h',
'util/testharness.cc',
'util/testharness.h',
'util/testutil.cc',
'util/testutil.h',
],
},
{
'target_name': 'leveldb_arena_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'util/arena_test.cc',
],
},
{
'target_name': 'leveldb_cache_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'util/cache_test.cc',
],
},
{
'target_name': 'leveldb_coding_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'util/coding_test.cc',
],
},
{
'target_name': 'leveldb_corruption_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/corruption_test.cc',
],
},
{
'target_name': 'leveldb_crc32c_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'util/crc32c_test.cc',
],
},
{
'target_name': 'leveldb_db_bench',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/db_bench.cc',
],
},
{
'target_name': 'leveldb_db_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/db_test.cc',
],
},
{
'target_name': 'leveldb_dbformat_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/dbformat_test.cc',
],
},
{
'target_name': 'leveldb_env_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'util/env_test.cc',
],
},
{
'target_name': 'leveldb_filename_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/filename_test.cc',
],
},
{
'target_name': 'leveldb_log_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/log_test.cc',
],
},
{
'target_name': 'leveldb_sha1_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'port/sha1_test.cc',
],
},
{
'target_name': 'leveldb_skiplist_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/skiplist_test.cc',
],
},
{
'target_name': 'leveldb_table_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'table/table_test.cc',
],
},
{
'target_name': 'leveldb_version_edit_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/version_edit_test.cc',
],
},
{
'target_name': 'leveldb_write_batch_test',
'type': 'executable',
'dependencies': [
'leveldb_testutil',
],
'sources': [
'db/write_batch_test.cc',
],
},
],
}
# Local Variables:
# tab-width:2
# indent-tabs-mode:nil
# End:
# vim: set expandtab tabstop=2 shiftwidth=2:

+ 10
- 0
port/README View File

@ -0,0 +1,10 @@
This directory contains interfaces and implementations that isolate the
rest of the package from platform details.
Code in the rest of the package includes "port.h" from this directory.
"port.h" in turn includes a platform specific "port_<platform>.h" file
that provides the platform specific implementation.
See port_posix.h for an example of what must be provided in a platform
specific header file.

+ 21
- 0
port/port.h View File

@ -0,0 +1,21 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_PORT_PORT_H_
#define STORAGE_LEVELDB_PORT_PORT_H_
#include <string.h>
// Include the appropriate platform specific file below. If you are
// porting to a new platform, see "port_example.h" for documentation
// of what the new port_<platform>.h file must provide.
#if defined(LEVELDB_PLATFORM_POSIX)
# include "port/port_posix.h"
#elif defined(LEVELDB_PLATFORM_CHROMIUM)
# include "port/port_chromium.h"
#elif defined(LEVELDB_PLATFORM_ANDROID)
# include "port/port_android.h"
#endif
#endif // STORAGE_LEVELDB_PORT_PORT_H_

+ 65
- 0
port/port_android.cc View File

@ -0,0 +1,65 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "port/port_android.h"
#include <cstdlib>
extern "C" {
size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) {
return fread(a, b, c, d);
}
size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) {
return fwrite(a, b, c, d);
}
int fflush_unlocked(FILE *f) {
return fflush(f);
}
int fdatasync(int fd) {
return fsync(fd);
}
}
// TODO(gabor): This is copied from port_posix.cc - not sure if I should do this?
namespace leveldb {
namespace port {
static void PthreadCall(const char* label, int result) {
if (result != 0) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
abort();
}
}
Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); }
Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); }
void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); }
CondVar::CondVar(Mutex* mu)
: mu_(mu) {
PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
}
CondVar::~CondVar() {
PthreadCall("destroy cv", pthread_cond_destroy(&cv_));
}
void CondVar::Wait() {
PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
}
void CondVar::Signal(){
PthreadCall("signal", pthread_cond_signal(&cv_));
}
void CondVar::SignalAll() {
PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
}
}
}

+ 131
- 0
port/port_android.h View File

@ -0,0 +1,131 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// See port_example.h for documentation for the following types/functions.
#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
#include <endian.h>
#include <pthread.h>
#include <stdint.h>
#include <sha1.h>
#include <cstdatomic>
#include <string>
#include <cctype>
extern "C" {
size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d);
size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d);
int fflush_unlocked(FILE *f);
int fdatasync (int fd);
}
namespace leveldb {
namespace port {
static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN;
class CondVar;
class Mutex {
public:
Mutex();
~Mutex();
void Lock();
void Unlock();
void AssertHeld() {
//TODO(gabor): How can I implement this?
}
private:
friend class CondVar;
pthread_mutex_t mu_;
// No copying
Mutex(const Mutex&);
void operator=(const Mutex&);
};
class CondVar {
public:
explicit CondVar(Mutex* mu);
~CondVar();
void Wait();
void Signal();
void SignalAll();
private:
Mutex* mu_;
pthread_cond_t cv_;
};
// Storage for a lock-free pointer
class AtomicPointer {
private:
std::atomic<void*> rep_;
public:
AtomicPointer() { }
explicit AtomicPointer(void* v) : rep_(v) { }
inline void* Acquire_Load() const {
return rep_.load(std::memory_order_acquire);
}
inline void Release_Store(void* v) {
rep_.store(v, std::memory_order_release);
}
inline void* NoBarrier_Load() const {
return rep_.load(std::memory_order_relaxed);
}
inline void NoBarrier_Store(void* v) {
rep_.store(v, std::memory_order_relaxed);
}
};
/**
* TODO(gabor): Implement actual compress
* This is a hack - it just copies input to output.
* No actual compression occurs.
*/
inline void Lightweight_Compress(
const char* input,
size_t input_length,
std::string* output) {
output->copy((char*)input,0,input_length);
}
/**
* TODO(gabor): Implement actual compress
* This is a hack - it just copies input to output.
* No actual uncompression occurs.
*/
inline bool Lightweight_Uncompress(
const char* input_data,
size_t input_length,
std::string* output) {
output->copy((char*)input_data,0,input_length);
return (bool)1;
}
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
SHA1_CTX sha1_ctx;
SHA1Init(&sha1_ctx);
SHA1Update(&sha1_ctx, (const u_char*)data, len);
SHA1Final((u_char*)hash_array, &sha1_ctx);
}
inline uint64_t ThreadIdentifier() {
pthread_t tid = pthread_self();
uint64_t r = 0;
memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid));
return r;
}
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
return false;
}
}
}
#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_

+ 83
- 0
port/port_chromium.cc View File

@ -0,0 +1,83 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "port/port_chromium.h"
#include "util/logging.h"
#if defined(USE_SNAPPY)
# include "third_party/snappy/src/snappy.h"
# include "third_party/snappy/src/snappy-stubs.h"
#endif
namespace leveldb {
namespace port {
Mutex::Mutex() {
}
Mutex::~Mutex() {
}
void Mutex::Lock() {
mu_.Acquire();
}
void Mutex::Unlock() {
mu_.Release();
}
void Mutex::AssertHeld() {
mu_.AssertAcquired();
}
CondVar::CondVar(Mutex* mu)
: cv_(&mu->mu_) {
}
CondVar::~CondVar() { }
void CondVar::Wait() {
cv_.Wait();
}
void CondVar::Signal(){
cv_.Signal();
}
void CondVar::SignalAll() {
cv_.Broadcast();
}
void Lightweight_Compress(const char* input, size_t input_length,
std::string* output) {
#if defined(USE_SNAPPY)
output->resize(snappy::MaxCompressedLength(input_length));
size_t outlen;
snappy::RawCompress(snappy::StringPiece(input, input_length),
&(*output)[0], &outlen);
output->resize(outlen);
#else
output->assign(input, input_length);
#endif
}
bool Lightweight_Uncompress(const char* input_data, size_t input_length,
std::string* output) {
#if defined(USE_SNAPPY)
snappy::StringPiece input(input_data, input_length);
size_t ulength;
if (!snappy::GetUncompressedLength(input, &ulength)) {
return false;
}
output->resize(ulength);
return snappy::RawUncompress(input, &(*output)[0]);
#else
output->assign(input_data, input_length);
return true;
#endif
}
}
}

+ 104
- 0
port/port_chromium.h View File

@ -0,0 +1,104 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// See port_example.h for documentation for the following types/functions.
#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_
#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_
#include <stdint.h>
#include <string>
#include <cstring>
#include "base/atomicops.h"
#include "base/basictypes.h"
#include "base/logging.h"
#include "base/sha1.h"
#include "base/synchronization/condition_variable.h"
#include "base/synchronization/lock.h"
// Linux's ThreadIdentifier() needs this.
#if defined(OS_LINUX)
# include <linux/unistd.h>
#endif
#if defined(OS_WIN)
#define snprintf _snprintf
#define va_copy(a, b) do { (a) = (b); } while (0)
#endif
namespace leveldb {
namespace port {
// Chromium only supports little endian.
static const bool kLittleEndian = true;
class Mutex {
public:
Mutex();
~Mutex();
void Lock();
void Unlock();
void AssertHeld();
private:
base::Lock mu_;
friend class CondVar;
DISALLOW_COPY_AND_ASSIGN(Mutex);
};
class CondVar {
public:
explicit CondVar(Mutex* mu);
~CondVar();
void Wait();
void Signal();
void SignalAll();
private:
base::ConditionVariable cv_;
DISALLOW_COPY_AND_ASSIGN(CondVar);
};
class AtomicPointer {
private:
typedef base::subtle::AtomicWord Rep;
Rep rep_;
public:
AtomicPointer() { }
explicit AtomicPointer(void* p) : rep_(reinterpret_cast<Rep>(p)) {}
inline void* Acquire_Load() const {
return reinterpret_cast<void*>(::base::subtle::Acquire_Load(&rep_));
}
inline void Release_Store(void* v) {
::base::subtle::Release_Store(&rep_, reinterpret_cast<Rep>(v));
}
inline void* NoBarrier_Load() const {
return reinterpret_cast<void*>(::base::subtle::NoBarrier_Load(&rep_));
}
inline void NoBarrier_Store(void* v) {
::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast<Rep>(v));
}
};
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
return ::base::SHA1HashBytes(reinterpret_cast<const unsigned char*>(data),
len,
reinterpret_cast<unsigned char*>(hash_array));
}
void Lightweight_Compress(const char* input, size_t input_length,
std::string* output);
bool Lightweight_Uncompress(const char* input_data, size_t input_length,
std::string* output);
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
return false;
}
}
}
#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_

+ 119
- 0
port/port_example.h View File

@ -0,0 +1,119 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// This file contains the specification, but not the implementations,
// of the types/operations/etc. that should be defined by a platform
// specific port_<platform>.h file. Use this file as a reference for
// how to port this package to a new platform.
#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
namespace leveldb {
namespace port {
// TODO(jorlow): Many of these belong more in the environment class rather than
// here. We should try moving them and see if it affects perf.
// The following boolean constant must be true on a little-endian machine
// and false otherwise.
static const bool kLittleEndian = true /* or some other expression */;
// ------------------ Threading -------------------
// A Mutex represents an exclusive lock.
class Mutex {
public:
Mutex();
~Mutex();
// Lock the mutex. Waits until other lockers have exited.
// Will deadlock if the mutex is already locked by this thread.
void Lock();
// Unlock the mutex.
// REQUIRES: This mutex was locked by this thread.
void Unlock();
// Optionally crash if this thread does not hold this mutex.
// The implementation must be fast, especially if NDEBUG is
// defined. The implementation is allowed to skip all checks.
void AssertHeld();
};
class CondVar {
public:
explicit CondVar(Mutex* mu);
~CondVar();
// Atomically release *mu and block on this condition variable until
// either a call to SignalAll(), or a call to Signal() that picks
// this thread to wakeup.
// REQUIRES: this thread holds *mu
void Wait();
// If there are some threads waiting, wake up at least one of them.
void Signal();
// Wake up all waiting threads.
void SignallAll();
};
// A type that holds a pointer that can be read or written atomically
// (i.e., without word-tearing.)
class AtomicPointer {
private:
intptr_t rep_;
public:
// Initialize to arbitrary value
AtomicPointer();
// Initialize to hold v
explicit AtomicPointer(void* v) : rep_(v) { }
// Read and return the stored pointer with the guarantee that no
// later memory access (read or write) by this thread can be
// reordered ahead of this read.
void* Acquire_Load() const;
// Set v as the stored pointer with the guarantee that no earlier
// memory access (read or write) by this thread can be reordered
// after this store.
void Release_Store(void* v);
// Read the stored pointer with no ordering guarantees.
void* NoBarrier_Load() const;
// Set va as the stored pointer with no ordering guarantees.
void NoBarrier_Store(void* v);
};
// ------------------ Checksumming -------------------
// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]"
extern void SHA1_Hash(const char* data, size_t len, char* hash_array);
// ------------------ Compression -------------------
// Store the lightweight compression of "input[0,input_length-1]" in *output.
extern void Lightweight_Compress(const char* input, size_t input_length,
std::string* output);
// Attempt to lightweight uncompress input[0,input_length-1] into *output.
// Returns true if successful, false if the input is invalid lightweight
// compressed data.
extern bool Lightweight_Uncompress(const char* input_data, size_t input_length,
std::string* output);
// ------------------ Miscellaneous -------------------
// If heap profiling is not supported, returns false.
// Else repeatedly calls (*func)(arg, data, n) and then returns true.
// The concatenation of all "data[0,n-1]" fragments is the heap profile.
extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
}
}
#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_

+ 50
- 0
port/port_posix.cc View File

@ -0,0 +1,50 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "port/port_posix.h"
#include <cstdlib>
#include <stdio.h>
#include <string.h>
#include "util/logging.h"
namespace leveldb {
namespace port {
static void PthreadCall(const char* label, int result) {
if (result != 0) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
abort();
}
}
Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); }
Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); }
void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); }
CondVar::CondVar(Mutex* mu)
: mu_(mu) {
PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
}
CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); }
void CondVar::Wait() {
PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
}
void CondVar::Signal() {
PthreadCall("signal", pthread_cond_signal(&cv_));
}
void CondVar::SignalAll() {
PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
}
}
}

+ 108
- 0
port/port_posix.h View File

@ -0,0 +1,108 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// See port_example.h for documentation for the following types/functions.
#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_
#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_
#include <endian.h>
#include <pthread.h>
#include <stdint.h>
#include <string>
#include <cstdatomic>
#include <cstring>
#include "port/sha1_portable.h"
namespace leveldb {
namespace port {
static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN);
class CondVar;
class Mutex {
public:
Mutex();
~Mutex();
void Lock();
void Unlock();
void AssertHeld() { }
private:
friend class CondVar;
pthread_mutex_t mu_;
// No copying
Mutex(const Mutex&);
void operator=(const Mutex&);
};
class CondVar {
public:
explicit CondVar(Mutex* mu);
~CondVar();
void Wait();
void Signal();
void SignalAll();
private:
pthread_cond_t cv_;
Mutex* mu_;
};
// Storage for a lock-free pointer
class AtomicPointer {
private:
std::atomic<void*> rep_;
public:
AtomicPointer() { }
explicit AtomicPointer(void* v) : rep_(v) { }
inline void* Acquire_Load() const {
return rep_.load(std::memory_order_acquire);
}
inline void Release_Store(void* v) {
rep_.store(v, std::memory_order_release);
}
inline void* NoBarrier_Load() const {
return rep_.load(std::memory_order_relaxed);
}
inline void NoBarrier_Store(void* v) {
rep_.store(v, std::memory_order_relaxed);
}
};
inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
SHA1_Hash_Portable(data, len, hash_array);
}
/**
* TODO(gabor): Implement actual compress
* This is a hack - it just copies input to output.
* No actual compression occurs.
*/
inline void Lightweight_Compress(const char* input, size_t input_length,
std::string* output) {
output->assign(input, input_length);
}
/**
* TODO(gabor): Implement actual uncompress
* This is a hack - it just copies input to output.
* No actual uncompression occurs.
*/
inline bool Lightweight_Uncompress(const char* input_data, size_t input_length,
std::string* output) {
output->assign(input_data, input_length);
return true;
}
inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
return false;
}
}
}
#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_

+ 298
- 0
port/sha1_portable.cc View File

@ -0,0 +1,298 @@
// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// This module provides a slow but portable implementation of
// the SHA1 hash function.
//
// It is adapted from free code written by Paul E. Jones
// <paulej@packetizer.com>. See http://www.packetizer.com/security/sha1/
//
// The license for the original code is:
/*
Copyright (C) 1998, 2009
Paul E. Jones <paulej@packetizer.com>
Freeware Public License (FPL)
This software is licensed as "freeware." Permission to distribute
this software in source and binary forms, including incorporation
into other products, is hereby granted without a fee. THIS SOFTWARE
IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD
LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER
DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA
OR DATA BEING RENDERED INACCURATE.
*/
#include "port/sha1_portable.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
namespace leveldb {
namespace port {
/*
* Description:
* This class implements the Secure Hashing Standard as defined
* in FIPS PUB 180-1 published April 17, 1995.
*/
/*
* This structure will hold context information for the hashing
* operation
*/
typedef struct SHA1Context {
unsigned Message_Digest[5]; /* Message Digest (output) */
unsigned Length_Low; /* Message length in bits */
unsigned Length_High; /* Message length in bits */
unsigned char Message_Block[64]; /* 512-bit message blocks */
int Message_Block_Index; /* Index into message block array */
bool Computed; /* Is the digest computed? */
bool Corrupted; /* Is the message digest corruped? */
} SHA1Context;
/*
* Portability Issues:
* SHA-1 is defined in terms of 32-bit "words". This code was
* written with the expectation that the processor has at least
* a 32-bit machine word size. If the machine word size is larger,
* the code should still function properly. One caveat to that
* is that the input functions taking characters and character
* arrays assume that only 8 bits of information are stored in each
* character.
*/
/*
* Define the circular shift macro
*/
#define SHA1CircularShift(bits,word) \
((((word) << (bits)) & 0xFFFFFFFF) | \
((word) >> (32-(bits))))
/* Function prototypes */
static void SHA1ProcessMessageBlock(SHA1Context *);
static void SHA1PadMessage(SHA1Context *);
// Initialize the SHA1Context in preparation for computing a new
// message digest.
static void SHA1Reset(SHA1Context* context) {
context->Length_Low = 0;
context->Length_High = 0;
context->Message_Block_Index = 0;
context->Message_Digest[0] = 0x67452301;
context->Message_Digest[1] = 0xEFCDAB89;
context->Message_Digest[2] = 0x98BADCFE;
context->Message_Digest[3] = 0x10325476;
context->Message_Digest[4] = 0xC3D2E1F0;
context->Computed = false;
context->Corrupted = false;
}
// This function will return the 160-bit message digest into the
// Message_Digest array within the SHA1Context provided
static bool SHA1Result(SHA1Context *context) {
if (context->Corrupted) {
return false;
}
if (!context->Computed) {
SHA1PadMessage(context);
context->Computed = true;
}
return true;
}
// This function accepts an array of bytes as the next portion of
// the message.
static void SHA1Input(SHA1Context *context,
const unsigned char *message_array,
unsigned length) {
if (!length) return;
if (context->Computed || context->Corrupted) {
context->Corrupted = true;
return;
}
while(length-- && !context->Corrupted) {
context->Message_Block[context->Message_Block_Index++] =
(*message_array & 0xFF);
context->Length_Low += 8;
/* Force it to 32 bits */
context->Length_Low &= 0xFFFFFFFF;
if (context->Length_Low == 0) {
context->Length_High++;
/* Force it to 32 bits */
context->Length_High &= 0xFFFFFFFF;
if (context->Length_High == 0)
{
/* Message is too long */
context->Corrupted = true;
}
}
if (context->Message_Block_Index == 64)
{
SHA1ProcessMessageBlock(context);
}
message_array++;
}
}
// This function will process the next 512 bits of the message stored
// in the Message_Block array.
static void SHA1ProcessMessageBlock(SHA1Context *context) {
const unsigned K[] = // Constants defined in SHA-1
{
0x5A827999,
0x6ED9EBA1,
0x8F1BBCDC,
0xCA62C1D6
};
int t; // Loop counter
unsigned temp; // Temporary word value
unsigned W[80]; // Word sequence
unsigned A, B, C, D, E; // Word buffers
// Initialize the first 16 words in the array W
for(t = 0; t < 16; t++) {
W[t] = ((unsigned) context->Message_Block[t * 4]) << 24;
W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16;
W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8;
W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]);
}
for(t = 16; t < 80; t++) {
W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]);
}
A = context->Message_Digest[0];
B = context->Message_Digest[1];
C = context->Message_Digest[2];
D = context->Message_Digest[3];
E = context->Message_Digest[4];
for(t = 0; t < 20; t++) {
temp = SHA1CircularShift(5,A) +
((B & C) | ((~B) & D)) + E + W[t] + K[0];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
for(t = 20; t < 40; t++) {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
for(t = 40; t < 60; t++) {
temp = SHA1CircularShift(5,A) +
((B & C) | (B & D) | (C & D)) + E + W[t] + K[2];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
for(t = 60; t < 80; t++) {
temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3];
temp &= 0xFFFFFFFF;
E = D;
D = C;
C = SHA1CircularShift(30,B);
B = A;
A = temp;
}
context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF;
context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF;
context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF;
context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF;
context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF;
context->Message_Block_Index = 0;
}
// According to the standard, the message must be padded to an even
// 512 bits. The first padding bit must be a '1'. The last 64 bits
// represent the length of the original message. All bits in between
// should be 0. This function will pad the message according to those
// rules by filling the Message_Block array accordingly. It will also
// call SHA1ProcessMessageBlock() appropriately. When it returns, it
// can be assumed that the message digest has been computed.
static void SHA1PadMessage(SHA1Context *context) {
// Check to see if the current message block is too small to hold
// the initial padding bits and length. If so, we will pad the
// block, process it, and then continue padding into a second block.
if (context->Message_Block_Index > 55) {
context->Message_Block[context->Message_Block_Index++] = 0x80;
while(context->Message_Block_Index < 64) {
context->Message_Block[context->Message_Block_Index++] = 0;
}
SHA1ProcessMessageBlock(context);
while(context->Message_Block_Index < 56) {
context->Message_Block[context->Message_Block_Index++] = 0;
}
} else {
context->Message_Block[context->Message_Block_Index++] = 0x80;
while(context->Message_Block_Index < 56) {
context->Message_Block[context->Message_Block_Index++] = 0;
}
}
// Store the message length as the last 8 octets
context->Message_Block[56] = (context->Length_High >> 24) & 0xFF;
context->Message_Block[57] = (context->Length_High >> 16) & 0xFF;
context->Message_Block[58] = (context->Length_High >> 8) & 0xFF;
context->Message_Block[59] = (context->Length_High) & 0xFF;
context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF;
context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF;
context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF;
context->Message_Block[63] = (context->Length_Low) & 0xFF;
SHA1ProcessMessageBlock(context);
}
void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) {
SHA1Context context;
SHA1Reset(&context);
SHA1Input(&context, reinterpret_cast<const unsigned char*>(data), len);
bool ok = SHA1Result(&context);
if (!ok) {
fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n");
exit(1);
}
for (int i = 0; i < 5; i++) {
uint32_t value = context.Message_Digest[i];
hash_array[i*4 + 0] = (value >> 24) & 0xff;
hash_array[i*4 + 1] = (value >> 16) & 0xff;
hash_array[i*4 + 2] = (value >> 8) & 0xff;
hash_array[i*4 + 3] = value & 0xff;
}
}
}
}

+ 25
- 0
port/sha1_portable.h View File

@ -0,0 +1,25 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
#include <stddef.h>
namespace leveldb {
namespace port {
// Compute the SHA1 hash value of "data[0..len-1]" and store it in
// "hash_array[0..19]". hash_array must have 20 bytes of space available.
//
// This function is portable but may not be as fast as a version
// optimized for your platform. It is provided as a default method
// that can be used when porting leveldb to a new platform if no
// better SHA1 hash implementation is available.
void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array);
}
}
#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_

+ 55
- 0
port/sha1_test.cc View File

@ -0,0 +1,55 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "port/port.h"
#include "util/testharness.h"
namespace leveldb {
namespace port {
class SHA1 { };
static std::string TestSHA1(const char* data, size_t len) {
char hash_val[20];
SHA1_Hash(data, len, hash_val);
char buf[41];
for (int i = 0; i < 20; i++) {
snprintf(buf + i * 2, 41 - i * 2,
"%02x",
static_cast<unsigned int>(static_cast<unsigned char>(
hash_val[i])));
}
return std::string(buf, 40);
}
TEST(SHA1, Simple) {
ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0));
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5));
std::string x(10000, 'x');
ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75",
TestSHA1(x.data(), x.size()));
}
TEST(SHA1, Benchmark) {
std::string data(1048576 * 100, 'x');
double start = Env::Default()->NowMicros() * 1e-6;
static const int kIters = 10;
uint32_t sha1 = 0;
for (int i = 0; i < kIters; i++) {
char hash_val[20];
SHA1_Hash(data.data(), data.size(), hash_val);
sha1 |= hash_val[0];
}
double finish = Env::Default()->NowMicros() * 1e-6;
double mb = (static_cast<long long int>(data.size()) * kIters) / 1048576.0;
fprintf(stderr, "SHA1 %0.0f MB: %.3f secs; %.1f MB/s, dummy=0x%02x\n",
mb, (finish - start), mb / (finish - start), sha1);
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 261
- 0
table/block.cc View File

@ -0,0 +1,261 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Decodes the blocks generated by block_builder.cc.
#include "table/block.h"
#include <vector>
#include <algorithm>
#include "include/comparator.h"
#include "util/coding.h"
#include "util/logging.h"
namespace leveldb {
inline uint32_t Block::NumRestarts() const {
assert(size_ >= 2*sizeof(uint32_t));
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Block::Block(const char* data, size_t size)
: data_(data),
size_(size) {
if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker
} else {
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
if (restart_offset_ > size_ - sizeof(uint32_t)) {
// The size is too small for NumRestarts() and therefore
// restart_offset_ wrapped around.
size_ = 0;
}
}
}
Block::~Block() {
delete[] data_;
}
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
// "*value_length", respectively. Will not derefence past "limit".
//
// If any errors are detected, returns NULL. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
static inline const char* DecodeEntry(const char* p, const char* limit,
uint32_t* shared,
uint32_t* non_shared,
uint32_t* value_length) {
if (limit - p < 3) return NULL;
*shared = reinterpret_cast<const unsigned char*>(p)[0];
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
*value_length = reinterpret_cast<const unsigned char*>(p)[2];
if ((*shared | *non_shared | *value_length) < 128) {
// Fast path: all three values are encoded in one byte each
p += 3;
} else {
if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL;
if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL;
if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL;
}
if (limit - p < (*non_shared + *value_length)) return NULL;
return p;
}
class Block::Iter : public Iterator {
private:
const Comparator* const comparator_;
const char* const data_; // underlying block contents
uint32_t const restarts_; // Offset of restart array (list of fixed32)
uint32_t const num_restarts_; // Number of uint32_t entries in restart array
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
uint32_t current_;
uint32_t restart_index_; // Index of restart block in which current_ falls
std::string key_;
Slice value_;
Status status_;
inline int Compare(const Slice& a, const Slice& b) const {
return comparator_->Compare(a, b);
}
// Return the offset in data_ just past the end of the current entry.
inline uint32_t NextEntryOffset() const {
return (value_.data() + value_.size()) - data_;
}
uint32_t GetRestartPoint(uint32_t index) {
assert(index < num_restarts_);
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
}
void SeekToRestartPoint(uint32_t index) {
key_.clear();
restart_index_ = index;
// current_ will be fixed by ParseNextKey();
// ParseNextKey() starts at the end of value_, so set value_ accordingly
uint32_t offset = GetRestartPoint(index);
value_ = Slice(data_ + offset, 0);
}
public:
Iter(const Comparator* comparator,
const char* data,
uint32_t restarts,
uint32_t num_restarts)
: comparator_(comparator),
data_(data),
restarts_(restarts),
num_restarts_(num_restarts),
current_(restarts_),
restart_index_(num_restarts_) {
assert(num_restarts_ > 0);
}
virtual bool Valid() const { return current_ < restarts_; }
virtual Status status() const { return status_; }
virtual Slice key() const {
assert(Valid());
return key_;
}
virtual Slice value() const {
assert(Valid());
return value_;
}
virtual void Next() {
assert(Valid());
ParseNextKey();
}
virtual void Prev() {
assert(Valid());
// Scan backwards to a restart point before current_
const uint32_t original = current_;
while (GetRestartPoint(restart_index_) >= original) {
if (restart_index_ == 0) {
// No more entries
current_ = restarts_;
restart_index_ = num_restarts_;
return;
}
restart_index_--;
}
SeekToRestartPoint(restart_index_);
do {
// Loop until end of current entry hits the start of original entry
} while (ParseNextKey() && NextEntryOffset() < original);
}
virtual void Seek(const Slice& target) {
// Binary search in restart array to find the first restart point
// with a key >= target
uint32_t left = 0;
uint32_t right = num_restarts_ - 1;
while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr = DecodeEntry(data_ + region_offset,
data_ + restarts_,
&shared, &non_shared, &value_length);
if (key_ptr == NULL || (shared != 0)) {
CorruptionError();
return;
}
Slice mid_key(key_ptr, non_shared);
if (Compare(mid_key, target) < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
}
}
// Linear search (within restart block) for first key >= target
SeekToRestartPoint(left);
while (true) {
if (!ParseNextKey()) {
return;
}
if (Compare(key_, target) >= 0) {
return;
}
}
}
virtual void SeekToFirst() {
SeekToRestartPoint(0);
ParseNextKey();
}
virtual void SeekToLast() {
SeekToRestartPoint(num_restarts_ - 1);
while (ParseNextKey() && NextEntryOffset() < restarts_) {
// Keep skipping
}
}
private:
void CorruptionError() {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption("bad entry in block");
key_.clear();
value_.clear();
}
bool ParseNextKey() {
current_ = NextEntryOffset();
const char* p = data_ + current_;
const char* limit = data_ + restarts_; // Restarts come right after data
if (p >= limit) {
// No more entries to return. Mark as invalid.
current_ = restarts_;
restart_index_ = num_restarts_;
return false;
}
// Decode next entry
uint32_t shared, non_shared, value_length;
p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
if (p == NULL || key_.size() < shared) {
CorruptionError();
return false;
} else {
key_.resize(shared);
key_.append(p, non_shared);
value_ = Slice(p + non_shared, value_length);
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) {
++restart_index_;
}
return true;
}
}
};
Iterator* Block::NewIterator(const Comparator* cmp) {
if (size_ < 2*sizeof(uint32_t)) {
return NewErrorIterator(Status::Corruption("bad block contents"));
}
const uint32_t num_restarts = NumRestarts();
if (num_restarts == 0) {
return NewEmptyIterator();
} else {
return new Iter(cmp, data_, restart_offset_, num_restarts);
}
}
}

+ 43
- 0
table/block.h View File

@ -0,0 +1,43 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_
#define STORAGE_LEVELDB_TABLE_BLOCK_H_
#include <stddef.h>
#include <stdint.h>
#include "include/iterator.h"
namespace leveldb {
class Comparator;
class Block {
public:
// Initialize the block with the specified contents.
// Takes ownership of data[] and will delete[] it when done.
Block(const char* data, size_t size);
~Block();
size_t size() const { return size_; }
Iterator* NewIterator(const Comparator* comparator);
private:
uint32_t NumRestarts() const;
const char* data_;
size_t size_;
uint32_t restart_offset_; // Offset in data_ of restart array
// No copying allowed
Block(const Block&);
void operator=(const Block&);
class Iter;
};
}
#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_

+ 109
- 0
table/block_builder.cc View File

@ -0,0 +1,109 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// BlockBuilder generates blocks where keys are prefix-compressed:
//
// When we store a key, we drop the prefix shared with the previous
// string. This helps reduce the space requirement significantly.
// Furthermore, once every K keys, we do not apply the prefix
// compression and store the entire key. We call this a "restart
// point". The tail end of the block stores the offsets of all of the
// restart points, and can be used to do a binary search when looking
// for a particular key. Values are stored as-is (without compression)
// immediately following the corresponding key.
//
// An entry for a particular key-value pair has the form:
// shared_bytes: varint32
// unshared_bytes: varint32
// value_length: varint32
// key_delta: char[unshared_bytes]
// value: char[value_length]
// shared_bytes == 0 for restart points.
//
// The trailer of the block has the form:
// restarts: uint32[num_restarts]
// num_restarts: uint32
// restarts[i] contains the offset within the block of the ith restart point.
#include "table/block_builder.h"
#include <algorithm>
#include <assert.h>
#include "include/comparator.h"
#include "include/table_builder.h"
#include "util/coding.h"
namespace leveldb {
BlockBuilder::BlockBuilder(const Options* options)
: options_(options),
restarts_(),
counter_(0),
finished_(false) {
assert(options->block_restart_interval >= 1);
restarts_.push_back(0); // First restart point is at offset 0
}
void BlockBuilder::Reset() {
buffer_.clear();
restarts_.clear();
restarts_.push_back(0); // First restart point is at offset 0
counter_ = 0;
finished_ = false;
last_key_.clear();
}
size_t BlockBuilder::CurrentSizeEstimate() const {
return (buffer_.size() + // Raw data buffer
restarts_.size() * sizeof(uint32_t) + // Restart array
sizeof(uint32_t)); // Restart array length
}
Slice BlockBuilder::Finish() {
// Append restart array
for (int i = 0; i < restarts_.size(); i++) {
PutFixed32(&buffer_, restarts_[i]);
}
PutFixed32(&buffer_, restarts_.size());
finished_ = true;
return Slice(buffer_);
}
void BlockBuilder::Add(const Slice& key, const Slice& value) {
Slice last_key_piece(last_key_);
assert(!finished_);
assert(counter_ <= options_->block_restart_interval);
assert(buffer_.empty() // No values yet?
|| options_->comparator->Compare(key, last_key_piece) > 0);
size_t shared = 0;
if (counter_ < options_->block_restart_interval) {
// See how much sharing to do with previous string
const size_t min_length = std::min(last_key_piece.size(), key.size());
while ((shared < min_length) && (last_key_[shared] == key[shared])) {
shared++;
}
} else {
// Restart compression
restarts_.push_back(buffer_.size());
counter_ = 0;
}
const size_t non_shared = key.size() - shared;
// Add "<shared><non_shared><value_size>" to buffer_
PutVarint32(&buffer_, shared);
PutVarint32(&buffer_, non_shared);
PutVarint32(&buffer_, value.size());
// Add string delta to buffer_ followed by value
buffer_.append(key.data() + shared, non_shared);
buffer_.append(value.data(), value.size());
// Update state
last_key_.resize(shared);
last_key_.append(key.data() + shared, non_shared);
assert(Slice(last_key_) == key);
counter_++;
}
}

+ 57
- 0
table/block_builder.h View File

@ -0,0 +1,57 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_
#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_
#include <vector>
#include <stdint.h>
#include "include/slice.h"
namespace leveldb {
struct Options;
class BlockBuilder {
public:
explicit BlockBuilder(const Options* options);
// Reset the contents as if the BlockBuilder was just constructed.
void Reset();
// REQUIRES: Finish() has not been callled since the last call to Reset().
// REQUIRES: key is larger than any previously added key
void Add(const Slice& key, const Slice& value);
// Finish building the block and return a slice that refers to the
// block contents. The returned slice will remain valid for the
// lifetime of this builder or until Reset() is called.
Slice Finish();
// Returns an estimate of the current (uncompressed) size of the block
// we are building.
size_t CurrentSizeEstimate() const;
// Return true iff no entries have been added since the last Reset()
bool empty() const {
return buffer_.empty();
}
private:
const Options* options_;
std::string buffer_; // Destination buffer
std::vector<uint32_t> restarts_; // Restart points
int counter_; // Number of entries emitted since restart
bool finished_; // Has Finish() been called?
std::string last_key_;
// No copying allowed
BlockBuilder(const BlockBuilder&);
void operator=(const BlockBuilder&);
};
}
#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_

+ 131
- 0
table/format.cc View File

@ -0,0 +1,131 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/format.h"
#include "include/env.h"
#include "port/port.h"
#include "table/block.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
void BlockHandle::EncodeTo(std::string* dst) const {
// Sanity check that all fields have been set
assert(offset_ != ~static_cast<uint64_t>(0));
assert(size_ != ~static_cast<uint64_t>(0));
PutVarint64(dst, offset_);
PutVarint64(dst, size_);
}
Status BlockHandle::DecodeFrom(Slice* input) {
if (GetVarint64(input, &offset_) &&
GetVarint64(input, &size_)) {
return Status::OK();
} else {
return Status::Corruption("bad block handle");
}
}
void Footer::EncodeTo(std::string* dst) const {
#ifndef NDEBUG
const size_t original_size = dst->size();
#endif
metaindex_handle_.EncodeTo(dst);
index_handle_.EncodeTo(dst);
dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber));
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32));
assert(dst->size() == original_size + kEncodedLength);
}
Status Footer::DecodeFrom(Slice* input) {
const char* magic_ptr = input->data() + kEncodedLength - 8;
const uint32_t magic_lo = DecodeFixed32(magic_ptr);
const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
(static_cast<uint64_t>(magic_lo)));
if (magic != kTableMagicNumber) {
return Status::InvalidArgument("not an sstable (bad magic number)");
}
Status result = metaindex_handle_.DecodeFrom(input);
if (result.ok()) {
result = index_handle_.DecodeFrom(input);
}
if (result.ok()) {
// We skip over any leftover data (just padding for now) in "input"
const char* end = magic_ptr + 8;
*input = Slice(end, input->data() + input->size() - end);
}
return result;
}
Status ReadBlock(RandomAccessFile* file,
const ReadOptions& options,
const BlockHandle& handle,
Block** block) {
*block = NULL;
// Read the block contents as well as the type/crc footer.
// See table_builder.cc for the code that built this structure.
size_t n = handle.size();
char* buf = new char[n + kBlockTrailerSize];
Slice contents;
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
if (!s.ok()) {
delete[] buf;
return s;
}
if (contents.size() != n + kBlockTrailerSize) {
delete[] buf;
return Status::Corruption("truncated block read");
}
// Check the crc of the type and the block contents
const char* data = contents.data(); // Pointer to where Read put the data
if (options.verify_checksums) {
const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));
const uint32_t actual = crc32c::Value(data, n + 1);
if (actual != crc) {
delete[] buf;
s = Status::Corruption("block checksum mismatch");
return s;
}
}
switch (data[n]) {
case kNoCompression:
if (data != buf) {
// File implementation gave us pointer to some other data.
// Copy into buf[].
memcpy(buf, data, n + kBlockTrailerSize);
}
// Ok
break;
case kLightweightCompression: {
std::string decompressed;
if (!port::Lightweight_Uncompress(data, n, &decompressed)) {
delete[] buf;
s = Status::Corruption("corrupted compressed block contents");
return s;
}
delete[] buf; // Done with uncompressed data
buf = new char[decompressed.size()];
memcpy(buf, decompressed.data(), decompressed.size());
n = decompressed.size();
break;
}
default:
delete[] buf;
return Status::Corruption("bad block type");
}
*block = new Block(buf, n); // Block takes ownership of buf[]
return Status::OK();
}
}

+ 103
- 0
table/format.h View File

@ -0,0 +1,103 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_
#define STORAGE_LEVELDB_TABLE_FORMAT_H_
#include <string>
#include <stdint.h>
#include "include/slice.h"
#include "include/status.h"
#include "include/table_builder.h"
namespace leveldb {
class Block;
class RandomAccessFile;
struct ReadOptions;
// BlockHandle is a pointer to the extent of a file that stores a data
// block or a meta block.
class BlockHandle {
public:
BlockHandle();
// The offset of the block in the file.
uint64_t offset() const { return offset_; }
void set_offset(uint64_t offset) { offset_ = offset; }
// The size of the stored block
uint64_t size() const { return size_; }
void set_size(uint64_t size) { size_ = size; }
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* input);
// Maximum encoding length of a BlockHandle
enum { kMaxEncodedLength = 10 + 10 };
private:
uint64_t offset_;
uint64_t size_;
};
// Footer encapsulates the fixed information stored at the tail
// end of every table file.
class Footer {
public:
Footer() { }
// The block handle for the metaindex block of the table
const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
// The block handle for the index block of the table
const BlockHandle& index_handle() const {
return index_handle_;
}
void set_index_handle(const BlockHandle& h) {
index_handle_ = h;
}
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* input);
// Encoded length of a Footer. Note that the serialization of a
// Footer will always occupy exactly this many bytes. It consists
// of two block handles and a magic number.
enum {
kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8
};
private:
BlockHandle metaindex_handle_;
BlockHandle index_handle_;
};
// kTableMagicNumber was picked by running
// echo http://code.google.com/p/leveldb/ | sha1sum
// and taking the leading 64 bits.
static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
// 1-byte type + 32-bit crc
static const size_t kBlockTrailerSize = 5;
// Read the block identified by "handle" from "file". On success,
// store a pointer to the heap-allocated result in *block and return
// OK. On failure store NULL in *block and return non-OK.
extern Status ReadBlock(RandomAccessFile* file,
const ReadOptions& options,
const BlockHandle& handle,
Block** block);
// Implementation details follow. Clients should ignore,
inline BlockHandle::BlockHandle()
: offset_(~static_cast<uint64_t>(0)),
size_(~static_cast<uint64_t>(0)) {
}
}
#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_

+ 68
- 0
table/iterator.cc View File

@ -0,0 +1,68 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/iterator.h"
#include "util/logging.h"
namespace leveldb {
Iterator::Iterator() {
cleanup_.function = NULL;
cleanup_.next = NULL;
}
Iterator::~Iterator() {
if (cleanup_.function != NULL) {
(*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
for (Cleanup* c = cleanup_.next; c != NULL; ) {
(*c->function)(c->arg1, c->arg2);
Cleanup* next = c->next;
delete c;
c = next;
}
}
}
void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
assert(func != NULL);
Cleanup* c;
if (cleanup_.function == NULL) {
c = &cleanup_;
} else {
c = new Cleanup;
c->next = cleanup_.next;
cleanup_.next = c;
}
c->function = func;
c->arg1 = arg1;
c->arg2 = arg2;
}
namespace {
class EmptyIterator : public Iterator {
public:
EmptyIterator(const Status& s) : status_(s) { }
virtual bool Valid() const { return false; }
virtual void Seek(const Slice& target) { }
virtual void SeekToFirst() { }
virtual void SeekToLast() { }
virtual void Next() { assert(false); }
virtual void Prev() { assert(false); }
Slice key() const { assert(false); return Slice(); }
Slice value() const { assert(false); return Slice(); }
virtual Status status() const { return status_; }
private:
Status status_;
};
}
Iterator* NewEmptyIterator() {
return new EmptyIterator(Status::OK());
}
Iterator* NewErrorIterator(const Status& status) {
return new EmptyIterator(status);
}
}

+ 64
- 0
table/iterator_wrapper.h View File

@ -0,0 +1,64 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
namespace leveldb {
// A internal wrapper class with an interface similar to Iterator that
// caches the valid() and key() results for an underlying iterator.
// This can help avoid virtual function calls and also gives better
// cache locality.
class IteratorWrapper {
private:
Iterator* iter_;
bool valid_;
Slice key_;
public:
IteratorWrapper(): iter_(NULL), valid_(false) { }
explicit IteratorWrapper(Iterator* iter): iter_(NULL) {
Set(iter);
}
~IteratorWrapper() { delete iter_; }
Iterator* iter() const { return iter_; }
// Takes ownership of "iter" and will delete it when destroyed, or
// when Set() is invoked again.
void Set(Iterator* iter) {
delete iter_;
iter_ = iter;
if (iter_ == NULL) {
valid_ = false;
} else {
Update();
}
}
// Iterator interface methods
bool Valid() const { return valid_; }
Slice key() const { assert(Valid()); return key_; }
Slice value() const { assert(Valid()); return iter_->value(); }
// Methods below require iter() != NULL
Status status() const { assert(iter_); return iter_->status(); }
void Next() { assert(iter_); iter_->Next(); Update(); }
void Prev() { assert(iter_); iter_->Prev(); Update(); }
void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); }
void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); }
void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); }
private:
void Update() {
valid_ = iter_->Valid();
if (valid_) {
key_ = iter_->key();
}
}
};
}
#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_

+ 143
- 0
table/merger.cc View File

@ -0,0 +1,143 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/merger.h"
#include "include/comparator.h"
#include "include/iterator.h"
#include "table/iterator_wrapper.h"
namespace leveldb {
namespace {
class MergingIterator : public Iterator {
public:
MergingIterator(const Comparator* comparator, Iterator** children, int n)
: comparator_(comparator),
children_(new IteratorWrapper[n]),
n_(n),
current_(NULL) {
for (int i = 0; i < n; i++) {
children_[i].Set(children[i]);
}
}
virtual ~MergingIterator() {
delete[] children_;
}
virtual bool Valid() const {
return (current_ != NULL);
}
virtual void SeekToFirst() {
for (int i = 0; i < n_; i++) {
children_[i].SeekToFirst();
}
FindSmallest();
}
virtual void SeekToLast() {
for (int i = 0; i < n_; i++) {
children_[i].SeekToLast();
}
FindLargest();
}
virtual void Seek(const Slice& target) {
for (int i = 0; i < n_; i++) {
children_[i].Seek(target);
}
FindSmallest();
}
virtual void Next() {
assert(Valid());
current_->Next();
FindSmallest();
}
virtual void Prev() {
assert(Valid());
current_->Prev();
FindLargest();
}
virtual Slice key() const {
assert(Valid());
return current_->key();
}
virtual Slice value() const {
assert(Valid());
return current_->value();
}
virtual Status status() const {
Status status;
for (int i = 0; i < n_; i++) {
status = children_[i].status();
if (!status.ok()) {
break;
}
}
return status;
}
private:
void FindSmallest();
void FindLargest();
// We might want to use a heap in case there are lots of children.
// For now we use a simple array since we expect a very small number
// of children in leveldb.
const Comparator* comparator_;
IteratorWrapper* children_;
int n_;
IteratorWrapper* current_;
};
void MergingIterator::FindSmallest() {
IteratorWrapper* smallest = NULL;
for (int i = 0; i < n_; i++) {
IteratorWrapper* child = &children_[i];
if (child->Valid()) {
if (smallest == NULL) {
smallest = child;
} else if (comparator_->Compare(child->key(), smallest->key()) < 0) {
smallest = child;
}
}
}
current_ = smallest;
}
void MergingIterator::FindLargest() {
IteratorWrapper* largest = NULL;
for (int i = n_-1; i >= 0; i--) {
IteratorWrapper* child = &children_[i];
if (child->Valid()) {
if (largest == NULL) {
largest = child;
} else if (comparator_->Compare(child->key(), largest->key()) > 0) {
largest = child;
}
}
}
current_ = largest;
}
}
Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
assert(n >= 0);
if (n == 0) {
return NewEmptyIterator();
} else if (n == 1) {
return list[0];
} else {
return new MergingIterator(cmp, list, n);
}
}
}

+ 26
- 0
table/merger.h View File

@ -0,0 +1,26 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_
#define STORAGE_LEVELDB_TABLE_MERGER_H_
namespace leveldb {
class Comparator;
class Iterator;
// Return an iterator that provided the union of the data in
// children[0,n-1]. Takes ownership of the child iterators and
// will delete them when the result iterator is deleted.
//
// The result does no duplicate suppression. I.e., if a particular
// key is present in K child iterators, it will be yielded K times.
//
// REQUIRES: n >= 0
extern Iterator* NewMergingIterator(
const Comparator* comparator, Iterator** children, int n);
}
#endif // STORAGE_LEVELDB_TABLE_MERGER_H_

+ 175
- 0
table/table.cc View File

@ -0,0 +1,175 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/table.h"
#include "include/cache.h"
#include "include/env.h"
#include "table/block.h"
#include "table/format.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
namespace leveldb {
struct Table::Rep {
~Rep() {
delete index_block;
}
Options options;
Status status;
RandomAccessFile* file;
uint64_t cache_id;
BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer
Block* index_block;
};
Status Table::Open(const Options& options,
RandomAccessFile* file,
Table** table) {
*table = NULL;
const uint64_t size = file->Size();
if (size < Footer::kEncodedLength) {
return Status::InvalidArgument("file is too short to be an sstable");
}
char footer_space[Footer::kEncodedLength];
Slice footer_input;
Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
&footer_input, footer_space);
if (!s.ok()) return s;
Footer footer;
s = footer.DecodeFrom(&footer_input);
if (!s.ok()) return s;
// Read the index block
Block* index_block = NULL;
if (s.ok()) {
s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block);
}
if (s.ok()) {
// We've successfully read the footer and the index block: we're
// ready to serve requests.
Rep* rep = new Table::Rep;
rep->options = options;
rep->file = file;
rep->metaindex_handle = footer.metaindex_handle();
rep->index_block = index_block;
rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
*table = new Table(rep);
} else {
if (index_block) delete index_block;
}
return s;
}
Table::~Table() {
delete rep_;
}
static void DeleteBlock(void* arg, void* ignored) {
delete reinterpret_cast<Block*>(arg);
}
static void DeleteCachedBlock(const Slice& key, void* value) {
Block* block = reinterpret_cast<Block*>(value);
delete block;
}
static void ReleaseBlock(void* arg, void* h) {
Cache* cache = reinterpret_cast<Cache*>(arg);
Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
cache->Release(handle);
}
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
Iterator* Table::BlockReader(void* arg,
const ReadOptions& options,
const Slice& index_value) {
Table* table = reinterpret_cast<Table*>(arg);
Cache* block_cache = table->rep_->options.block_cache;
Block* block = NULL;
Cache::Handle* cache_handle = NULL;
BlockHandle handle;
Slice input = index_value;
Status s = handle.DecodeFrom(&input);
// We intentionally allow extra stuff in index_value so that we
// can add more features in the future.
if (s.ok()) {
if (block_cache != NULL) {
char cache_key_buffer[16];
EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
EncodeFixed64(cache_key_buffer+8, handle.offset());
Slice key(cache_key_buffer, sizeof(cache_key_buffer));
cache_handle = block_cache->Lookup(key);
if (cache_handle != NULL) {
block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
} else {
s = ReadBlock(table->rep_->file, options, handle, &block);
if (s.ok() && options.fill_cache) {
cache_handle = block_cache->Insert(
key, block, block->size(), &DeleteCachedBlock);
}
}
} else {
s = ReadBlock(table->rep_->file, options, handle, &block);
}
}
Iterator* iter;
if (block != NULL) {
iter = block->NewIterator(table->rep_->options.comparator);
if (cache_handle == NULL) {
iter->RegisterCleanup(&DeleteBlock, block, NULL);
} else {
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
}
} else {
iter = NewErrorIterator(s);
}
return iter;
}
Iterator* Table::NewIterator(const ReadOptions& options) const {
return NewTwoLevelIterator(
rep_->index_block->NewIterator(rep_->options.comparator),
&Table::BlockReader, const_cast<Table*>(this), options);
}
uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
Iterator* index_iter =
rep_->index_block->NewIterator(rep_->options.comparator);
index_iter->Seek(key);
uint64_t result;
if (index_iter->Valid()) {
BlockHandle handle;
Slice input = index_iter->value();
Status s = handle.DecodeFrom(&input);
if (s.ok()) {
result = handle.offset();
} else {
// Strange: we can't decode the block handle in the index block.
// We'll just return the offset of the metaindex block, which is
// close to the whole file size for this case.
result = rep_->metaindex_handle.offset();
}
} else {
// key is past the last key in the file. Approximate the offset
// by returning the offset of the metaindex block (which is
// right near the end of the file).
result = rep_->metaindex_handle.offset();
}
delete index_iter;
return result;
}
}

+ 224
- 0
table/table_builder.cc View File

@ -0,0 +1,224 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/table_builder.h"
#include <assert.h>
#include <stdio.h>
#include "include/comparator.h"
#include "include/env.h"
#include "table/block_builder.h"
#include "table/format.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/logging.h"
namespace leveldb {
struct TableBuilder::Rep {
Options options;
Options index_block_options;
WritableFile* file;
uint64_t offset;
Status status;
BlockBuilder data_block;
BlockBuilder index_block;
std::string last_key;
int64_t num_entries;
bool closed; // Either Finish() or Abandon() has been called.
// We do not emit the index entry for a block until we have seen the
// first key for the next data block. This allows us to use shorter
// keys in the index block. For example, consider a block boundary
// between the keys "the quick brown fox" and "the who". We can use
// "the r" as the key for the index block entry since it is >= all
// entries in the first block and < all entries in subsequent
// blocks.
//
// Invariant: r->pending_index_entry is true only if data_block is empty.
bool pending_index_entry;
BlockHandle pending_handle; // Handle to add to index block
std::string compressed_output;
Rep(const Options& opt, WritableFile* f)
: options(opt),
index_block_options(opt),
file(f),
offset(0),
data_block(&options),
index_block(&index_block_options),
num_entries(0),
closed(false),
pending_index_entry(false) {
index_block_options.block_restart_interval = 1;
}
};
TableBuilder::TableBuilder(const Options& options, WritableFile* file)
: rep_(new Rep(options, file)) {
}
TableBuilder::~TableBuilder() {
assert(rep_->closed); // Catch errors where caller forgot to call Finish()
delete rep_;
}
Status TableBuilder::ChangeOptions(const Options& options) {
// Note: if more fields are added to Options, update
// this function to catch changes that should not be allowed to
// change in the middle of building a Table.
if (options.comparator != rep_->options.comparator) {
return Status::InvalidArgument("changing comparator while building table");
}
// Note that any live BlockBuilders point to rep_->options and therefore
// will automatically pick up the updated options.
rep_->options = options;
rep_->index_block_options = options;
rep_->index_block_options.block_restart_interval = 1;
return Status::OK();
}
void TableBuilder::Add(const Slice& key, const Slice& value) {
Rep* r = rep_;
assert(!r->closed);
if (!ok()) return;
if (r->num_entries > 0) {
assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
}
if (r->pending_index_entry) {
assert(r->data_block.empty());
r->options.comparator->FindShortestSeparator(&r->last_key, key);
std::string handle_encoding;
r->pending_handle.EncodeTo(&handle_encoding);
r->index_block.Add(r->last_key, Slice(handle_encoding));
r->pending_index_entry = false;
}
r->last_key.assign(key.data(), key.size());
r->num_entries++;
r->data_block.Add(key, value);
const size_t estimated_block_size = r->data_block.CurrentSizeEstimate();
if (estimated_block_size >= r->options.block_size) {
Flush();
}
}
void TableBuilder::Flush() {
Rep* r = rep_;
assert(!r->closed);
if (!ok()) return;
if (r->data_block.empty()) return;
assert(!r->pending_index_entry);
WriteBlock(&r->data_block, &r->pending_handle);
if (ok()) {
r->pending_index_entry = true;
r->status = r->file->Flush();
}
}
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
// File format contains a sequence of blocks where each block has:
// block_data: uint8[n]
// type: uint8
// crc: uint32
assert(ok());
Rep* r = rep_;
Slice raw = block->Finish();
Slice block_contents;
CompressionType type = r->options.compression;
// TODO(postrelease): Support more compression options: zlib?
switch (type) {
case kNoCompression:
block_contents = raw;
break;
case kLightweightCompression: {
port::Lightweight_Compress(raw.data(), raw.size(), &r->compressed_output);
block_contents = r->compressed_output;
if (block_contents.size() >= raw.size() - (raw.size() / 8u)) {
// Compressed less than 12.5%, so just store uncompressed form
block_contents = raw;
type = kNoCompression;
}
break;
}
}
handle->set_offset(r->offset);
handle->set_size(block_contents.size());
r->status = r->file->Append(block_contents);
if (r->status.ok()) {
char trailer[kBlockTrailerSize];
trailer[0] = type;
uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type
EncodeFixed32(trailer+1, crc32c::Mask(crc));
r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
if (r->status.ok()) {
r->offset += block_contents.size() + kBlockTrailerSize;
}
}
r->compressed_output.clear();
block->Reset();
}
Status TableBuilder::status() const {
return rep_->status;
}
Status TableBuilder::Finish() {
Rep* r = rep_;
Flush();
assert(!r->closed);
r->closed = true;
BlockHandle metaindex_block_handle;
BlockHandle index_block_handle;
if (ok()) {
BlockBuilder meta_index_block(&r->options);
// TODO(postrelease): Add stats and other meta blocks
WriteBlock(&meta_index_block, &metaindex_block_handle);
}
if (ok()) {
if (r->pending_index_entry) {
r->options.comparator->FindShortSuccessor(&r->last_key);
std::string handle_encoding;
r->pending_handle.EncodeTo(&handle_encoding);
r->index_block.Add(r->last_key, Slice(handle_encoding));
r->pending_index_entry = false;
}
WriteBlock(&r->index_block, &index_block_handle);
}
if (ok()) {
Footer footer;
footer.set_metaindex_handle(metaindex_block_handle);
footer.set_index_handle(index_block_handle);
std::string footer_encoding;
footer.EncodeTo(&footer_encoding);
r->status = r->file->Append(footer_encoding);
if (r->status.ok()) {
r->offset += footer_encoding.size();
}
}
return r->status;
}
void TableBuilder::Abandon() {
Rep* r = rep_;
assert(!r->closed);
r->closed = true;
}
uint64_t TableBuilder::NumEntries() const {
return rep_->num_entries;
}
uint64_t TableBuilder::FileSize() const {
return rep_->offset;
}
}

+ 808
- 0
table/table_test.cc View File

@ -0,0 +1,808 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/table.h"
#include <map>
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "include/db.h"
#include "include/env.h"
#include "include/iterator.h"
#include "include/table_builder.h"
#include "table/block.h"
#include "table/block_builder.h"
#include "table/format.h"
#include "util/random.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
// Return reverse of "key".
// Used to test non-lexicographic comparators.
static std::string Reverse(const Slice& key) {
std::string str(key.ToString());
std::string rev(str.rbegin(), str.rend());
return rev;
}
namespace {
class ReverseKeyComparator : public Comparator {
public:
virtual const char* Name() const {
return "leveldb.ReverseBytewiseComparator";
}
virtual int Compare(const Slice& a, const Slice& b) const {
return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
}
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const {
std::string s = Reverse(*start);
std::string l = Reverse(limit);
BytewiseComparator()->FindShortestSeparator(&s, l);
*start = Reverse(s);
}
virtual void FindShortSuccessor(std::string* key) const {
std::string s = Reverse(*key);
BytewiseComparator()->FindShortSuccessor(&s);
*key = Reverse(s);
}
};
}
static ReverseKeyComparator reverse_key_comparator;
static void Increment(const Comparator* cmp, std::string* key) {
if (cmp == BytewiseComparator()) {
key->push_back('\0');
} else {
assert(cmp == &reverse_key_comparator);
std::string rev = Reverse(*key);
rev.push_back('\0');
*key = Reverse(rev);
}
}
// An STL comparator that uses a Comparator
namespace {
struct STLLessThan {
const Comparator* cmp;
STLLessThan() : cmp(BytewiseComparator()) { }
STLLessThan(const Comparator* c) : cmp(c) { }
bool operator()(const std::string& a, const std::string& b) const {
return cmp->Compare(Slice(a), Slice(b)) < 0;
}
};
}
class StringSink: public WritableFile {
public:
~StringSink() { }
const std::string& contents() const { return contents_; }
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
virtual Status Append(const Slice& data) {
contents_.append(data.data(), data.size());
return Status::OK();
}
private:
std::string contents_;
};
class StringSource: public RandomAccessFile {
public:
StringSource(const Slice& contents)
: contents_(contents.data(), contents.size()) {
}
virtual ~StringSource() { }
virtual uint64_t Size() const { return contents_.size(); }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
if (offset > contents_.size()) {
return Status::InvalidArgument("invalid Read offset");
}
if (offset + n > contents_.size()) {
n = contents_.size() - offset;
}
memcpy(scratch, &contents_[offset], n);
*result = Slice(scratch, n);
return Status::OK();
}
private:
std::string contents_;
};
typedef std::map<std::string, std::string, STLLessThan> KVMap;
// Helper class for tests to unify the interface between
// BlockBuilder/TableBuilder and Block/Table.
class Constructor {
public:
explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { }
virtual ~Constructor() { }
void Add(const std::string& key, const Slice& value) {
data_[key] = value.ToString();
}
// Finish constructing the data structure with all the keys that have
// been added so far. Returns the keys in sorted order in "*keys"
// and stores the key/value pairs in "*kvmap"
void Finish(const Options& options,
std::vector<std::string>* keys,
KVMap* kvmap) {
*kvmap = data_;
keys->clear();
for (KVMap::const_iterator it = data_.begin();
it != data_.end();
++it) {
keys->push_back(it->first);
}
data_.clear();
Status s = FinishImpl(options, *kvmap);
ASSERT_TRUE(s.ok()) << s.ToString();
}
// Construct the data structure from the data in "data"
virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;
virtual size_t NumBytes() const = 0;
virtual Iterator* NewIterator() const = 0;
virtual const KVMap& data() { return data_; }
private:
KVMap data_;
};
class BlockConstructor: public Constructor {
public:
explicit BlockConstructor(const Comparator* cmp)
: Constructor(cmp),
comparator_(cmp),
block_size_(-1),
block_(NULL) { }
~BlockConstructor() {
delete block_;
}
virtual Status FinishImpl(const Options& options, const KVMap& data) {
delete block_;
block_ = NULL;
BlockBuilder builder(&options);
for (KVMap::const_iterator it = data.begin();
it != data.end();
++it) {
builder.Add(it->first, it->second);
}
// Open the block
Slice block_data = builder.Finish();
block_size_ = block_data.size();
char* block_data_copy = new char[block_size_];
memcpy(block_data_copy, block_data.data(), block_size_);
block_ = new Block(block_data_copy, block_size_);
return Status::OK();
}
virtual size_t NumBytes() const { return block_size_; }
virtual Iterator* NewIterator() const {
return block_->NewIterator(comparator_);
}
private:
const Comparator* comparator_;
int block_size_;
Block* block_;
BlockConstructor();
};
class TableConstructor: public Constructor {
public:
TableConstructor(const Comparator* cmp)
: Constructor(cmp),
source_(NULL), table_(NULL) {
}
~TableConstructor() {
Reset();
}
virtual Status FinishImpl(const Options& options, const KVMap& data) {
Reset();
StringSink sink;
TableBuilder builder(options, &sink);
for (KVMap::const_iterator it = data.begin();
it != data.end();
++it) {
builder.Add(it->first, it->second);
ASSERT_TRUE(builder.status().ok());
}
Status s = builder.Finish();
ASSERT_TRUE(s.ok()) << s.ToString();
ASSERT_EQ(sink.contents().size(), builder.FileSize());
// Open the table
source_ = new StringSource(sink.contents());
Options table_options;
table_options.comparator = options.comparator;
return Table::Open(table_options, source_, &table_);
}
virtual size_t NumBytes() const { return source_->Size(); }
virtual Iterator* NewIterator() const {
return table_->NewIterator(ReadOptions());
}
uint64_t ApproximateOffsetOf(const Slice& key) const {
return table_->ApproximateOffsetOf(key);
}
private:
void Reset() {
delete table_;
delete source_;
table_ = NULL;
source_ = NULL;
}
StringSource* source_;
Table* table_;
TableConstructor();
};
// A helper class that converts internal format keys into user keys
class KeyConvertingIterator: public Iterator {
public:
explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
virtual ~KeyConvertingIterator() { delete iter_; }
virtual bool Valid() const { return iter_->Valid(); }
virtual void Seek(const Slice& target) {
ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
std::string encoded;
AppendInternalKey(&encoded, ikey);
iter_->Seek(encoded);
}
virtual void SeekToFirst() { iter_->SeekToFirst(); }
virtual void SeekToLast() { iter_->SeekToLast(); }
virtual void Next() { iter_->Next(); }
virtual void Prev() { iter_->Prev(); }
virtual Slice key() const {
assert(Valid());
ParsedInternalKey key;
if (!ParseInternalKey(iter_->key(), &key)) {
status_ = Status::Corruption("malformed internal key");
return Slice("corrupted key");
}
return key.user_key;
}
virtual Slice value() const { return iter_->value(); }
virtual Status status() const {
return status_.ok() ? iter_->status() : status_;
}
private:
mutable Status status_;
Iterator* iter_;
// No copying allowed
KeyConvertingIterator(const KeyConvertingIterator&);
void operator=(const KeyConvertingIterator&);
};
class MemTableConstructor: public Constructor {
public:
explicit MemTableConstructor(const Comparator* cmp)
: Constructor(cmp),
internal_comparator_(cmp) {
memtable_ = new MemTable(internal_comparator_);
}
~MemTableConstructor() {
delete memtable_;
}
virtual Status FinishImpl(const Options& options, const KVMap& data) {
delete memtable_;
memtable_ = new MemTable(internal_comparator_);
int seq = 1;
for (KVMap::const_iterator it = data.begin();
it != data.end();
++it) {
memtable_->Add(seq, kTypeValue, it->first, it->second);
seq++;
}
return Status::OK();
}
virtual size_t NumBytes() const {
return memtable_->ApproximateMemoryUsage();
}
virtual Iterator* NewIterator() const {
return new KeyConvertingIterator(memtable_->NewIterator());
}
private:
InternalKeyComparator internal_comparator_;
MemTable* memtable_;
};
class DBConstructor: public Constructor {
public:
explicit DBConstructor(const Comparator* cmp)
: Constructor(cmp),
comparator_(cmp) {
db_ = NULL;
NewDB();
}
~DBConstructor() {
delete db_;
}
virtual Status FinishImpl(const Options& options, const KVMap& data) {
delete db_;
db_ = NULL;
NewDB();
for (KVMap::const_iterator it = data.begin();
it != data.end();
++it) {
WriteBatch batch;
batch.Put(it->first, it->second);
ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok());
}
return Status::OK();
}
virtual size_t NumBytes() const {
Range r("", "\xff\xff");
uint64_t size;
db_->GetApproximateSizes(&r, 1, &size);
return size;
}
virtual Iterator* NewIterator() const {
return db_->NewIterator(ReadOptions());
}
private:
void NewDB() {
std::string name = test::TmpDir() + "/table_testdb";
Options options;
options.comparator = comparator_;
Status status = DestroyDB(name, options);
ASSERT_TRUE(status.ok()) << status.ToString();
options.create_if_missing = true;
options.error_if_exists = true;
status = DB::Open(options, name, &db_);
ASSERT_TRUE(status.ok()) << status.ToString();
}
const Comparator* comparator_;
DB* db_;
};
enum TestType {
TABLE_TEST,
BLOCK_TEST,
MEMTABLE_TEST,
DB_TEST,
};
struct TestArgs {
TestType type;
bool reverse_compare;
int restart_interval;
};
static const TestArgs kTestArgList[] = {
{ TABLE_TEST, false, 16 },
{ TABLE_TEST, false, 1 },
{ TABLE_TEST, false, 1024 },
{ TABLE_TEST, true, 16 },
{ TABLE_TEST, true, 1 },
{ TABLE_TEST, true, 1024 },
{ BLOCK_TEST, false, 16 },
{ BLOCK_TEST, false, 1 },
{ BLOCK_TEST, false, 1024 },
{ BLOCK_TEST, true, 16 },
{ BLOCK_TEST, true, 1 },
{ BLOCK_TEST, true, 1024 },
// Restart interval does not matter for memtables
{ MEMTABLE_TEST, false, 16 },
{ MEMTABLE_TEST, true, 16 },
// Do not bother with restart interval variations for DB
{ DB_TEST, false, 16 },
{ DB_TEST, true, 16 },
};
static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]);
class Harness {
public:
Harness() : constructor_(NULL) { }
void Init(const TestArgs& args) {
delete constructor_;
constructor_ = NULL;
options_ = Options();
options_.block_restart_interval = args.restart_interval;
// Use shorter block size for tests to exercise block boundary
// conditions more.
options_.block_size = 256;
if (args.reverse_compare) {
options_.comparator = &reverse_key_comparator;
}
switch (args.type) {
case TABLE_TEST:
constructor_ = new TableConstructor(options_.comparator);
break;
case BLOCK_TEST:
constructor_ = new BlockConstructor(options_.comparator);
break;
case MEMTABLE_TEST:
constructor_ = new MemTableConstructor(options_.comparator);
break;
case DB_TEST:
constructor_ = new DBConstructor(options_.comparator);
break;
}
}
~Harness() {
delete constructor_;
}
void Add(const std::string& key, const std::string& value) {
constructor_->Add(key, value);
}
void Test(Random* rnd) {
std::vector<std::string> keys;
KVMap data;
constructor_->Finish(options_, &keys, &data);
TestForwardScan(keys, data);
TestBackwardScan(keys, data);
TestRandomAccess(rnd, keys, data);
}
void TestForwardScan(const std::vector<std::string>& keys,
const KVMap& data) {
Iterator* iter = constructor_->NewIterator();
ASSERT_TRUE(!iter->Valid());
iter->SeekToFirst();
for (KVMap::const_iterator model_iter = data.begin();
model_iter != data.end();
++model_iter) {
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
iter->Next();
}
ASSERT_TRUE(!iter->Valid());
delete iter;
}
void TestBackwardScan(const std::vector<std::string>& keys,
const KVMap& data) {
Iterator* iter = constructor_->NewIterator();
ASSERT_TRUE(!iter->Valid());
iter->SeekToLast();
for (KVMap::const_reverse_iterator model_iter = data.rbegin();
model_iter != data.rend();
++model_iter) {
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
iter->Prev();
}
ASSERT_TRUE(!iter->Valid());
delete iter;
}
void TestRandomAccess(Random* rnd,
const std::vector<std::string>& keys,
const KVMap& data) {
static const bool kVerbose = false;
Iterator* iter = constructor_->NewIterator();
ASSERT_TRUE(!iter->Valid());
KVMap::const_iterator model_iter = data.begin();
if (kVerbose) fprintf(stderr, "---\n");
for (int i = 0; i < 200; i++) {
const int toss = rnd->Uniform(5);
switch (toss) {
case 0: {
if (iter->Valid()) {
if (kVerbose) fprintf(stderr, "Next\n");
iter->Next();
++model_iter;
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
}
break;
}
case 1: {
if (kVerbose) fprintf(stderr, "SeekToFirst\n");
iter->SeekToFirst();
model_iter = data.begin();
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
break;
}
case 2: {
std::string key = PickRandomKey(rnd, keys);
model_iter = data.lower_bound(key);
if (kVerbose) fprintf(stderr, "Seek '%s'\n",
EscapeString(key).c_str());
iter->Seek(Slice(key));
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
break;
}
case 3: {
if (iter->Valid()) {
if (kVerbose) fprintf(stderr, "Prev\n");
iter->Prev();
if (model_iter == data.begin()) {
model_iter = data.end(); // Wrap around to invalid value
} else {
--model_iter;
}
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
}
break;
}
case 4: {
if (kVerbose) fprintf(stderr, "SeekToLast\n");
iter->SeekToLast();
if (keys.empty()) {
model_iter = data.end();
} else {
std::string last = data.rbegin()->first;
model_iter = data.lower_bound(last);
}
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
break;
}
}
}
delete iter;
}
std::string ToString(const KVMap& data, const KVMap::const_iterator& it) {
if (it == data.end()) {
return "END";
} else {
return "'" + it->first + "->" + it->second + "'";
}
}
std::string ToString(const KVMap& data,
const KVMap::const_reverse_iterator& it) {
if (it == data.rend()) {
return "END";
} else {
return "'" + it->first + "->" + it->second + "'";
}
}
std::string ToString(const Iterator* it) {
if (!it->Valid()) {
return "END";
} else {
return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
}
}
std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
if (keys.empty()) {
return "foo";
} else {
const int index = rnd->Uniform(keys.size());
std::string result = keys[index];
switch (rnd->Uniform(3)) {
case 0:
// Return an existing key
break;
case 1: {
// Attempt to return something smaller than an existing key
if (result.size() > 0 && result[result.size()-1] > '\0') {
result[result.size()-1]--;
}
break;
}
case 2: {
// Return something larger than an existing key
Increment(options_.comparator, &result);
break;
}
}
return result;
}
}
private:
Options options_;
Constructor* constructor_;
};
// Test the empty key
TEST(Harness, SimpleEmptyKey) {
for (int i = 0; i < kNumTestArgs; i++) {
Init(kTestArgList[i]);
Random rnd(test::RandomSeed() + 1);
Add("", "v");
Test(&rnd);
}
}
TEST(Harness, SimpleSingle) {
for (int i = 0; i < kNumTestArgs; i++) {
Init(kTestArgList[i]);
Random rnd(test::RandomSeed() + 2);
Add("abc", "v");
Test(&rnd);
}
}
TEST(Harness, SimpleMulti) {
for (int i = 0; i < kNumTestArgs; i++) {
Init(kTestArgList[i]);
Random rnd(test::RandomSeed() + 3);
Add("abc", "v");
Add("abcd", "v");
Add("ac", "v2");
Test(&rnd);
}
}
TEST(Harness, SimpleSpecialKey) {
for (int i = 0; i < kNumTestArgs; i++) {
Init(kTestArgList[i]);
Random rnd(test::RandomSeed() + 4);
Add("\xff\xff", "v3");
Test(&rnd);
}
}
TEST(Harness, Randomized) {
for (int i = 0; i < kNumTestArgs; i++) {
Init(kTestArgList[i]);
Random rnd(test::RandomSeed() + 5);
for (int num_entries = 0; num_entries < 2000;
num_entries += (num_entries < 50 ? 1 : 200)) {
if ((num_entries % 10) == 0) {
fprintf(stderr, "case %d of %d: num_entries = %d\n",
(i + 1), int(kNumTestArgs), num_entries);
}
for (int e = 0; e < num_entries; e++) {
std::string v;
Add(test::RandomKey(&rnd, rnd.Skewed(4)),
test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
}
Test(&rnd);
}
}
}
class MemTableTest { };
TEST(MemTableTest, Simple) {
InternalKeyComparator cmp(BytewiseComparator());
MemTable memtable(cmp);
WriteBatch batch;
WriteBatchInternal::SetSequence(&batch, 100);
batch.Put(std::string("k1"), std::string("v1"));
batch.Put(std::string("k2"), std::string("v2"));
batch.Put(std::string("k3"), std::string("v3"));
batch.Put(std::string("largekey"), std::string("vlarge"));
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok());
Iterator* iter = memtable.NewIterator();
iter->SeekToFirst();
while (iter->Valid()) {
fprintf(stderr, "key: '%s' -> '%s'\n",
iter->key().ToString().c_str(),
iter->value().ToString().c_str());
iter->Next();
}
delete iter;
}
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
bool result = (val >= low) && (val <= high);
if (!result) {
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
(unsigned long long)(val),
(unsigned long long)(low),
(unsigned long long)(high));
}
return result;
}
class TableTest { };
TEST(TableTest, ApproximateOffsetOfPlain) {
TableConstructor c(BytewiseComparator());
c.Add("k01", "hello");
c.Add("k02", "hello2");
c.Add("k03", std::string(10000, 'x'));
c.Add("k04", std::string(200000, 'x'));
c.Add("k05", std::string(300000, 'x'));
c.Add("k06", "hello3");
c.Add("k07", std::string(100000, 'x'));
std::vector<std::string> keys;
KVMap kvmap;
Options options;
options.block_size = 1024;
options.compression = kNoCompression;
c.Finish(options, &keys, &kvmap);
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000));
}
TEST(TableTest, ApproximateOffsetOfCompressed) {
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM)
// Compression not supported yet, so skip this test.
// TODO(sanjay) Reenable after compression support is added
return;
#endif
Random rnd(301);
TableConstructor c(BytewiseComparator());
std::string tmp;
c.Add("k01", "hello");
c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
c.Add("k03", "hello3");
c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
std::vector<std::string> keys;
KVMap kvmap;
Options options;
options.block_size = 1024;
options.compression = kLightweightCompression;
c.Finish(options, &keys, &kvmap);
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000));
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 182
- 0
table/two_level_iterator.cc View File

@ -0,0 +1,182 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/two_level_iterator.h"
#include "include/table.h"
#include "table/block.h"
#include "table/format.h"
#include "table/iterator_wrapper.h"
namespace leveldb {
namespace {
typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&);
class TwoLevelIterator: public Iterator {
public:
TwoLevelIterator(
Iterator* index_iter,
BlockFunction block_function,
void* arg,
const ReadOptions& options);
virtual ~TwoLevelIterator();
virtual void Seek(const Slice& target);
virtual void SeekToFirst();
virtual void SeekToLast();
virtual void Next();
virtual void Prev();
virtual bool Valid() const {
return data_iter_.Valid();
}
virtual Slice key() const {
assert(Valid());
return data_iter_.key();
}
virtual Slice value() const {
assert(Valid());
return data_iter_.value();
}
virtual Status status() const {
// It'd be nice if status() returned a const Status& instead of a Status
if (!index_iter_.status().ok()) {
return index_iter_.status();
} else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) {
return data_iter_.status();
} else {
return status_;
}
}
private:
void SaveError(const Status& s) {
if (status_.ok() && !s.ok()) status_ = s;
}
void SkipEmptyDataBlocksForward();
void SkipEmptyDataBlocksBackward();
void SetDataIterator(Iterator* data_iter);
void InitDataBlock();
BlockFunction block_function_;
void* arg_;
const ReadOptions options_;
Status status_;
IteratorWrapper index_iter_;
IteratorWrapper data_iter_; // May be NULL
// If data_iter_ is non-NULL, then "data_block_handle_" holds the
// "index_value" passed to block_function_ to create the data_iter_.
std::string data_block_handle_;
};
TwoLevelIterator::TwoLevelIterator(
Iterator* index_iter,
BlockFunction block_function,
void* arg,
const ReadOptions& options)
: block_function_(block_function),
arg_(arg),
options_(options),
index_iter_(index_iter),
data_iter_(NULL) {
}
TwoLevelIterator::~TwoLevelIterator() {
}
void TwoLevelIterator::Seek(const Slice& target) {
index_iter_.Seek(target);
InitDataBlock();
if (data_iter_.iter() != NULL) data_iter_.Seek(target);
SkipEmptyDataBlocksForward();
}
void TwoLevelIterator::SeekToFirst() {
index_iter_.SeekToFirst();
InitDataBlock();
if (data_iter_.iter() != NULL) data_iter_.SeekToFirst();
SkipEmptyDataBlocksForward();
}
void TwoLevelIterator::SeekToLast() {
index_iter_.SeekToLast();
InitDataBlock();
if (data_iter_.iter() != NULL) data_iter_.SeekToLast();
SkipEmptyDataBlocksBackward();
}
void TwoLevelIterator::Next() {
assert(Valid());
data_iter_.Next();
SkipEmptyDataBlocksForward();
}
void TwoLevelIterator::Prev() {
assert(Valid());
data_iter_.Prev();
SkipEmptyDataBlocksBackward();
}
void TwoLevelIterator::SkipEmptyDataBlocksForward() {
while (data_iter_.iter() == NULL || !data_iter_.Valid()) {
// Move to next block
if (!index_iter_.Valid()) {
SetDataIterator(NULL);
return;
}
index_iter_.Next();
InitDataBlock();
if (data_iter_.iter() != NULL) data_iter_.SeekToFirst();
}
}
void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
while (data_iter_.iter() == NULL || !data_iter_.Valid()) {
// Move to next block
if (!index_iter_.Valid()) {
SetDataIterator(NULL);
return;
}
index_iter_.Prev();
InitDataBlock();
if (data_iter_.iter() != NULL) data_iter_.SeekToLast();
}
}
void TwoLevelIterator::SetDataIterator(Iterator* data_iter) {
if (data_iter_.iter() != NULL) SaveError(data_iter_.status());
data_iter_.Set(data_iter);
}
void TwoLevelIterator::InitDataBlock() {
if (!index_iter_.Valid()) {
SetDataIterator(NULL);
} else {
Slice handle = index_iter_.value();
if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) {
// data_iter_ is already constructed with this iterator, so
// no need to change anything
} else {
Iterator* iter = (*block_function_)(arg_, options_, handle);
data_block_handle_.assign(handle.data(), handle.size());
SetDataIterator(iter);
}
}
}
}
Iterator* NewTwoLevelIterator(
Iterator* index_iter,
BlockFunction block_function,
void* arg,
const ReadOptions& options) {
return new TwoLevelIterator(index_iter, block_function, arg, options);
}
}

+ 34
- 0
table/two_level_iterator.h View File

@ -0,0 +1,34 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_
#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_
#include "include/iterator.h"
namespace leveldb {
struct ReadOptions;
// Return a new two level iterator. A two-level iterator contains an
// index iterator whose values point to a sequence of blocks where
// each block is itself a sequence of key,value pairs. The returned
// two-level iterator yields the concatenation of all key/value pairs
// in the sequence of blocks. Takes ownership of "index_iter" and
// will delete it when no longer needed.
//
// Uses a supplied function to convert an index_iter value into
// an iterator over the contents of the corresponding block.
extern Iterator* NewTwoLevelIterator(
Iterator* index_iter,
Iterator* (*block_function)(
void* arg,
const ReadOptions& options,
const Slice& index_value),
void* arg,
const ReadOptions& options);
}
#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_

+ 68
- 0
util/arena.cc View File

@ -0,0 +1,68 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/arena.h"
#include <assert.h>
namespace leveldb {
static const int kBlockSize = 4096;
Arena::Arena() {
blocks_memory_ = 0;
alloc_ptr_ = NULL; // First allocation will allocate a block
alloc_bytes_remaining_ = 0;
}
Arena::~Arena() {
for (int i = 0; i < blocks_.size(); i++) {
delete[] blocks_[i];
}
}
char* Arena::AllocateFallback(size_t bytes) {
if (bytes > kBlockSize / 4) {
// Object is more than a quarter of our block size. Allocate it separately
// to avoid wasting too much space in leftover bytes.
char* result = AllocateNewBlock(bytes);
return result;
}
// We waste the remaining space in the current block.
alloc_ptr_ = AllocateNewBlock(kBlockSize);
alloc_bytes_remaining_ = kBlockSize;
char* result = alloc_ptr_;
alloc_ptr_ += bytes;
alloc_bytes_remaining_ -= bytes;
return result;
}
char* Arena::AllocateAligned(size_t bytes) {
const int align = sizeof(void*); // We'll align to pointer size
assert((align & (align-1)) == 0); // Pointer size should be a power of 2
size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
size_t slop = (current_mod == 0 ? 0 : align - current_mod);
size_t needed = bytes + slop;
char* result;
if (needed <= alloc_bytes_remaining_) {
result = alloc_ptr_ + slop;
alloc_ptr_ += needed;
alloc_bytes_remaining_ -= needed;
} else {
// AllocateFallback always returned aligned memory
result = AllocateFallback(bytes);
}
assert((reinterpret_cast<uintptr_t>(result) & (align-1)) == 0);
return result;
}
char* Arena::AllocateNewBlock(size_t block_bytes) {
char* result = new char[block_bytes];
blocks_memory_ += block_bytes;
blocks_.push_back(result);
return result;
}
}

+ 68
- 0
util/arena.h View File

@ -0,0 +1,68 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_
#define STORAGE_LEVELDB_UTIL_ARENA_H_
#include <cstddef>
#include <vector>
#include <assert.h>
#include <stdint.h>
namespace leveldb {
class Arena {
public:
Arena();
~Arena();
// Return a pointer to a newly allocated memory block of "bytes" bytes.
char* Allocate(size_t bytes);
// Allocate memory with the normal alignment guarantees provided by malloc
char* AllocateAligned(size_t bytes);
// Returns an estimate of the total memory usage of data allocated
// by the arena (including space allocated but not yet used for user
// allocations).
size_t MemoryUsage() const {
return blocks_memory_ + blocks_.capacity() * sizeof(char*);
}
private:
char* AllocateFallback(size_t bytes);
char* AllocateNewBlock(size_t block_bytes);
// Allocation state
char* alloc_ptr_;
size_t alloc_bytes_remaining_;
// Array of new[] allocated memory blocks
std::vector<char*> blocks_;
// Bytes of memory in blocks allocated so far
size_t blocks_memory_;
// No copying allowed
Arena(const Arena&);
void operator=(const Arena&);
};
inline char* Arena::Allocate(size_t bytes) {
// The semantics of what to return are a bit messy if we allow
// 0-byte allocations, so we disallow them here (we don't need
// them for our internal use).
assert(bytes > 0);
if (bytes <= alloc_bytes_remaining_) {
char* result = alloc_ptr_;
alloc_ptr_ += bytes;
alloc_bytes_remaining_ -= bytes;
return result;
}
return AllocateFallback(bytes);
}
}
#endif // STORAGE_LEVELDB_UTIL_ARENA_H_

+ 68
- 0
util/arena_test.cc View File

@ -0,0 +1,68 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/arena.h"
#include "util/random.h"
#include "util/testharness.h"
namespace leveldb {
class ArenaTest { };
TEST(ArenaTest, Empty) {
Arena arena;
}
TEST(ArenaTest, Simple) {
std::vector<std::pair<size_t, char*> > allocated;
Arena arena;
const int N = 100000;
size_t bytes = 0;
Random rnd(301);
for (int i = 0; i < N; i++) {
size_t s;
if (i % (N / 10) == 0) {
s = i;
} else {
s = rnd.OneIn(4000) ? rnd.Uniform(6000) :
(rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
}
if (s == 0) {
// Our arena disallows size 0 allocations.
s = 1;
}
char* r;
if (rnd.OneIn(10)) {
r = arena.AllocateAligned(s);
} else {
r = arena.Allocate(s);
}
for (int b = 0; b < s; b++) {
// Fill the "i"th allocation with a known bit pattern
r[b] = i % 256;
}
bytes += s;
allocated.push_back(std::make_pair(s, r));
ASSERT_GE(arena.MemoryUsage(), bytes);
if (i > N/10) {
ASSERT_LE(arena.MemoryUsage(), bytes * 1.10);
}
}
for (int i = 0; i < allocated.size(); i++) {
size_t num_bytes = allocated[i].first;
const char* p = allocated[i].second;
for (int b = 0; b < num_bytes; b++) {
// Check the "i"th allocation for the known bit pattern
ASSERT_EQ(int(p[b]) & 0xff, i % 256);
}
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 253
- 0
util/cache.cc View File

@ -0,0 +1,253 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID)
#include <unordered_set>
#elif defined(LEVELDB_PLATFORM_CHROMIUM)
#include "base/hash_tables.h"
#else
#include <hash_set> // TODO(sanjay): Switch to unordered_set when possible.
#endif
#include <assert.h>
#include "include/cache.h"
#include "port/port.h"
#include "util/hash.h"
#include "util/mutexlock.h"
namespace leveldb {
Cache::~Cache() {
}
namespace {
// LRU cache implementation
// An entry is a variable length heap-allocated structure. Entries
// are kept in a circular doubly linked list ordered by access time.
struct LRUHandle {
void* value;
void (*deleter)(const Slice&, void* value);
LRUHandle* next;
LRUHandle* prev;
size_t charge; // TODO(opt): Only allow uint32_t?
size_t key_length;
size_t refs; // TODO(opt): Pack with "key_length"?
char key_data[1]; // Beginning of key
Slice key() const {
// For cheaper lookups, we allow a temporary Handle object
// to store a pointer to a key in "value".
if (next == this) {
return *(reinterpret_cast<Slice*>(value));
} else {
return Slice(key_data, key_length);
}
}
};
// Pick a platform specific hash_set instantiation
#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN)
// Microsoft's hash_set deviates from the standard. See
// http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx
// for details. Basically the 2 param () operator is a less than and
// the 1 param () operator is a hash function.
struct HandleHashCompare : public stdext::hash_compare<LRUHandle*> {
size_t operator() (LRUHandle* h) const {
Slice k = h->key();
return Hash(k.data(), k.size(), 0);
}
bool operator() (LRUHandle* a, LRUHandle* b) const {
return a->key().compare(b->key()) < 0;
}
};
typedef base::hash_set<LRUHandle*, HandleHashCompare> HandleTable;
#else
struct HandleHash {
inline size_t operator()(LRUHandle* h) const {
Slice k = h->key();
return Hash(k.data(), k.size(), 0);
}
};
struct HandleEq {
inline bool operator()(LRUHandle* a, LRUHandle* b) const {
return a->key() == b->key();
}
};
# if defined(LEVELDB_PLATFORM_CHROMIUM)
typedef base::hash_set<LRUHandle*, HandleHash, HandleEq> HandleTable;
# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID)
typedef std::unordered_set<LRUHandle*, HandleHash, HandleEq> HandleTable;
# else
typedef __gnu_cxx::hash_set<LRUHandle*, HandleHash, HandleEq> HandleTable;
# endif
#endif
class LRUCache : public Cache {
public:
explicit LRUCache(size_t capacity);
virtual ~LRUCache();
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value));
virtual Handle* Lookup(const Slice& key);
virtual void Release(Handle* handle);
virtual void* Value(Handle* handle);
virtual void Erase(const Slice& key);
virtual uint64_t NewId();
private:
void LRU_Remove(LRUHandle* e);
void LRU_Append(LRUHandle* e);
void Unref(LRUHandle* e);
// Constructor parameters
const size_t capacity_;
// mutex_ protects the following state.
port::Mutex mutex_;
size_t usage_;
uint64_t last_id_;
// Dummy head of LRU list.
// lru.prev is newest entry, lru.next is oldest entry.
LRUHandle lru_;
HandleTable table_;
};
LRUCache::LRUCache(size_t capacity)
: capacity_(capacity),
usage_(0),
last_id_(0) {
// Make empty circular linked list
lru_.next = &lru_;
lru_.prev = &lru_;
}
LRUCache::~LRUCache() {
table_.clear();
for (LRUHandle* e = lru_.next; e != &lru_; ) {
LRUHandle* next = e->next;
assert(e->refs == 1); // Error if caller has an unreleased handle
Unref(e);
e = next;
}
}
void LRUCache::Unref(LRUHandle* e) {
assert(e->refs > 0);
e->refs--;
if (e->refs <= 0) {
usage_ -= e->charge;
(*e->deleter)(e->key(), e->value);
free(e);
}
}
void LRUCache::LRU_Remove(LRUHandle* e) {
e->next->prev = e->prev;
e->prev->next = e->next;
}
void LRUCache::LRU_Append(LRUHandle* e) {
// Make "e" newest entry by inserting just before lru_
e->next = &lru_;
e->prev = lru_.prev;
e->prev->next = e;
e->next->prev = e;
}
Cache::Handle* LRUCache::Lookup(const Slice& key) {
MutexLock l(&mutex_);
LRUHandle dummy;
dummy.next = &dummy;
dummy.value = const_cast<Slice*>(&key);
HandleTable::iterator iter = table_.find(&dummy);
if (iter == table_.end()) {
return NULL;
} else {
LRUHandle* e = const_cast<LRUHandle*>(*iter);
e->refs++;
LRU_Remove(e);
LRU_Append(e);
return reinterpret_cast<Handle*>(e);
}
}
void* LRUCache::Value(Handle* handle) {
return reinterpret_cast<LRUHandle*>(handle)->value;
}
void LRUCache::Release(Handle* handle) {
MutexLock l(&mutex_);
Unref(reinterpret_cast<LRUHandle*>(handle));
}
Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) {
MutexLock l(&mutex_);
LRUHandle* e = reinterpret_cast<LRUHandle*>(
malloc(sizeof(LRUHandle)-1 + key.size()));
e->value = value;
e->deleter = deleter;
e->charge = charge;
e->key_length = key.size();
e->refs = 2; // One from LRUCache, one for the returned handle
memcpy(e->key_data, key.data(), key.size());
LRU_Append(e);
usage_ += charge;
std::pair<HandleTable::iterator,bool> p = table_.insert(e);
if (!p.second) {
// Kill existing entry
LRUHandle* old = const_cast<LRUHandle*>(*(p.first));
LRU_Remove(old);
table_.erase(p.first);
table_.insert(e);
Unref(old);
}
while (usage_ > capacity_ && lru_.next != &lru_) {
LRUHandle* old = lru_.next;
LRU_Remove(old);
table_.erase(old);
Unref(old);
}
return reinterpret_cast<Handle*>(e);
}
void LRUCache::Erase(const Slice& key) {
MutexLock l(&mutex_);
LRUHandle dummy;
dummy.next = &dummy;
dummy.value = const_cast<Slice*>(&key);
HandleTable::iterator iter = table_.find(&dummy);
if (iter != table_.end()) {
LRUHandle* e = const_cast<LRUHandle*>(*iter);
LRU_Remove(e);
table_.erase(iter);
Unref(e);
}
}
uint64_t LRUCache::NewId() {
MutexLock l(&mutex_);
return ++(last_id_);
}
} // end anonymous namespace
Cache* NewLRUCache(size_t capacity) {
return new LRUCache(capacity);
}
}

+ 169
- 0
util/cache_test.cc View File

@ -0,0 +1,169 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/cache.h"
#include <vector>
#include "util/coding.h"
#include "util/testharness.h"
namespace leveldb {
// Conversions between numeric keys/values and the types expected by Cache.
static std::string EncodeKey(int k) {
std::string result;
PutFixed32(&result, k);
return result;
}
static int DecodeKey(const Slice& k) {
assert(k.size() == 4);
return DecodeFixed32(k.data());
}
static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
class CacheTest {
public:
static CacheTest* current_;
static void Deleter(const Slice& key, void* v) {
current_->deleted_keys_.push_back(DecodeKey(key));
current_->deleted_values_.push_back(DecodeValue(v));
}
static const int kCacheSize = 100;
std::vector<int> deleted_keys_;
std::vector<int> deleted_values_;
Cache* cache_;
CacheTest() : cache_(NewLRUCache(kCacheSize)) {
current_ = this;
}
~CacheTest() {
delete cache_;
}
int Lookup(int key) {
Cache::Handle* handle = cache_->Lookup(EncodeKey(key));
const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle));
if (handle != NULL) {
cache_->Release(handle);
}
return r;
}
void Insert(int key, int value, int charge = 1) {
cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge,
&CacheTest::Deleter));
}
void Erase(int key) {
cache_->Erase(EncodeKey(key));
}
};
CacheTest* CacheTest::current_;
TEST(CacheTest, HitAndMiss) {
ASSERT_EQ(-1, Lookup(100));
Insert(100, 101);
ASSERT_EQ(101, Lookup(100));
ASSERT_EQ(-1, Lookup(200));
ASSERT_EQ(-1, Lookup(300));
Insert(200, 201);
ASSERT_EQ(101, Lookup(100));
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(-1, Lookup(300));
Insert(100, 102);
ASSERT_EQ(102, Lookup(100));
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(-1, Lookup(300));
ASSERT_EQ(1, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(101, deleted_values_[0]);
}
TEST(CacheTest, Erase) {
Erase(200);
ASSERT_EQ(0, deleted_keys_.size());
Insert(100, 101);
Insert(200, 201);
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(1, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(101, deleted_values_[0]);
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(1, deleted_keys_.size());
}
TEST(CacheTest, EntriesArePinned) {
Insert(100, 101);
Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
Insert(100, 102);
Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
ASSERT_EQ(0, deleted_keys_.size());
cache_->Release(h1);
ASSERT_EQ(1, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(101, deleted_values_[0]);
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(1, deleted_keys_.size());
cache_->Release(h2);
ASSERT_EQ(2, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[1]);
ASSERT_EQ(102, deleted_values_[1]);
}
TEST(CacheTest, EvictionPolicy) {
Insert(100, 101);
Insert(200, 201);
// Frequently used entry must be kept around
for (int i = 0; i < kCacheSize; i++) {
Insert(1000+i, 2000+i);
ASSERT_EQ(2000+i, Lookup(1000+i));
ASSERT_EQ(101, Lookup(100));
}
ASSERT_EQ(101, Lookup(100));
ASSERT_EQ(2, deleted_keys_.size());
ASSERT_EQ(200, deleted_keys_[0]);
ASSERT_EQ(201, deleted_values_[0]);
}
TEST(CacheTest, HeavyEntry) {
Insert(100, 101);
Insert(200, 201, kCacheSize);
ASSERT_EQ(1, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(101, deleted_values_[0]);
}
TEST(CacheTest, NewId) {
uint64_t a = cache_->NewId();
uint64_t b = cache_->NewId();
ASSERT_NE(a, b);
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 194
- 0
util/coding.cc View File

@ -0,0 +1,194 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/coding.h"
namespace leveldb {
void EncodeFixed32(char* buf, uint32_t value) {
#if __BYTE_ORDER == __LITTLE_ENDIAN
memcpy(buf, &value, sizeof(value));
#else
buf[0] = value & 0xff;
buf[1] = (value >> 8) & 0xff;
buf[2] = (value >> 16) & 0xff;
buf[3] = (value >> 24) & 0xff;
#endif
}
void EncodeFixed64(char* buf, uint64_t value) {
#if __BYTE_ORDER == __LITTLE_ENDIAN
memcpy(buf, &value, sizeof(value));
#else
buf[0] = value & 0xff;
buf[1] = (value >> 8) & 0xff;
buf[2] = (value >> 16) & 0xff;
buf[3] = (value >> 24) & 0xff;
buf[4] = (value >> 32) & 0xff;
buf[5] = (value >> 40) & 0xff;
buf[6] = (value >> 48) & 0xff;
buf[7] = (value >> 56) & 0xff;
#endif
}
void PutFixed32(std::string* dst, uint32_t value) {
char buf[sizeof(value)];
EncodeFixed32(buf, value);
dst->append(buf, sizeof(buf));
}
void PutFixed64(std::string* dst, uint64_t value) {
char buf[sizeof(value)];
EncodeFixed64(buf, value);
dst->append(buf, sizeof(buf));
}
char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
static const int B = 128;
if (v < (1<<7)) {
*(ptr++) = v;
} else if (v < (1<<14)) {
*(ptr++) = v | B;
*(ptr++) = v>>7;
} else if (v < (1<<21)) {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = v>>14;
} else if (v < (1<<28)) {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = (v>>14) | B;
*(ptr++) = v>>21;
} else {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = (v>>14) | B;
*(ptr++) = (v>>21) | B;
*(ptr++) = v>>28;
}
return reinterpret_cast<char*>(ptr);
}
void PutVarint32(std::string* dst, uint32_t v) {
char buf[5];
char* ptr = EncodeVarint32(buf, v);
dst->append(buf, ptr - buf);
}
char* EncodeVarint64(char* dst, uint64_t v) {
static const int B = 128;
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
while (v >= B) {
*(ptr++) = (v & (B-1)) | B;
v >>= 7;
}
*(ptr++) = v;
return reinterpret_cast<char*>(ptr);
}
void PutVarint64(std::string* dst, uint64_t v) {
char buf[10];
char* ptr = EncodeVarint64(buf, v);
dst->append(buf, ptr - buf);
}
void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
PutVarint32(dst, value.size());
dst->append(value.data(), value.size());
}
int VarintLength(uint64_t v) {
int len = 1;
while (v >= 128) {
v >>= 7;
len++;
}
return len;
}
const char* GetVarint32PtrFallback(const char* p,
const char* limit,
uint32_t* value) {
uint32_t result = 0;
for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
p++;
if (byte & 128) {
// More bytes are present
result |= ((byte & 127) << shift);
} else {
result |= (byte << shift);
*value = result;
return reinterpret_cast<const char*>(p);
}
}
return NULL;
}
bool GetVarint32(Slice* input, uint32_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint32Ptr(p, limit, value);
if (q == NULL) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
uint64_t result = 0;
for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
p++;
if (byte & 128) {
// More bytes are present
result |= ((byte & 127) << shift);
} else {
result |= (byte << shift);
*value = result;
return reinterpret_cast<const char*>(p);
}
}
return NULL;
}
bool GetVarint64(Slice* input, uint64_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint64Ptr(p, limit, value);
if (q == NULL) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
const char* GetLengthPrefixedSlice(const char* p, const char* limit,
Slice* result) {
uint32_t len;
p = GetVarint32Ptr(p, limit, &len);
if (p == NULL) return NULL;
if (p + len > limit) return NULL;
*result = Slice(p, len);
return p + len;
}
bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
uint32_t len;
if (GetVarint32(input, &len) &&
input->size() >= len) {
*result = Slice(input->data(), len);
input->remove_prefix(len);
return true;
} else {
return false;
}
}
}

+ 104
- 0
util/coding.h View File

@ -0,0 +1,104 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Endian-neutral encoding:
// * Fixed-length numbers are encoded with least-significant byte first
// * In addition we support variable length "varint" encoding
// * Strings are encoded prefixed by their length in varint format
#ifndef STORAGE_LEVELDB_UTIL_CODING_H_
#define STORAGE_LEVELDB_UTIL_CODING_H_
#include <stdint.h>
#include <string.h>
#include <string>
#include "include/slice.h"
#include "port/port.h"
namespace leveldb {
// Standard Put... routines append to a string
extern void PutFixed32(std::string* dst, uint32_t value);
extern void PutFixed64(std::string* dst, uint64_t value);
extern void PutVarint32(std::string* dst, uint32_t value);
extern void PutVarint64(std::string* dst, uint64_t value);
extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
// Standard Get... routines parse a value from the beginning of a Slice
// and advance the slice past the parsed value.
extern bool GetVarint32(Slice* input, uint32_t* value);
extern bool GetVarint64(Slice* input, uint64_t* value);
extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
// Pointer-based variants of GetVarint... These either store a value
// in *v and return a pointer just past the parsed value, or return
// NULL on error. These routines only look at bytes in the range
// [p..limit-1]
extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
// Returns the length of the varint32 or varint64 encoding of "v"
extern int VarintLength(uint64_t v);
// Lower-level versions of Put... that write directly into a character buffer
// REQUIRES: dst has enough space for the value being written
extern void EncodeFixed32(char* dst, uint32_t value);
extern void EncodeFixed64(char* dst, uint64_t value);
// Lower-level versions of Put... that write directly into a character buffer
// and return a pointer just past the last byte written.
// REQUIRES: dst has enough space for the value being written
extern char* EncodeVarint32(char* dst, uint32_t value);
extern char* EncodeVarint64(char* dst, uint64_t value);
// Lower-level versions of Get... that read directly from a character buffer
// without any bounds checking.
inline uint32_t DecodeFixed32(const char* ptr) {
if (port::kLittleEndian) {
// Load the raw bytes
uint32_t result;
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
return result;
} else {
return ((static_cast<uint32_t>(ptr[0]))
| (static_cast<uint32_t>(ptr[1]) << 8)
| (static_cast<uint32_t>(ptr[2]) << 16)
| (static_cast<uint32_t>(ptr[3]) << 24));
}
}
inline uint64_t DecodeFixed64(const char* ptr) {
if (port::kLittleEndian) {
// Load the raw bytes
uint64_t result;
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
return result;
} else {
uint64_t lo = DecodeFixed32(ptr);
uint64_t hi = DecodeFixed32(ptr + 4);
return (hi << 32) | lo;
}
}
// Internal routine for use by fallback path of GetVarint32Ptr
extern const char* GetVarint32PtrFallback(const char* p,
const char* limit,
uint32_t* value);
inline const char* GetVarint32Ptr(const char* p,
const char* limit,
uint32_t* value) {
if (p < limit) {
uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
if ((result & 128) == 0) {
*value = result;
return p + 1;
}
}
return GetVarint32PtrFallback(p, limit, value);
}
}
#endif // STORAGE_LEVELDB_UTIL_CODING_H_

+ 173
- 0
util/coding_test.cc View File

@ -0,0 +1,173 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/coding.h"
#include "util/testharness.h"
namespace leveldb {
class Coding { };
TEST(Coding, Fixed32) {
std::string s;
for (uint32_t v = 0; v < 100000; v++) {
PutFixed32(&s, v);
}
const char* p = s.data();
for (uint32_t v = 0; v < 100000; v++) {
uint32_t actual = DecodeFixed32(p);
ASSERT_EQ(v, actual);
p += sizeof(uint32_t);
}
}
TEST(Coding, Fixed64) {
std::string s;
for (int power = 0; power <= 63; power++) {
uint64_t v = static_cast<uint64_t>(1) << power;
PutFixed64(&s, v - 1);
PutFixed64(&s, v + 0);
PutFixed64(&s, v + 1);
}
const char* p = s.data();
for (int power = 0; power <= 63; power++) {
uint64_t v = static_cast<uint64_t>(1) << power;
uint64_t actual;
actual = DecodeFixed64(p);
ASSERT_EQ(v-1, actual);
p += sizeof(uint64_t);
actual = DecodeFixed64(p);
ASSERT_EQ(v+0, actual);
p += sizeof(uint64_t);
actual = DecodeFixed64(p);
ASSERT_EQ(v+1, actual);
p += sizeof(uint64_t);
}
}
TEST(Coding, Varint32) {
std::string s;
for (uint32_t i = 0; i < (32 * 32); i++) {
uint32_t v = (i / 32) << (i % 32);
PutVarint32(&s, v);
}
const char* p = s.data();
const char* limit = p + s.size();
for (uint32_t i = 0; i < (32 * 32); i++) {
uint32_t expected = (i / 32) << (i % 32);
uint32_t actual;
const char* start = p;
p = GetVarint32Ptr(p, limit, &actual);
ASSERT_TRUE(p != NULL);
ASSERT_EQ(expected, actual);
ASSERT_EQ(VarintLength(actual), p - start);
}
ASSERT_EQ(p, s.data() + s.size());
}
TEST(Coding, Varint64) {
// Construct the list of values to check
std::vector<uint64_t> values;
// Some special values
values.push_back(0);
values.push_back(100);
values.push_back(~static_cast<uint64_t>(0));
values.push_back(~static_cast<uint64_t>(0) - 1);
for (uint32_t k = 0; k < 64; k++) {
// Test values near powers of two
const uint64_t power = 1ull << k;
values.push_back(power);
values.push_back(power-1);
values.push_back(power+1);
};
std::string s;
for (int i = 0; i < values.size(); i++) {
PutVarint64(&s, values[i]);
}
const char* p = s.data();
const char* limit = p + s.size();
for (int i = 0; i < values.size(); i++) {
ASSERT_TRUE(p < limit);
uint64_t actual;
const char* start = p;
p = GetVarint64Ptr(p, limit, &actual);
ASSERT_TRUE(p != NULL);
ASSERT_EQ(values[i], actual);
ASSERT_EQ(VarintLength(actual), p - start);
}
ASSERT_EQ(p, limit);
}
TEST(Coding, Varint32Overflow) {
uint32_t result;
std::string input("\x81\x82\x83\x84\x85\x11");
ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result)
== NULL);
}
TEST(Coding, Varint32Truncation) {
uint32_t large_value = (1u << 31) + 100;
std::string s;
PutVarint32(&s, large_value);
uint32_t result;
for (int len = 0; len < s.size() - 1; len++) {
ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL);
}
ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL);
ASSERT_EQ(large_value, result);
}
TEST(Coding, Varint64Overflow) {
uint64_t result;
std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result)
== NULL);
}
TEST(Coding, Varint64Truncation) {
uint64_t large_value = (1ull << 63) + 100ull;
std::string s;
PutVarint64(&s, large_value);
uint64_t result;
for (int len = 0; len < s.size() - 1; len++) {
ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL);
}
ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL);
ASSERT_EQ(large_value, result);
}
TEST(Coding, Strings) {
std::string s;
PutLengthPrefixedSlice(&s, Slice(""));
PutLengthPrefixedSlice(&s, Slice("foo"));
PutLengthPrefixedSlice(&s, Slice("bar"));
PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x')));
Slice input(s);
Slice v;
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
ASSERT_EQ("", v.ToString());
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
ASSERT_EQ("foo", v.ToString());
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
ASSERT_EQ("bar", v.ToString());
ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
ASSERT_EQ(std::string(200, 'x'), v.ToString());
ASSERT_EQ("", input.ToString());
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

+ 72
- 0
util/comparator.cc View File

@ -0,0 +1,72 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdint.h>
#include "include/comparator.h"
#include "include/slice.h"
#include "util/logging.h"
namespace leveldb {
Comparator::~Comparator() { }
namespace {
class BytewiseComparatorImpl : public Comparator {
public:
BytewiseComparatorImpl() { }
virtual const char* Name() const {
return "leveldb.BytewiseComparator";
}
virtual int Compare(const Slice& a, const Slice& b) const {
return a.compare(b);
}
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const {
// Find length of common prefix
size_t min_length = std::min(start->size(), limit.size());
size_t diff_index = 0;
while ((diff_index < min_length) &&
((*start)[diff_index] == limit[diff_index])) {
diff_index++;
}
if (diff_index >= min_length) {
// Do not shorten if one string is a prefix of the other
} else {
uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]);
if (diff_byte < static_cast<uint8_t>(0xff) &&
diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) {
(*start)[diff_index]++;
start->resize(diff_index + 1);
assert(Compare(*start, limit) < 0);
}
}
}
virtual void FindShortSuccessor(std::string* key) const {
// Find first character that can be incremented
size_t n = key->size();
for (int i = 0; i < n; i++) {
const uint8_t byte = (*key)[i];
if (byte != static_cast<uint8_t>(0xff)) {
(*key)[i] = byte + 1;
key->resize(i+1);
return;
}
}
// *key is a run of 0xffs. Leave it alone.
}
};
}
static const BytewiseComparatorImpl bytewise;
const Comparator* BytewiseComparator() {
return &bytewise;
}
}

+ 332
- 0
util/crc32c.cc View File

@ -0,0 +1,332 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A portable implementation of crc32c, optimized to handle
// four bytes at a time.
#include "util/crc32c.h"
#include <stdint.h>
#include "util/coding.h"
namespace leveldb {
namespace crc32c {
static const uint32_t table0_[256] = {
0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
};
static const uint32_t table1_[256] = {
0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
};
static const uint32_t table2_[256] = {
0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
};
static const uint32_t table3_[256] = {
0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
};
// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
static inline uint32_t LE_LOAD32(const uint8_t *p) {
return DecodeFixed32(reinterpret_cast<const char*>(p));
}
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint32_t l = crc ^ 0xffffffffu;
#define STEP1 do { \
int c = (l & 0xff) ^ *p++; \
l = table0_[c] ^ (l >> 8); \
} while (0)
#define STEP4 do { \
uint32_t c = l ^ LE_LOAD32(p); \
p += 4; \
l = table3_[c & 0xff] ^ \
table2_[(c >> 8) & 0xff] ^ \
table1_[(c >> 16) & 0xff] ^ \
table0_[c >> 24]; \
} while (0)
// Point x at first 4-byte aligned byte in string. This might be
// just past the end of the string.
const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
const uint8_t* x = reinterpret_cast<const uint8_t*>(((pval + 3) >> 2) << 2);
if (x <= e) {
// Process bytes until finished or p is 4-byte aligned
while (p != x) {
STEP1;
}
}
// Process bytes 16 at a time
while ((e-p) >= 16) {
STEP4; STEP4; STEP4; STEP4;
}
// Process bytes 4 at a time
while ((e-p) >= 4) {
STEP4;
}
// Process the last few bytes
while (p != e) {
STEP1;
}
#undef STEP4
#undef STEP1
return l ^ 0xffffffffu;
}
}
}

+ 45
- 0
util/crc32c.h View File

@ -0,0 +1,45 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_
#define STORAGE_LEVELDB_UTIL_CRC32C_H_
#include <stddef.h>
#include <stdint.h>
namespace leveldb {
namespace crc32c {
// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
// crc32c of some string A. Extend() is often used to maintain the
// crc32c of a stream of data.
extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
// Return the crc32c of data[0,n-1]
inline uint32_t Value(const char* data, size_t n) {
return Extend(0, data, n);
}
static const uint32_t kMaskDelta = 0xa282ead8ul;
// Return a masked representation of crc.
//
// Motivation: it is problematic to compute the CRC of a string that
// contains embedded CRCs. Therefore we recommend that CRCs stored
// somewhere (e.g., in files) should be masked before being stored.
inline uint32_t Mask(uint32_t crc) {
// Rotate right by 15 bits and add a constant.
return ((crc >> 15) | (crc << 17)) + kMaskDelta;
}
// Return the crc whose masked representation is masked_crc.
inline uint32_t Unmask(uint32_t masked_crc) {
uint32_t rot = masked_crc - kMaskDelta;
return ((rot >> 17) | (rot << 15));
}
}
}
#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_

+ 86
- 0
util/crc32c_test.cc View File

@ -0,0 +1,86 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/crc32c.h"
#include "util/testharness.h"
namespace leveldb {
namespace crc32c {
class CRC { };
TEST(CRC, StandardResults) {
// From rfc3720 section B.4.
char buf[32];
memset(buf, 0, sizeof(buf));
ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf)));
memset(buf, 0xff, sizeof(buf));
ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf)));
for (int i = 0; i < 32; i++) {
buf[i] = i;
}
ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf)));
for (int i = 0; i < 32; i++) {
buf[i] = 31 - i;
}
ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf)));
unsigned char data[48] = {
0x01, 0xc0, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x14, 0x00, 0x00, 0x00,
0x00, 0x00, 0x04, 0x00,
0x00, 0x00, 0x00, 0x14,
0x00, 0x00, 0x00, 0x18,
0x28, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
};
ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
}
TEST(CRC, Values) {
ASSERT_NE(Value("a", 1), Value("foo", 3));
}
TEST(CRC, Extend) {
ASSERT_EQ(Value("hello world", 11),
Extend(Value("hello ", 6), "world", 5));
}
TEST(CRC, Mask) {
uint32_t crc = Value("foo", 3);
ASSERT_NE(crc, Mask(crc));
ASSERT_NE(crc, Mask(Mask(crc)));
ASSERT_EQ(crc, Unmask(Mask(crc)));
ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
}
TEST(CRC, Benchmark) {
std::string data(1048576 * 100, 'x');
double start = Env::Default()->NowMicros() * 1e-6;
static const int kIters = 10;
uint32_t crc = 0;
for (int i = 0; i < kIters; i++) {
crc |= Value(data.data(), data.size());
}
double finish = Env::Default()->NowMicros() * 1e-6;
double mb = (static_cast<long long int>(data.size()) * kIters) / 1048576.0;
fprintf(stderr, "CRC %0.0f MB: %.3f secs; %.1f MB/s, crc=0x%08x\n",
mb, (finish - start), mb / (finish - start), crc);
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save