From 69c6d38342a1fab5f7f2921aa2e9c0e60ba90e35 Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Tue, 19 Apr 2011 23:11:15 +0000 Subject: [PATCH] reverting disastrous MOE commit, returning to r21 git-svn-id: https://leveldb.googlecode.com/svn/trunk@23 62dab493-f737-651d-591e-8d6aee1b9529 --- AUTHORS | 8 + LICENSE | 27 + Makefile | 134 +++ README | 51 ++ TODO | 14 + db/builder.cc | 99 +++ db/builder.h | 36 + db/corruption_test.cc | 378 +++++++++ db/db_bench.cc | 635 +++++++++++++++ db/db_impl.cc | 1345 +++++++++++++++++++++++++++++++ db/db_impl.h | 207 +++++ db/db_iter.cc | 397 +++++++++ db/db_iter.h | 26 + db/db_test.cc | 1211 ++++++++++++++++++++++++++++ db/dbformat.cc | 152 ++++ db/dbformat.h | 204 +++++ db/dbformat_test.cc | 127 +++ db/filename.cc | 154 ++++ db/filename.h | 92 +++ db/filename_test.cc | 156 ++++ db/log_format.h | 35 + db/log_reader.cc | 176 ++++ db/log_reader.h | 75 ++ db/log_test.cc | 361 +++++++++ db/log_writer.cc | 102 +++ db/log_writer.h | 48 ++ db/memtable.cc | 109 +++ db/memtable.h | 69 ++ db/repair.cc | 396 +++++++++ db/skiplist.h | 378 +++++++++ db/skiplist_test.cc | 378 +++++++++ db/snapshot.h | 66 ++ db/table_cache.cc | 95 +++ db/table_cache.h | 50 ++ db/version_edit.cc | 301 +++++++ db/version_edit.h | 124 +++ db/version_edit_test.cc | 50 ++ db/version_set.cc | 1120 +++++++++++++++++++++++++ db/version_set.h | 332 ++++++++ db/write_batch.cc | 164 ++++ db/write_batch_internal.h | 73 ++ db/write_batch_test.cc | 110 +++ doc/doc.css | 89 ++ doc/impl.html | 228 ++++++ doc/index.html | 509 ++++++++++++ doc/log_format.txt | 75 ++ doc/table_format.txt | 61 ++ include/leveldb/cache.h | 99 +++ include/leveldb/comparator.h | 61 ++ include/leveldb/db.h | 142 ++++ include/leveldb/env.h | 290 +++++++ include/leveldb/iterator.h | 95 +++ include/leveldb/options.h | 208 +++++ include/leveldb/slice.h | 104 +++ include/leveldb/status.h | 86 ++ include/leveldb/table.h | 69 ++ include/leveldb/table_builder.h | 86 ++ include/leveldb/write_batch.h | 49 ++ leveldb.gyp | 327 ++++++++ leveldb/AUTHORS | 8 - leveldb/LICENSE | 27 - leveldb/Makefile | 129 --- leveldb/README | 51 -- leveldb/TODO | 14 - leveldb/db/builder.cc | 90 --- leveldb/db/builder.h | 36 - leveldb/db/corruption_test.cc | 354 -------- leveldb/db/db_bench.cc | 613 -------------- leveldb/db/db_impl.cc | 1188 --------------------------- leveldb/db/db_impl.h | 184 ----- leveldb/db/db_iter.cc | 298 ------- leveldb/db/db_iter.h | 26 - leveldb/db/db_test.cc | 1030 ----------------------- leveldb/db/dbformat.cc | 87 -- leveldb/db/dbformat.h | 155 ---- leveldb/db/dbformat_test.cc | 112 --- leveldb/db/filename.cc | 135 ---- leveldb/db/filename.h | 80 -- leveldb/db/filename_test.cc | 122 --- leveldb/db/log_format.h | 35 - leveldb/db/log_reader.cc | 176 ---- leveldb/db/log_reader.h | 75 -- leveldb/db/log_test.cc | 361 --------- leveldb/db/log_writer.cc | 102 --- leveldb/db/log_writer.h | 48 -- leveldb/db/memtable.cc | 109 --- leveldb/db/memtable.h | 69 -- leveldb/db/repair.cc | 380 --------- leveldb/db/skiplist.h | 378 --------- leveldb/db/skiplist_test.cc | 378 --------- leveldb/db/snapshot.h | 66 -- leveldb/db/table_cache.cc | 95 --- leveldb/db/table_cache.h | 50 -- leveldb/db/version_edit.cc | 268 ------ leveldb/db/version_edit.h | 106 --- leveldb/db/version_edit_test.cc | 46 -- leveldb/db/version_set.cc | 1027 ----------------------- leveldb/db/version_set.h | 308 ------- leveldb/db/write_batch.cc | 148 ---- leveldb/db/write_batch_internal.h | 69 -- leveldb/db/write_batch_test.cc | 87 -- leveldb/doc/doc.css | 89 -- leveldb/doc/impl.html | 217 ----- leveldb/doc/index.html | 498 ------------ leveldb/doc/log_format.txt | 75 -- leveldb/doc/table_format.txt | 61 -- leveldb/include/leveldb/cache.h | 99 --- leveldb/include/leveldb/comparator.h | 61 -- leveldb/include/leveldb/db.h | 142 ---- leveldb/include/leveldb/env.h | 290 ------- leveldb/include/leveldb/iterator.h | 95 --- leveldb/include/leveldb/options.h | 198 ----- leveldb/include/leveldb/slice.h | 104 --- leveldb/include/leveldb/status.h | 86 -- leveldb/include/leveldb/table.h | 69 -- leveldb/include/leveldb/table_builder.h | 86 -- leveldb/include/leveldb/write_batch.h | 49 -- leveldb/leveldb.gyp | 315 -------- leveldb/port/README | 10 - leveldb/port/port.h | 21 - leveldb/port/port_android.cc | 64 -- leveldb/port/port_android.h | 150 ---- leveldb/port/port_chromium.cc | 80 -- leveldb/port/port_chromium.h | 97 --- leveldb/port/port_example.h | 115 --- leveldb/port/port_posix.cc | 50 -- leveldb/port/port_posix.h | 94 --- leveldb/port/win/stdint.h | 24 - leveldb/table/block.cc | 263 ------ leveldb/table/block.h | 43 - leveldb/table/block_builder.cc | 109 --- leveldb/table/block_builder.h | 57 -- leveldb/table/format.cc | 131 --- leveldb/table/format.h | 103 --- leveldb/table/iterator.cc | 68 -- leveldb/table/iterator_wrapper.h | 64 -- leveldb/table/merger.cc | 197 ----- leveldb/table/merger.h | 26 - leveldb/table/table.cc | 175 ---- leveldb/table/table_builder.cc | 227 ------ leveldb/table/table_test.cc | 841 ------------------- leveldb/table/two_level_iterator.cc | 182 ----- leveldb/table/two_level_iterator.h | 34 - leveldb/util/arena.cc | 68 -- leveldb/util/arena.h | 68 -- leveldb/util/arena_test.cc | 68 -- leveldb/util/cache.cc | 253 ------ leveldb/util/cache_test.cc | 169 ---- leveldb/util/coding.cc | 194 ----- leveldb/util/coding.h | 104 --- leveldb/util/coding_test.cc | 173 ---- leveldb/util/comparator.cc | 72 -- leveldb/util/crc32c.cc | 332 -------- leveldb/util/crc32c.h | 45 -- leveldb/util/crc32c_test.cc | 72 -- leveldb/util/env.cc | 77 -- leveldb/util/env_chromium.cc | 603 -------------- leveldb/util/env_posix.cc | 599 -------------- leveldb/util/env_test.cc | 102 --- leveldb/util/hash.cc | 45 -- leveldb/util/hash.h | 19 - leveldb/util/histogram.cc | 128 --- leveldb/util/histogram.h | 41 - leveldb/util/logging.cc | 81 -- leveldb/util/logging.h | 47 -- leveldb/util/mutexlock.h | 39 - leveldb/util/options.cc | 28 - leveldb/util/random.h | 59 -- leveldb/util/status.cc | 59 -- leveldb/util/testharness.cc | 65 -- leveldb/util/testharness.h | 129 --- leveldb/util/testutil.cc | 51 -- leveldb/util/testutil.h | 53 -- port/README | 10 + port/port.h | 21 + port/port_android.cc | 64 ++ port/port_android.h | 158 ++++ port/port_chromium.cc | 80 ++ port/port_chromium.h | 104 +++ port/port_example.h | 120 +++ port/port_posix.cc | 50 ++ port/port_posix.h | 99 +++ port/sha1_portable.cc | 298 +++++++ port/sha1_portable.h | 25 + port/sha1_test.cc | 39 + port/win/stdint.h | 24 + table/block.cc | 261 ++++++ table/block.h | 43 + table/block_builder.cc | 109 +++ table/block_builder.h | 57 ++ table/format.cc | 131 +++ table/format.h | 103 +++ table/iterator.cc | 68 ++ table/iterator_wrapper.h | 64 ++ table/merger.cc | 197 +++++ table/merger.h | 26 + table/table.cc | 175 ++++ table/table_builder.cc | 227 ++++++ table/table_test.cc | 841 +++++++++++++++++++ table/two_level_iterator.cc | 182 +++++ table/two_level_iterator.h | 34 + util/arena.cc | 68 ++ util/arena.h | 68 ++ util/arena_test.cc | 68 ++ util/cache.cc | 253 ++++++ util/cache_test.cc | 169 ++++ util/coding.cc | 194 +++++ util/coding.h | 104 +++ util/coding_test.cc | 173 ++++ util/comparator.cc | 72 ++ util/crc32c.cc | 332 ++++++++ util/crc32c.h | 45 ++ util/crc32c_test.cc | 72 ++ util/env.cc | 77 ++ util/env_chromium.cc | 603 ++++++++++++++ util/env_posix.cc | 599 ++++++++++++++ util/env_test.cc | 102 +++ util/hash.cc | 45 ++ util/hash.h | 19 + util/histogram.cc | 128 +++ util/histogram.h | 41 + util/logging.cc | 81 ++ util/logging.h | 47 ++ util/mutexlock.h | 39 + util/options.cc | 29 + util/random.h | 59 ++ util/status.cc | 59 ++ util/testharness.cc | 65 ++ util/testharness.h | 129 +++ util/testutil.cc | 51 ++ util/testutil.h | 53 ++ 231 files changed, 20097 insertions(+), 18722 deletions(-) create mode 100644 AUTHORS create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README create mode 100644 TODO create mode 100644 db/builder.cc create mode 100644 db/builder.h create mode 100644 db/corruption_test.cc create mode 100644 db/db_bench.cc create mode 100644 db/db_impl.cc create mode 100644 db/db_impl.h create mode 100644 db/db_iter.cc create mode 100644 db/db_iter.h create mode 100644 db/db_test.cc create mode 100644 db/dbformat.cc create mode 100644 db/dbformat.h create mode 100644 db/dbformat_test.cc create mode 100644 db/filename.cc create mode 100644 db/filename.h create mode 100644 db/filename_test.cc create mode 100644 db/log_format.h create mode 100644 db/log_reader.cc create mode 100644 db/log_reader.h create mode 100644 db/log_test.cc create mode 100644 db/log_writer.cc create mode 100644 db/log_writer.h create mode 100644 db/memtable.cc create mode 100644 db/memtable.h create mode 100644 db/repair.cc create mode 100644 db/skiplist.h create mode 100644 db/skiplist_test.cc create mode 100644 db/snapshot.h create mode 100644 db/table_cache.cc create mode 100644 db/table_cache.h create mode 100644 db/version_edit.cc create mode 100644 db/version_edit.h create mode 100644 db/version_edit_test.cc create mode 100644 db/version_set.cc create mode 100644 db/version_set.h create mode 100644 db/write_batch.cc create mode 100644 db/write_batch_internal.h create mode 100644 db/write_batch_test.cc create mode 100644 doc/doc.css create mode 100644 doc/impl.html create mode 100644 doc/index.html create mode 100644 doc/log_format.txt create mode 100644 doc/table_format.txt create mode 100644 include/leveldb/cache.h create mode 100644 include/leveldb/comparator.h create mode 100644 include/leveldb/db.h create mode 100644 include/leveldb/env.h create mode 100644 include/leveldb/iterator.h create mode 100644 include/leveldb/options.h create mode 100644 include/leveldb/slice.h create mode 100644 include/leveldb/status.h create mode 100644 include/leveldb/table.h create mode 100644 include/leveldb/table_builder.h create mode 100644 include/leveldb/write_batch.h create mode 100644 leveldb.gyp delete mode 100644 leveldb/AUTHORS delete mode 100644 leveldb/LICENSE delete mode 100644 leveldb/Makefile delete mode 100644 leveldb/README delete mode 100644 leveldb/TODO delete mode 100644 leveldb/db/builder.cc delete mode 100644 leveldb/db/builder.h delete mode 100644 leveldb/db/corruption_test.cc delete mode 100644 leveldb/db/db_bench.cc delete mode 100644 leveldb/db/db_impl.cc delete mode 100644 leveldb/db/db_impl.h delete mode 100644 leveldb/db/db_iter.cc delete mode 100644 leveldb/db/db_iter.h delete mode 100644 leveldb/db/db_test.cc delete mode 100644 leveldb/db/dbformat.cc delete mode 100644 leveldb/db/dbformat.h delete mode 100644 leveldb/db/dbformat_test.cc delete mode 100644 leveldb/db/filename.cc delete mode 100644 leveldb/db/filename.h delete mode 100644 leveldb/db/filename_test.cc delete mode 100644 leveldb/db/log_format.h delete mode 100644 leveldb/db/log_reader.cc delete mode 100644 leveldb/db/log_reader.h delete mode 100644 leveldb/db/log_test.cc delete mode 100644 leveldb/db/log_writer.cc delete mode 100644 leveldb/db/log_writer.h delete mode 100644 leveldb/db/memtable.cc delete mode 100644 leveldb/db/memtable.h delete mode 100644 leveldb/db/repair.cc delete mode 100644 leveldb/db/skiplist.h delete mode 100644 leveldb/db/skiplist_test.cc delete mode 100644 leveldb/db/snapshot.h delete mode 100644 leveldb/db/table_cache.cc delete mode 100644 leveldb/db/table_cache.h delete mode 100644 leveldb/db/version_edit.cc delete mode 100644 leveldb/db/version_edit.h delete mode 100644 leveldb/db/version_edit_test.cc delete mode 100644 leveldb/db/version_set.cc delete mode 100644 leveldb/db/version_set.h delete mode 100644 leveldb/db/write_batch.cc delete mode 100644 leveldb/db/write_batch_internal.h delete mode 100644 leveldb/db/write_batch_test.cc delete mode 100644 leveldb/doc/doc.css delete mode 100644 leveldb/doc/impl.html delete mode 100644 leveldb/doc/index.html delete mode 100644 leveldb/doc/log_format.txt delete mode 100644 leveldb/doc/table_format.txt delete mode 100644 leveldb/include/leveldb/cache.h delete mode 100644 leveldb/include/leveldb/comparator.h delete mode 100644 leveldb/include/leveldb/db.h delete mode 100644 leveldb/include/leveldb/env.h delete mode 100644 leveldb/include/leveldb/iterator.h delete mode 100644 leveldb/include/leveldb/options.h delete mode 100644 leveldb/include/leveldb/slice.h delete mode 100644 leveldb/include/leveldb/status.h delete mode 100644 leveldb/include/leveldb/table.h delete mode 100644 leveldb/include/leveldb/table_builder.h delete mode 100644 leveldb/include/leveldb/write_batch.h delete mode 100644 leveldb/leveldb.gyp delete mode 100644 leveldb/port/README delete mode 100644 leveldb/port/port.h delete mode 100644 leveldb/port/port_android.cc delete mode 100644 leveldb/port/port_android.h delete mode 100644 leveldb/port/port_chromium.cc delete mode 100644 leveldb/port/port_chromium.h delete mode 100644 leveldb/port/port_example.h delete mode 100644 leveldb/port/port_posix.cc delete mode 100644 leveldb/port/port_posix.h delete mode 100644 leveldb/port/win/stdint.h delete mode 100644 leveldb/table/block.cc delete mode 100644 leveldb/table/block.h delete mode 100644 leveldb/table/block_builder.cc delete mode 100644 leveldb/table/block_builder.h delete mode 100644 leveldb/table/format.cc delete mode 100644 leveldb/table/format.h delete mode 100644 leveldb/table/iterator.cc delete mode 100644 leveldb/table/iterator_wrapper.h delete mode 100644 leveldb/table/merger.cc delete mode 100644 leveldb/table/merger.h delete mode 100644 leveldb/table/table.cc delete mode 100644 leveldb/table/table_builder.cc delete mode 100644 leveldb/table/table_test.cc delete mode 100644 leveldb/table/two_level_iterator.cc delete mode 100644 leveldb/table/two_level_iterator.h delete mode 100644 leveldb/util/arena.cc delete mode 100644 leveldb/util/arena.h delete mode 100644 leveldb/util/arena_test.cc delete mode 100644 leveldb/util/cache.cc delete mode 100644 leveldb/util/cache_test.cc delete mode 100644 leveldb/util/coding.cc delete mode 100644 leveldb/util/coding.h delete mode 100644 leveldb/util/coding_test.cc delete mode 100644 leveldb/util/comparator.cc delete mode 100644 leveldb/util/crc32c.cc delete mode 100644 leveldb/util/crc32c.h delete mode 100644 leveldb/util/crc32c_test.cc delete mode 100644 leveldb/util/env.cc delete mode 100644 leveldb/util/env_chromium.cc delete mode 100644 leveldb/util/env_posix.cc delete mode 100644 leveldb/util/env_test.cc delete mode 100644 leveldb/util/hash.cc delete mode 100644 leveldb/util/hash.h delete mode 100644 leveldb/util/histogram.cc delete mode 100644 leveldb/util/histogram.h delete mode 100644 leveldb/util/logging.cc delete mode 100644 leveldb/util/logging.h delete mode 100644 leveldb/util/mutexlock.h delete mode 100644 leveldb/util/options.cc delete mode 100644 leveldb/util/random.h delete mode 100644 leveldb/util/status.cc delete mode 100644 leveldb/util/testharness.cc delete mode 100644 leveldb/util/testharness.h delete mode 100644 leveldb/util/testutil.cc delete mode 100644 leveldb/util/testutil.h create mode 100644 port/README create mode 100644 port/port.h create mode 100644 port/port_android.cc create mode 100644 port/port_android.h create mode 100644 port/port_chromium.cc create mode 100644 port/port_chromium.h create mode 100644 port/port_example.h create mode 100644 port/port_posix.cc create mode 100644 port/port_posix.h create mode 100644 port/sha1_portable.cc create mode 100644 port/sha1_portable.h create mode 100644 port/sha1_test.cc create mode 100644 port/win/stdint.h create mode 100644 table/block.cc create mode 100644 table/block.h create mode 100644 table/block_builder.cc create mode 100644 table/block_builder.h create mode 100644 table/format.cc create mode 100644 table/format.h create mode 100644 table/iterator.cc create mode 100644 table/iterator_wrapper.h create mode 100644 table/merger.cc create mode 100644 table/merger.h create mode 100644 table/table.cc create mode 100644 table/table_builder.cc create mode 100644 table/table_test.cc create mode 100644 table/two_level_iterator.cc create mode 100644 table/two_level_iterator.h create mode 100644 util/arena.cc create mode 100644 util/arena.h create mode 100644 util/arena_test.cc create mode 100644 util/cache.cc create mode 100644 util/cache_test.cc create mode 100644 util/coding.cc create mode 100644 util/coding.h create mode 100644 util/coding_test.cc create mode 100644 util/comparator.cc create mode 100644 util/crc32c.cc create mode 100644 util/crc32c.h create mode 100644 util/crc32c_test.cc create mode 100644 util/env.cc create mode 100644 util/env_chromium.cc create mode 100644 util/env_posix.cc create mode 100644 util/env_test.cc create mode 100644 util/hash.cc create mode 100644 util/hash.h create mode 100644 util/histogram.cc create mode 100644 util/histogram.h create mode 100644 util/logging.cc create mode 100644 util/logging.h create mode 100644 util/mutexlock.h create mode 100644 util/options.cc create mode 100644 util/random.h create mode 100644 util/status.cc create mode 100644 util/testharness.cc create mode 100644 util/testharness.h create mode 100644 util/testutil.cc create mode 100644 util/testutil.h diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..27a9407 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,8 @@ +# Names should be added to this file like so: +# Name or Organization + +Google Inc. + +# Initial version authors: +Jeffrey Dean +Sanjay Ghemawat diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8e80208 --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The LevelDB Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7569701 --- /dev/null +++ b/Makefile @@ -0,0 +1,134 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +CC = g++ + +# Uncomment one of the following to switch between debug and opt mode +#OPT = -O2 -DNDEBUG +OPT = -g2 + +CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -I./include -std=c++0x $(OPT) + +LDFLAGS=-lpthread + +LIBOBJECTS = \ + ./db/builder.o \ + ./db/db_impl.o \ + ./db/db_iter.o \ + ./db/filename.o \ + ./db/dbformat.o \ + ./db/log_reader.o \ + ./db/log_writer.o \ + ./db/memtable.o \ + ./db/repair.o \ + ./db/table_cache.o \ + ./db/version_edit.o \ + ./db/version_set.o \ + ./db/write_batch.o \ + ./port/port_posix.o \ + ./port/sha1_portable.o \ + ./table/block.o \ + ./table/block_builder.o \ + ./table/format.o \ + ./table/iterator.o \ + ./table/merger.o \ + ./table/table.o \ + ./table/table_builder.o \ + ./table/two_level_iterator.o \ + ./util/arena.o \ + ./util/cache.o \ + ./util/coding.o \ + ./util/comparator.o \ + ./util/crc32c.o \ + ./util/env.o \ + ./util/env_posix.o \ + ./util/hash.o \ + ./util/histogram.o \ + ./util/logging.o \ + ./util/options.o \ + ./util/status.o + +TESTUTIL = ./util/testutil.o +TESTHARNESS = ./util/testharness.o $(TESTUTIL) + +TESTS = \ + arena_test \ + cache_test \ + coding_test \ + corruption_test \ + crc32c_test \ + db_test \ + dbformat_test \ + env_test \ + filename_test \ + log_test \ + sha1_test \ + skiplist_test \ + table_test \ + version_edit_test \ + write_batch_test + +PROGRAMS = db_bench $(TESTS) + +all: $(PROGRAMS) + +check: $(TESTS) + for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done + +clean: + rm -f $(PROGRAMS) */*.o + +db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + +arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +.cc.o: + $(CC) $(CFLAGS) $< -o $@ + +# TODO(gabor): dependencies for .o files +# TODO(gabor): Build library diff --git a/README b/README new file mode 100644 index 0000000..c97e43c --- /dev/null +++ b/README @@ -0,0 +1,51 @@ +leveldb: A key-value store +Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) + +The code under this directory implements a system for maintaining a +persistent key/value store. + +See doc/index.html for more explanation. +See doc/db_layout.txt for a brief overview of the implementation. + +The public interface is in include/*.h. Callers should not include or +rely on the details of any other header files in this package. Those +internal APIs may be changed without warning. + +Guide to header files: + +include/db.h + Main interface to the DB: Start here + +include/options.h + Control over the behavior of an entire database, and also + control over the behavior of individual reads and writes. + +include/comparator.h + Abstraction for user-specified comparison function. If you want + just bytewise comparison of keys, you can use the default comparator, + but clients can write their own comparator implementations if they + want custom ordering (e.g. to handle different character + encodings, etc.) + +include/iterator.h + Interface for iterating over data. You can get an iterator + from a DB object. + +include/write_batch.h + Interface for atomically applying multiple updates to a database. + +include/slice.h + A simple module for maintaining a pointer and a length into some + other byte array. + +include/status.h + Status is returned from many of the public interfaces and is used + to report success and various kinds of errors. + +include/env.h + Abstraction of the OS environment. A posix implementation of + this interface is in util/env_posix.cc + +include/table.h +include/table_builder.h + Lower-level modules that most clients probably won't use directly diff --git a/TODO b/TODO new file mode 100644 index 0000000..2f848b8 --- /dev/null +++ b/TODO @@ -0,0 +1,14 @@ +ss +- Stats + +db +- Maybe implement DB::BulkDeleteForRange(start_key, end_key) + that would blow away files whose ranges are entirely contained + within [start_key..end_key]? For Chrome, deletion of obsolete + object stores, etc. can be done in the background anyway, so + probably not that important. + +api changes? +- Efficient large value reading and writing + +Faster Get implementation diff --git a/db/builder.cc b/db/builder.cc new file mode 100644 index 0000000..6c8e6b8 --- /dev/null +++ b/db/builder.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" + +namespace leveldb { + +Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + VersionEdit* edit) { + Status s; + meta->file_size = 0; + iter->SeekToFirst(); + + std::string fname = TableFileName(dbname, meta->number); + if (iter->Valid()) { + WritableFile* file; + s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + + TableBuilder* builder = new TableBuilder(options, file); + meta->smallest.DecodeFrom(iter->key()); + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + meta->largest.DecodeFrom(key); + if (ExtractValueType(key) == kTypeLargeValueRef) { + if (iter->value().size() != LargeValueRef::ByteSize()) { + s = Status::Corruption("invalid indirect reference hash value (L0)"); + break; + } + edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()), + meta->number, + iter->key()); + } + builder->Add(key, iter->value()); + } + + // Finish and check for builder errors + if (s.ok()) { + s = builder->Finish(); + if (s.ok()) { + meta->file_size = builder->FileSize(); + assert(meta->file_size > 0); + } + } else { + builder->Abandon(); + } + delete builder; + + // Finish and check for file errors + if (s.ok()) { + s = file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + delete file; + file = NULL; + + if (s.ok()) { + // Verify that the table is usable + Iterator* it = table_cache->NewIterator(ReadOptions(), + meta->number, + meta->file_size); + s = it->status(); + delete it; + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (s.ok() && meta->file_size > 0) { + edit->AddFile(0, meta->number, meta->file_size, + meta->smallest, meta->largest); + } else { + env->DeleteFile(fname); + } + return s; +} + +} diff --git a/db/builder.h b/db/builder.h new file mode 100644 index 0000000..4efcb04 --- /dev/null +++ b/db/builder.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ +#define STORAGE_LEVELDB_DB_BUILDER_H_ + +#include "leveldb/status.h" + +namespace leveldb { + +struct Options; +struct FileMetaData; + +class Env; +class Iterator; +class TableCache; +class VersionEdit; + +// Build a Table file from the contents of *iter. The generated file +// will be named according to meta->number. On success, the rest of +// *meta will be filled with metadata about the generated table, and +// large value refs and the added file information will be added to +// *edit. If no data is present in *iter, meta->file_size will be set +// to zero, and no Table file will be produced. +extern Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + VersionEdit* edit); + +} + +#endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/db/corruption_test.cc b/db/corruption_test.cc new file mode 100644 index 0000000..63d8d8b --- /dev/null +++ b/db/corruption_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include +#include +#include +#include +#include "leveldb/cache.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "leveldb/write_batch.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static const int kValueSize = 1000; + +class CorruptionTest { + public: + test::ErrorEnv env_; + Random rnd_; + std::string dbname_; + Cache* tiny_cache_; + Options options_; + DB* db_; + + CorruptionTest() : rnd_(test::RandomSeed()) { + tiny_cache_ = NewLRUCache(100); + options_.env = &env_; + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, options_); + + db_ = NULL; + options_.create_if_missing = true; + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() { + delete db_; + DestroyDB(dbname_, Options()); + delete tiny_cache_; + } + + Status TryReopen(Options* options = NULL) { + delete db_; + db_ = NULL; + Options opt = (options ? *options : options_); + opt.env = &env_; + opt.block_cache = tiny_cache_; + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); + } + + void RepairDB() { + delete db_; + db_ = NULL; + ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); + } + + void Build(int n) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Check(int min_expected, int max_expected) { + int next_expected = 0; + int missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || + !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(key, &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + + fprintf(stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", + min_expected, max_expected, correct, bad_keys, bad_values, missed); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + LargeValueRef large_ref; + FileType type; + std::vector candidates; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type) && + type == filetype) { + candidates.push_back(dbname_ + "/" + filenames[i]); + } + } + ASSERT_TRUE(!candidates.empty()) << filetype; + std::string fname = candidates[rnd_.Uniform(candidates.size())]; + + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + ASSERT_TRUE(false) << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = sbuf.st_size + offset; + } + } + if (offset > sbuf.st_size) { + offset = sbuf.st_size; + } + if (offset + bytes_to_corrupt > sbuf.st_size) { + bytes_to_corrupt = sbuf.st_size - offset; + } + + // Do it + std::string contents; + Status s = ReadFileToString(Env::Default(), fname, &contents); + ASSERT_TRUE(s.ok()) << s.ToString(); + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(Env::Default(), contents, fname); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + int Property(const std::string& name) { + std::string property; + int result; + if (db_->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } +}; + +TEST(CorruptionTest, Recovery) { + Build(100); + Check(100, 100); + Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record + Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + Reopen(); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); +} + +TEST(CorruptionTest, RecoverWriteError) { + env_.writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_.writable_file_error_ = true; + const int num = 3 + (Options().write_buffer_size / kValueSize); + std::string value_storage; + Status s; + for (int i = 0; s.ok() && i < num; i++) { + WriteBatch batch; + batch.Put("a", Value(100, &value_storage)); + s = db_->Write(WriteOptions(), &batch); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_.num_writable_file_errors_, 1); + env_.writable_file_error_ = false; + Reopen(); +} + +TEST(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(1, "", "~"); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); +} + +TEST(CorruptionTest, TableFileIndexData) { + Build(10000); // Enough to build multiple Tables + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(1, "", "~"); + + Corrupt(kTableFile, -2000, 500); + Reopen(); + Check(5000, 9999); +} + +TEST(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST(CorruptionTest, LargeValueRecovery) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + std::string big; + ASSERT_OK(db_->Put(WriteOptions(), + "foo", test::RandomString(&rnd, 100000, &big))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); + + RepairDB(); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); + + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); +} + +TEST(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST(CorruptionTest, CompactionInputError) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); + dbi->TEST_CompactRange(0, "", "~"); + ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); +} + +TEST(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.paranoid_checks = true; + options.write_buffer_size = 1048576; + Reopen(&options); + + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + for (int i = 0; i < 10000 && s.ok(); i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + Corrupt(kTableFile, 100, 1); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + dbi->TEST_CompactMemTable(); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/db_bench.cc b/db/db_bench.cc new file mode 100644 index 0000000..849ebfa --- /dev/null +++ b/db/db_bench.cc @@ -0,0 +1,635 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "leveldb/cache.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/write_batch.h" +#include "port/port.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/random.h" +#include "util/testutil.h" + +// Comma-separated list of operations to run in the specified order +// Actual benchmarks: +// fillseq -- write N values in sequential key order in async mode +// fillrandom -- write N values in random key order in async mode +// overwrite -- overwrite N values in random key order in async mode +// fillsync -- write N/100 values in random key order in sync mode +// fill100K -- write N/1000 100K values in random order in async mode +// readseq -- read N values sequentially +// readreverse -- read N values in reverse order +// readrandom -- read N values in random order +// crc32c -- repeated crc32c of 4K of data +// sha1 -- repeated SHA1 computation over 4K of data +// Meta operations: +// compact -- Compact the entire DB +// stats -- Print DB stats +// heapprofile -- Dump a heap profile (if supported by this port) +static const char* FLAGS_benchmarks = + "fillseq," + "fillsync," + "fillrandom," + "overwrite," + "readrandom," + "readrandom," // Extra run to allow previous compactions to quiesce + "readseq," + "readreverse," + "compact," + "readrandom," + "readseq," + "readreverse," + "fill100K," + "crc32c," + "sha1," + "snappycomp," + "snappyuncomp," + ; + +// Number of key/values to place in database +static int FLAGS_num = 1000000; + +// Size of each value +static int FLAGS_value_size = 100; + +// Arrange to generate values that shrink to this fraction of +// their original size after compression +static double FLAGS_compression_ratio = 0.5; + +// Print histogram of operation timings +static bool FLAGS_histogram = false; + +// Number of bytes to buffer in memtable before compacting +// (initialized to default value by "main") +static int FLAGS_write_buffer_size = 0; + +// Number of bytes to use as a cache of uncompressed data. +// Negative means use default settings. +static int FLAGS_cache_size = -1; + +namespace leveldb { + +// Helper for quickly generating random data. +namespace { +class RandomGenerator { + private: + std::string data_; + int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < 1048576) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +static Slice TrimSpace(Slice s) { + int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); +} + +} + +class Benchmark { + private: + Cache* cache_; + DB* db_; + int num_; + int heap_counter_; + double start_; + double last_op_finish_; + int64_t bytes_; + std::string message_; + std::string post_message_; + Histogram hist_; + RandomGenerator gen_; + Random rand_; + + // State kept for progress messages + int done_; + int next_report_; // When to report next + + void PrintHeader() { + const int kKeySize = 16; + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", kKeySize); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %d\n", num_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast(kKeySize + FLAGS_value_size) * num_) + / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) + / 1048576.0)); + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + + // See if snappy is working by attempting to compress a compressible string + const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; + std::string compressed; + if (!port::Snappy_Compress(text, sizeof(text), &compressed)) { + fprintf(stdout, "WARNING: Snappy compression is not enabled\n"); + } else if (compressed.size() >= sizeof(text)) { + fprintf(stdout, "WARNING: Snappy compression is not effective\n"); + } + } + + void PrintEnvironment() { + fprintf(stderr, "LevelDB: version %d.%d\n", + kMajorVersion, kMinorVersion); + +#if defined(__linux) + time_t now = time(NULL); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != NULL) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != NULL) { + const char* sep = strchr(line, ':'); + if (sep == NULL) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + + void Start() { + start_ = Env::Default()->NowMicros() * 1e-6; + bytes_ = 0; + message_.clear(); + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + next_report_ = 100; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = Env::Default()->NowMicros() * 1e-6; + double micros = (now - last_op_finish_) * 1e6; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stderr, "... finished %d ops%30s\r", done_, ""); + fflush(stderr); + } + } + + void Stop(const Slice& name) { + double finish = Env::Default()->NowMicros() * 1e-6; + + // Pretend at least one op was done in case we are running a benchmark + // that does nto call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + if (bytes_ > 0) { + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / (finish - start_)); + if (!message_.empty()) { + message_ = std::string(rate) + " " + message_; + } else { + message_ = rate; + } + } + + fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", + name.ToString().c_str(), + (finish - start_) * 1e6 / done_, + (message_.empty() ? "" : " "), + message_.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + + if (!post_message_.empty()) { + fprintf(stdout, "\n%s\n", post_message_.c_str()); + post_message_.clear(); + } + } + + public: + enum Order { + SEQUENTIAL, + RANDOM + }; + enum DBState { + FRESH, + EXISTING + }; + + Benchmark() + : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), + db_(NULL), + num_(FLAGS_num), + heap_counter_(0), + bytes_(0), + rand_(301) { + std::vector files; + Env::Default()->GetChildren("/tmp/dbbench", &files); + for (int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); + } + } + DestroyDB("/tmp/dbbench", Options()); + } + + ~Benchmark() { + delete db_; + delete cache_; + } + + void Run() { + PrintHeader(); + Open(); + + const char* benchmarks = FLAGS_benchmarks; + while (benchmarks != NULL) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == NULL) { + name = benchmarks; + benchmarks = NULL; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + Start(); + + WriteOptions write_options; + bool known = true; + if (name == Slice("fillseq")) { + Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); + } else if (name == Slice("fillbatch")) { + Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000); + } else if (name == Slice("fillrandom")) { + Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1); + } else if (name == Slice("overwrite")) { + Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1); + } else if (name == Slice("fillsync")) { + write_options.sync = true; + Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); + } else if (name == Slice("fill100K")) { + Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); + } else if (name == Slice("readseq")) { + ReadSequential(); + } else if (name == Slice("readreverse")) { + ReadReverse(); + } else if (name == Slice("readrandom")) { + ReadRandom(); + } else if (name == Slice("readrandomsmall")) { + int n = num_; + num_ /= 1000; + ReadRandom(); + num_ = n; + } else if (name == Slice("compact")) { + Compact(); + } else if (name == Slice("crc32c")) { + Crc32c(4096, "(4K per op)"); + } else if (name == Slice("sha1")) { + SHA1(4096, "(4K per op)"); + } else if (name == Slice("snappycomp")) { + SnappyCompress(); + } else if (name == Slice("snappyuncomp")) { + SnappyUncompress(); + } else if (name == Slice("heapprofile")) { + HeapProfile(); + } else if (name == Slice("stats")) { + PrintStats(); + } else { + known = false; + if (name != Slice()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + } + } + if (known) { + Stop(name); + } + } + } + + private: + void Crc32c(int size, const char* label) { + // Checksum about 500MB of data total + std::string data(size, 'x'); + int64_t bytes = 0; + uint32_t crc = 0; + while (bytes < 500 * 1048576) { + crc = crc32c::Value(data.data(), size); + FinishedSingleOp(); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); + + bytes_ = bytes; + message_ = label; + } + + void SHA1(int size, const char* label) { + // SHA1 about 100MB of data total + std::string data(size, 'x'); + int64_t bytes = 0; + char sha1[20]; + while (bytes < 100 * 1048576) { + port::SHA1_Hash(data.data(), size, sha1); + FinishedSingleOp(); + bytes += size; + } + + // Print so result is not dead + fprintf(stderr, "... sha1=%02x...\r", static_cast(sha1[0])); + + bytes_ = bytes; + message_ = label; + } + + void SnappyCompress() { + Slice input = gen_.Generate(Options().block_size); + int64_t bytes = 0; + int64_t produced = 0; + bool ok = true; + std::string compressed; + while (ok && bytes < 1024 * 1048576) { // Compress 1G + ok = port::Snappy_Compress(input.data(), input.size(), &compressed); + produced += compressed.size(); + bytes += input.size(); + FinishedSingleOp(); + } + + if (!ok) { + message_ = "(snappy failure)"; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "(output: %.1f%%)", + (produced * 100.0) / bytes); + message_ = buf; + bytes_ = bytes; + } + } + + void SnappyUncompress() { + Slice input = gen_.Generate(Options().block_size); + std::string compressed; + bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); + int64_t bytes = 0; + std::string uncompressed; + while (ok && bytes < 1024 * 1048576) { // Compress 1G + ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), + &uncompressed); + bytes += uncompressed.size(); + FinishedSingleOp(); + } + + if (!ok) { + message_ = "(snappy failure)"; + } else { + bytes_ = bytes; + } + } + + void Open() { + assert(db_ == NULL); + Options options; + options.create_if_missing = true; + options.block_cache = cache_; + options.write_buffer_size = FLAGS_write_buffer_size; + Status s = DB::Open(options, "/tmp/dbbench", &db_); + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void Write(const WriteOptions& options, Order order, DBState state, + int num_entries, int value_size, int entries_per_batch) { + if (state == FRESH) { + delete db_; + db_ = NULL; + DestroyDB("/tmp/dbbench", Options()); + Open(); + Start(); // Do not count time taken to destroy/open + } + + if (num_entries != num_) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%d ops)", num_entries); + message_ = msg; + } + + WriteBatch batch; + Status s; + std::string val; + for (int i = 0; i < num_entries; i += entries_per_batch) { + batch.Clear(); + for (int j = 0; j < entries_per_batch; j++) { + const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num); + char key[100]; + snprintf(key, sizeof(key), "%016d", k); + batch.Put(key, gen_.Generate(value_size)); + bytes_ += value_size + strlen(key); + FinishedSingleOp(); + } + s = db_->Write(options, &batch); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + } + } + + void ReadSequential() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadReverse() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadRandom() { + ReadOptions options; + std::string value; + for (int i = 0; i < num_; i++) { + char key[100]; + const int k = rand_.Next() % FLAGS_num; + snprintf(key, sizeof(key), "%016d", k); + db_->Get(options, key, &value); + FinishedSingleOp(); + } + } + + void Compact() { + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + std::string property; + char name[100]; + snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); + if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbi->TEST_CompactRange(level, "", "~"); + } + } + + void PrintStats() { + std::string stats; + if (!db_->GetProperty("leveldb.stats", &stats)) { + message_ = "(failed)"; + } else { + post_message_ = stats; + } + } + + static void WriteToFile(void* arg, const char* buf, int n) { + reinterpret_cast(arg)->Append(Slice(buf, n)); + } + + void HeapProfile() { + char fname[100]; + snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); + WritableFile* file; + Status s = Env::Default()->NewWritableFile(fname, &file); + if (!s.ok()) { + message_ = s.ToString(); + return; + } + bool ok = port::GetHeapProfile(WriteToFile, file); + delete file; + if (!ok) { + message_ = "not supported"; + Env::Default()->DeleteFile(fname); + } + } +}; + +} + +int main(int argc, char** argv) { + FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; + for (int i = 1; i < argc; i++) { + double d; + int n; + char junk; + if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { + FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); + } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { + FLAGS_compression_ratio = d; + } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_histogram = n; + } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { + FLAGS_num = n; + } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { + FLAGS_value_size = n; + } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { + FLAGS_cache_size = n; + } else { + fprintf(stderr, "Invalid flag '%s'\n", argv[i]); + exit(1); + } + } + + leveldb::Benchmark benchmark; + benchmark.Run(); + return 0; +} diff --git a/db/db_impl.cc b/db/db_impl.cc new file mode 100644 index 0000000..d012236 --- /dev/null +++ b/db/db_impl.cc @@ -0,0 +1,1345 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" + +#include +#include +#include +#include +#include +#include +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/status.h" +#include "leveldb/table.h" +#include "leveldb/table_builder.h" +#include "port/port.h" +#include "table/block.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/mutexlock.h" + +namespace leveldb { + +struct DBImpl::CompactionState { + Compaction* const compaction; + + // Sequence numbers < smallest_snapshot are not significant since we + // will never have to service a snapshot below smallest_snapshot. + // Therefore if we have seen a sequence number S <= smallest_snapshot, + // we can drop all entries for the same key with sequence numbers < S. + SequenceNumber smallest_snapshot; + + // Files produced by compaction + struct Output { + uint64_t number; + uint64_t file_size; + InternalKey smallest, largest; + }; + std::vector outputs; + + // State kept for output being generated + WritableFile* outfile; + TableBuilder* builder; + + uint64_t total_bytes; + + Output* current_output() { return &outputs[outputs.size()-1]; } + + explicit CompactionState(Compaction* c) + : compaction(c), + outfile(NULL), + builder(NULL), + total_bytes(0) { + } +}; + +namespace { +class NullWritableFile : public WritableFile { + public: + virtual Status Append(const Slice& data) { return Status::OK(); } + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } +}; +} + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (*ptr > maxvalue) *ptr = maxvalue; + if (*ptr < minvalue) *ptr = minvalue; +} +Options SanitizeOptions(const std::string& dbname, + const InternalKeyComparator* icmp, + const Options& src) { + Options result = src; + result.comparator = icmp; + ClipToRange(&result.max_open_files, 20, 50000); + ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); + ClipToRange(&result.large_value_threshold, 16<<10, 1<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); + if (result.info_log == NULL) { + // Open a log file in the same directory as the db + src.env->CreateDir(dbname); // In case it does not exist + src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); + Status s = src.env->NewWritableFile(InfoLogFileName(dbname), + &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = new NullWritableFile; + } + } + if (result.block_cache == NULL) { + result.block_cache = NewLRUCache(8 << 20); + } + return result; +} + +DBImpl::DBImpl(const Options& options, const std::string& dbname) + : env_(options.env), + internal_comparator_(options.comparator), + options_(SanitizeOptions(dbname, &internal_comparator_, options)), + owns_info_log_(options_.info_log != options.info_log), + owns_cache_(options_.block_cache != options.block_cache), + dbname_(dbname), + db_lock_(NULL), + shutting_down_(NULL), + bg_cv_(&mutex_), + compacting_cv_(&mutex_), + mem_(new MemTable(internal_comparator_)), + imm_(NULL), + logfile_(NULL), + log_(NULL), + bg_compaction_scheduled_(false), + compacting_(false) { + has_imm_.Release_Store(NULL); + + // Reserve ten files or so for other uses and give the rest to TableCache. + const int table_cache_size = options.max_open_files - 10; + table_cache_ = new TableCache(dbname_, &options_, table_cache_size); + + versions_ = new VersionSet(dbname_, &options_, table_cache_, + &internal_comparator_); +} + +DBImpl::~DBImpl() { + // Wait for background work to finish + mutex_.Lock(); + shutting_down_.Release_Store(this); // Any non-NULL value is ok + if (bg_compaction_scheduled_) { + while (bg_compaction_scheduled_) { + bg_cv_.Wait(); + } + } + mutex_.Unlock(); + + if (db_lock_ != NULL) { + env_->UnlockFile(db_lock_); + } + + delete versions_; + delete mem_; + delete imm_; + delete log_; + delete logfile_; + delete table_cache_; + + if (owns_info_log_) { + delete options_.info_log; + } + if (owns_cache_) { + delete options_.block_cache; + } +} + +Status DBImpl::NewDB() { + VersionEdit new_db; + new_db.SetComparatorName(user_comparator()->Name()); + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + WritableFile* file; + Status s = env_->NewWritableFile(manifest, &file); + if (!s.ok()) { + return s; + } + { + log::Writer log(file); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = file->Close(); + } + } + delete file; + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(manifest); + } + return s; +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || options_.paranoid_checks) { + // No change needed + } else { + Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str()); + *s = Status::OK(); + } +} + +void DBImpl::DeleteObsoleteFiles() { + // Make a set of all of the live files + std::set live = pending_outputs_; + versions_->AddLiveFiles(&live); + + versions_->CleanupLargeValueRefs(live); + + std::vector filenames; + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number == versions_->LogNumber()) || + (number == versions_->PrevLogNumber())); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (in case there is a race that allows other incarnations) + keep = (number >= versions_->ManifestFileNumber()); + break; + case kTableFile: + keep = (live.find(number) != live.end()); + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live" + keep = (live.find(number) != live.end()); + break; + case kLargeValueFile: + keep = versions_->LargeValueIsLive(large_ref); + break; + case kCurrentFile: + case kDBLockFile: + case kInfoLogFile: + keep = true; + break; + } + + if (!keep) { + if (type == kTableFile) { + table_cache_->Evict(number); + } + Log(env_, options_.info_log, "Delete type=%d #%lld\n", + int(type), + static_cast(number)); + env_->DeleteFile(dbname_ + "/" + filenames[i]); + } + } + } +} + +Status DBImpl::Recover(VersionEdit* edit) { + mutex_.AssertHeld(); + + // Ignore error from CreateDir since the creation of the DB is + // committed only when the descriptor is created, and this directory + // may already exist from a previous failed creation attempt. + env_->CreateDir(dbname_); + assert(db_lock_ == NULL); + Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + if (!env_->FileExists(CurrentFileName(dbname_))) { + if (options_.create_if_missing) { + s = NewDB(); + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + dbname_, "does not exist (create_if_missing is false)"); + } + } else { + if (options_.error_if_exists) { + return Status::InvalidArgument( + dbname_, "exists (error_if_exists is true)"); + } + } + + s = versions_->Recover(); + if (s.ok()) { + // Recover from the log files named in the descriptor + SequenceNumber max_sequence(0); + if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log + s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence); + } + if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state + s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence); + } + if (s.ok()) { + if (versions_->LastSequence() < max_sequence) { + versions_->SetLastSequence(max_sequence); + } + } + } + + return s; +} + +Status DBImpl::RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + WritableFile* info_log; + const char* fname; + Status* status; // NULL if options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) { + Log(env, info_log, "%s%s: dropping %d bytes; %s", + (this->status == NULL ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != NULL && this->status->ok()) *this->status = s; + } + }; + + mutex_.AssertHeld(); + + // Open the log file + std::string fname = LogFileName(dbname_, log_number); + SequentialFile* file; + Status status = env_->NewSequentialFile(fname, &file); + if (!status.ok()) { + MaybeIgnoreError(&status); + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks ? &status : NULL); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(file, &reporter, true/*checksum*/); + Log(env_, options_.info_log, "Recovering log #%llu", + (unsigned long long) log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable* mem = NULL; + while (reader.ReadRecord(&record, &scratch) && + status.ok()) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + + if (mem == NULL) { + mem = new MemTable(internal_comparator_); + } + status = WriteBatchInternal::InsertInto(&batch, mem); + MaybeIgnoreError(&status); + if (!status.ok()) { + break; + } + const SequenceNumber last_seq = + WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { + status = WriteLevel0Table(mem, edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + break; + } + delete mem; + mem = NULL; + } + } + + if (status.ok() && mem != NULL) { + status = WriteLevel0Table(mem, edit); + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + } + + delete mem; + delete file; + return status; +} + +Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + pending_outputs_.insert(meta.number); + Iterator* iter = mem->NewIterator(); + Log(env_, options_.info_log, "Level-0 table #%llu: started", + (unsigned long long) meta.number); + + Status s; + { + mutex_.Unlock(); + s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit); + mutex_.Lock(); + } + + Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s", + (unsigned long long) meta.number, + (unsigned long long) meta.file_size, + s.ToString().c_str()); + delete iter; + pending_outputs_.erase(meta.number); + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + stats_[0].Add(stats); + return s; +} + +Status DBImpl::CompactMemTable() { + mutex_.AssertHeld(); + assert(imm_ != NULL); + assert(compacting_); + + // Save the contents of the memtable as a new Table + VersionEdit edit; + Status s = WriteLevel0Table(imm_, &edit); + + // Replace immutable memtable with the generated Table + if (s.ok()) { + edit.SetPrevLogNumber(0); + s = versions_->LogAndApply(&edit, imm_); + } + + if (s.ok()) { + // Commit to the new state + imm_ = NULL; + has_imm_.Release_Store(NULL); + DeleteObsoleteFiles(); + } + + compacting_cv_.SignalAll(); // Wake up waiter even if there was an error + return s; +} + +void DBImpl::TEST_CompactRange( + int level, + const std::string& begin, + const std::string& end) { + MutexLock l(&mutex_); + while (compacting_) { + compacting_cv_.Wait(); + } + Compaction* c = versions_->CompactRange( + level, + InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek), + InternalKey(end, 0, static_cast(0))); + + if (c != NULL) { + CompactionState* compact = new CompactionState(c); + DoCompactionWork(compact); // Ignore error in test compaction + CleanupCompaction(compact); + } + + // Start any background compaction that may have been delayed by this thread + MaybeScheduleCompaction(); +} + +Status DBImpl::TEST_CompactMemTable() { + MutexLock l(&mutex_); + Status s = MakeRoomForWrite(true /* force compaction */); + if (s.ok()) { + // Wait until the compaction completes + while (imm_ != NULL && bg_error_.ok()) { + compacting_cv_.Wait(); + } + if (imm_ != NULL) { + s = bg_error_; + } + } + return s; +} + +void DBImpl::MaybeScheduleCompaction() { + mutex_.AssertHeld(); + if (bg_compaction_scheduled_) { + // Already scheduled + } else if (compacting_) { + // Some other thread is running a compaction. Do not conflict with it. + } else if (shutting_down_.Acquire_Load()) { + // DB is being deleted; no more background compactions + } else if (imm_ == NULL && !versions_->NeedsCompaction()) { + // No work to be done + } else { + bg_compaction_scheduled_ = true; + env_->Schedule(&DBImpl::BGWork, this); + } +} + +void DBImpl::BGWork(void* db) { + reinterpret_cast(db)->BackgroundCall(); +} + +void DBImpl::BackgroundCall() { + MutexLock l(&mutex_); + assert(bg_compaction_scheduled_); + if (!shutting_down_.Acquire_Load() && + !compacting_) { + BackgroundCompaction(); + } + bg_compaction_scheduled_ = false; + bg_cv_.SignalAll(); + + // Previous compaction may have produced too many files in a level, + // so reschedule another compaction if needed. + MaybeScheduleCompaction(); +} + +void DBImpl::BackgroundCompaction() { + mutex_.AssertHeld(); + assert(!compacting_); + + if (imm_ != NULL) { + compacting_ = true; + CompactMemTable(); + compacting_ = false; + compacting_cv_.SignalAll(); + return; + } + + Compaction* c = versions_->PickCompaction(); + if (c == NULL) { + // Nothing to do + return; + } + + Status status; + if (c->IsTrivialMove()) { + // Move file to next level + assert(c->num_input_files(0) == 1); + FileMetaData* f = c->input(0, 0); + c->edit()->DeleteFile(c->level(), f->number); + c->edit()->AddFile(c->level() + 1, f->number, f->file_size, + f->smallest, f->largest); + status = versions_->LogAndApply(c->edit(), NULL); + Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", + static_cast(f->number), + c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str()); + } else { + CompactionState* compact = new CompactionState(c); + status = DoCompactionWork(compact); + CleanupCompaction(compact); + } + delete c; + + if (status.ok()) { + // Done + } else if (shutting_down_.Acquire_Load()) { + // Ignore compaction errors found during shutting down + } else { + Log(env_, options_.info_log, + "Compaction error: %s", status.ToString().c_str()); + if (options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } +} + +void DBImpl::CleanupCompaction(CompactionState* compact) { + mutex_.AssertHeld(); + if (compact->builder != NULL) { + // May happen if we get a shutdown call in the middle of compaction + compact->builder->Abandon(); + delete compact->builder; + } else { + assert(compact->outfile == NULL); + } + delete compact->outfile; + for (int i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + pending_outputs_.erase(out.number); + } + delete compact; +} + +Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { + assert(compact != NULL); + assert(compact->builder == NULL); + uint64_t file_number; + { + mutex_.Lock(); + file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + CompactionState::Output out; + out.number = file_number; + out.smallest.Clear(); + out.largest.Clear(); + compact->outputs.push_back(out); + mutex_.Unlock(); + } + + // Make the output file + std::string fname = TableFileName(dbname_, file_number); + Status s = env_->NewWritableFile(fname, &compact->outfile); + if (s.ok()) { + compact->builder = new TableBuilder(options_, compact->outfile); + } + return s; +} + +Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, + Iterator* input) { + assert(compact != NULL); + assert(compact->outfile != NULL); + assert(compact->builder != NULL); + + const uint64_t output_number = compact->current_output()->number; + assert(output_number != 0); + + // Check for iterator errors + Status s = input->status(); + const uint64_t current_entries = compact->builder->NumEntries(); + if (s.ok()) { + s = compact->builder->Finish(); + } else { + compact->builder->Abandon(); + } + const uint64_t current_bytes = compact->builder->FileSize(); + compact->current_output()->file_size = current_bytes; + compact->total_bytes += current_bytes; + delete compact->builder; + compact->builder = NULL; + + // Finish and check for file errors + if (s.ok()) { + s = compact->outfile->Sync(); + } + if (s.ok()) { + s = compact->outfile->Close(); + } + delete compact->outfile; + compact->outfile = NULL; + + if (s.ok() && current_entries > 0) { + // Verify that the table is usable + Iterator* iter = table_cache_->NewIterator(ReadOptions(), + output_number, + current_bytes); + s = iter->status(); + delete iter; + if (s.ok()) { + Log(env_, options_.info_log, + "Generated table #%llu: %lld keys, %lld bytes", + (unsigned long long) output_number, + (unsigned long long) current_entries, + (unsigned long long) current_bytes); + } + } + return s; +} + + +Status DBImpl::InstallCompactionResults(CompactionState* compact) { + mutex_.AssertHeld(); + Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1, + static_cast(compact->total_bytes)); + + // Add compaction outputs + compact->compaction->AddInputDeletions(compact->compaction->edit()); + const int level = compact->compaction->level(); + for (int i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + compact->compaction->edit()->AddFile( + level + 1, + out.number, out.file_size, out.smallest, out.largest); + pending_outputs_.erase(out.number); + } + compact->outputs.clear(); + + Status s = versions_->LogAndApply(compact->compaction->edit(), NULL); + if (s.ok()) { + compact->compaction->ReleaseInputs(); + DeleteObsoleteFiles(); + } else { + // Discard any files we may have created during this failed compaction + for (int i = 0; i < compact->outputs.size(); i++) { + env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); + } + } + return s; +} + +Status DBImpl::DoCompactionWork(CompactionState* compact) { + const uint64_t start_micros = env_->NowMicros(); + int64_t imm_micros = 0; // Micros spent doing imm_ compactions + + Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1); + + assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); + assert(compact->builder == NULL); + assert(compact->outfile == NULL); + if (snapshots_.empty()) { + compact->smallest_snapshot = versions_->LastSequence(); + } else { + compact->smallest_snapshot = snapshots_.oldest()->number_; + } + + // Release mutex while we're actually doing the compaction work + compacting_ = true; + mutex_.Unlock(); + + Iterator* input = versions_->MakeInputIterator(compact->compaction); + input->SeekToFirst(); + Status status; + ParsedInternalKey ikey; + std::string current_user_key; + bool has_current_user_key = false; + SequenceNumber last_sequence_for_key = kMaxSequenceNumber; + for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { + // Prioritize immutable compaction work + if (has_imm_.NoBarrier_Load() != NULL) { + const uint64_t imm_start = env_->NowMicros(); + mutex_.Lock(); + if (imm_ != NULL) { + CompactMemTable(); + compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + } + mutex_.Unlock(); + imm_micros += (env_->NowMicros() - imm_start); + } + + Slice key = input->key(); + InternalKey tmp_internal_key; + tmp_internal_key.DecodeFrom(key); + if (compact->compaction->ShouldStopBefore(tmp_internal_key) && + compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. + bool drop = false; + if (!ParseInternalKey(key, &ikey)) { + // Do not hide error keys + current_user_key.clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + } else { + if (!has_current_user_key || + user_comparator()->Compare(ikey.user_key, + Slice(current_user_key)) != 0) { + // First occurrence of this user key + current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + } + + if (last_sequence_for_key <= compact->smallest_snapshot) { + // Hidden by an newer entry for same user key + drop = true; // (A) + } else if (ikey.type == kTypeDeletion && + ikey.sequence <= compact->smallest_snapshot && + compact->compaction->IsBaseLevelForKey(ikey.user_key)) { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + } + + last_sequence_for_key = ikey.sequence; + } +#if 0 + Log(env_, options_.info_log, + " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " + "%d smallest_snapshot: %d", + ikey.user_key.ToString().c_str(), + (int)ikey.sequence, ikey.type, kTypeLargeValueRef, drop, + compact->compaction->IsBaseLevelForKey(ikey.user_key), + (int)last_sequence_for_key, (int)compact->smallest_snapshot); +#endif + + if (!drop) { + // Open output file if necessary + if (compact->builder == NULL) { + status = OpenCompactionOutputFile(compact); + if (!status.ok()) { + break; + } + } + if (compact->builder->NumEntries() == 0) { + compact->current_output()->smallest.DecodeFrom(key); + } + compact->current_output()->largest.DecodeFrom(key); + + if (ikey.type == kTypeLargeValueRef) { + if (input->value().size() != LargeValueRef::ByteSize()) { + if (options_.paranoid_checks) { + status = Status::Corruption("invalid large value ref"); + break; + } else { + Log(env_, options_.info_log, + "compaction found invalid large value ref"); + } + } else { + compact->compaction->edit()->AddLargeValueRef( + LargeValueRef::FromRef(input->value()), + compact->current_output()->number, + input->key()); + compact->builder->Add(key, input->value()); + } + } else { + compact->builder->Add(key, input->value()); + } + + // Close output file if it is big enough + if (compact->builder->FileSize() >= + compact->compaction->MaxOutputFileSize()) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + } + + input->Next(); + } + + if (status.ok() && shutting_down_.Acquire_Load()) { + status = Status::IOError("Deleting DB during compaction"); + } + if (status.ok() && compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + } + if (status.ok()) { + status = input->status(); + } + delete input; + input = NULL; + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros - imm_micros; + for (int which = 0; which < 2; which++) { + for (int i = 0; i < compact->compaction->num_input_files(which); i++) { + stats.bytes_read += compact->compaction->input(which, i)->file_size; + } + } + for (int i = 0; i < compact->outputs.size(); i++) { + stats.bytes_written += compact->outputs[i].file_size; + } + + mutex_.Lock(); + stats_[compact->compaction->level() + 1].Add(stats); + + if (status.ok()) { + status = InstallCompactionResults(compact); + } + compacting_ = false; + compacting_cv_.SignalAll(); + return status; +} + +Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, + SequenceNumber* latest_snapshot) { + mutex_.Lock(); + *latest_snapshot = versions_->LastSequence(); + + // Collect together all needed child iterators + std::vector list; + list.push_back(mem_->NewIterator()); + if (imm_ != NULL) { + list.push_back(imm_->NewIterator()); + } + versions_->current()->AddIterators(options, &list); + Iterator* internal_iter = + NewMergingIterator(&internal_comparator_, &list[0], list.size()); + versions_->current()->Ref(); + internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current()); + + mutex_.Unlock(); + return internal_iter; +} + +Iterator* DBImpl::TEST_NewInternalIterator() { + SequenceNumber ignored; + return NewInternalIterator(ReadOptions(), &ignored); +} + +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { + MutexLock l(&mutex_); + return versions_->MaxNextLevelOverlappingBytes(); +} + +Status DBImpl::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + // TODO(opt): faster implementation + Iterator* iter = NewIterator(options); + iter->Seek(key); + bool found = false; + if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) { + Slice v = iter->value(); + value->assign(v.data(), v.size()); + found = true; + } + // Non-OK iterator status trumps everything else + Status result = iter->status(); + if (result.ok() && !found) { + result = Status::NotFound(Slice()); // Use an empty error message for speed + } + delete iter; + return result; +} + +Iterator* DBImpl::NewIterator(const ReadOptions& options) { + SequenceNumber latest_snapshot; + Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + SequenceNumber sequence = + (options.snapshot ? options.snapshot->number_ : latest_snapshot); + return NewDBIterator(&dbname_, env_, + user_comparator(), internal_iter, sequence); +} + +void DBImpl::Unref(void* arg1, void* arg2) { + DBImpl* impl = reinterpret_cast(arg1); + Version* v = reinterpret_cast(arg2); + MutexLock l(&impl->mutex_); + v->Unref(); +} + +const Snapshot* DBImpl::GetSnapshot() { + MutexLock l(&mutex_); + return snapshots_.New(versions_->LastSequence()); +} + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + MutexLock l(&mutex_); + snapshots_.Delete(s); +} + +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { + return DB::Put(o, key, val); +} + +Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + return DB::Delete(options, key); +} + +Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { + Status status; + + WriteBatch* final = NULL; + { + MutexLock l(&mutex_); + status = MakeRoomForWrite(false); // May temporarily release lock and wait + + uint64_t last_sequence = versions_->LastSequence(); + if (status.ok()) { + status = HandleLargeValues(last_sequence + 1, updates, &final); + } + if (status.ok()) { + WriteBatchInternal::SetSequence(final, last_sequence + 1); + last_sequence += WriteBatchInternal::Count(final); + versions_->SetLastSequence(last_sequence); + + // Add to log and apply to memtable + status = log_->AddRecord(WriteBatchInternal::Contents(final)); + if (status.ok() && options.sync) { + status = logfile_->Sync(); + } + if (status.ok()) { + status = WriteBatchInternal::InsertInto(final, mem_); + } + } + + if (options.post_write_snapshot != NULL) { + *options.post_write_snapshot = + status.ok() ? snapshots_.New(last_sequence) : NULL; + } + } + if (final != updates) { + delete final; + } + + return status; +} + +Status DBImpl::MakeRoomForWrite(bool force) { + mutex_.AssertHeld(); + Status s; + while (true) { + if (!bg_error_.ok()) { + // Yield previous error + s = bg_error_; + break; + } else if (!force && + (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { + // There is room in current memtable + break; + } else if (imm_ != NULL) { + // We have filled up the current memtable, but the previous + // one is still being compacted, so we wait. + compacting_cv_.Wait(); + } else { + // Attempt to switch to a new memtable and trigger compaction of old + assert(versions_->PrevLogNumber() == 0); + uint64_t new_log_number = versions_->NewFileNumber(); + WritableFile* lfile = NULL; + s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); + if (!s.ok()) { + break; + } + VersionEdit edit; + edit.SetPrevLogNumber(versions_->LogNumber()); + edit.SetLogNumber(new_log_number); + s = versions_->LogAndApply(&edit, NULL); + if (!s.ok()) { + delete lfile; + env_->DeleteFile(LogFileName(dbname_, new_log_number)); + break; + } + delete log_; + delete logfile_; + logfile_ = lfile; + log_ = new log::Writer(lfile); + imm_ = mem_; + has_imm_.Release_Store(imm_); + mem_ = new MemTable(internal_comparator_); + force = false; // Do not force another compaction if have room + MaybeScheduleCompaction(); + } + } + return s; +} + +bool DBImpl::HasLargeValues(const WriteBatch& batch) const { + if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) { + for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) { + if (it.op() == kTypeValue && + it.value().size() >= options_.large_value_threshold) { + return true; + } + } + } + return false; +} + +// Given "raw_value", determines the appropriate compression format to use +// and stores the data that should be written to the large value file in +// "*file_bytes", and sets "*ref" to the appropriate large value reference. +// May use "*scratch" as backing store for "*file_bytes". +void DBImpl::MaybeCompressLargeValue( + const Slice& raw_value, + Slice* file_bytes, + std::string* scratch, + LargeValueRef* ref) { + switch (options_.compression) { + case kSnappyCompression: { + if (port::Snappy_Compress(raw_value.data(), raw_value.size(), scratch) && + (scratch->size() < (raw_value.size() / 8) * 7)) { + *file_bytes = *scratch; + *ref = LargeValueRef::Make(raw_value, kSnappyCompression); + return; + } + + // Less than 12.5% compression: just leave as uncompressed data + break; + } + case kNoCompression: + // Use default code outside of switch + break; + } + // Store as uncompressed data + *file_bytes = raw_value; + *ref = LargeValueRef::Make(raw_value, kNoCompression); +} + +Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq, + WriteBatch* updates, + WriteBatch** final) { + if (!HasLargeValues(*updates)) { + // Fast path: no large values found + *final = updates; + } else { + // Copy *updates to a new WriteBatch, replacing the references to + *final = new WriteBatch; + SequenceNumber seq = assigned_seq; + for (WriteBatchInternal::Iterator it(*updates); !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeValue: + if (it.value().size() < options_.large_value_threshold) { + (*final)->Put(it.key(), it.value()); + } else { + std::string scratch; + Slice file_bytes; + LargeValueRef large_ref; + MaybeCompressLargeValue( + it.value(), &file_bytes, &scratch, &large_ref); + InternalKey ikey(it.key(), seq, kTypeLargeValueRef); + if (versions_->RegisterLargeValueRef( + large_ref, versions_->LogNumber(), ikey)) { + // TODO(opt): avoid holding the lock here (but be careful about + // another thread doing a Write and switching logs or + // having us get a different "assigned_seq" value). + + uint64_t tmp_number = versions_->NewFileNumber(); + pending_outputs_.insert(tmp_number); + std::string tmp = TempFileName(dbname_, tmp_number); + WritableFile* file; + Status s = env_->NewWritableFile(tmp, &file); + if (!s.ok()) { + return s; // Caller will delete *final + } + + file->Append(file_bytes); + + s = file->Close(); + delete file; + + if (s.ok()) { + const std::string fname = + LargeValueFileName(dbname_, large_ref); + s = env_->RenameFile(tmp, fname); + } else { + Log(env_, options_.info_log, "Write large value: %s", + s.ToString().c_str()); + } + pending_outputs_.erase(tmp_number); + + if (!s.ok()) { + env_->DeleteFile(tmp); // Cleanup; intentionally ignoring error + return s; // Caller will delete *final + } + } + + // Put an indirect reference in the write batch in place + // of large value + WriteBatchInternal::PutLargeValueRef(*final, it.key(), large_ref); + } + break; + case kTypeLargeValueRef: + return Status::Corruption("Corrupted write batch"); + break; + case kTypeDeletion: + (*final)->Delete(it.key()); + break; + } + seq = seq + 1; + } + } + return Status::OK(); +} + +bool DBImpl::GetProperty(const Slice& property, std::string* value) { + value->clear(); + + MutexLock l(&mutex_); + Slice in = property; + Slice prefix("leveldb."); + if (!in.starts_with(prefix)) return false; + in.remove_prefix(prefix.size()); + + if (in.starts_with("num-files-at-level")) { + in.remove_prefix(strlen("num-files-at-level")); + uint64_t level; + bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); + if (!ok || level < 0 || level >= config::kNumLevels) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", versions_->NumLevelFiles(level)); + *value = buf; + return true; + } + } else if (in == "stats") { + char buf[200]; + snprintf(buf, sizeof(buf), + " Compactions\n" + "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n" + "--------------------------------------------------\n" + ); + value->append(buf); + for (int level = 0; level < config::kNumLevels; level++) { + int files = versions_->NumLevelFiles(level); + if (stats_[level].micros > 0 || files > 0) { + snprintf( + buf, sizeof(buf), + "%3d %8d %8.0f %9.0f %8.0f %9.0f\n", + level, + files, + versions_->NumLevelBytes(level) / 1048576.0, + stats_[level].micros / 1e6, + stats_[level].bytes_read / 1048576.0, + stats_[level].bytes_written / 1048576.0); + value->append(buf); + } + } + return true; + } + + return false; +} + +void DBImpl::GetApproximateSizes( + const Range* range, int n, + uint64_t* sizes) { + // TODO(opt): better implementation + Version* v; + { + MutexLock l(&mutex_); + versions_->current()->Ref(); + v = versions_->current(); + } + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + uint64_t start = versions_->ApproximateOffsetOf(v, k1); + uint64_t limit = versions_->ApproximateOffsetOf(v, k2); + sizes[i] = (limit >= start ? limit - start : 0); + } + + { + MutexLock l(&mutex_); + v->Unref(); + } +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { + WriteBatch batch; + batch.Put(key, value); + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, const Slice& key) { + WriteBatch batch; + batch.Delete(key); + return Write(opt, &batch); +} + +DB::~DB() { } + +Status DB::Open(const Options& options, const std::string& dbname, + DB** dbptr) { + *dbptr = NULL; + + DBImpl* impl = new DBImpl(options, dbname); + impl->mutex_.Lock(); + VersionEdit edit; + Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + WritableFile* lfile; + s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), + &lfile); + if (s.ok()) { + edit.SetLogNumber(new_log_number); + impl->logfile_ = lfile; + impl->log_ = new log::Writer(lfile); + s = impl->versions_->LogAndApply(&edit, NULL); + } + if (s.ok()) { + impl->DeleteObsoleteFiles(); + } + } + impl->mutex_.Unlock(); + if (s.ok()) { + *dbptr = impl; + } else { + delete impl; + } + return s; +} + +Status DestroyDB(const std::string& dbname, const Options& options) { + Env* env = options.env; + std::vector filenames; + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + if (filenames.empty()) { + return Status::OK(); + } + + FileLock* lock; + Status result = env->LockFile(LockFileName(dbname), &lock); + if (result.ok()) { + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + Status del = env->DeleteFile(dbname + "/" + filenames[i]); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(LockFileName(dbname)); + env->DeleteDir(dbname); // Ignore error in case dir contains other files + } + return result; +} + +} diff --git a/db/db_impl.h b/db/db_impl.h new file mode 100644 index 0000000..1f685f0 --- /dev/null +++ b/db/db_impl.h @@ -0,0 +1,207 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ +#define STORAGE_LEVELDB_DB_DB_IMPL_H_ + +#include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "port/port.h" + +namespace leveldb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +class DBImpl : public DB { + public: + DBImpl(const Options& options, const std::string& dbname); + virtual ~DBImpl(); + + // Implementations of the DB interface + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); + virtual Status Delete(const WriteOptions&, const Slice& key); + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value); + virtual Iterator* NewIterator(const ReadOptions&); + virtual const Snapshot* GetSnapshot(); + virtual void ReleaseSnapshot(const Snapshot* snapshot); + virtual bool GetProperty(const Slice& property, std::string* value); + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); + + // Extra methods (for testing) that are not in the public DB interface + + // Compact any files in the named level that overlap [begin,end] + void TEST_CompactRange( + int level, + const std::string& begin, + const std::string& end); + + // Force current memtable contents to be compacted. + Status TEST_CompactMemTable(); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + Iterator* TEST_NewInternalIterator(); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes(); + + private: + friend class DB; + + Iterator* NewInternalIterator(const ReadOptions&, + SequenceNumber* latest_snapshot); + + Status NewDB(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + Status Recover(VersionEdit* edit); + + void MaybeIgnoreError(Status* s) const; + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + + // Called when an iterator over a particular version of the + // descriptor goes away. + static void Unref(void* arg1, void* arg2); + + // Compact the in-memory write buffer to disk. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. + Status CompactMemTable(); + + Status RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence); + + Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); + + Status MakeRoomForWrite(bool force /* compact even if there is room? */); + bool HasLargeValues(const WriteBatch& batch) const; + + // Process data in "*updates" and return a status. "assigned_seq" + // is the sequence number assigned to the first mod in "*updates". + // If no large values are encountered, "*final" is set to "updates". + // If large values were encountered, registers the references of the + // large values with the VersionSet, writes the large values to + // files (if appropriate), and allocates a new WriteBatch with the + // large values replaced with indirect references and stores a + // pointer to the new WriteBatch in *final. If *final != updates on + // return, then the client should delete *final when no longer + // needed. Returns OK on success, and an appropriate error + // otherwise. + Status HandleLargeValues(SequenceNumber assigned_seq, + WriteBatch* updates, + WriteBatch** final); + + // Helper routine for HandleLargeValues + void MaybeCompressLargeValue( + const Slice& raw_value, + Slice* file_bytes, + std::string* scratch, + LargeValueRef* ref); + + struct CompactionState; + + void MaybeScheduleCompaction(); + static void BGWork(void* db); + void BackgroundCall(); + void BackgroundCompaction(); + void CleanupCompaction(CompactionState* compact); + Status DoCompactionWork(CompactionState* compact); + + Status OpenCompactionOutputFile(CompactionState* compact); + Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); + Status InstallCompactionResults(CompactionState* compact); + + // Constant after construction + Env* const env_; + const InternalKeyComparator internal_comparator_; + const Options options_; // options_.comparator == &internal_comparator_ + bool owns_info_log_; + bool owns_cache_; + const std::string dbname_; + + // table_cache_ provides its own synchronization + TableCache* table_cache_; + + // Lock over the persistent DB state. Non-NULL iff successfully acquired. + FileLock* db_lock_; + + // State below is protected by mutex_ + port::Mutex mutex_; + port::AtomicPointer shutting_down_; + port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ + port::CondVar compacting_cv_; // Signalled when !compacting_ + MemTable* mem_; + MemTable* imm_; // Memtable being compacted + port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ + WritableFile* logfile_; + log::Writer* log_; + SnapshotList snapshots_; + + // Set of table files to protect from deletion because they are + // part of ongoing compactions. + std::set pending_outputs_; + + // Has a background compaction been scheduled or is running? + bool bg_compaction_scheduled_; + + // Is there a compaction running? + bool compacting_; + + VersionSet* versions_; + + // Have we encountered a background error in paranoid mode? + Status bg_error_; + + // Per level compaction stats. stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + int64_t micros; + int64_t bytes_read; + int64_t bytes_written; + + CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { } + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->bytes_read += c.bytes_read; + this->bytes_written += c.bytes_written; + } + }; + CompactionStats stats_[config::kNumLevels]; + + // No copying allowed + DBImpl(const DBImpl&); + void operator=(const DBImpl&); + + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } +}; + +// Sanitize db options. The caller should delete result.info_log if +// it is not equal to src.info_log. +extern Options SanitizeOptions(const std::string& db, + const InternalKeyComparator* icmp, + const Options& src); + +} + +#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/db/db_iter.cc b/db/db_iter.cc new file mode 100644 index 0000000..31c2a38 --- /dev/null +++ b/db/db_iter.cc @@ -0,0 +1,397 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/mutexlock.h" + +namespace leveldb { + +#if 0 +static void DumpInternalIter(Iterator* iter) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey k; + if (!ParseInternalKey(iter->key(), &k)) { + fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); + } else { + fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); + } + } +} +#endif + +namespace { + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter: public Iterator { + public: + // Which direction is the iterator currently moving? + // (1) When moving forward, the internal iterator is positioned at + // the exact entry that yields this->key(), this->value() + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { + kForward, + kReverse + }; + + DBIter(const std::string* dbname, Env* env, + const Comparator* cmp, Iterator* iter, SequenceNumber s) + : dbname_(dbname), + env_(env), + user_comparator_(cmp), + iter_(iter), + sequence_(s), + large_(NULL), + direction_(kForward), + valid_(false) { + } + virtual ~DBIter() { + delete iter_; + delete large_; + } + virtual bool Valid() const { return valid_; } + virtual Slice key() const { + assert(valid_); + return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_; + } + virtual Slice value() const { + assert(valid_); + Slice raw_value = (direction_ == kForward) ? iter_->value() : saved_value_; + if (large_ == NULL) { + return raw_value; + } else { + MutexLock l(&large_->mutex); + if (!large_->produced) { + ReadIndirectValue(raw_value); + } + return large_->value; + } + } + virtual Status status() const { + if (status_.ok()) { + if (large_ != NULL && !large_->status.ok()) return large_->status; + return iter_->status(); + } else { + return status_; + } + } + + virtual void Next(); + virtual void Prev(); + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + + private: + struct Large { + port::Mutex mutex; + std::string value; + bool produced; + Status status; + }; + + void FindNextUserEntry(bool skipping, std::string* skip); + void FindPrevUserEntry(); + bool ParseKey(ParsedInternalKey* key); + void ReadIndirectValue(Slice ref) const; + + inline void SaveKey(const Slice& k, std::string* dst) { + dst->assign(k.data(), k.size()); + } + + inline void ForgetLargeValue() { + if (large_ != NULL) { + delete large_; + large_ = NULL; + } + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + const std::string* const dbname_; + Env* const env_; + const Comparator* const user_comparator_; + Iterator* const iter_; + SequenceNumber const sequence_; + + Status status_; + std::string saved_key_; // == current key when direction_==kReverse + std::string saved_value_; // == current raw value when direction_==kReverse + Large* large_; // Non-NULL if value is an indirect reference + Direction direction_; + bool valid_; + + // No copying allowed + DBIter(const DBIter&); + void operator=(const DBIter&); +}; + +inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { + if (!ParseInternalKey(iter_->key(), ikey)) { + status_ = Status::Corruption("corrupted internal key in DBIter"); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + ForgetLargeValue(); + + if (direction_ == kReverse) { // Switch directions? + direction_ = kForward; + // iter_ is pointing just before the entries for this->key(), + // so advance into the range of entries for this->key() and then + // use the normal skipping code below. + if (!iter_->Valid()) { + iter_->SeekToFirst(); + } else { + iter_->Next(); + } + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + return; + } + } + + // Temporarily use saved_key_ as storage for key to skip. + std::string* skip = &saved_key_; + SaveKey(ExtractUserKey(iter_->key()), skip); + FindNextUserEntry(true, skip); +} + +void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { + // Loop until we hit an acceptable entry to yield + assert(iter_->Valid()); + assert(direction_ == kForward); + assert(large_ == NULL); + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + SaveKey(ikey.user_key, skip); + skipping = true; + break; + case kTypeValue: + case kTypeLargeValueRef: + if (skipping && + user_comparator_->Compare(ikey.user_key, *skip) <= 0) { + // Entry hidden + } else { + valid_ = true; + saved_key_.clear(); + if (ikey.type == kTypeLargeValueRef) { + large_ = new Large; + large_->produced = false; + } + return; + } + break; + } + } + iter_->Next(); + } while (iter_->Valid()); + saved_key_.clear(); + valid_ = false; +} + +void DBIter::Prev() { + assert(valid_); + ForgetLargeValue(); + + if (direction_ == kForward) { // Switch directions? + // iter_ is pointing at the current entry. Scan backwards until + // the key changes so we can use the normal reverse scanning code. + assert(iter_->Valid()); // Otherwise valid_ would have been false + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + while (true) { + iter_->Prev(); + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + return; + } + if (user_comparator_->Compare(ExtractUserKey(iter_->key()), + saved_key_) < 0) { + break; + } + } + direction_ = kReverse; + } + + FindPrevUserEntry(); +} + +void DBIter::FindPrevUserEntry() { + assert(direction_ == kReverse); + assert(large_ == NULL); + + ValueType value_type = kTypeDeletion; + if (iter_->Valid()) { + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if ((value_type != kTypeDeletion) && + user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { + // We encountered a non-deleted value in entries for previous keys, + break; + } + value_type = ikey.type; + if (value_type == kTypeDeletion) { + ClearSavedValue(); + } else { + Slice raw_value = iter_->value(); + if (saved_value_.capacity() > raw_value.size() + 1048576) { + std::string empty; + swap(empty, saved_value_); + } + saved_value_.assign(raw_value.data(), raw_value.size()); + } + } + iter_->Prev(); + } while (iter_->Valid()); + } + + if (value_type == kTypeDeletion) { + // End + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + direction_ = kForward; + } else { + valid_ = true; + if (value_type == kTypeLargeValueRef) { + large_ = new Large; + large_->produced = false; + } + } +} + +void DBIter::Seek(const Slice& target) { + direction_ = kForward; + ForgetLargeValue(); + ClearSavedValue(); + saved_key_.clear(); + AppendInternalKey( + &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); + iter_->Seek(saved_key_); + if (iter_->Valid()) { + FindNextUserEntry(false, &saved_key_ /* temporary storage */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToFirst() { + direction_ = kForward; + ForgetLargeValue(); + ClearSavedValue(); + iter_->SeekToFirst(); + if (iter_->Valid()) { + FindNextUserEntry(false, &saved_key_ /* temporary storage */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToLast() { + direction_ = kReverse; + ForgetLargeValue(); + ClearSavedValue(); + iter_->SeekToLast(); + FindPrevUserEntry(); +} + +void DBIter::ReadIndirectValue(Slice ref) const { + assert(!large_->produced); + large_->produced = true; + LargeValueRef large_ref; + if (ref.size() != LargeValueRef::ByteSize()) { + large_->status = Status::Corruption("malformed large value reference"); + return; + } + memcpy(large_ref.data, ref.data(), LargeValueRef::ByteSize()); + std::string fname = LargeValueFileName(*dbname_, large_ref); + RandomAccessFile* file; + Status s = env_->NewRandomAccessFile(fname, &file); + uint64_t file_size = 0; + if (s.ok()) { + s = env_->GetFileSize(fname, &file_size); + } + if (s.ok()) { + uint64_t value_size = large_ref.ValueSize(); + large_->value.resize(value_size); + Slice result; + s = file->Read(0, file_size, &result, + const_cast(large_->value.data())); + if (s.ok()) { + if (result.size() == file_size) { + switch (large_ref.compression_type()) { + case kNoCompression: { + if (result.data() != large_->value.data()) { + large_->value.assign(result.data(), result.size()); + } + break; + } + case kSnappyCompression: { + std::string uncompressed; + if (port::Snappy_Uncompress(result.data(), result.size(), + &uncompressed) && + uncompressed.size() == large_ref.ValueSize()) { + swap(uncompressed, large_->value); + } else { + s = Status::Corruption( + "Unable to read entire compressed large value file"); + } + } + } + } else { + s = Status::Corruption("Unable to read entire large value file"); + } + } + delete file; // Ignore errors on closing + } + if (!s.ok()) { + large_->value.clear(); + large_->status = s; + } +} + +} // anonymous namespace + +Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence) { + return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); +} + +} diff --git a/db/db_iter.h b/db/db_iter.h new file mode 100644 index 0000000..195f3d3 --- /dev/null +++ b/db/db_iter.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ +#define STORAGE_LEVELDB_DB_DB_ITER_H_ + +#include +#include "leveldb/db.h" +#include "db/dbformat.h" + +namespace leveldb { + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified "sequence" number +// into appropriate user keys. +extern Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence); + +} + +#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/db/db_test.cc b/db/db_test.cc new file mode 100644 index 0000000..04de331 --- /dev/null +++ b/db/db_test.cc @@ -0,0 +1,1211 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +class DBTest { + public: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + DBTest() : env_(Env::Default()) { + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, Options()); + db_ = NULL; + Reopen(); + } + + ~DBTest() { + delete db_; + DestroyDB(dbname_, Options()); + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); + } + + void DestroyAndReopen(Options* options = NULL) { + delete db_; + db_ = NULL; + DestroyDB(dbname_, Options()); + ASSERT_OK(TryReopen(options)); + } + + Status TryReopen(Options* options) { + delete db_; + db_ = NULL; + Options opts; + if (options != NULL) { + opts = *options; + } else { + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const std::string& k, const std::string& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string AllEntriesFor(const Slice& user_key) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (last_options_.comparator->Compare( + ikey.user_key, user_key) != 0) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeLargeValueRef: + result += "LARGEVALUE(" + EscapeString(iter->value()) + ")"; + break; + case kTypeDeletion: + result += "DEL"; + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + delete iter; + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + std::set LargeValueFiles() const { + // Return the set of large value files that exist in the database + std::vector filenames; + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + uint64_t number; + LargeValueRef large_ref; + FileType type; + std::set live; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type) && + type == kLargeValueFile) { + fprintf(stderr, " live: %s\n", + LargeValueRefToFilenameString(large_ref).c_str()); + live.insert(large_ref); + } + } + fprintf(stderr, "Found %d live large value files\n", (int)live.size()); + return live; + } + + void Compact(const Slice& start, const Slice& limit) { + dbfull()->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + if (NumTableFilesAtLevel(level) > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbfull()->TEST_CompactRange(level, "", "~"); + } + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < config::kNumLevels; level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(DBTest, Empty) { + ASSERT_TRUE(db_ != NULL); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(DBTest, ReadWrite) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); +} + +TEST(DBTest, PutDeleteGet) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(db_->Delete(WriteOptions(), "foo")); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(DBTest, IterEmpty) { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterSingle) { + ASSERT_OK(Put("a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterMulti) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put("a", "va2")); + ASSERT_OK(Put("a2", "va3")); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Put("c", "vc2")); + ASSERT_OK(Delete("b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterSmallAndLargeMix) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", std::string(100000, 'b'))); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Put("d", std::string(100000, 'd'))); + ASSERT_OK(Put("e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, Recover) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v5", Get("baz")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + + Reopen(); + ASSERT_EQ("v3", Get("foo")); + ASSERT_OK(Put("foo", "v4")); + ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v5", Get("baz")); +} + +TEST(DBTest, RecoveryWithEmptyLog) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + Reopen(); + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); +} + +TEST(DBTest, MinorCompactionsHappen) { + Options options; + options.write_buffer_size = 10000; + Reopen(&options); + + const int N = 500; + + int starting_num_tables = NumTableFilesAtLevel(0); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = NumTableFilesAtLevel(0); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } + + Reopen(); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } +} + +TEST(DBTest, RecoverWithLargeLog) { + { + Options options; + options.large_value_threshold = 1048576; + Reopen(&options); + ASSERT_OK(Put("big1", std::string(200000, '1'))); + ASSERT_OK(Put("big2", std::string(200000, '2'))); + ASSERT_OK(Put("small3", std::string(10, '3'))); + ASSERT_OK(Put("small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + options.large_value_threshold = 1048576; + Reopen(&options); + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + ASSERT_EQ(std::string(200000, '1'), Get("big1")); + ASSERT_EQ(std::string(200000, '2'), Get("big2")); + ASSERT_EQ(std::string(10, '3'), Get("small3")); + ASSERT_EQ(std::string(10, '4'), Get("small4")); + ASSERT_GT(NumTableFilesAtLevel(0), 1); +} + +TEST(DBTest, CompactionsGenerateMultipleFiles) { + Options options; + options.write_buffer_size = 100000000; // Large write buffer + options.large_value_threshold = 1048576; + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to level-0 + Reopen(&options); + dbfull()->TEST_CompactRange(0, "", Key(100000)); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST(DBTest, SparseMerge) { + Options options; + options.compression = kNoCompression; + Reopen(&options); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put("A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + Put("C", "vc"); + Compact("", "z"); + + // Make sparse update + Put("A", "va2"); + Put("B100", "bvalue2"); + Put("C", "vc2"); + dbfull()->TEST_CompactMemTable(); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST(DBTest, ApproximateSizes) { + for (int test = 0; test < 2; test++) { + // test==0: default large_value_threshold + // test==1: 1 MB large_value_threshold + Options options; + options.large_value_threshold = (test == 0) ? 65536 : 1048576; + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + DestroyAndReopen(); + + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + Reopen(&options); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + const int N = 80; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); + } + if (test == 1) { + // 0 because GetApproximateSizes() does not account for memtable space for + // non-large values + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + } else { + ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000)); + ASSERT_TRUE(Between(Size(Key(20), Key(30)), + 100000*10, 100000*10 + 10000)); + } + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); + ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), + 100000 * (i+1), 100000 * (i+1) + 10000)); + ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), + 100000 * 10, 100000 * 10 + 10000)); + } + ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); + ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); + + dbfull()->TEST_CompactRange(0, + Key(compact_start), + Key(compact_start + 9)); + } + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + } + } +} + +TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + Options options; + options.large_value_threshold = 65536; + options.compression = kNoCompression; + Reopen(); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(2), big1)); + ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(4), big1)); + ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, Key(0), Key(100)); + } +} + +TEST(DBTest, IteratorPinsRef) { + Put("foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Write to force compactions + Put("foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values + } + Put("foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; +} + +TEST(DBTest, Snapshot) { + Put("foo", "v1"); + const Snapshot* s1 = db_->GetSnapshot(); + Put("foo", "v2"); + const Snapshot* s2 = db_->GetSnapshot(); + Put("foo", "v3"); + const Snapshot* s3 = db_->GetSnapshot(); + + Put("foo", "v4"); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v3", Get("foo", s3)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s3); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ("v4", Get("foo")); +} + +TEST(DBTest, HiddenValuesAreRemoved) { + Random rnd(301); + std::string big = RandomString(&rnd, 50000); + Put("foo", big); + Put("pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "tiny"); + Put("pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + + ASSERT_EQ(big, Get("foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); + dbfull()->TEST_CompactRange(0, "", "x"); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GE(NumTableFilesAtLevel(1), 1); + dbfull()->TEST_CompactRange(1, "", "x"); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + + ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); +} + +TEST(DBTest, DeletionMarkers1) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + Delete("foo"); + Put("foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + dbfull()->TEST_CompactRange(0, "", "z"); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(1, "", "z"); + // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. + // (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); +} + +TEST(DBTest, DeletionMarkers2) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(0, "", "z"); + // DEL kept: L2 file overlaps + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(1, "", "z"); + // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. + // (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); +} + +TEST(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "leveldb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + NewComparator cmp; + Options new_options; + new_options.comparator = &cmp; + Status s = TryReopen(&new_options); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); +} + +static bool LargeValuesOK(DBTest* db, + const std::set& expected) { + std::set actual = db->LargeValueFiles(); + if (actual.size() != expected.size()) { + fprintf(stderr, "Sets differ in size: %d vs %d\n", + (int)actual.size(), (int)expected.size()); + return false; + } + for (std::set::const_iterator it = expected.begin(); + it != expected.end(); + ++it) { + if (actual.count(*it) != 1) { + fprintf(stderr, " key '%s' not found in actual set\n", + LargeValueRefToFilenameString(*it).c_str()); + return false; + } + } + return true; +} + +TEST(DBTest, LargeValues1) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + + std::string big1; + test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible + std::set expected; + + ASSERT_OK(Put("big1", big1)); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Delete("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + // No handling of deletion markers on memtable compactions, so big1 remains + ASSERT_TRUE(LargeValuesOK(this, expected)); + + dbfull()->TEST_CompactRange(0, "", "z"); + expected.erase(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + +static bool SnappyCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(in.data(), in.size(), &out); +} + +TEST(DBTest, LargeValues2) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + + std::string big1, big2; + test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible + test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible + std::set expected; + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big1", big1)); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_EQ(big1, Get("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big2", big2)); + ASSERT_EQ(big2, Get("big2")); + if (SnappyCompressionSupported()) { + expected.insert(LargeValueRef::Make(big2, kSnappyCompression)); + } else { + expected.insert(LargeValueRef::Make(big2, kNoCompression)); + } + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big2", big2)); + ASSERT_OK(Put("big2_b", big2)); + ASSERT_EQ(big1, Get("big1")); + ASSERT_EQ(big2, Get("big2")); + ASSERT_EQ(big2, Get("big2_b")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Delete("big1")); + ASSERT_EQ("NOT_FOUND", Get("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(0, "", "z"); + expected.erase(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(1, "", "z"); + + ASSERT_OK(Delete("big2")); + ASSERT_EQ("NOT_FOUND", Get("big2")); + ASSERT_EQ(big2, Get("big2_b")); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + // Make sure the large value refs survive a reload and compactions after + // the reload. + Reopen(); + ASSERT_TRUE(LargeValuesOK(this, expected)); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + +TEST(DBTest, LargeValues3) { + // Make sure we don't compress values if + Options options; + options.large_value_threshold = 10000; + options.compression = kNoCompression; + Reopen(&options); + + Random rnd(301); + + std::string big1 = std::string(100000, 'x'); // Very compressible + std::set expected; + + ASSERT_OK(Put("big1", big1)); + ASSERT_EQ(big1, Get("big1")); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + + +TEST(DBTest, DBOpen_Options) { + std::string dbname = test::TmpDir() + "/db_options_test"; + DestroyDB(dbname, Options()); + + // Does not exist, and create_if_missing == false: error + DB* db = NULL; + Options opts; + opts.create_if_missing = false; + Status s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); + ASSERT_TRUE(db == NULL); + + // Does not exist, and create_if_missing == true: OK + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; + + // Does exist, and error_if_exists == true: error + opts.create_if_missing = false; + opts.error_if_exists = true; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); + ASSERT_TRUE(db == NULL); + + // Does exist, and error_if_exists == false: OK + opts.create_if_missing = true; + opts.error_if_exists = false; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; +} + +class ModelDB: public DB { + public: + explicit ModelDB(const Options& options): options_(options) { } + ~ModelDB() { } + virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { + return DB::Put(o, k, v); + } + virtual Status Delete(const WriteOptions& o, const Slice& key) { + return DB::Delete(o, key); + } + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) { + assert(false); // Not implemented + return Status::NotFound(key); + } + virtual Iterator* NewIterator(const ReadOptions& options) { + if (options.snapshot == NULL) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + reinterpret_cast(options.snapshot->number_); + return new ModelIter(snapshot_state, false); + } + } + virtual const Snapshot* GetSnapshot() { + KVMap* saved = new KVMap; + *saved = map_; + return snapshots_.New( + reinterpret_cast(saved)); + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) { + const KVMap* saved = reinterpret_cast(snapshot->number_); + delete saved; + snapshots_.Delete(snapshot); + } + virtual Status Write(const WriteOptions& options, WriteBatch* batch) { + assert(options.post_write_snapshot == NULL); // Not supported + for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeValue: + map_[it.key().ToString()] = it.value().ToString(); + break; + case kTypeLargeValueRef: + assert(false); // Should not occur + break; + case kTypeDeletion: + map_.erase(it.key().ToString()); + break; + } + } + return Status::OK(); + } + + virtual bool GetProperty(const Slice& property, std::string* value) { + return false; + } + virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + private: + typedef std::map KVMap; + class ModelIter: public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) { + } + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() { ++iter_; } + virtual void Prev() { --iter_; } + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + SnapshotList snapshots_; +}; + +static std::string RandomKey(Random* rnd) { + int len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, + DB* model, + DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); + miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); + delete miter; + delete dbiter; + return ok; +} + +TEST(DBTest, Randomized) { + Random rnd(test::RandomSeed()); + ModelDB model(last_options_); + const int N = 10000; + const Snapshot* model_snap = NULL; + const Snapshot* db_snap = NULL; + std::string k, v; + for (int step = 0; step < N; step++) { + if (step % 100 == 0) { + fprintf(stderr, "Step %d of %d\n", step, N); + } + int p = rnd.Uniform(100); + if (p < 45) { // Put + k = RandomKey(&rnd); + v = RandomString(&rnd, + rnd.OneIn(20) + ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + + } else if (p < 90) { // Delete + k = RandomKey(&rnd); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + + + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); + + Reopen(); + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/dbformat.cc b/db/dbformat.cc new file mode 100644 index 0000000..2664eb4 --- /dev/null +++ b/db/dbformat.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "db/dbformat.h" +#include "port/port.h" +#include "util/coding.h" + +namespace leveldb { + +static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(t <= kValueTypeForSeek); + return (seq << 8) | t; +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +std::string ParsedInternalKey::DebugString() const { + char buf[50]; + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); + std::string result = "'"; + result += user_key.ToString(); + result += buf; + return result; +} + +const char* InternalKeyComparator::Name() const { + return "leveldb.InternalKeyComparator"; +} + +int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +void InternalKeyComparator::FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + user_comparator_->FindShortestSeparator(&tmp, user_limit); + if (user_comparator_->Compare(*start, tmp) < 0) { + // User key has become larger. Tack on the earliest possible + // number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*start, tmp) < 0); + assert(this->Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void InternalKeyComparator::FindShortSuccessor(std::string* key) const { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + user_comparator_->FindShortSuccessor(&tmp); + if (user_comparator_->Compare(user_key, tmp) < 0) { + // User key has become larger. Tack on the earliest possible + // number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) { + LargeValueRef result; + port::SHA1_Hash(value.data(), value.size(), &result.data[0]); + EncodeFixed64(&result.data[20], value.size()); + result.data[28] = static_cast(ctype); + return result; +} + +std::string LargeValueRefToFilenameString(const LargeValueRef& h) { + assert(sizeof(h.data) == LargeValueRef::ByteSize()); + assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf + static const char tohex[] = "0123456789abcdef"; + char buf[20*2]; + for (int i = 0; i < 20; i++) { + buf[2*i] = tohex[(h.data[i] >> 4) & 0xf]; + buf[2*i+1] = tohex[h.data[i] & 0xf]; + } + std::string result = std::string(buf, sizeof(buf)); + result += "-"; + result += NumberToString(h.ValueSize()); + result += "-"; + result += NumberToString(static_cast(h.compression_type())); + return result; +} + +static uint32_t hexvalue(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } else if (c >= 'A' && c <= 'F') { + return 10 + c - 'A'; + } else { + assert(c >= 'a' && c <= 'f'); + return 10 + c - 'a'; + } +} + +bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) { + Slice in = s; + if (in.size() < 40) { + return false; + } + for (int i = 0; i < 20; i++) { + if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) { + return false; + } + unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]); + h->data[i] = c; + } + in.remove_prefix(40); + uint64_t value_size, ctype; + + if (ConsumeChar(&in, '-') && + ConsumeDecimalNumber(&in, &value_size) && + ConsumeChar(&in, '-') && + ConsumeDecimalNumber(&in, &ctype) && + in.empty() && + (ctype <= kSnappyCompression)) { + EncodeFixed64(&h->data[20], value_size); + h->data[28] = static_cast(ctype); + return true; + } else { + return false; + } +} + +} diff --git a/db/dbformat.h b/db/dbformat.h new file mode 100644 index 0000000..5f117f9 --- /dev/null +++ b/db/dbformat.h @@ -0,0 +1,204 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ +#define STORAGE_LEVELDB_DB_FORMAT_H_ + +#include +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/slice.h" +#include "leveldb/table_builder.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +// Grouping of constants. We may want to make some of these +// parameters set via options. +namespace config { +static const int kNumLevels = 7; +} + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +enum ValueType { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeLargeValueRef = 0x2, +}; +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeLargeValueRef; + +typedef uint64_t SequenceNumber; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString() const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte +// uncompressed size, and a 1 byte CompressionType code. An +// encoded form of it is embedded in the filenames of large value +// files stored in the database, and the raw binary form is stored as +// the iter->value() result for values of type kTypeLargeValueRef in +// the table and log files that make up the database. +struct LargeValueRef { + char data[29]; + + // Initialize a large value ref for the given data + static LargeValueRef Make(const Slice& data, + CompressionType compression_type); + + // Initialize a large value ref from a serialized, 29-byte reference value + static LargeValueRef FromRef(const Slice& ref) { + LargeValueRef result; + assert(ref.size() == sizeof(result.data)); + memcpy(result.data, ref.data(), sizeof(result.data)); + return result; + } + + // Return the number of bytes in a LargeValueRef (not the + // number of bytes in the value referenced). + static size_t ByteSize() { return sizeof(LargeValueRef().data); } + + // Return the number of bytes in the value referenced by "*this". + uint64_t ValueSize() const { return DecodeFixed64(&data[20]); } + + CompressionType compression_type() const { + return static_cast(data[28]); + } + + bool operator==(const LargeValueRef& b) const { + return memcmp(data, b.data, sizeof(data)) == 0; + } + bool operator<(const LargeValueRef& b) const { + return memcmp(data, b.data, sizeof(data)) < 0; + } +}; + +// Convert the large value ref to a human-readable string suitable +// for embedding in a large value filename. +extern std::string LargeValueRefToFilenameString(const LargeValueRef& h); + +// Parse the large value filename string in "input" and store it in +// "*h". If successful, returns true. Otherwise returns false. +extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref); + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast(kTypeLargeValueRef)); +} + +} + +#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc new file mode 100644 index 0000000..702cbb4 --- /dev/null +++ b/db/dbformat_test.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +static std::string IKey(const std::string& user_key, + uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); + return result; +} + +static void TestKey(const std::string& key, + uint64_t seq, + ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); +} + +class FormatTest { }; + +TEST(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; + const uint64_t seq[] = { + 1, 2, 3, + (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, + (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, + (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 + }; + for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeDeletion))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeLargeValueRef))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), + IKey("hello", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ(IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), + IKey("foo", 200, kTypeValue))); +} + +TEST(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +TEST(FormatTest, SHA1) { + // Check that we are computing the same value as sha1. + // Note that the last two numbers are the length of the input and the + // compression type. + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr + LargeValueRefToFilenameString( + LargeValueRef::Make("hello", kNoCompression))); + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr + LargeValueRefToFilenameString( + LargeValueRef::Make("hello", kSnappyCompression))); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/filename.cc b/db/filename.cc new file mode 100644 index 0000000..d21918c --- /dev/null +++ b/db/filename.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "db/filename.h" +#include "db/dbformat.h" +#include "leveldb/env.h" +#include "util/logging.h" + +namespace leveldb { + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast(number), + suffix); + return name + buf; +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string TableFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "sst"); +} + +std::string LargeValueFileName(const std::string& name, + const LargeValueRef& large_ref) { + std::string result = name + "/"; + result += LargeValueRefToFilenameString(large_ref); + result += ".val"; + return result; +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + static_cast(number)); + return dbname + buf; +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/CURRENT"; +} + +std::string LockFileName(const std::string& dbname) { + return dbname + "/LOCK"; +} + +std::string TempFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + return MakeFileName(dbname, number, "dbtmp"); +} + +std::string InfoLogFileName(const std::string& dbname) { + return dbname + "/LOG"; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname) { + return dbname + "/LOG.old"; +} + + +// Owned filenames have the form: +// dbname/CURRENT +// dbname/LOCK +// dbname/LOG +// dbname/LOG.old +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val +// dbname/[0-9]+.(log|sst) +bool ParseFileName(const std::string& fname, + uint64_t* number, + LargeValueRef* large_ref, + FileType* type) { + Slice rest(fname); + if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (rest == "LOG" || rest == "LOG.old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.size() >= 4 && + Slice(rest.data() + rest.size() - 4, 4) == ".val") { + LargeValueRef h; + if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4), + &h)) { + return false; + } + *large_ref = h; + *type = kLargeValueFile; + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + Slice suffix = rest; + if (suffix == Slice(".log")) { + *type = kLogFile; + } else if (suffix == Slice(".sst")) { + *type = kTableFile; + } else if (suffix == Slice(".dbtmp")) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); + if (s.ok()) { + s = env->RenameFile(tmp, CurrentFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +} diff --git a/db/filename.h b/db/filename.h new file mode 100644 index 0000000..81ab2fc --- /dev/null +++ b/db/filename.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ +#define STORAGE_LEVELDB_DB_FILENAME_H_ + +#include +#include +#include "leveldb/slice.h" +#include "leveldb/status.h" +#include "port/port.h" + +namespace leveldb { + +class Env; +struct LargeValueRef; + +enum FileType { + kLogFile, + kDBLockFile, + kTableFile, + kLargeValueFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one +}; + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::string& dbname, uint64_t number); + +// Return the name of the large value file with the specified large +// value reference in the db named by "dbname". The result will be +// prefixed with "dbname". +extern std::string LargeValueFileName(const std::string& dbname, + const LargeValueRef& large_ref); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname); + +// If filename is a leveldb file, store the type of the file in *type. +// If *type is kLargeValueFile, then the large value reference data +// from the filename is stored in "*large_ref. For all other types of +// files, the number encoded in the filename is stored in *number. If +// the filename was successfully parsed, returns true. Else return +// false. +extern bool ParseFileName(const std::string& filename, + uint64_t* number, + LargeValueRef* large_ref, + FileType* type); + +// Make the CURRENT file point to the descriptor file with the +// specified number. +extern Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number); + + +} + +#endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/db/filename_test.cc b/db/filename_test.cc new file mode 100644 index 0000000..4d2a91e --- /dev/null +++ b/db/filename_test.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +class FileNameTest { }; + +TEST(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + LargeValueRef large_ref; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + const char* large_ref; + FileType type; + } cases[] = { + { "100.log", 100, "", kLogFile }, + { "0.log", 0, "", kLogFile }, + { "0.sst", 0, "", kTableFile }, + { "CURRENT", 0, "", kCurrentFile }, + { "LOCK", 0, "", kDBLockFile }, + { "MANIFEST-2", 2, "", kDescriptorFile }, + { "MANIFEST-7", 7, "", kDescriptorFile }, + { "LOG", 0, "", kInfoLogFile }, + { "LOG.old", 0, "", kInfoLogFile }, + { "18446744073709551615.log", 18446744073709551615ull, "", + kLogFile }, + { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0, + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile }, + { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0, + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0", + kLargeValueFile }, + }; + for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + std::string f = cases[i].fname; + ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f; + ASSERT_EQ(cases[i].type, type) << f; + if (type == kLargeValueFile) { + ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref)) + << f; + } else { + ASSERT_EQ(cases[i].number, number) << f; + } + } + + // Errors + static const char* errors[] = { + "", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop", + "100.val", + ".val", + "123456789012345678901234567890123456789-12340.val", + "1234567890123456789012345678901234567-123-0.val", + "12345678901234567890123456789012345678902-100-1-.val", + // Overflow on value size + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val", + // '03.val' is a bad compression type + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" }; + for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f; + }; +} + +TEST(FileNameTest, Construction) { + uint64_t number; + FileType type; + LargeValueRef large_ref; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(0, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(0, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(192, number); + ASSERT_EQ(kLogFile, type); + + fname = TableFileName("bar", 200); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(200, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(100, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(999, number); + ASSERT_EQ(kTempFile, type); + + for (int i = 0; i <= kSnappyCompression; i++) { + CompressionType ctype = static_cast(i); + std::string value = "abcdef"; + LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype); + fname = LargeValueFileName("tmp", real_large_ref); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(real_large_ref == large_ref); + ASSERT_EQ(kLargeValueFile, type); + ASSERT_EQ(large_ref.compression_type(), ctype); + } +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/log_format.h b/db/log_format.h new file mode 100644 index 0000000..137cd4a --- /dev/null +++ b/db/log_format.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ +#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ + +namespace leveldb { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4, +}; +static const int kMaxRecordType = kLastType; + +static const int kBlockSize = 32768; + +// Header is checksum (4 bytes), type (1 byte), length (2 bytes). +static const int kHeaderSize = 4 + 1 + 2; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/db/log_reader.cc b/db/log_reader.cc new file mode 100644 index 0000000..75e1d28 --- /dev/null +++ b/db/log_reader.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { +namespace log { + +Reader::Reporter::~Reporter() { +} + +Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) + : file_(file), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false) { +} + +Reader::~Reader() { + delete[] backing_store_; +} + +bool Reader::ReadRecord(Slice* record, std::string* scratch) { + scratch->clear(); + record->clear(); + bool in_fragmented_record = false; + + Slice fragment; + while (true) { + switch (ReadPhysicalRecord(&fragment)) { + case kFullType: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + } + scratch->clear(); + *record = fragment; + return true; + + case kFirstType: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + } + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + if (!in_fragmented_record) { + ReportDrop(fragment.size(), "missing start of fragmented record"); + } else { + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + if (!in_fragmented_record) { + ReportDrop(fragment.size(), "missing start of fragmented record"); + } else { + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + return true; + } + break; + + case kEof: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + scratch->clear(); + } + return false; + + case kBadRecord: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: + ReportDrop( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + "unknown record type"); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + return false; +} + +void Reader::ReportDrop(size_t bytes, const char* reason) { + if (reporter_ != NULL) { + reporter_->Corruption(bytes, Status::Corruption(reason)); + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result) { + while (true) { + if (buffer_.size() < kHeaderSize) { + if (!eof_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + if (!status.ok()) { + if (reporter_ != NULL) { + reporter_->Corruption(kBlockSize, status); + } + buffer_.clear(); + eof_ = true; + return kEof; + } else if (buffer_.size() < kBlockSize) { + eof_ = true; + } + continue; + } else if (buffer_.size() == 0) { + // End of file + return kEof; + } else { + ReportDrop(buffer_.size(), "truncated record at end of file"); + buffer_.clear(); + return kEof; + } + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + if (kHeaderSize + length > buffer_.size()) { + ReportDrop(buffer_.size(), "bad record length"); + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + buffer_.clear(); + return kBadRecord; + } + + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + ReportDrop(buffer_.size(), "checksum mismatch"); + buffer_.clear(); + return kBadRecord; + } + } + + buffer_.remove_prefix(kHeaderSize + length); + *result = Slice(header + kHeaderSize, length); + return type; + } +} + +} +} diff --git a/db/log_reader.h b/db/log_reader.h new file mode 100644 index 0000000..baf1475 --- /dev/null +++ b/db/log_reader.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ +#define STORAGE_LEVELDB_DB_LOG_READER_H_ + +#include "db/log_format.h" +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class SequentialFile; + +namespace log { + +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-NULL, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + Reader(SequentialFile* file, Reporter* reporter, bool checksum); + + ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + bool ReadRecord(Slice* record, std::string* scratch); + + private: + SequentialFile* const file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + kBadRecord = kMaxRecordType + 2 + }; + + // Return type, or one of the preceding special values + unsigned int ReadPhysicalRecord(Slice* result); + void ReportDrop(size_t bytes, const char* reason); + + // No copying allowed + Reader(const Reader&); + void operator=(const Reader&); +}; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/db/log_test.cc b/db/log_test.cc new file mode 100644 index 0000000..025a5ff --- /dev/null +++ b/db/log_test.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +class LogTest { + private: + class StringDest : public WritableFile { + public: + std::string contents_; + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + virtual Status Append(const Slice& slice) { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + }; + + class StringSource : public SequentialFile { + public: + Slice contents_; + bool force_error_; + bool returned_partial_; + StringSource() : force_error_(false), returned_partial_(false) { } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + ASSERT_EQ(kBlockSize, n); + + if (force_error_) { + force_error_ = false; + returned_partial_ = true; + return Status::Corruption("read error"); + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + *result = Slice(contents_.data(), n); + contents_.remove_prefix(n); + return Status::OK(); + } + }; + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) { } + virtual void Corruption(size_t bytes, const Status& status) { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + StringDest dest_; + StringSource source_; + ReportCollector report_; + bool reading_; + Writer writer_; + Reader reader_; + + public: + LogTest() : reading_(false), + writer_(&dest_), + reader_(&source_, &report_, true/*checksum*/) { + } + + void Write(const std::string& msg) { + ASSERT_TRUE(!reading_) << "Write() after starting to read"; + writer_.AddRecord(Slice(msg)); + } + + size_t WrittenBytes() const { + return dest_.contents_.size(); + } + + std::string Read() { + if (!reading_) { + reading_ = true; + source_.contents_ = Slice(dest_.contents_); + } + std::string scratch; + Slice record; + if (reader_.ReadRecord(&record, &scratch)) { + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, int delta) { + dest_.contents_[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_.contents_[offset] = new_byte; + } + + void ShrinkSize(int bytes) { + dest_.contents_.resize(dest_.contents_.size() - bytes); + } + + void FixChecksum(int header_offset, int len) { + // Compute crc of type/len/data + uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_.contents_[header_offset], crc); + } + + void ForceError() { + source_.force_error_ = true; + } + + size_t DroppedBytes() const { + return report_.dropped_bytes_; + } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } +}; + +TEST(LogTest, Empty) { + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ShortTrailer) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, AlignedEof) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST(LogTest, TruncatedTrailingRecord) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); + ASSERT_EQ("OK", MatchError("truncated record at end of file")); +} + +TEST(LogTest, BadLength) { + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); +} + +TEST(LogTest, ChecksumMismatch) { + Write("foo"); + IncrementByte(0, 10); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(10, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); +} + +TEST(LogTest, UnexpectedMiddleType) { + Write("foo"); + SetByte(6, kMiddleType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedLastType) { + Write("foo"); + SetByte(6, kLastType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + const int dropped = DroppedBytes(); + ASSERT_LE(dropped, 2*kBlockSize + 100); + ASSERT_GE(dropped, 2*kBlockSize); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/log_writer.cc b/db/log_writer.cc new file mode 100644 index 0000000..18ca37a --- /dev/null +++ b/db/log_writer.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { +namespace log { + +Writer::Writer(WritableFile* dest) + : dest_(dest), + block_offset_(0) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { +} + +Status Writer::AddRecord(const Slice& slice) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + Status s; + do { + const int leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < kHeaderSize) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize being 7) + assert(kHeaderSize == 7); + dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); + } + block_offset_ = 0; + } + + // Invariant: we never leave < kHeaderSize bytes in a block. + const int avail = kBlockSize - block_offset_ - kHeaderSize; + assert(avail >= 0); + + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool begin = (ptr == slice.data()); + const bool end = (left == fragment_length); + if (begin && end) { + type = kFullType; + } else if (begin) { + type = kFirstType; + } else if (end) { + type = kLastType; + } else { + type = kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + } while (s.ok() && left > 0); + return s; +} + +Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { + assert(n <= 0xffff); // Must fit in two bytes + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + // Format the header + char buf[kHeaderSize]; + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + // Compute the crc of the record type and the payload. + uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage + EncodeFixed32(buf, crc); + + // Write the header and the payload + Status s = dest_->Append(Slice(buf, kHeaderSize)); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n)); + if (s.ok()) { + s = dest_->Flush(); + } + } + block_offset_ += kHeaderSize + n; + return s; +} + +} +} diff --git a/db/log_writer.h b/db/log_writer.h new file mode 100644 index 0000000..d3cf27d --- /dev/null +++ b/db/log_writer.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ +#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ + +#include +#include "db/log_format.h" +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class WritableFile; + +namespace log { + +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(WritableFile* dest); + ~Writer(); + + Status AddRecord(const Slice& slice); + + private: + WritableFile* dest_; + int block_offset_; // Current offset in block + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + + // No copying allowed + Writer(const Writer&); + void operator=(const Writer&); +}; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/db/memtable.cc b/db/memtable.cc new file mode 100644 index 0000000..a3b618a --- /dev/null +++ b/db/memtable.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" +#include "db/dbformat.h" +#include "leveldb/comparator.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "util/coding.h" + +namespace leveldb { + +static Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len; + const char* p = data; + p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted + return Slice(p, len); +} + +MemTable::MemTable(const InternalKeyComparator& cmp) + : comparator_(cmp), + table_(comparator_, &arena_) { +} + +MemTable::~MemTable() { +} + +size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } + +int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) + const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(aptr); + Slice b = GetLengthPrefixedSlice(bptr); + return comparator.Compare(a, b); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +static const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, target.size()); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator: public Iterator { + public: + explicit MemTableIterator(MemTable::Table* table) { + iter_ = new MemTable::Table::Iterator(table); + } + virtual ~MemTableIterator() { delete iter_; } + + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } + virtual Slice value() const { + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + virtual Status status() const { return Status::OK(); } + + private: + MemTable::Table::Iterator* iter_; + std::string tmp_; // For passing to EncodeKey + + // No copying allowed + MemTableIterator(const MemTableIterator&); + void operator=(const MemTableIterator&); +}; + +Iterator* MemTable::NewIterator() { + return new MemTableIterator(&table_); +} + +void MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, + const Slice& value) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + size_t key_size = key.size(); + size_t val_size = value.size(); + size_t internal_key_size = key_size + 8; + const size_t encoded_len = + VarintLength(internal_key_size) + internal_key_size + + VarintLength(val_size) + val_size; + char* buf = arena_.Allocate(encoded_len); + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + p += key_size; + EncodeFixed64(p, (s << 8) | type); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((p + val_size) - buf == encoded_len); + table_.Insert(buf); +} + +} diff --git a/db/memtable.h b/db/memtable.h new file mode 100644 index 0000000..45b3342 --- /dev/null +++ b/db/memtable.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ +#define STORAGE_LEVELDB_DB_MEMTABLE_H_ + +#include +#include "leveldb/db.h" +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "util/arena.h" + +namespace leveldb { + +class InternalKeyComparator; +class Mutex; +class MemTableIterator; + +class MemTable { + public: + explicit MemTable(const InternalKeyComparator& comparator); + ~MemTable(); + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + size_t ApproximateMemoryUsage(); + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/format.{h,cc} module. + Iterator* NewIterator(); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + void Add(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value); + + private: + struct KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + int operator()(const char* a, const char* b) const; + }; + friend class MemTableIterator; + friend class MemTableBackwardIterator; + + typedef SkipList Table; + + KeyComparator comparator_; + Arena arena_; + Table table_; + + // No copying allowed + MemTable(const MemTable&); + void operator=(const MemTable&); +}; + +} + +#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/db/repair.cc b/db/repair.cc new file mode 100644 index 0000000..014e00e --- /dev/null +++ b/db/repair.cc @@ -0,0 +1,396 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// We recover the contents of the descriptor from the other files we find. +// (1) Any log files are first converted to tables +// (2) We scan every table to compute +// (a) smallest/largest for the table +// (b) large value refs from the table +// (c) largest sequence number in the table +// (3) We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, +// large-value-refs, ...) in the table's meta section to speed up +// ScanTable. + +#include "db/builder.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/env.h" + +namespace leveldb { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const Options& options) + : dbname_(dbname), + env_(options.env), + icmp_(options.comparator), + options_(SanitizeOptions(dbname, &icmp_, options)), + owns_info_log_(options_.info_log != options.info_log), + next_file_number_(1) { + // TableCache can be small since we expect each table to be opened once. + table_cache_ = new TableCache(dbname_, &options_, 10); + } + + ~Repairer() { + delete table_cache_; + if (owns_info_log_) { + delete options_.info_log; + } + } + + Status Run() { + Status status = FindFiles(); + if (status.ok()) { + ConvertLogFilesToTables(); + ExtractMetaData(); + status = WriteDescriptor(); + } + if (status.ok()) { + unsigned long long bytes = 0; + for (int i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.file_size; + } + Log(env_, options_.info_log, + "**** Repaired leveldb %s; " + "recovered %d files; %llu bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), + static_cast(tables_.size()), + bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + SequenceNumber max_sequence; + }; + + std::string const dbname_; + Env* const env_; + InternalKeyComparator const icmp_; + Options const options_; + bool owns_info_log_; + TableCache* table_cache_; + VersionEdit edit_; + + std::vector manifests_; + std::vector table_numbers_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + + Status FindFiles() { + std::vector filenames; + Status status = env_->GetChildren(dbname_, &filenames); + if (!status.ok()) { + return status; + } + if (filenames.empty()) { + return Status::IOError(dbname_, "repair found no files"); + } + + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + if (type == kLargeValueFile) { + // Will be picked up when we process a Table that points to it + } else if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kLogFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_numbers_.push_back(number); + } else { + // Ignore other files + } + } + } + } + return status; + } + + void ConvertLogFilesToTables() { + for (int i = 0; i < logs_.size(); i++) { + std::string logname = LogFileName(dbname_, logs_[i]); + Status status = ConvertLogToTable(logs_[i]); + if (!status.ok()) { + Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", + (unsigned long long) logs_[i], + status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + WritableFile* info_log; + uint64_t lognum; + virtual void Corruption(size_t bytes, const Status& s) { + // We print error messages for corruption, but continue repairing. + Log(env, info_log, "Log #%llu: dropping %d bytes; %s", + (unsigned long long) lognum, + static_cast(bytes), + s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(dbname_, log); + SequentialFile* lfile; + Status status = env_->NewSequentialFile(logname, &lfile); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.lognum = log; + // We intentially make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(lfile, &reporter, false/*do not checksum*/); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable mem(icmp_); + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + status = WriteBatchInternal::InsertInto(&batch, &mem); + if (status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + Log(env_, options_.info_log, "Log #%llu: ignoring %s", + (unsigned long long) log, + status.ToString().c_str()); + status = Status::OK(); // Keep going with rest of file + } + } + delete lfile; + + // We ignore any version edits generated by the conversion to a Table + // since ExtractMetaData() will also generate edits. + VersionEdit skipped; + FileMetaData meta; + meta.number = next_file_number_++; + Iterator* iter = mem.NewIterator(); + status = BuildTable(dbname_, env_, options_, table_cache_, iter, + &meta, &skipped); + delete iter; + if (status.ok()) { + if (meta.file_size > 0) { + table_numbers_.push_back(meta.number); + } + } + Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", + (unsigned long long) log, + counter, + (unsigned long long) meta.number, + status.ToString().c_str()); + return status; + } + + void ExtractMetaData() { + std::vector kept; + for (int i = 0; i < table_numbers_.size(); i++) { + TableInfo t; + t.meta.number = table_numbers_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName(dbname_, table_numbers_[i]); + Log(env_, options_.info_log, "Table #%llu: ignoring %s", + (unsigned long long) table_numbers_[i], + status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName(dbname_, t->meta.number); + int counter = 0; + Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (status.ok()) { + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), t->meta.number, t->meta.file_size); + bool empty = true; + ParsedInternalKey parsed; + t->max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(env_, options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t->meta.number, + EscapeString(key).c_str()); + continue; + } + + counter++; + if (empty) { + empty = false; + t->meta.smallest.DecodeFrom(key); + } + t->meta.largest.DecodeFrom(key); + if (parsed.sequence > t->max_sequence) { + t->max_sequence = parsed.sequence; + } + + if (ExtractValueType(key) == kTypeLargeValueRef) { + if (iter->value().size() != LargeValueRef::ByteSize()) { + Log(env_, options_.info_log, "Table #%llu: bad large value ref", + (unsigned long long) t->meta.number); + } else { + edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()), + t->meta.number, + key); + } + } + } + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; + } + Log(env_, options_.info_log, "Table #%llu: %d entries %s", + (unsigned long long) t->meta.number, + counter, + status.ToString().c_str()); + return status; + } + + Status WriteDescriptor() { + std::string tmp = TempFileName(dbname_, 1); + WritableFile* file; + Status status = env_->NewWritableFile(tmp, &file); + if (!status.ok()) { + return status; + } + + SequenceNumber max_sequence = 0; + for (int i = 0; i < tables_.size(); i++) { + if (max_sequence < tables_[i].max_sequence) { + max_sequence = tables_[i].max_sequence; + } + } + + edit_.SetComparatorName(icmp_.user_comparator()->Name()); + edit_.SetLogNumber(0); + edit_.SetNextFile(next_file_number_); + edit_.SetLastSequence(max_sequence); + + for (int i = 0; i < tables_.size(); i++) { + // TODO(opt): separate out into multiple levels + const TableInfo& t = tables_[i]; + edit_.AddFile(0, t.meta.number, t.meta.file_size, + t.meta.smallest, t.meta.largest); + } + + //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); + { + log::Writer log(file); + std::string record; + edit_.EncodeTo(&record); + status = log.AddRecord(record); + } + if (status.ok()) { + status = file->Close(); + } + delete file; + file = NULL; + + if (!status.ok()) { + env_->DeleteFile(tmp); + } else { + // Discard older manifests + for (int i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + + // Install new manifest + status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); + if (status.ok()) { + status = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(tmp); + } + } + return status; + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != NULL) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == NULL) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + Log(env_, options_.info_log, "Archiving %s: %s\n", + fname.c_str(), s.ToString().c_str()); + } +}; +} + +Status RepairDB(const std::string& dbname, const Options& options) { + Repairer repairer(dbname, options); + return repairer.Run(); +} + +} diff --git a/db/skiplist.h b/db/skiplist.h new file mode 100644 index 0000000..be39354 --- /dev/null +++ b/db/skiplist.h @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... + +#include +#include +#include "port/port.h" +#include "util/arena.h" +#include "util/random.h" + +namespace leveldb { + +class Arena; + +template +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + enum { kMaxHeight = 12 }; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + inline int GetMaxHeight() const { + return reinterpret_cast(max_height_.NoBarrier_Load()); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return NULL if there is no such node. + // + // If prev is non-NULL, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template +struct SkipList::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template +typename SkipList::Node* +SkipList::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template +inline SkipList::Iterator::Iterator(const SkipList* list) { + list_ = list; + node_ = NULL; +} + +template +inline bool SkipList::Iterator::Valid() const { + return node_ != NULL; +} + +template +inline const Key& SkipList::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template +inline void SkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void SkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template +inline void SkipList::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, NULL); +} + +template +inline void SkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void SkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template +int SkipList::RandomHeight() { + // Increase height with probability 1 in kBranching + static const unsigned int kBranching = 4; + int height = 1; + while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight); + return height; +} + +template +bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { + // NULL n is considered infinite + return (n != NULL) && (compare_(n->key, key) < 0); +} + +template +typename SkipList::Node* SkipList::FindGreaterOrEqual(const Key& key, Node** prev) + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != NULL) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template +typename SkipList::Node* +SkipList::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == NULL || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +typename SkipList::Node* SkipList::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == NULL) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +SkipList::SkipList(Comparator cmp, Arena* arena) + : compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, kMaxHeight)), + max_height_(reinterpret_cast(1)), + rnd_(0xdeadbeef) { + for (int i = 0; i < kMaxHeight; i++) { + head_->SetNext(i, NULL); + } +} + +template +void SkipList::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* prev[kMaxHeight]; + Node* x = FindGreaterOrEqual(key, prev); + + // Our data structure does not allow duplicate insertion + assert(x == NULL || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (NULL), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since NULL sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); + prev[i]->SetNext(i, x); + } +} + +template +bool SkipList::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, NULL); + if (x != NULL && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc new file mode 100644 index 0000000..5f9ec0d --- /dev/null +++ b/db/skiplist_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/skiplist.h" +#include +#include "leveldb/env.h" +#include "util/arena.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { + +typedef uint64_t Key; + +struct Comparator { + int operator()(const Key& a, const Key& b) const { + if (a < b) { + return -1; + } else if (a > b) { + return +1; + } else { + return 0; + } + } +}; + +class SkipTest { }; + +TEST(SkipTest, Empty) { + Arena arena; + Comparator cmp; + SkipList list(cmp, &arena); + ASSERT_TRUE(!list.Contains(10)); + + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + iter.Seek(100); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToLast(); + ASSERT_TRUE(!iter.Valid()); +} + +TEST(SkipTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + Arena arena; + Comparator cmp; + SkipList list(cmp, &arena); + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + list.Insert(key); + } + } + + for (int i = 0; i < R; i++) { + if (list.Contains(i)) { + ASSERT_EQ(keys.count(i), 1); + } else { + ASSERT_EQ(keys.count(i), 0); + } + } + + // Simple iterator tests + { + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + iter.Seek(0); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), iter.key()); + } + + // Forward iteration test + for (int i = 0; i < R; i++) { + SkipList::Iterator iter(&list); + iter.Seek(i); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + { + SkipList::Iterator iter(&list); + iter.SeekToLast(); + + // Compare against model iterator + for (std::set::reverse_iterator model_iter = keys.rbegin(); + model_iter != keys.rend(); + ++model_iter) { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + iter.Prev(); + } + ASSERT_TRUE(!iter.Valid()); + } +} + +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + private: + static const uint32_t K = 4; + + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = { k, g }; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + port::AtomicPointer generation[K]; + void Set(int k, intptr_t v) { + generation[k].Release_Store(reinterpret_cast(v)); + } + intptr_t Get(int k) { + return reinterpret_cast(generation[k].Acquire_Load()); + } + + State() { + for (int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + Arena arena_; + + // SkipList is not protected by mu_. We just use a single writer + // thread to modify it. + SkipList list_; + + public: + ConcurrentTest() : list_(Comparator(), &arena_) { } + + // REQUIRES: External synchronization + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const intptr_t g = current_.Get(k) + 1; + const Key key = MakeKey(k, g); + list_.Insert(key); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + SkipList::Iterator iter(&list_); + iter.Seek(pos); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = iter.key(); + ASSERT_TRUE(IsValidKey(current)) << std::hex << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << std::hex << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0) || + (gen(pos) > initial_state.Get(key(pos))) + ) << "key: " << key(pos) + << "; gen: " << gen(pos) + << "; initgen: " + << initial_state.Get(key(pos)); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(new_target); + } + } + } + } +}; +const uint32_t ConcurrentTest::K; + +// Simple test that does single-threaded testing of the ConcurrentTest +// scaffolding. +TEST(SkipTest, ConcurrentWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + test.WriteStep(&rnd); + } +} + +class TestState { + public: + ConcurrentTest t_; + int seed_; + port::AtomicPointer quit_flag_; + + enum ReaderState { + STARTING, + RUNNING, + DONE + }; + + explicit TestState(int s) + : seed_(s), + quit_flag_(NULL), + state_(STARTING), + state_cv_(&mu_) {} + + void Wait(ReaderState s) { + mu_.Lock(); + while (state_ != s) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + void Change(ReaderState s) { + mu_.Lock(); + state_ = s; + state_cv_.Signal(); + mu_.Unlock(); + } + + private: + port::Mutex mu_; + ReaderState state_; + port::CondVar state_cv_; +}; + +static void ConcurrentReader(void* arg) { + TestState* state = reinterpret_cast(arg); + Random rnd(state->seed_); + int64_t reads = 0; + state->Change(TestState::RUNNING); + while (!state->quit_flag_.Acquire_Load()) { + state->t_.ReadStep(&rnd); + ++reads; + } + state->Change(TestState::DONE); +} + +static void RunConcurrent(int run) { + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int i = 0; i < kSize; i++) { + state.t_.WriteStep(&rnd); + } + state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do + state.Wait(TestState::DONE); + } +} + +TEST(SkipTest, Concurrent1) { RunConcurrent(1); } +TEST(SkipTest, Concurrent2) { RunConcurrent(2); } +TEST(SkipTest, Concurrent3) { RunConcurrent(3); } +TEST(SkipTest, Concurrent4) { RunConcurrent(4); } +TEST(SkipTest, Concurrent5) { RunConcurrent(5); } + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/snapshot.h b/db/snapshot.h new file mode 100644 index 0000000..9a90756 --- /dev/null +++ b/db/snapshot.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ +#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ + +#include "leveldb/db.h" + +namespace leveldb { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each Snapshot corresponds to a particular sequence number. +class Snapshot { + public: + SequenceNumber number_; // const after creation + + private: + friend class SnapshotList; + + // Snapshot is kept in a doubly-linked circular list + Snapshot* prev_; + Snapshot* next_; + + SnapshotList* list_; // just for sanity checks +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + } + + bool empty() const { return list_.next_ == &list_; } + Snapshot* oldest() const { assert(!empty()); return list_.next_; } + Snapshot* newest() const { assert(!empty()); return list_.prev_; } + + const Snapshot* New(SequenceNumber seq) { + Snapshot* s = new Snapshot; + s->number_ = seq; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + return s; + } + + void Delete(const Snapshot* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + delete s; + } + + private: + // Dummy head of doubly-linked list of snapshots + Snapshot list_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/db/table_cache.cc b/db/table_cache.cc new file mode 100644 index 0000000..325d707 --- /dev/null +++ b/db/table_cache.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/filename.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "util/coding.h" + +namespace leveldb { + +struct TableAndFile { + RandomAccessFile* file; + Table* table; +}; + +static void DeleteEntry(const Slice& key, void* value) { + TableAndFile* tf = reinterpret_cast(value); + delete tf->table; + delete tf->file; + delete tf; +} + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +TableCache::TableCache(const std::string& dbname, + const Options* options, + int entries) + : env_(options->env), + dbname_(dbname), + options_(options), + cache_(NewLRUCache(entries)) { +} + +TableCache::~TableCache() { + delete cache_; +} + +Iterator* TableCache::NewIterator(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + Table** tableptr) { + if (tableptr != NULL) { + *tableptr = NULL; + } + + char buf[sizeof(file_number)]; + EncodeFixed64(buf, file_number); + Slice key(buf, sizeof(buf)); + Cache::Handle* handle = cache_->Lookup(key); + if (handle == NULL) { + std::string fname = TableFileName(dbname_, file_number); + RandomAccessFile* file = NULL; + Table* table = NULL; + Status s = env_->NewRandomAccessFile(fname, &file); + if (s.ok()) { + s = Table::Open(*options_, file, file_size, &table); + } + + if (!s.ok()) { + assert(table == NULL); + delete file; + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + return NewErrorIterator(s); + } + + TableAndFile* tf = new TableAndFile; + tf->file = file; + tf->table = table; + handle = cache_->Insert(key, tf, 1, &DeleteEntry); + } + + Table* table = reinterpret_cast(cache_->Value(handle))->table; + Iterator* result = table->NewIterator(options); + result->RegisterCleanup(&UnrefEntry, cache_, handle); + if (tableptr != NULL) { + *tableptr = table; + } + return result; +} + +void TableCache::Evict(uint64_t file_number) { + char buf[sizeof(file_number)]; + EncodeFixed64(buf, file_number); + cache_->Erase(Slice(buf, sizeof(buf))); +} + +} diff --git a/db/table_cache.h b/db/table_cache.h new file mode 100644 index 0000000..5376194 --- /dev/null +++ b/db/table_cache.h @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ +#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ + +#include +#include +#include "db/dbformat.h" +#include "leveldb/cache.h" +#include "leveldb/table.h" +#include "port/port.h" + +namespace leveldb { + +class Env; + +class TableCache { + public: + TableCache(const std::string& dbname, const Options* options, int entries); + ~TableCache(); + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "tableptr" is + // non-NULL, also sets "*tableptr" to point to the Table object + // underlying the returned iterator, or NULL if no Table object underlies + // the returned iterator. The returned "*tableptr" object is owned by + // the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + Iterator* NewIterator(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + Table** tableptr = NULL); + + // Evict any entry for the specified file number + void Evict(uint64_t file_number); + + private: + Env* const env_; + const std::string dbname_; + const Options* options_; + Cache* cache_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/db/version_edit.cc b/db/version_edit.cc new file mode 100644 index 0000000..689dbe0 --- /dev/null +++ b/db/version_edit.cc @@ -0,0 +1,301 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/version_set.h" +#include "util/coding.h" + +namespace leveldb { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. +enum Tag { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + kLargeValueRef = 8, + kPrevLogNumber = 9, +}; + +void VersionEdit::Clear() { + comparator_.clear(); + log_number_ = 0; + prev_log_number_ = 0; + last_sequence_ = 0; + next_file_number_ = 0; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_last_sequence_ = false; + deleted_files_.clear(); + new_files_.clear(); + large_refs_added_.clear(); +} + +void VersionEdit::EncodeTo(std::string* dst) const { + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32(dst, kLogNumber); + PutVarint64(dst, log_number_); + } + if (has_prev_log_number_) { + PutVarint32(dst, kPrevLogNumber); + PutVarint64(dst, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32(dst, kNextFileNumber); + PutVarint64(dst, next_file_number_); + } + if (has_last_sequence_) { + PutVarint32(dst, kLastSequence); + PutVarint64(dst, last_sequence_); + } + + for (int i = 0; i < compact_pointers_.size(); i++) { + PutVarint32(dst, kCompactPointer); + PutVarint32(dst, compact_pointers_[i].first); // level + PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); + } + + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + PutVarint32(dst, kDeletedFile); + PutVarint32(dst, iter->first); // level + PutVarint64(dst, iter->second); // file number + } + + for (int i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + PutVarint32(dst, kNewFile); + PutVarint32(dst, new_files_[i].first); // level + PutVarint64(dst, f.number); + PutVarint64(dst, f.file_size); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + } + + for (int i = 0; i < large_refs_added_.size(); i++) { + const VersionEdit::Large& l = large_refs_added_[i]; + PutVarint32(dst, kLargeValueRef); + PutLengthPrefixedSlice(dst, + Slice(l.large_ref.data, LargeValueRef::ByteSize())); + PutVarint64(dst, l.fnum); + PutLengthPrefixedSlice(dst, l.internal_key.Encode()); + } +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return true; + } else { + return false; + } +} + +static bool GetLevel(Slice* input, int* level) { + uint32_t v; + if (GetVarint32(input, &v) && + v < config::kNumLevels) { + *level = v; + return true; + } else { + return false; + } +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); + Slice input = src; + const char* msg = NULL; + uint32_t tag; + + // Temporary storage for parsing + int level; + uint64_t number; + FileMetaData f; + Slice str; + Large large; + InternalKey key; + + while (msg == NULL && GetVarint32(&input, &tag)) { + switch (tag) { + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactPointer: + if (GetLevel(&input, &level) && + GetInternalKey(&input, &key)) { + compact_pointers_.push_back(std::make_pair(level, key)); + } else { + msg = "compaction pointer"; + } + break; + + case kDeletedFile: + if (GetLevel(&input, &level) && + GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + msg = "deleted file"; + } + break; + + case kNewFile: + if (GetLevel(&input, &level) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + new_files_.push_back(std::make_pair(level, f)); + } else { + msg = "new-file entry"; + } + break; + + case kLargeValueRef: + if (GetLengthPrefixedSlice(&input, &str) && + (str.size() == LargeValueRef::ByteSize()) && + GetVarint64(&input, &large.fnum) && + GetInternalKey(&input, &large.internal_key)) { + large.large_ref = LargeValueRef::FromRef(str); + large_refs_added_.push_back(large); + } else { + msg = "large ref"; + } + break; + + default: + msg = "unknown tag"; + break; + } + } + + if (msg == NULL && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != NULL) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString() const { + std::string r; + r.append("VersionEdit {"); + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFile: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (int i = 0; i < compact_pointers_.size(); i++) { + r.append("\n CompactPointer: "); + AppendNumberTo(&r, compact_pointers_[i].first); + r.append(" '"); + AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); + r.append("'"); + } + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, iter->first); + r.append(" "); + AppendNumberTo(&r, iter->second); + } + for (int i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.number); + r.append(" "); + AppendNumberTo(&r, f.file_size); + r.append(" '"); + AppendEscapedStringTo(&r, f.smallest.Encode()); + r.append("' .. '"); + AppendEscapedStringTo(&r, f.largest.Encode()); + r.append("'"); + } + for (int i = 0; i < large_refs_added_.size(); i++) { + const VersionEdit::Large& l = large_refs_added_[i]; + r.append("\n LargeRef: "); + AppendNumberTo(&r, l.fnum); + r.append(" "); + r.append(LargeValueRefToFilenameString(l.large_ref)); + r.append(" '"); + AppendEscapedStringTo(&r, l.internal_key.Encode()); + r.append("'"); + } + r.append("\n}\n"); + return r; +} + +} diff --git a/db/version_edit.h b/db/version_edit.h new file mode 100644 index 0000000..7e417b5 --- /dev/null +++ b/db/version_edit.h @@ -0,0 +1,124 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ +#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ + +#include +#include +#include +#include "db/dbformat.h" + +namespace leveldb { + +class VersionSet; + +struct FileMetaData { + int refs; + uint64_t number; + uint64_t file_size; // File size in bytes + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + + FileMetaData() : refs(0), file_size(0) { } +}; + +class VersionEdit { + public: + VersionEdit() { Clear(); } + ~VersionEdit() { } + + void Clear(); + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + void SetCompactPointer(int level, const InternalKey& key) { + compact_pointers_.push_back(std::make_pair(level, key)); + } + + // Add the specified file at the specified number. + // REQUIRES: This version has not been saved (see VersionSet::SaveTo) + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + void AddFile(int level, uint64_t file, + uint64_t file_size, + const InternalKey& smallest, + const InternalKey& largest) { + FileMetaData f; + f.number = file; + f.file_size = file_size; + f.smallest = smallest; + f.largest = largest; + new_files_.push_back(std::make_pair(level, f)); + } + + // Delete the specified "file" from the specified "level". + void DeleteFile(int level, uint64_t file) { + deleted_files_.insert(std::make_pair(level, file)); + } + + // Record that a large value with the specified large_ref was + // written to the output file numbered "fnum" + void AddLargeValueRef(const LargeValueRef& large_ref, + uint64_t fnum, + const Slice& internal_key) { + large_refs_added_.resize(large_refs_added_.size() + 1); + Large* large = &(large_refs_added_.back()); + large->large_ref = large_ref; + large->fnum = fnum; + large->internal_key.DecodeFrom(internal_key); + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString() const; + + private: + friend class VersionSet; + + typedef std::set< std::pair > DeletedFileSet; + + std::string comparator_; + uint64_t log_number_; + uint64_t prev_log_number_; + uint64_t next_file_number_; + SequenceNumber last_sequence_; + bool has_comparator_; + bool has_log_number_; + bool has_prev_log_number_; + bool has_next_file_number_; + bool has_last_sequence_; + + std::vector< std::pair > compact_pointers_; + DeletedFileSet deleted_files_; + std::vector< std::pair > new_files_; + struct Large { + LargeValueRef large_ref; + uint64_t fnum; + InternalKey internal_key; + }; + std::vector large_refs_added_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc new file mode 100644 index 0000000..6906ec3 --- /dev/null +++ b/db/version_edit_test.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" +#include "util/testharness.h" + +namespace leveldb { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest { }; + +TEST(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); + edit.DeleteFile(4, kBig + 700 + i); + edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression), + kBig + 800 + i, "foobar"); + edit.AddLargeValueRef(LargeValueRef::Make("big2", kSnappyCompression), + kBig + 801 + i, "baz"); + edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/version_set.cc b/db/version_set.cc new file mode 100644 index 0000000..31f79bb --- /dev/null +++ b/db/version_set.cc @@ -0,0 +1,1120 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include +#include +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "leveldb/env.h" +#include "leveldb/table_builder.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +static const int kTargetFileSize = 2 * 1048576; + +// Maximum bytes of overlaps in grandparent (i.e., level+2) before we +// stop building a single file in a level->level+1 compaction. +static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; + +static double MaxBytesForLevel(int level) { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + double result = 10 * 1048576.0; // Result for both level-0 and level-1 + while (level > 1) { + result *= 10; + level--; + } + return result; +} + +static uint64_t MaxFileSizeForLevel(int level) { + return kTargetFileSize; // We could vary per level to reduce number of files? +} + +namespace { +std::string IntSetToString(const std::set& s) { + std::string result = "{"; + for (std::set::const_iterator it = s.begin(); + it != s.end(); + ++it) { + result += (result.size() > 1) ? "," : ""; + result += NumberToString(*it); + } + result += "}"; + return result; +} +} + +Version::~Version() { + assert(refs_ == 0); + for (int level = 0; level < config::kNumLevels; level++) { + for (int i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + assert(f->refs >= 0); + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + delete cleanup_mem_; +} + +// An internal iterator. For a given version/level pair, yields +// information about the files in the level. For a given entry, key() +// is the largest key that occurs in the file, and value() is an +// 16-byte value containing the file number and file size, both +// encoded using EncodeFixed64. +class Version::LevelFileNumIterator : public Iterator { + public: + LevelFileNumIterator(const Version* version, + const std::vector* flist) + : icmp_(version->vset_->icmp_.user_comparator()), + flist_(flist), + index_(flist->size()) { // Marks as invalid + } + virtual bool Valid() const { + return index_ < flist_->size(); + } + virtual void Seek(const Slice& target) { + uint32_t left = 0; + uint32_t right = flist_->size() - 1; + while (left < right) { + uint32_t mid = (left + right) / 2; + int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target); + if (cmp < 0) { + // Key at "mid.largest" is < than "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + index_ = left; + } + virtual void SeekToFirst() { index_ = 0; } + virtual void SeekToLast() { + index_ = flist_->empty() ? 0 : flist_->size() - 1; + } + virtual void Next() { + assert(Valid()); + index_++; + } + virtual void Prev() { + assert(Valid()); + if (index_ == 0) { + index_ = flist_->size(); // Marks as invalid + } else { + index_--; + } + } + Slice key() const { + assert(Valid()); + return (*flist_)[index_]->largest.Encode(); + } + Slice value() const { + assert(Valid()); + EncodeFixed64(value_buf_, (*flist_)[index_]->number); + EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); + return Slice(value_buf_, sizeof(value_buf_)); + } + virtual Status status() const { return Status::OK(); } + private: + const InternalKeyComparator icmp_; + const std::vector* const flist_; + int index_; + + // Backing store for value(). Holds the file number and size. + mutable char value_buf_[16]; +}; + +static Iterator* GetFileIterator(void* arg, + const ReadOptions& options, + const Slice& file_value) { + TableCache* cache = reinterpret_cast(arg); + if (file_value.size() != 16) { + return NewErrorIterator( + Status::Corruption("FileReader invoked with unexpected value")); + } else { + return cache->NewIterator(options, + DecodeFixed64(file_value.data()), + DecodeFixed64(file_value.data() + 8)); + } +} + +Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, + int level) const { + return NewTwoLevelIterator( + new LevelFileNumIterator(this, &files_[level]), + &GetFileIterator, vset_->table_cache_, options); +} + +void Version::AddIterators(const ReadOptions& options, + std::vector* iters) { + // Merge all level zero files together since they may overlap + for (int i = 0; i < files_[0].size(); i++) { + iters->push_back( + vset_->table_cache_->NewIterator( + options, files_[0][i]->number, files_[0][i]->file_size)); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < config::kNumLevels; level++) { + if (!files_[level].empty()) { + iters->push_back(NewConcatenatingIterator(options, level)); + } + } +} + +void Version::Ref() { + ++refs_; +} + +void Version::Unref() { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + vset_->MaybeDeleteOldVersions(); + // TODO: try to delete obsolete files + } +} + +std::string Version::DebugString() const { + std::string r; + for (int level = 0; level < config::kNumLevels; level++) { + // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g'] + r.append("level "); + AppendNumberTo(&r, level); + r.push_back(':'); + const std::vector& files = files_[level]; + for (int i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->number); + r.push_back(':'); + AppendNumberTo(&r, files[i]->file_size); + r.append("['"); + AppendEscapedStringTo(&r, files[i]->smallest.Encode()); + r.append("' .. '"); + AppendEscapedStringTo(&r, files[i]->largest.Encode()); + r.append("']"); + } + r.push_back('\n'); + } + return r; +} + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionSet::Builder { + private: + typedef std::map FileMap; + VersionSet* vset_; + FileMap files_[config::kNumLevels]; + + public: + // Initialize a builder with the files from *base and other info from *vset + Builder(VersionSet* vset, Version* base) + : vset_(vset) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = base->files_[level]; + for (int i = 0; i < files.size(); i++) { + FileMetaData* f = files[i]; + f->refs++; + files_[level].insert(std::make_pair(f->number, f)); + } + } + } + + ~Builder() { + for (int level = 0; level < config::kNumLevels; level++) { + const FileMap& fmap = files_[level]; + for (FileMap::const_iterator iter = fmap.begin(); + iter != fmap.end(); + ++iter) { + FileMetaData* f = iter->second; + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + } + + // Apply all of the edits in *edit to the current state. + void Apply(VersionEdit* edit) { + // Update compaction pointers + for (int i = 0; i < edit->compact_pointers_.size(); i++) { + const int level = edit->compact_pointers_[i].first; + vset_->compact_pointer_[level] = + edit->compact_pointers_[i].second.Encode().ToString(); + } + + // Delete files + const VersionEdit::DeletedFileSet& del = edit->deleted_files_; + for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); + iter != del.end(); + ++iter) { + const int level = iter->first; + const uint64_t number = iter->second; + FileMap::iterator fiter = files_[level].find(number); + assert(fiter != files_[level].end()); // Sanity check for debug mode + if (fiter != files_[level].end()) { + FileMetaData* f = fiter->second; + f->refs--; + if (f->refs <= 0) { + delete f; + } + files_[level].erase(fiter); + } + } + + // Add new files + for (int i = 0; i < edit->new_files_.size(); i++) { + const int level = edit->new_files_[i].first; + FileMetaData* f = new FileMetaData(edit->new_files_[i].second); + f->refs = 1; + assert(files_[level].count(f->number) == 0); + files_[level].insert(std::make_pair(f->number, f)); + } + + // Add large value refs + for (int i = 0; i < edit->large_refs_added_.size(); i++) { + const VersionEdit::Large& l = edit->large_refs_added_[i]; + vset_->RegisterLargeValueRef(l.large_ref, l.fnum, l.internal_key); + } + } + + // Save the current state in *v. + void SaveTo(Version* v) { + for (int level = 0; level < config::kNumLevels; level++) { + const FileMap& fmap = files_[level]; + for (FileMap::const_iterator iter = fmap.begin(); + iter != fmap.end(); + ++iter) { + FileMetaData* f = iter->second; + f->refs++; + v->files_[level].push_back(f); + } + } + } +}; + +VersionSet::VersionSet(const std::string& dbname, + const Options* options, + TableCache* table_cache, + const InternalKeyComparator* cmp) + : env_(options->env), + dbname_(dbname), + options_(options), + table_cache_(table_cache), + icmp_(*cmp), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + last_sequence_(0), + log_number_(0), + prev_log_number_(0), + descriptor_file_(NULL), + descriptor_log_(NULL), + current_(new Version(this)), + oldest_(current_) { +} + +VersionSet::~VersionSet() { + for (Version* v = oldest_; v != NULL; ) { + Version* next = v->next_; + assert(v->refs_ == 0); + delete v; + v = next; + } + delete descriptor_log_; + delete descriptor_file_; +} + +Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { + if (edit->has_log_number_) { + assert(edit->log_number_ >= log_number_); + assert(edit->log_number_ < next_file_number_); + } else { + edit->SetLogNumber(log_number_); + } + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + + edit->SetNextFile(next_file_number_); + edit->SetLastSequence(last_sequence_); + + Version* v = new Version(this); + { + Builder builder(this, current_); + builder.Apply(edit); + builder.SaveTo(v); + } + + std::string new_manifest_file; + Status s = Finalize(v); + + // Initialize new descriptor log file if necessary by creating + // a temporary file that contains a snapshot of the current version. + if (s.ok()) { + if (descriptor_log_ == NULL) { + assert(descriptor_file_ == NULL); + new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); + edit->SetNextFile(next_file_number_); + s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); + if (s.ok()) { + descriptor_log_ = new log::Writer(descriptor_file_); + s = WriteSnapshot(descriptor_log_); + } + } + } + + // Write new record to MANIFEST log + if (s.ok()) { + std::string record; + edit->EncodeTo(&record); + s = descriptor_log_->AddRecord(record); + if (s.ok()) { + s = descriptor_file_->Sync(); + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && !new_manifest_file.empty()) { + s = SetCurrentFile(env_, dbname_, manifest_file_number_); + } + + // Install the new version + if (s.ok()) { + assert(current_->next_ == NULL); + assert(current_->cleanup_mem_ == NULL); + current_->cleanup_mem_ = cleanup_mem; + v->next_ = NULL; + current_->next_ = v; + current_ = v; + log_number_ = edit->log_number_; + prev_log_number_ = edit->prev_log_number_; + } else { + delete v; + if (!new_manifest_file.empty()) { + delete descriptor_log_; + delete descriptor_file_; + descriptor_log_ = NULL; + descriptor_file_ = NULL; + env_->DeleteFile(new_manifest_file); + } + } + + return s; +} + +Status VersionSet::Recover() { + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string current; + Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); + if (!s.ok()) { + return s; + } + if (current.empty() || current[current.size()-1] != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + current.resize(current.size() - 1); + + std::string dscname = dbname_ + "/" + current; + SequentialFile* file; + s = env_->NewSequentialFile(dscname, &file); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; + Builder builder(this, current_); + + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(file, &reporter, true/*checksum*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (s.ok()) { + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument( + edit.comparator_ + "does not match existing comparator ", + icmp_.user_comparator()->Name()); + } + } + + if (s.ok()) { + builder.Apply(&edit); + } + + if (edit.has_log_number_) { + log_number = edit.log_number_; + have_log_number = true; + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + delete file; + file = NULL; + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + } + + if (s.ok()) { + Version* v = new Version(this); + builder.SaveTo(v); + s = Finalize(v); + if (!s.ok()) { + delete v; + } else { + // Install recovered version + v->next_ = NULL; + current_->next_ = v; + current_ = v; + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + log_number_ = log_number; + prev_log_number_ = prev_log_number; + } + } + + return s; +} + +static int64_t TotalFileSize(const std::vector& files) { + int64_t sum = 0; + for (int i = 0; i < files.size(); i++) { + sum += files[i]->file_size; + } + return sum; +} + +Status VersionSet::Finalize(Version* v) { + // Precomputed best level for next compaction + int best_level = -1; + double best_score = -1; + + Status s; + for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { + s = SortLevel(v, level); + + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + score = v->files_[level].size() / 4.0; + } else { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = TotalFileSize(v->files_[level]); + score = static_cast(level_bytes) / MaxBytesForLevel(level); + } + + if (score > best_score) { + best_level = level; + best_score = score; + } + } + + v->compaction_level_ = best_level; + v->compaction_score_ = best_score; + return s; +} + +Status VersionSet::WriteSnapshot(log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // Save metadata + VersionEdit edit; + edit.SetComparatorName(icmp_.user_comparator()->Name()); + + // Save compaction pointers + for (int level = 0; level < config::kNumLevels; level++) { + if (!compact_pointer_[level].empty()) { + InternalKey key; + key.DecodeFrom(compact_pointer_[level]); + edit.SetCompactPointer(level, key); + } + } + + // Save files + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = current_->files_[level]; + for (int i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); + } + } + + // Save large value refs + for (LargeValueMap::const_iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ++it) { + const LargeValueRef& ref = it->first; + const LargeReferencesSet& pointers = it->second; + for (LargeReferencesSet::const_iterator j = pointers.begin(); + j != pointers.end(); + ++j) { + edit.AddLargeValueRef(ref, j->first, j->second); + } + } + + std::string record; + edit.EncodeTo(&record); + return log->AddRecord(record); +} + +// Helper to sort by tables_[file_number].smallest +struct VersionSet::BySmallestKey { + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + return internal_comparator->Compare(f1->smallest, f2->smallest) < 0; + } +}; + +Status VersionSet::SortLevel(Version* v, uint64_t level) { + Status result; + BySmallestKey cmp; + cmp.internal_comparator = &icmp_; + std::sort(v->files_[level].begin(), v->files_[level].end(), cmp); + + if (result.ok() && level > 0) { + // There should be no overlap + for (int i = 1; i < v->files_[level].size(); i++) { + const InternalKey& prev_end = v->files_[level][i-1]->largest; + const InternalKey& this_begin = v->files_[level][i]->smallest; + if (icmp_.Compare(prev_end, this_begin) >= 0) { + result = Status::Corruption( + "overlapping ranges in same level", + (EscapeString(prev_end.Encode()) + " vs. " + + EscapeString(this_begin.Encode()))); + break; + } + } + } + return result; +} + +int VersionSet::NumLevelFiles(int level) const { + assert(level >= 0); + assert(level < config::kNumLevels); + return current_->files_[level].size(); +} + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { + uint64_t result = 0; + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (int i = 0; i < files.size(); i++) { + if (icmp_.Compare(files[i]->largest, ikey) <= 0) { + // Entire file is before "ikey", so just add the file size + result += files[i]->file_size; + } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { + // Entire file is after "ikey", so ignore + if (level > 0) { + // Files other than level 0 are sorted by meta->smallest, so + // no further files in this level will contain data for + // "ikey". + break; + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + Table* tableptr; + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); + if (tableptr != NULL) { + result += tableptr->ApproximateOffsetOf(ikey.Encode()); + } + delete iter; + } + } + } + + // Add in large value files which are references from internal keys + // stored in the table files + // + // TODO(opt): this is O(# large values in db). If this becomes too slow, + // we could store an auxiliary data structure indexed by internal key + for (LargeValueMap::const_iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ++it) { + const LargeValueRef& lref = it->first; + for (LargeReferencesSet::const_iterator it2 = it->second.begin(); + it2 != it->second.end(); + ++it2) { + if (icmp_.Compare(it2->second, ikey.Encode()) <= 0) { + // Internal key for large value is before our key of interest + result += lref.ValueSize(); + } + } + } + + + return result; +} + +bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref, + uint64_t fnum, + const InternalKey& internal_key) { + LargeReferencesSet* refs = &large_value_refs_[large_ref]; + bool is_first = refs->empty(); + refs->insert(make_pair(fnum, internal_key.Encode().ToString())); + return is_first; +} + +void VersionSet::CleanupLargeValueRefs(const std::set& live_tables) { + for (LargeValueMap::iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ) { + LargeReferencesSet* refs = &it->second; + for (LargeReferencesSet::iterator ref_it = refs->begin(); + ref_it != refs->end(); + ) { + if (ref_it->first != log_number_ && // Not in log file + ref_it->first != prev_log_number_ && // Not in prev log + live_tables.count(ref_it->first) == 0) { // Not in a live table + // No longer live: erase + LargeReferencesSet::iterator to_erase = ref_it; + ++ref_it; + refs->erase(to_erase); + } else { + // Still live: leave this reference alone + ++ref_it; + } + } + if (refs->empty()) { + // No longer any live references to this large value: remove from + // large_value_refs + Log(env_, options_->info_log, "large value is dead: '%s'", + LargeValueRefToFilenameString(it->first).c_str()); + LargeValueMap::iterator to_erase = it; + ++it; + large_value_refs_.erase(to_erase); + } else { + ++it; + } + } +} + +bool VersionSet::LargeValueIsLive(const LargeValueRef& large_ref) { + LargeValueMap::iterator it = large_value_refs_.find(large_ref); + if (it == large_value_refs_.end()) { + return false; + } else { + assert(!it->second.empty()); + return true; + } +} + +void VersionSet::MaybeDeleteOldVersions() { + // Note: it is important to delete versions in order since a newer + // version with zero refs may be holding a pointer to a memtable + // that is used by somebody who has a ref on an older version. + while (oldest_ != current_ && oldest_->refs_ == 0) { + Version* next = oldest_->next_; + delete oldest_; + oldest_ = next; + } +} + +void VersionSet::AddLiveFiles(std::set* live) { + for (Version* v = oldest_; v != NULL; v = v->next_) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (int i = 0; i < files.size(); i++) { + live->insert(files[i]->number); + } + } + } +} + +int64_t VersionSet::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < config::kNumLevels); + return TotalFileSize(current_->files_[level]); +} + +int64_t VersionSet::MaxNextLevelOverlappingBytes() { + int64_t result = 0; + std::vector overlaps; + for (int level = 0; level < config::kNumLevels - 1; level++) { + for (int i = 0; i < current_->files_[level].size(); i++) { + const FileMetaData* f = current_->files_[level][i]; + GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); + const int64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +void VersionSet::GetOverlappingInputs( + int level, + const InternalKey& begin, + const InternalKey& end, + std::vector* inputs) { + inputs->clear(); + Slice user_begin = begin.user_key(); + Slice user_end = end.user_key(); + const Comparator* user_cmp = icmp_.user_comparator(); + for (int i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || + user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { + // Either completely before or after range; skip it + } else { + inputs->push_back(f); + } + } +} + +// Stores the minimal range that covers all entries in inputs in +// *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest) { + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + for (int i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_.Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_.Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } +} + +// Stores the minimal range that covers all entries in inputs1 and inputs2 +// in *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + +Iterator* VersionSet::MakeInputIterator(Compaction* c) { + ReadOptions options; + options.verify_checksums = options_->paranoid_checks; + options.fill_cache = false; + + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); + Iterator** list = new Iterator*[space]; + int num = 0; + for (int which = 0; which < 2; which++) { + if (!c->inputs_[which].empty()) { + if (c->level() + which == 0) { + const std::vector& files = c->inputs_[which]; + for (int i = 0; i < files.size(); i++) { + list[num++] = table_cache_->NewIterator( + options, files[i]->number, files[i]->file_size); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = NewTwoLevelIterator( + new Version::LevelFileNumIterator( + c->input_version_, &c->inputs_[which]), + &GetFileIterator, table_cache_, options); + } + } + } + assert(num <= space); + Iterator* result = NewMergingIterator(&icmp_, list, num); + delete[] list; + return result; +} + +Compaction* VersionSet::PickCompaction() { + if (!NeedsCompaction()) { + return NULL; + } + const int level = current_->compaction_level_; + assert(level >= 0); + assert(level+1 < config::kNumLevels); + + Compaction* c = new Compaction(level); + c->input_version_ = current_; + c->input_version_->Ref(); + + // Pick the first file that comes after compact_pointer_[level] + for (int i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (compact_pointer_[level].empty() || + icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { + c->inputs_[0].push_back(f); + break; + } + } + if (c->inputs_[0].empty()) { + // Wrap-around to the beginning of the key space + c->inputs_[0].push_back(current_->files_[level][0]); + } + + // Files in level 0 may overlap each other, so pick up all overlapping ones + if (level == 0) { + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); + assert(!c->inputs_[0].empty()); + } + + SetupOtherInputs(c); + + return c; +} + +void VersionSet::SetupOtherInputs(Compaction* c) { + const int level = c->level(); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + + GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + + // See if we can grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. + if (!c->inputs_[1].empty()) { + std::vector expanded0; + GetOverlappingInputs(level, all_start, all_limit, &expanded0); + if (expanded0.size() > c->inputs_[0].size()) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); + if (expanded1.size() == c->inputs_[1].size()) { + Log(env_, options_->info_log, + "Expanding@%d %d+%d to %d+%d\n", + level, + int(c->inputs_[0].size()), + int(c->inputs_[1].size()), + int(expanded0.size()), + int(expanded1.size())); + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < config::kNumLevels) { + GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); + } + + if (false) { + Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", + level, + EscapeString(smallest.Encode()).c_str(), + EscapeString(largest.Encode()).c_str()); + } + + // Update the place where we will do the next compaction for this level. + // We update this immediately instead of waiting for the VersionEdit + // to be applied so that if the compaction fails, we will try a different + // key range next time. + compact_pointer_[level] = largest.Encode().ToString(); + c->edit_.SetCompactPointer(level, largest); +} + +Compaction* VersionSet::CompactRange( + int level, + const InternalKey& begin, + const InternalKey& end) { + std::vector inputs; + GetOverlappingInputs(level, begin, end, &inputs); + if (inputs.empty()) { + return NULL; + } + + Compaction* c = new Compaction(level); + c->input_version_ = current_; + c->input_version_->Ref(); + c->inputs_[0] = inputs; + SetupOtherInputs(c); + return c; +} + +Compaction::Compaction(int level) + : level_(level), + max_output_file_size_(MaxFileSizeForLevel(level)), + input_version_(NULL), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0) { + for (int i = 0; i < config::kNumLevels; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + if (input_version_ != NULL) { + input_version_->Unref(); + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + return (num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes); +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (int i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const InternalKey& key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != NULL) { + input_version_->Unref(); + input_version_ = NULL; + } +} + +} diff --git a/db/version_set.h b/db/version_set.h new file mode 100644 index 0000000..e1c5a4b --- /dev/null +++ b/db/version_set.h @@ -0,0 +1,332 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of Table files per level. The +// entire set of versions is maintained in a VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ +#define STORAGE_LEVELDB_DB_VERSION_SET_H_ + +#include +#include +#include +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/port.h" + +namespace leveldb { + +namespace log { class Writer; } + +class Compaction; +class Iterator; +class MemTable; +class TableBuilder; +class TableCache; +class Version; +class VersionSet; +class WritableFile; + +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, std::vector* iters); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + void Unref(); + + // Return a human readable string that describes this version's contents. + std::string DebugString() const; + + private: + friend class Compaction; + friend class VersionSet; + + class LevelFileNumIterator; + Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; + + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + int refs_; // Number of live refs to this version + MemTable* cleanup_mem_; // NULL, or table to delete when version dropped + + // List of files per level + std::vector files_[config::kNumLevels]; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + double compaction_score_; + int compaction_level_; + + explicit Version(VersionSet* vset) + : vset_(vset), next_(NULL), refs_(0), + cleanup_mem_(NULL), + compaction_score_(-1), + compaction_level_(-1) { + } + + ~Version(); + + // No copying allowed + Version(const Version&); + void operator=(const Version&); +}; + +class VersionSet { + public: + VersionSet(const std::string& dbname, + const Options* options, + TableCache* table_cache, + const InternalKeyComparator*); + ~VersionSet(); + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Iff Apply() returns OK, arrange to delete + // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed + // by older versions. + Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); + + // Recover the last saved descriptor from persistent storage. + Status Recover(); + + // Save current contents to *log + Status WriteSnapshot(log::Writer* log); + + // Return the current version. + Version* current() const { return current_; } + + // Return the current manifest file number + uint64_t ManifestFileNumber() const { return manifest_file_number_; } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_++; } + + // Return the number of Table files at the specified level. + int NumLevelFiles(int level) const; + + // Return the combined file size of all files at the specified level. + int64_t NumLevelBytes(int level) const; + + // Return the last sequence number. + uint64_t LastSequence() const { return last_sequence_; } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + last_sequence_ = s; + } + + // Return the current log file number. + uint64_t LogNumber() const { return log_number_; } + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t PrevLogNumber() const { return prev_log_number_; } + + // Pick level and inputs for a new compaction. + // Returns NULL if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + Compaction* PickCompaction(); + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns NULL if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + Compaction* CompactRange( + int level, + const InternalKey& begin, + const InternalKey& end); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + Iterator* MakeInputIterator(Compaction* c); + + // Returns true iff some level needs a compaction. + bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } + + // Add all files listed in any live version to *live. + // May also mutate some internal state. + void AddLiveFiles(std::set* live); + + // Return the approximate offset in the database of the data for + // "key" as of version "v". + uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); + + // Register a reference to a large value with the specified + // large_ref from the specified file number. Returns "true" if this + // is the first recorded reference to the "large_ref" value in the + // database, and false otherwise. + bool RegisterLargeValueRef(const LargeValueRef& large_ref, + uint64_t filenum, + const InternalKey& internal_key); + + // Cleanup the large value reference state by eliminating any + // references from files that are not includes in either "live_tables" + // or the current log. + void CleanupLargeValueRefs(const std::set& live_tables); + + // Returns true if a large value with the given reference is live. + bool LargeValueIsLive(const LargeValueRef& large_ref); + + private: + class Builder; + + friend class Compaction; + friend class Version; + + Status Finalize(Version* v); + + // Delete any old versions that are no longer needed. + void MaybeDeleteOldVersions(); + + struct BySmallestKey; + Status SortLevel(Version* v, uint64_t level); + + void GetOverlappingInputs( + int level, + const InternalKey& begin, + const InternalKey& end, + std::vector* inputs); + + void GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest); + + void GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest); + + void SetupOtherInputs(Compaction* c); + + Env* const env_; + const std::string dbname_; + const Options* const options_; + TableCache* const table_cache_; + const InternalKeyComparator icmp_; + uint64_t next_file_number_; + uint64_t manifest_file_number_; + uint64_t last_sequence_; + uint64_t log_number_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + // Opened lazily + WritableFile* descriptor_file_; + log::Writer* descriptor_log_; + + // Versions are kept in a singly linked list that is never empty + Version* current_; // Pointer to the last (newest) list entry + Version* oldest_; // Pointer to the first (oldest) list entry + + // Map from large value reference to the set of + // values containing references to the value. We keep the + // internal key as a std::string rather than as an InternalKey because + // we want to be able to easily use a set. + typedef std::set > LargeReferencesSet; + typedef std::map LargeValueMap; + LargeValueMap large_value_refs_; + + // Per-level key at which the next compaction at that level should start. + // Either an empty string, or a valid InternalKey. + std::string compact_pointer_[config::kNumLevels]; + + // No copying allowed + VersionSet(const VersionSet&); + void operator=(const VersionSet&); +}; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // and "level+1" will be merged to produce a set of "level+1" files. + int level() const { return level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "key". + bool ShouldStopBefore(const InternalKey& key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + private: + friend class Version; + friend class VersionSet; + + explicit Compaction(int level); + + int level_; + uint64_t max_output_file_size_; + Version* input_version_; + VersionEdit edit_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + int grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + int64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + + // State for implementing IsBaseLevelForKey + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + int level_ptrs_[config::kNumLevels]; +}; + +} + +#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/db/write_batch.cc b/db/write_batch.cc new file mode 100644 index 0000000..e84e548 --- /dev/null +++ b/db/write_batch.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring | +// kTypeLargeValueRef varstring varstring | +// kTypeDeletion varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "leveldb/write_batch.h" + +#include "leveldb/db.h" +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "util/coding.h" + +namespace leveldb { + +WriteBatch::WriteBatch() { + Clear(); +} + +WriteBatch::~WriteBatch() { } + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(12); +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +void WriteBatch::Put(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeValue)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatchInternal::PutLargeValueRef(WriteBatch* b, + const Slice& key, + const LargeValueRef& large_ref) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + b->rep_.push_back(static_cast(kTypeLargeValueRef)); + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, + Slice(large_ref.data, sizeof(large_ref.data))); +} + +void WriteBatch::Delete(const Slice& key) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeDeletion)); + PutLengthPrefixedSlice(&rep_, key); +} + +Status WriteBatchInternal::InsertInto(const WriteBatch* b, + MemTable* memtable) { + const int count = WriteBatchInternal::Count(b); + int found = 0; + Iterator it(*b); + for (; !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeDeletion: + memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); + break; + case kTypeValue: + memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); + break; + case kTypeLargeValueRef: + memtable->Add(it.sequence_number(), kTypeLargeValueRef, + it.key(), it.value()); + break; + } + found++; + } + if (!it.status().ok()) { + return it.status(); + } else if (found != count) { + return Status::Corruption("wrong count in WriteBatch"); + } + return Status::OK(); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= 12); + b->rep_.assign(contents.data(), contents.size()); +} + +WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) + : input_(WriteBatchInternal::Contents(&batch)), + done_(false) { + if (input_.size() < 12) { + done_ = true; + } else { + seq_ = WriteBatchInternal::Sequence(&batch), + input_.remove_prefix(12); + GetNextEntry(); + } +} + +void WriteBatchInternal::Iterator::Next() { + assert(!done_); + seq_++; + GetNextEntry(); +} + +void WriteBatchInternal::Iterator::GetNextEntry() { + if (input_.empty()) { + done_ = true; + return; + } + char tag = input_[0]; + input_.remove_prefix(1); + switch (tag) { + case kTypeValue: + case kTypeLargeValueRef: + if (GetLengthPrefixedSlice(&input_, &key_) && + GetLengthPrefixedSlice(&input_, &value_)) { + op_ = static_cast(tag); + } else { + status_ = Status::Corruption("bad WriteBatch Put"); + done_ = true; + input_.clear(); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input_, &key_)) { + op_ = kTypeDeletion; + } else { + status_ = Status::Corruption("bad WriteBatch Delete"); + done_ = true; + input_.clear(); + } + break; + default: + status_ = Status::Corruption("unknown WriteBatch tag"); + done_ = true; + input_.clear(); + break; + } +} + +} diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h new file mode 100644 index 0000000..ea28e2d --- /dev/null +++ b/db/write_batch_internal.h @@ -0,0 +1,73 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ +#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ + +#include "leveldb/write_batch.h" + +namespace leveldb { + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + static void PutLargeValueRef(WriteBatch* batch, + const Slice& key, + const LargeValueRef& large_ref); + + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + // Return the seqeunce number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the seqeunce number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + static Status InsertInto(const WriteBatch* batch, MemTable* memtable); + + // Iterate over the contents of a write batch. + class Iterator { + public: + explicit Iterator(const WriteBatch& batch); + bool Done() const { return done_; } + void Next(); + ValueType op() const { return op_; } + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + SequenceNumber sequence_number() const { return seq_; } + Status status() const { return status_; } + + private: + void GetNextEntry(); + + Slice input_; + bool done_; + ValueType op_; + Slice key_; + Slice value_; + SequenceNumber seq_; + Status status_; + }; +}; + +} + + +#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc new file mode 100644 index 0000000..deb8411 --- /dev/null +++ b/db/write_batch_test.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "leveldb/env.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +static std::string PrintContents(WriteBatch* b) { + InternalKeyComparator cmp(BytewiseComparator()); + MemTable mem(cmp); + std::string state; + Status s = WriteBatchInternal::InsertInto(b, &mem); + Iterator* iter = mem.NewIterator(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + break; + case kTypeLargeValueRef: + state.append("PutRef("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append("ParseError()"); + } + return state; +} + +class WriteBatchTest { }; + +TEST(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); +} + +TEST(WriteBatchTest, Multiple) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + batch.Put(Slice("baz"), Slice("boo")); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@102" + "Delete(box)@101" + "Put(foo, bar)@100", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, PutIndirect) { + WriteBatch batch; + batch.Put(Slice("baz"), Slice("boo")); + LargeValueRef h; + for (int i = 0; i < LargeValueRef::ByteSize(); i++) { + h.data[i] = (i < 20) ? 'a' : 'b'; + } + WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(2, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@100" + "PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, Corruption) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + WriteBatchInternal::SetContents(&batch, + Slice(contents.data(),contents.size()-1)); + ASSERT_EQ("Put(foo, bar)@200" + "ParseError()", + PrintContents(&batch)); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/doc/doc.css b/doc/doc.css new file mode 100644 index 0000000..700c564 --- /dev/null +++ b/doc/doc.css @@ -0,0 +1,89 @@ +body { + margin-left: 0.5in; + margin-right: 0.5in; + background: white; + color: black; +} + +h1 { + margin-left: -0.2in; + font-size: 14pt; +} +h2 { + margin-left: -0in; + font-size: 12pt; +} +h3 { + margin-left: -0in; +} +h4 { + margin-left: -0in; +} +hr { + margin-left: -0in; +} + +/* Definition lists: definition term bold */ +dt { + font-weight: bold; +} + +address { + text-align: center; +} +code,samp,var { + color: blue; +} +kbd { + color: #600000; +} +div.note p { + float: right; + width: 3in; + margin-right: 0%; + padding: 1px; + border: 2px solid #6060a0; + background-color: #fffff0; +} + +ul { + margin-top: -0em; + margin-bottom: -0em; +} + +ol { + margin-top: -0em; + margin-bottom: -0em; +} + +UL.nobullets { + list-style-type: none; + list-style-image: none; + margin-left: -1em; +} + +p { + margin: 1em 0 1em 0; + padding: 0 0 0 0; +} + +pre { + line-height: 1.3em; + padding: 0.4em 0 0.8em 0; + margin: 0 0 0 0; + border: 0 0 0 0; + color: blue; +} + +.datatable { + margin-left: auto; + margin-right: auto; + margin-top: 2em; + margin-bottom: 2em; + border: 1px solid; +} + +.datatable td,th { + padding: 0 0.5em 0 0.5em; + text-align: right; +} diff --git a/doc/impl.html b/doc/impl.html new file mode 100644 index 0000000..b190d2c --- /dev/null +++ b/doc/impl.html @@ -0,0 +1,228 @@ + + + + +Leveldb file layout and compactions + + + + +

Files

+ +The implementation of leveldb is similar in spirit to the +representation of a single + +Bigtable tablet (section 5.3). +However the organization of the files that make up the representation +is somewhat different and is explained below. + +

+Each database is represented by a set of file stored in a directory. +There are several different types of files as documented below: +

+

Log files

+

+A log file (*.log) stores a sequence of recent updates. Each update +is appended to the current log file. When the log file reaches a +pre-determined size (approximately 1MB by default), it is converted +to a sorted table (see below) and a new log file is created for future +updates. +

+A copy of the current log file is kept in an in-memory structure (the +memtable). This copy is consulted on every read so that read +operations reflect all logged updates. +

+

Sorted tables

+

+A sorted table (*.sst) stores a sequence of entries sorted by key. +Each entry is either a value for the key, or a deletion marker for the +key. (Deletion markers are kept around to hide obsolete values +present in older sorted tables). +

+The set of sorted tables are organized into a sequence of levels. The +sorted table generated from a log file is placed in a special young +level (also called level-0). When the number of young files exceeds a +certain threshold (currently four), all of the young files are merged +together with all of the overlapping level-1 files to produce a +sequence of new level-1 files (we create a new level-1 file for every +2MB of data.) +

+Files in the young level may contain overlapping keys. However files +in other levels have distinct non-overlapping key ranges. Consider +level number L where L >= 1. When the combined size of files in +level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, +...), one file in level-L, and all of the overlapping files in +level-(L+1) are merged to form a set of new files for level-(L+1). +These merges have the effect of gradually migrating new updates from +the young level to the largest level using only bulk reads and writes +(i.e., minimizing expensive seeks). + +

Large value files

+

+Each large value (greater than 64KB by default) is placed in a large +value file (*.val) of its own. An entry is maintained in the log +and/or sorted tables that maps from the corresponding key to the +name of this large value file. The name of the large value file +is derived from a SHA1 hash of the value and its length so that +identical values share the same file. +

+

Manifest

+

+A MANIFEST file lists the set of sorted tables that make up each +level, the corresponding key ranges, and other important metadata. +A new MANIFEST file (with a new number embedded in the file name) +is created whenever the database is reopened. The MANIFEST file is +formatted as a log, and changes made to the serving state (as files +are added or removed) are appended to this log. +

+

Current

+

+CURRENT is a simple text file that contains the name of the latest +MANIFEST file. +

+

Info logs

+

+Informational messages are printed to files named LOG and LOG.old. +

+

Others

+

+Other files used for miscellaneous purposes may also be present +(LOCK, *.dbtmp). + +

Level 0

+When the log file grows above a certain size (1MB by default): +
    +
  • Write the contents of the current memtable to an sstable +
  • Replace the current memtable by a brand new empty memtable +
  • Switch to a new log file +
  • Delete the old log file and the old memtable +
+Experimental measurements show that generating an sstable from a 1MB +log file takes ~12ms, which seems like an acceptable latency hiccup to +add infrequently to a log write. + +

+The new sstable is added to a special level-0 level. level-0 contains +a set of files (up to 4 by default). However unlike other levels, +these files do not cover disjoint ranges, but may overlap each other. + +

Compactions

+ +

+When the size of level L exceeds its limit, we compact it in a +background thread. The compaction picks a file from level L and all +overlapping files from the next level L+1. Note that if a level-L +file overlaps only part of a level-(L+1) file, the entire file at +level-(L+1) is used as an input to the compaction and will be +discarded after the compaction. Aside: because level-0 is special +(files in it may overlap each other), we treat compactions from +level-0 to level-1 specially: a level-0 compaction may pick more than +one level-0 file in case some of these files overlap each other. + +

+A compaction merges the contents of the picked files to produce a +sequence of level-(L+1) files. We switch to producing a new +level-(L+1) file after the current output file has reached the target +file size (2MB). We also switch to a new output file when the key +range of the current output file has grown enough to overlap more then +ten level-(L+2) files. This last rule ensures that a later compaction +of a level-(L+1) file will not pick up too much data from level-(L+2). + +

+The old files are discarded and the new files are added to the serving +state. + +

+Compactions for a particular level rotate through the key space. In +more detail, for each level L, we remember the ending key of the last +compaction at level L. The next compaction for level L will pick the +first file that starts after this key (wrapping around to the +beginning of the key space if there is no such file). + +

+Compactions drop overwritten values. They also drop deletion markers +if there are no higher numbered levels that contain a file whose range +overlaps the current key. + +

Timing

+ +Level-0 compactions will read up to four 1MB files from level-0, and +at worst all the level-1 files (10MB). I.e., we will read 14MB and +write 14MB. + +

+Other than the special level-0 compactions, we will pick one 2MB file +from level L. In the worst case, this will overlap ~ 12 files from +level L+1 (10 because level-(L+1) is ten times the size of level-L, +and another two at the boundaries since the file ranges at level-L +will usually not be aligned with the file ranges at level-L+1). The +compaction will therefore read 26MB and write 26MB. Assuming a disk +IO rate of 100MB/s (ballpark range for modern drives), the worst +compaction cost will be approximately 0.5 second. + +

+If we throttle the background writing to something small, say 10% of +the full 100MB/s speed, a compaction may take up to 5 seconds. If the +user is writing at 10MB/s, we might build up lots of level-0 files +(~50 to hold the 5*10MB). This may signficantly increase the cost of +reads due to the overhead of merging more files together on every +read. + +

+Solution 1: To reduce this problem, we might want to increase the log +switching threshold when the number of level-0 files is large. Though +the downside is that the larger this threshold, the larger the delay +that we will add to write latency when a write triggers a log switch. + +

+Solution 2: We might want to decrease write rate artificially when the +number of level-0 files goes up. + +

+Solution 3: We work on reducing the cost of very wide merges. +Perhaps most of the level-0 files will have their blocks sitting +uncompressed in the cache and we will only need to worry about the +O(N) complexity in the merging iterator. + +

Number of files

+ +Instead of always making 2MB files, we could make larger files for +larger levels to reduce the total file count, though at the expense of +more bursty compactions. Alternatively, we could shard the set of +files into multiple directories. + +

+An experiment on an ext3 filesystem on Feb 04, 2011 shows +the following timings to do 100K file opens in directories with +varying number of files: + + + + + +
Files in directoryMicroseconds to open a file
10009
1000010
10000016
+So maybe even the sharding is not necessary on modern filesystems? + +

Recovery

+ +
    +
  • Read CURRENT to find name of the latest committed MANIFEST +
  • Read the named MANIFEST file +
  • Clean up stale files +
  • We could open all sstables here, but it is probably better to be lazy... +
  • Convert log chunk to a new level-0 sstable +
  • Start directing new writes to a new log file with recovered sequence# +
+ +

Garbage collection of files

+ +DeleteObsoleteFiles() is called at the end of every +compaction and at the end of recovery. It finds the names of all +files in the database. It deletes all log files that are not the +current log file. It deletes all table files that are not referenced +from some level and are not the output of an active compaction. It +deletes all large value files that are not referenced from any live +table or log file. + + + diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 0000000..2a83fc3 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,509 @@ + + + + +Leveldb + + + +

Leveldb

+
Jeff Dean, Sanjay Ghemawat
+

+The leveldb library provides a persistent key value store. Keys and +values are arbitrary byte arrays. The keys are ordered within the key +value store according to a user-specified comparator function. + +

+

Opening A Database

+

+A leveldb database has a name which corresponds to a file system +directory. All of the contents of database are stored in this +directory. The following example shows how to open a database, +creating it if necessary: +

+

+  #include <assert>
+  #include "leveldb/include/db.h"
+
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+If you want to raise an error if the database already exists, add +the following line before the leveldb::DB::Open call: +
+  options.error_if_exists = true;
+
+

Status

+

+You may have noticed the leveldb::Status type above. Values of this +type are returned by most functions in leveldb that may encounter an +error. You can check if such a result is ok, and also print an +associated error message: +

+

+   leveldb::Status s = ...;
+   if (!s.ok()) cerr << s.ToString() << endl;
+
+

Closing A Database

+

+When you are done with a database, just delete the database object. +Example: +

+

+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+
+

Reads And Writes

+

+The database provides Put, Delete, and Get methods to +modify/query the database. For example, the following code +moves the value stored under key1 to key2. +

+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
+  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
+
+ +

Atomic Updates

+

+Note that if the process dies after the Put of key2 but before the +delete of key1, the same value may be left stored under multiple keys. +Such problems can be avoided by using the WriteBatch class to +atomically apply a set of updates: +

+

+  #include "leveldb/include/write_batch.h"
+  ...
+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) {
+    leveldb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db->Write(leveldb::WriteOptions(), &batch);
+  }
+
+The WriteBatch holds a sequence of edits to be made to the database, +and these edits within the batch are applied in order. Note that we +called Delete before Put so that if key1 is identical to key2, +we do not end up erroneously dropping the value entirely. +

+Apart from its atomicity benefits, WriteBatch may also be used to +speed up bulk updates by placing lots of individual mutations into the +same batch. + +

Synchronous Writes

+By default, each write to leveldb is asynchronous: it +returns after pushing the write from the process into the operating +system. The transfer from operating system memory to the underlying +persistent storage happens asynchronously. The sync flag +can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling +either fsync(...) or fdatasync(...) or +msync(..., MS_SYNC) before the write operation returns.) +
+  leveldb::WriteOptions write_options;
+  write_options.sync = true;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a thousand times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can often be used safely. For example, when +loading a large amount of data into the database you can handle lost +updates by restarting the bulk load after a crash. A hybrid scheme is +also possible where every Nth write is synchronous, and in the event +of a crash, the bulk load is restarted just after the last synchronous +write finished by the previous run. (The synchronous write can update +a marker that describes where to restart on a crash.) + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write (i.e., +write_options.sync is set to true). The extra cost of +the synchronous write will be amortized across all of the writes in +the batch. + +

+

Concurrency

+

+A database may only be opened by one process at a time. The leveldb +implementation acquires a lock from the operating system to prevent +misuse. Within a single process, the same leveldb::DB object may +be safely used by multiple concurrent threads. +

+

Iteration

+

+The following example demonstrates how to print all key,value pairs +in a database. +

+

+  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+  delete it;
+
+The following variation shows how to process just the keys in the +range [start,limit): +

+

+  for (it->Seek(start);
+       it->Valid() && it->key().ToString() < limit;
+       it->Next()) {
+    ...
+  }
+
+You can also process entries in reverse order. (Caveat: reverse +iteration may be somewhat slower than forward iteration.) +

+

+  for (it->SeekToLast(); it->Valid(); it->Prev()) {
+    ...
+  }
+
+

Snapshots

+

+Snapshots provide consistent read-only views over the entire state of +the key-value store. ReadOptions::snapshot may be non-NULL to indicate +that a read should operate on a particular version of the DB state. +If ReadOptions::snapshot is NULL, the read will operate on an +implicit snapshot of the current state. +

+Snapshots typically are created by the DB::GetSnapshot() method: +

+

+  leveldb::ReadOptions options;
+  options.snapshot = db->GetSnapshot();
+  ... apply some updates to db ...
+  leveldb::Iterator* iter = db->NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db->ReleaseSnapshot(options.snapshot);
+
+Note that when a snapshot is no longer needed, it should be released +using the DB::ReleaseSnapshot interface. This allows the +implementation to get rid of state that was being maintained just to +support reading as of that snapshot. +

+A Write operation can also return a snapshot that +represents the state of the database just after applying a particular +set of updates: +

+

+  leveldb::Snapshot* snapshot;
+  leveldb::WriteOptions write_options;
+  write_options.post_write_snapshot = &snapshot;
+  leveldb::Status status = db->Write(write_options, ...);
+  ... perform other mutations to db ...
+
+  leveldb::ReadOptions read_options;
+  read_options.snapshot = snapshot;
+  leveldb::Iterator* iter = db->NewIterator(read_options);
+  ... read as of the state just after the Write call returned ...
+  delete iter;
+
+  db->ReleaseSnapshot(snapshot);
+
+

Slice

+

+The return value of the it->key() and it->value() calls above +are instances of the leveldb::Slice type. Slice is a simple +structure that contains a length and a pointer to an external byte +array. Returning a Slice is a cheaper alternative to returning a +std::string since we do not need to copy potentially large keys and +values. In addition, leveldb methods do not return null-terminated +C-style strings since leveldb keys and values are allowed to +contain '\0' bytes. +

+C++ strings and null-terminated C-style strings can be easily converted +to a Slice: +

+

+   leveldb::Slice s1 = "hello";
+
+   std::string str("world");
+   leveldb::Slice s2 = str;
+
+A Slice can be easily converted back to a C++ string: +
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+
+Be careful when using Slices since it is up to the caller to ensure that +the external byte array into which the Slice points remains live while +the Slice is in use. For example, the following is buggy: +

+

+   leveldb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+
+When the if statement goes out of scope, str will be destroyed and the +backing storage for slice will disappear. +

+

Comparators

+

+The preceding examples used the default ordering function for key, +which orders bytes lexicographically. You can however supply a custom +comparator when opening a database. For example, suppose each +database key consists of two numbers and we should sort by the first +number, breaking ties by the second number. First, define a proper +subclass of leveldb::Comparator that expresses these rules: +

+

+  class TwoPartComparator : public leveldb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a < b: negative result
+    //   if a > b: positive result
+    //   else: zero result
+    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &a1, &a2);
+      ParseKey(b, &b1, &b2);
+      if (a1 < b1) return -1;
+      if (a1 > b1) return +1;
+      if (a2 < b2) return -1;
+      if (a2 > b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+
+Now create a database using this custom comparator: +

+

+  TwoPartComparator cmp;
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &cmp;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  ...
+
+

Backwards compatibility

+

+The result of the comparator's Name method is attached to the +database when it is created, and is checked on every subsequent +database open. If the name changes, the leveldb::DB::Open call will +fail. Therefore, change the name if and only if the new key format +and comparison function are incompatible with existing databases, and +it is ok to discard the contents of all existing databases. +

+You can however still gradually evolve your key format over time with +a little bit of pre-planning. For example, you could store a version +number at the end of each key (one byte should suffice for most uses). +When you wish to switch to a new key format (e.g., adding an optional +third part to the keys processed by TwoPartComparator), +(a) keep the same comparator name (b) increment the version number +for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. +

+

Performance

+

+Performance can be tuned by changing the default values of the +types defined in leveldb/include/options.h. + +

+

Block size

+

+leveldb groups adjacent keys together into the same block and such a +block is the unit of transfer to and from persistent storage. The +default block size is approximately 4096 uncompressed bytes. +Applications that mostly do bulk scans over the contents of the +database may wish to increase this size. Applications that do a lot +of point reads of small values may wish to switch to a smaller block +size if performance measurements indicate an improvement. There isn't +much benefit in using blocks smaller than one kilobyte, or larger than +a few megabytes. Also note that compression will be more effective +with larger block sizes. +

+

Compression

+

+Each block is individually compressed before being written to +persistent storage. Compression is on by default since the default +compression method is very fast, and is automatically disabled for +uncompressible data. In rare cases, applications may want to disable +compression entirely, but should only do so if benchmarks show a +performance improvement: +

+

+  leveldb::Options options;
+  options.compression = leveldb::kNoCompression;
+  ... leveldb::DB::Open(options, name, ...) ....
+
+

Cache

+

+The contents of the database are stored in a set of files in the +filesystem and each file stores a sequence of compressed blocks. If +options.cache is non-NULL, it is used to cache frequently used +uncompressed block contents. +

+

+  #include "leveldb/include/cache.h"
+
+  leveldb::Options options;
+  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
+  leveldb::DB* db;
+  leveldb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.cache;
+
+Note that the cache holds uncompressed data, and therefore it should +be sized according to application level data sizes, without any +reduction from compression. (Caching of compressed blocks is left to +the operating system buffer cache, or any custom Env +implementation provided by the client.) +

+When performing a bulk read, the application may wish to disable +caching so that the data processed by the bulk read does not end up +displacing most of the cached contents. A per-iterator option can be +used to achieve this: +

+

+  leveldb::ReadOptions options;
+  options.fill_cache = false;
+  leveldb::Iterator* it = db->NewIterator(options);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ...
+  }
+
+

Key Layout

+

+Note that the unit of disk transfer and caching is a block. Adjacent +keys (according to the database sort order) will usually be placed in +the same block. Therefore the application can improve its performance +by placing keys that are accessed together near each other and placing +infrequently used keys in a separate region of the key space. +

+For example, suppose we are implementing a simple file system on top +of leveldb. The types of entries we might wish to store are: +

+

+   filename -> permission-bits, length, list of file_block_ids
+   file_block_id -> data
+
+We might want to prefix filename keys with one letter (say '/') and the +file_block_id keys with a different letter (say '0') so that scans +over just the metadata do not force us to fetch and cache bulky file +contents. +

+

Large Values

+

+leveldb has special treatment of large values (by default, a value +of length greater than or equal to 64K is considered large, though a +field in Options can be used to adjust this threshold). Each such +large value is placed in a separate operating system file, and the +normal database blocks just contain pointers to such files. +

+Furthermore, if the same large value occurs multiple times in a single +database, it will be stored just once. +

+

Checksums

+

+leveldb associates checksums with all data it stores in the file system. +There are two separate controls provided over how aggressively these +checksums are verified: +

+

    +
  • ReadOptions::verify_checksums may be set to true to force + checksum verification of all data that is read from the file system on + behalf of a particular read. By default, no such verification is + done. +

    +

  • Options::paranoid_checks may be set to true before opening a + database to make the database implementation raise an error as soon as + it detects an internal corruption. Depending on which portion of the + database has been corrupted, the error may be raised when the database + is opened, or later by another database operation. By default, + paranoid checking is off so that the database can be used even if + parts of its persistent storage have been corrupted. +

    + If a database is corrupted (perhaps it cannot be opened when + paranoid checking is turned on), the leveldb::RepairDB function + may be used to recover as much of the data as possible +

    +

+

Approximate Sizes

+

+The GetApproximateSizes method can used to get the approximate +number of bytes of file system space used by one or more key ranges. +

+

+   leveldb::Range ranges[2];
+   ranges[0] = leveldb::Range("a", "c");
+   ranges[1] = leveldb::Range("x", "z");
+   uint64_t sizes[2];
+   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
+
+The preceding call will set sizes[0] to the approximate number of +bytes of file system space used by the key range [a..c) and +sizes[1] to the approximate number of bytes used by the key range +[x..z). +

+

Environment

+

+All file operations (and other operating system calls) issued by the +leveldb implementation are routed through a leveldb::Env object. +Sophisticated clients may wish to provide their own Env +implementation to get better control. For example, an application may +introduce artificial delays in the file IO paths to limit the impact +of leveldb on other activities in the system. +

+

+  class SlowEnv : public leveldb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  leveldb::Options options;
+  options.env = &env;
+  Status s = leveldb::DB::Open(options, ...);
+
+

Porting

+

+leveldb may be ported to a new platform by providing platform +specific implementations of the types/methods/functions exported by +leveldb/port/port.h. See leveldb/port/port_example.h for more +details. +

+In addition, the new platform may need a new default leveldb::Env +implementation. See leveldb/util/env_posix.h for an example. + +

Other Information

+ +

+Details about the leveldb implementation may be found in +the following documents: +

+ + + diff --git a/doc/log_format.txt b/doc/log_format.txt new file mode 100644 index 0000000..3a0414b --- /dev/null +++ b/doc/log_format.txt @@ -0,0 +1,75 @@ +The log file contents are a sequence of 32KB blocks. The only +exception is that the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] + length: uint16 + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last six bytes of a block (since it +won't fit). Any leftover bytes here form the trailer, which must +consist entirely of zero bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new +non-zero length record is added, the writer must emit a FIRST record +(which contains zero bytes of user data) to fill up the trailing seven +bytes of the block and then emit all of the user data in subsequent +blocks. + +More types may be added in the future. Some Readers may skip record +types they do not understand, others may report that some data was +skipped. + +FULL == 1 +FIRST == 2 +MIDDLE == 3 +LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been +split into multiple fragments (typically because of block boundaries). +FIRST is the type of the first fragment of a user record, LAST is the +type of the last fragment of a user record, and MID is the type of all +interior fragments of a user record. + +Example: consider a sequence of user records: + A: length 1000 + B: length 97270 + C: length 8000 +A will be stored as a FULL record in the first block. + +B will be split into three fragments: first fragment occupies the rest +of the first block, second fragment occupies the entirety of the +second block, and the third fragment occupies a prefix of the third +block. This will leave six bytes free in the third block, which will +be left empty as the trailer. + +C will be stored as a FULL record in the fourth block. + +=================== + +Some benefits over the recordio format: + +(1) We do not need any heuristics for resyncing - just go to next +block boundary and scan. If there is a corruption, skip to the next +block. As a side-benefit, we do not get confused when part of the +contents of one log file are embedded as a record inside another log +file. + +(2) Splitting at approximate boundaries (e.g., for mapreduce) is +simple: find the next block boundary and skip records until we +hit a FULL or FIRST record. + +(3) We do not need extra buffering for large records. + +Some downsides compared to recordio format: + +(1) No packing of tiny records. This could be fixed by adding a new +record type, so it is a shortcoming of the current implementation, +not necessarily the format. + +(2) No compression. Again, this could be fixed by adding new record types. diff --git a/doc/table_format.txt b/doc/table_format.txt new file mode 100644 index 0000000..ad5aa4b --- /dev/null +++ b/doc/table_format.txt @@ -0,0 +1,61 @@ +File format +=========== + + + [data block 1] + [data block 2] + ... + [data block N] + [meta block 1] + ... + [meta block K] + [metaindex block] + [index block] + [Footer] (fixed size; starts at file_size - sizeof(Footer)) + + +The file contains internal pointers. Each such pointer is called +a BlockHandle and contains the following information: + offset: varint64 + size: varint64 + +(1) The sequence of key/value pairs in the file are stored in sorted +order and partitioned into a sequence of data blocks. These blocks +come one after another at the beginning of the file. Each data block +is formatted according to the code in block_builder.cc, and then +optionally compressed. + +(2) After the data blocks we store a bunch of meta blocks. The +supported meta block types are described below. More meta block types +may be added in the future. Each meta block is again formatted using +block_builder.cc and then optionally compressed. + +(3) A "metaindex" block. It contains one entry for every other meta +block where the key is the name of the meta block and the value is a +BlockHandle pointing to that meta block. + +(4) An "index" block. This block contains one entry per data block, +where the key is a string >= last key in that data block and before +the first key in the successive data block. The value is the +BlockHandle for the data block. + +(6) At the very end of the file is a fixed length footer that contains +the BlockHandle of the metaindex and index blocks as well as a magic number. + metaindex_handle: char[p]; // Block handle for metaindex + index_handle: char[q]; // Block handle for index + padding: char[40-p-q]; // 0 bytes to make fixed length + // (40==2*BlockHandle::kMaxEncodedLength) + magic: fixed64; // == 0xdb4775248b80fb57 + +"stats" Meta Block +------------------ + +This meta block contains a bunch of stats. The key is the name +of the statistic. The value contains the statistic. +TODO(postrelease): record following stats. + data size + index size + key size (uncompressed) + value size (uncompressed) + number of entries + number of data blocks diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h new file mode 100644 index 0000000..79196d1 --- /dev/null +++ b/include/leveldb/cache.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ +#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ + +#include +#include "leveldb/slice.h" + +namespace leveldb { + +class Cache; + +// Create a new cache with a fixed size capacity. This implementation +// of Cache uses a least-recently-used eviction policy. +extern Cache* NewLRUCache(size_t capacity); + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) = 0; + + // If the cache has no mapping for "key", returns NULL. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + private: + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; + + // No copying allowed + Cache(const Cache&); + void operator=(const Cache&); +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h new file mode 100644 index 0000000..4e00e4d --- /dev/null +++ b/include/leveldb/comparator.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ + +#include + +namespace leveldb { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. +class Comparator { + public: + virtual ~Comparator(); + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "leveldb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/include/leveldb/db.h b/include/leveldb/db.h new file mode 100644 index 0000000..f18ded3 --- /dev/null +++ b/include/leveldb/db.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ +#define STORAGE_LEVELDB_INCLUDE_DB_H_ + +#include +#include +#include "leveldb/iterator.h" +#include "leveldb/options.h" + +namespace leveldb { + +static const int kMajorVersion = 1; +static const int kMinorVersion = 1; + +struct Options; +struct ReadOptions; +struct WriteOptions; + +class Snapshot; +class WriteBatch; + +// Some internal types. Clients should ignore. +class WriteBatchInternal; + +struct Range { + Slice start; + Slice limit; + + Range(const Slice& s, const Slice& l) : start(s), limit(l) { } +}; + +// A DB is a persistent ordered map from keys to values. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores NULL in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, + const std::string& name, + DB** dbptr); + + DB() { } + virtual ~DB(); + + // Set the database entry for "key" to "value". Returns OK on success, + // and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; + + // Apply the specified updates to the database. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) = 0; + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // DB implementations can export properties about their state + // via this method. If "property" is a valid property understood by this + // DB implementation, fills "*value" with its current value and returns + // true. Otherwise returns false. + // + // + // Valid property names include: + // + // "leveldb.num-files-at-level" - return the number of files at level , + // where is an ASCII representation of a level number (e.g. "0"). + // "leveldb.stats" - returns a multi-line string that describes statistics + // about the internal operation of the DB. + virtual bool GetProperty(const Slice& property, std::string* value) = 0; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + // + // The results may not include the sizes of recently written data. + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) = 0; + + // Possible extensions: + // (1) Add a method to compact a range of keys + + private: + // No copying allowed + DB(const DB&); + void operator=(const DB&); +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options); + +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +Status RepairDB(const std::string& dbname, const Options& options); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/include/leveldb/env.h b/include/leveldb/env.h new file mode 100644 index 0000000..4b6e712 --- /dev/null +++ b/include/leveldb/env.h @@ -0,0 +1,290 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the leveldb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. + +#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ +#define STORAGE_LEVELDB_INCLUDE_ENV_H_ + +#include +#include +#include +#include +#include "leveldb/status.h" + +namespace leveldb { + +class FileLock; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; + +class Env { + public: + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to leveldb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores NULL in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores NULL in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // Arrange to run "(*function)(arg)" once in a background thread. + // + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + virtual void Schedule( + void (*function)(void* arg), + void* arg) = 0; + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Write an entry to the log file with the specified format. + virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). If an error was encountered, returns a + // non-OK status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() { } + virtual ~WritableFile(); + + virtual Status Append(const Slice& data) = 0; + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; + + private: + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + +// Log the specified data to *info_log if info_log is non-NULL. +extern void Log(Env* env, WritableFile* info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 3, 4))) +# endif + ; + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *target + explicit EnvWrapper(Env* target) : target_(target) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, SequentialFile** r) { + return target_->NewSequentialFile(f, r); + } + Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { + return target_->NewRandomAccessFile(f, r); + } + Status NewWritableFile(const std::string& f, WritableFile** r) { + return target_->NewWritableFile(f, r); + } + bool FileExists(const std::string& f) { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector* r) { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) { return target_->CreateDir(d); } + Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } + Status GetFileSize(const std::string& f, uint64_t* s) { + return target_->GetFileSize(f, s); + } + Status RenameFile(const std::string& s, const std::string& t) { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } + void Schedule(void (*f)(void*), void* a) { + return target_->Schedule(f, a); + } + void StartThread(void (*f)(void*), void* a) { + return target_->StartThread(f, a); + } + virtual Status GetTestDirectory(std::string* path) { + return target_->GetTestDirectory(path); + } + virtual void Logv(WritableFile* log, const char* format, va_list ap) { + return target_->Logv(log, format, ap); + } + uint64_t NowMicros() { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) { + target_->SleepForMicroseconds(micros); + } + private: + Env* target_; +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h new file mode 100644 index 0000000..1866fb5 --- /dev/null +++ b/include/leveldb/iterator.h @@ -0,0 +1,95 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. + +#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ + +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class Iterator { + public: + Iterator(); + virtual ~Iterator(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + + // No copying allowed + Iterator(const Iterator&); + void operator=(const Iterator&); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/include/leveldb/options.h b/include/leveldb/options.h new file mode 100644 index 0000000..87d388e --- /dev/null +++ b/include/leveldb/options.h @@ -0,0 +1,208 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ +#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ + +#include + +namespace leveldb { + +class Cache; +class Comparator; +class Env; +class Snapshot; +class WritableFile; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1, +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be to written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + WritableFile* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. + // + // Default: 4MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Handle values larger than "large_value_threshold" bytes + // specially, by writing them into their own files (to avoid + // compaction overhead) and doing content-based elimination of + // duplicate values to save space. + // + // We recommend against changing this value. + // + // Default: 64K + size_t large_value_threshold; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + int block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // Create an Options object with default values for all fields. + Options(); +}; + +// Options that control read operations +struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: false + bool verify_checksums; + + // Should the data read for this iteration be cached in memory? + // Callers may wish to set this field to false for bulk scans. + // Default: true + bool fill_cache; + + // If "snapshot" is non-NULL, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is NULL, use an impliicit + // snapshot of the state at the beginning of this read operation. + // Default: NULL + const Snapshot* snapshot; + + ReadOptions() + : verify_checksums(false), + fill_cache(true), + snapshot(NULL) { + } +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fsync()". + // + // Default: false + bool sync; + + // If "post_write_snapshot" is non-NULL, and the write succeeds, + // *post_write_snapshot will be modified to point to a snapshot of + // the DB state immediately after this write. The caller must call + // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the + // snapshot is no longer needed. + // + // If "post_write_snapshot" is non-NULL, and the write fails, + // *post_write_snapshot will be set to NULL. + // + // Default: NULL + const Snapshot** post_write_snapshot; + + WriteOptions() + : sync(false), + post_write_snapshot(NULL) { + } +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h new file mode 100644 index 0000000..62cb894 --- /dev/null +++ b/include/leveldb/slice.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. + +#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ +#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ + +#include +#include +#include +#include + +namespace leveldb { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) { } + + // Create a slice that refers to data[0,n-1]. + Slice(const char* data, size_t n) : data_(data), size_(n) { } + + // Create a slice that refers to the contents of "s" + Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + Slice(const char* s) : data_(s), size_(strlen(s)) { } + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { data_ = ""; size_ = 0; } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + std::string ToString() const { return std::string(data_, size_); } + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_, x.data_, x.size_) == 0)); + } + + private: + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline int Slice::compare(const Slice& b) const { + const int min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +} + + +#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/include/leveldb/status.h b/include/leveldb/status.h new file mode 100644 index 0000000..47e3edf --- /dev/null +++ b/include/leveldb/status.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. + +#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ +#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ + +#include +#include +#include "leveldb/slice.h" + +namespace leveldb { + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, Slice()); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + private: + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + }; + Code code() const { return (state_ == NULL) ? kOk : state_->first; } + + Status(Code code, const Slice& msg, const Slice& msg2); + + typedef std::pair State; + State* state_; +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); +} +inline void Status::operator=(const Status& s) { + if (this != &s) { + delete state_; + state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); + } +} + +} + +#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/include/leveldb/table.h b/include/leveldb/table.h new file mode 100644 index 0000000..bd99176 --- /dev/null +++ b/include/leveldb/table.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ +#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ + +#include +#include "leveldb/iterator.h" + +namespace leveldb { + +class Block; +class BlockHandle; +struct Options; +class RandomAccessFile; +struct ReadOptions; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. +class Table { + public: + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to NULL and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, + RandomAccessFile* file, + uint64_t file_size, + Table** table); + + ~Table(); + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + Iterator* NewIterator(const ReadOptions&) const; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key) const; + + private: + struct Rep; + Rep* rep_; + + explicit Table(Rep* rep) { rep_ = rep; } + static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); + + // No copying allowed + Table(const Table&); + void operator=(const Table&); +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h new file mode 100644 index 0000000..49d2d51 --- /dev/null +++ b/include/leveldb/table_builder.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). + +#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ +#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ + +#include +#include "leveldb/options.h" +#include "leveldb/status.h" + +namespace leveldb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; + +class TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + TableBuilder(const Options& options, WritableFile* file); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~TableBuilder(); + + // Change the options used by this builder. Note: only some of the + // option fields can be changed after construction. If a field is + // not allowed to change dynamically and its value in the structure + // passed to the constructor is different from its value in the + // structure passed to this method, this method will return an error + // without changing any fields. + Status ChangeOptions(const Options& options); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value); + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // Return non-ok iff some error has been detected. + Status status() const; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish(); + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon(); + + // Number of calls to Add() so far. + uint64_t NumEntries() const; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const; + + private: + bool ok() const { return status().ok(); } + void WriteBlock(BlockBuilder* block, BlockHandle* handle); + + struct Rep; + Rep* rep_; + + // No copying allowed + TableBuilder(const TableBuilder&); + void operator=(const TableBuilder&); +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h new file mode 100644 index 0000000..3411952 --- /dev/null +++ b/include/leveldb/write_batch.h @@ -0,0 +1,49 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); + +#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ +#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ + +#include + +namespace leveldb { + +class Slice; + +class WriteBatch { + public: + WriteBatch(); + ~WriteBatch(); + + // Store the mapping "key->value" in the database. + void Put(const Slice& key, const Slice& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + void Delete(const Slice& key); + + // Clear all updates buffered in this batch. + void Clear(); + + private: + friend class WriteBatchInternal; + + std::string rep_; // See comment in write_batch.cc for the format of rep_ + + // Intentionally copyable +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/leveldb.gyp b/leveldb.gyp new file mode 100644 index 0000000..d10ac33 --- /dev/null +++ b/leveldb.gyp @@ -0,0 +1,327 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +{ + 'variables': { + 'use_snappy%': 0, + }, + 'target_defaults': { + 'defines': [ + 'LEVELDB_PLATFORM_CHROMIUM=1', + ], + 'include_dirs': [ + '.', + 'include/', + ], + 'conditions': [ + ['OS == "win"', { + 'include_dirs': [ + 'port/win', + ], + }], + ['use_snappy', { + 'defines': [ + 'USE_SNAPPY=1', + ], + }], + ], + }, + 'targets': [ + { + 'target_name': 'leveldb', + 'type': '<(library)', + 'dependencies': [ + # The base libary is a lightweight abstraction layer for things like + # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ + '../../base/base.gyp:base', + ], + 'conditions': [ + ['use_snappy', { + 'dependencies': [ + '../../third_party/snappy/snappy.gyp:snappy', + ], + }], + ], + 'direct_dependent_settings': { + 'include_dirs': [ + 'include/', + ], + }, + 'sources': [ + # Include and then exclude so that all files show up in IDEs, even if + # they don't build. + 'db/builder.cc', + 'db/builder.h', + 'db/db_impl.cc', + 'db/db_impl.h', + 'db/db_iter.cc', + 'db/db_iter.h', + 'db/filename.cc', + 'db/filename.h', + 'db/dbformat.cc', + 'db/dbformat.h', + 'db/log_format.h', + 'db/log_reader.cc', + 'db/log_reader.h', + 'db/log_writer.cc', + 'db/log_writer.h', + 'db/memtable.cc', + 'db/memtable.h', + 'db/repair.cc', + 'db/skiplist.h', + 'db/snapshot.h', + 'db/table_cache.cc', + 'db/table_cache.h', + 'db/version_edit.cc', + 'db/version_edit.h', + 'db/version_set.cc', + 'db/version_set.h', + 'db/write_batch.cc', + 'db/write_batch_internal.h', + 'include/leveldb/cache.h', + 'include/leveldb/comparator.h', + 'include/leveldb/db.h', + 'include/leveldb/env.h', + 'include/leveldb/iterator.h', + 'include/leveldb/options.h', + 'include/leveldb/slice.h', + 'include/leveldb/status.h', + 'include/leveldb/table.h', + 'include/leveldb/table_builder.h', + 'include/leveldb/write_batch.h', + 'port/port.h', + 'port/port_chromium.cc', + 'port/port_chromium.h', + 'port/port_example.h', + 'port/port_posix.cc', + 'port/port_posix.h', + 'port/sha1_portable.cc', + 'port/sha1_portable.h', + 'table/block.cc', + 'table/block.h', + 'table/block_builder.cc', + 'table/block_builder.h', + 'table/format.cc', + 'table/format.h', + 'table/iterator.cc', + 'table/iterator_wrapper.h', + 'table/merger.cc', + 'table/merger.h', + 'table/table.cc', + 'table/table_builder.cc', + 'table/two_level_iterator.cc', + 'table/two_level_iterator.h', + 'util/arena.cc', + 'util/arena.h', + 'util/cache.cc', + 'util/coding.cc', + 'util/coding.h', + 'util/comparator.cc', + 'util/crc32c.cc', + 'util/crc32c.h', + 'util/env.cc', + 'util/env_chromium.cc', + 'util/env_posix.cc', + 'util/hash.cc', + 'util/hash.h', + 'util/logging.cc', + 'util/logging.h', + 'util/mutexlock.h', + 'util/options.cc', + 'util/random.h', + 'util/status.cc', + ], + 'sources/': [ + ['exclude', '_(android|example|portable|posix)\\.cc$'], + ], + }, + { + 'target_name': 'leveldb_testutil', + 'type': '<(library)', + 'dependencies': [ + '../../base/base.gyp:base', + 'leveldb', + ], + 'export_dependent_settings': [ + # The tests use include directories from these projects. + '../../base/base.gyp:base', + 'leveldb', + ], + 'sources': [ + 'util/histogram.cc', + 'util/histogram.h', + 'util/testharness.cc', + 'util/testharness.h', + 'util/testutil.cc', + 'util/testutil.h', + ], + }, + { + 'target_name': 'leveldb_arena_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/arena_test.cc', + ], + }, + { + 'target_name': 'leveldb_cache_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/cache_test.cc', + ], + }, + { + 'target_name': 'leveldb_coding_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/coding_test.cc', + ], + }, + { + 'target_name': 'leveldb_corruption_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/corruption_test.cc', + ], + }, + { + 'target_name': 'leveldb_crc32c_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/crc32c_test.cc', + ], + }, + { + 'target_name': 'leveldb_db_bench', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/db_bench.cc', + ], + }, + { + 'target_name': 'leveldb_db_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/db_test.cc', + ], + }, + { + 'target_name': 'leveldb_dbformat_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/dbformat_test.cc', + ], + }, + { + 'target_name': 'leveldb_env_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/env_test.cc', + ], + }, + { + 'target_name': 'leveldb_filename_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/filename_test.cc', + ], + }, + { + 'target_name': 'leveldb_log_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/log_test.cc', + ], + }, + { + 'target_name': 'leveldb_sha1_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'port/sha1_test.cc', + ], + }, + { + 'target_name': 'leveldb_skiplist_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/skiplist_test.cc', + ], + }, + { + 'target_name': 'leveldb_table_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'table/table_test.cc', + ], + }, + { + 'target_name': 'leveldb_version_edit_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/version_edit_test.cc', + ], + }, + { + 'target_name': 'leveldb_write_batch_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/write_batch_test.cc', + ], + }, + ], +} + +# Local Variables: +# tab-width:2 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/leveldb/AUTHORS b/leveldb/AUTHORS deleted file mode 100644 index 27a9407..0000000 --- a/leveldb/AUTHORS +++ /dev/null @@ -1,8 +0,0 @@ -# Names should be added to this file like so: -# Name or Organization - -Google Inc. - -# Initial version authors: -Jeffrey Dean -Sanjay Ghemawat diff --git a/leveldb/LICENSE b/leveldb/LICENSE deleted file mode 100644 index 8e80208..0000000 --- a/leveldb/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2011 The LevelDB Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/leveldb/Makefile b/leveldb/Makefile deleted file mode 100644 index 43ac23d..0000000 --- a/leveldb/Makefile +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -CC = g++ - -# Uncomment one of the following to switch between debug and opt mode -#OPT = -O2 -DNDEBUG -OPT = -g2 - -CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -I./include -std=c++0x $(OPT) - -LDFLAGS=-lpthread - -LIBOBJECTS = \ - ./db/builder.o \ - ./db/db_impl.o \ - ./db/db_iter.o \ - ./db/filename.o \ - ./db/dbformat.o \ - ./db/log_reader.o \ - ./db/log_writer.o \ - ./db/memtable.o \ - ./db/repair.o \ - ./db/table_cache.o \ - ./db/version_edit.o \ - ./db/version_set.o \ - ./db/write_batch.o \ - ./port/port_posix.o \ - ./table/block.o \ - ./table/block_builder.o \ - ./table/format.o \ - ./table/iterator.o \ - ./table/merger.o \ - ./table/table.o \ - ./table/table_builder.o \ - ./table/two_level_iterator.o \ - ./util/arena.o \ - ./util/cache.o \ - ./util/coding.o \ - ./util/comparator.o \ - ./util/crc32c.o \ - ./util/env.o \ - ./util/env_posix.o \ - ./util/hash.o \ - ./util/histogram.o \ - ./util/logging.o \ - ./util/options.o \ - ./util/status.o - -TESTUTIL = ./util/testutil.o -TESTHARNESS = ./util/testharness.o $(TESTUTIL) - -TESTS = \ - arena_test \ - cache_test \ - coding_test \ - corruption_test \ - crc32c_test \ - db_test \ - dbformat_test \ - env_test \ - filename_test \ - log_test \ - skiplist_test \ - table_test \ - version_edit_test \ - write_batch_test - -PROGRAMS = db_bench $(TESTS) - -all: $(PROGRAMS) - -check: $(TESTS) - for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done - -clean: - rm -f $(PROGRAMS) */*.o - -db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) - $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ - -arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -.cc.o: - $(CC) $(CFLAGS) $< -o $@ - -# TODO(gabor): dependencies for .o files -# TODO(gabor): Build library diff --git a/leveldb/README b/leveldb/README deleted file mode 100644 index 3618ade..0000000 --- a/leveldb/README +++ /dev/null @@ -1,51 +0,0 @@ -leveldb: A key-value store -Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) - -The code under this directory implements a system for maintaining a -persistent key/value store. - -See doc/index.html for more explanation. -See doc/impl.html for a brief overview of the implementation. - -The public interface is in include/*.h. Callers should not include or -rely on the details of any other header files in this package. Those -internal APIs may be changed without warning. - -Guide to header files: - -include/db.h - Main interface to the DB: Start here - -include/options.h - Control over the behavior of an entire database, and also - control over the behavior of individual reads and writes. - -include/comparator.h - Abstraction for user-specified comparison function. If you want - just bytewise comparison of keys, you can use the default comparator, - but clients can write their own comparator implementations if they - want custom ordering (e.g. to handle different character - encodings, etc.) - -include/iterator.h - Interface for iterating over data. You can get an iterator - from a DB object. - -include/write_batch.h - Interface for atomically applying multiple updates to a database. - -include/slice.h - A simple module for maintaining a pointer and a length into some - other byte array. - -include/status.h - Status is returned from many of the public interfaces and is used - to report success and various kinds of errors. - -include/env.h - Abstraction of the OS environment. A posix implementation of - this interface is in util/env_posix.cc - -include/table.h -include/table_builder.h - Lower-level modules that most clients probably won't use directly diff --git a/leveldb/TODO b/leveldb/TODO deleted file mode 100644 index ce81439..0000000 --- a/leveldb/TODO +++ /dev/null @@ -1,14 +0,0 @@ -ss -- Stats - -db -- Maybe implement DB::BulkDeleteForRange(start_key, end_key) - that would blow away files whose ranges are entirely contained - within [start_key..end_key]? For Chrome, deletion of obsolete - object stores, etc. can be done in the background anyway, so - probably not that important. - -api changes: -- Make it wrappable - -Faster Get implementation diff --git a/leveldb/db/builder.cc b/leveldb/db/builder.cc deleted file mode 100644 index 9f132d7..0000000 --- a/leveldb/db/builder.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/builder.h" - -#include "db/filename.h" -#include "db/dbformat.h" -#include "db/table_cache.h" -#include "db/version_edit.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" - -namespace leveldb { - -Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta, - VersionEdit* edit) { - Status s; - meta->file_size = 0; - iter->SeekToFirst(); - - std::string fname = TableFileName(dbname, meta->number); - if (iter->Valid()) { - WritableFile* file; - s = env->NewWritableFile(fname, &file); - if (!s.ok()) { - return s; - } - - TableBuilder* builder = new TableBuilder(options, file); - meta->smallest.DecodeFrom(iter->key()); - for (; iter->Valid(); iter->Next()) { - Slice key = iter->key(); - meta->largest.DecodeFrom(key); - builder->Add(key, iter->value()); - } - - // Finish and check for builder errors - if (s.ok()) { - s = builder->Finish(); - if (s.ok()) { - meta->file_size = builder->FileSize(); - assert(meta->file_size > 0); - } - } else { - builder->Abandon(); - } - delete builder; - - // Finish and check for file errors - if (s.ok()) { - s = file->Sync(); - } - if (s.ok()) { - s = file->Close(); - } - delete file; - file = NULL; - - if (s.ok()) { - // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), - meta->number, - meta->file_size); - s = it->status(); - delete it; - } - } - - // Check for input iterator errors - if (!iter->status().ok()) { - s = iter->status(); - } - - if (s.ok() && meta->file_size > 0) { - edit->AddFile(0, meta->number, meta->file_size, - meta->smallest, meta->largest); - } else { - env->DeleteFile(fname); - } - return s; -} - -} diff --git a/leveldb/db/builder.h b/leveldb/db/builder.h deleted file mode 100644 index 5dd17b6..0000000 --- a/leveldb/db/builder.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ -#define STORAGE_LEVELDB_DB_BUILDER_H_ - -#include "leveldb/status.h" - -namespace leveldb { - -struct Options; -struct FileMetaData; - -class Env; -class Iterator; -class TableCache; -class VersionEdit; - -// Build a Table file from the contents of *iter. The generated file -// will be named according to meta->number. On success, the rest of -// *meta will be filled with metadata about the generated table, and -// the file information will be added to *edit. If no data is present -// in *iter, meta->file_size will be set to zero, and no Table file -// will be produced. -extern Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta, - VersionEdit* edit); - -} - -#endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/leveldb/db/corruption_test.cc b/leveldb/db/corruption_test.cc deleted file mode 100644 index 12d176e..0000000 --- a/leveldb/db/corruption_test.cc +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include -#include -#include -#include -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "leveldb/write_batch.h" -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/log_format.h" -#include "db/version_set.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static const int kValueSize = 1000; - -class CorruptionTest { - public: - test::ErrorEnv env_; - Random rnd_; - std::string dbname_; - Cache* tiny_cache_; - Options options_; - DB* db_; - - CorruptionTest() : rnd_(test::RandomSeed()) { - tiny_cache_ = NewLRUCache(100); - options_.env = &env_; - dbname_ = test::TmpDir() + "/db_test"; - DestroyDB(dbname_, options_); - - db_ = NULL; - options_.create_if_missing = true; - Reopen(); - options_.create_if_missing = false; - } - - ~CorruptionTest() { - delete db_; - DestroyDB(dbname_, Options()); - delete tiny_cache_; - } - - Status TryReopen(Options* options = NULL) { - delete db_; - db_ = NULL; - Options opt = (options ? *options : options_); - opt.env = &env_; - opt.block_cache = tiny_cache_; - return DB::Open(opt, dbname_, &db_); - } - - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); - } - - void RepairDB() { - delete db_; - db_ = NULL; - ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); - } - - void Build(int n) { - std::string key_space, value_space; - WriteBatch batch; - for (int i = 0; i < n; i++) { - //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); - Slice key = Key(i, &key_space); - batch.Clear(); - batch.Put(key, Value(i, &value_space)); - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - } - } - - void Check(int min_expected, int max_expected) { - int next_expected = 0; - int missed = 0; - int bad_keys = 0; - int bad_values = 0; - int correct = 0; - std::string value_space; - Iterator* iter = db_->NewIterator(ReadOptions()); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - uint64_t key; - Slice in(iter->key()); - if (!ConsumeDecimalNumber(&in, &key) || - !in.empty() || - key < next_expected) { - bad_keys++; - continue; - } - missed += (key - next_expected); - next_expected = key + 1; - if (iter->value() != Value(key, &value_space)) { - bad_values++; - } else { - correct++; - } - } - delete iter; - - fprintf(stderr, - "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", - min_expected, max_expected, correct, bad_keys, bad_values, missed); - ASSERT_LE(min_expected, correct); - ASSERT_GE(max_expected, correct); - } - - void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { - // Pick file to corrupt - std::vector filenames; - ASSERT_OK(env_.GetChildren(dbname_, &filenames)); - uint64_t number; - FileType type; - std::vector candidates; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && - type == filetype) { - candidates.push_back(dbname_ + "/" + filenames[i]); - } - } - ASSERT_TRUE(!candidates.empty()) << filetype; - std::string fname = candidates[rnd_.Uniform(candidates.size())]; - - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - const char* msg = strerror(errno); - ASSERT_TRUE(false) << fname << ": " << msg; - } - - if (offset < 0) { - // Relative to end of file; make it absolute - if (-offset > sbuf.st_size) { - offset = 0; - } else { - offset = sbuf.st_size + offset; - } - } - if (offset > sbuf.st_size) { - offset = sbuf.st_size; - } - if (offset + bytes_to_corrupt > sbuf.st_size) { - bytes_to_corrupt = sbuf.st_size - offset; - } - - // Do it - std::string contents; - Status s = ReadFileToString(Env::Default(), fname, &contents); - ASSERT_TRUE(s.ok()) << s.ToString(); - for (int i = 0; i < bytes_to_corrupt; i++) { - contents[i + offset] ^= 0x80; - } - s = WriteStringToFile(Env::Default(), contents, fname); - ASSERT_TRUE(s.ok()) << s.ToString(); - } - - int Property(const std::string& name) { - std::string property; - int result; - if (db_->GetProperty(name, &property) && - sscanf(property.c_str(), "%d", &result) == 1) { - return result; - } else { - return -1; - } - } - - // Return the ith key - Slice Key(int i, std::string* storage) { - char buf[100]; - snprintf(buf, sizeof(buf), "%016d", i); - storage->assign(buf, strlen(buf)); - return Slice(*storage); - } - - // Return the value to associate with the specified key - Slice Value(int k, std::string* storage) { - Random r(k); - return test::RandomString(&r, kValueSize, storage); - } -}; - -TEST(CorruptionTest, Recovery) { - Build(100); - Check(100, 100); - Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record - Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block - Reopen(); - - // The 64 records in the first two log blocks are completely lost. - Check(36, 36); -} - -TEST(CorruptionTest, RecoverWriteError) { - env_.writable_file_error_ = true; - Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); -} - -TEST(CorruptionTest, NewFileErrorDuringWrite) { - // Do enough writing to force minor compaction - env_.writable_file_error_ = true; - const int num = 3 + (Options().write_buffer_size / kValueSize); - std::string value_storage; - Status s; - for (int i = 0; s.ok() && i < num; i++) { - WriteBatch batch; - batch.Put("a", Value(100, &value_storage)); - s = db_->Write(WriteOptions(), &batch); - } - ASSERT_TRUE(!s.ok()); - ASSERT_GE(env_.num_writable_file_errors_, 1); - env_.writable_file_error_ = false; - Reopen(); -} - -TEST(CorruptionTest, TableFile) { - Build(100); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); - - Corrupt(kTableFile, 100, 1); - Check(99, 99); -} - -TEST(CorruptionTest, TableFileIndexData) { - Build(10000); // Enough to build multiple Tables - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); - - Corrupt(kTableFile, -2000, 500); - Reopen(); - Check(5000, 9999); -} - -TEST(CorruptionTest, MissingDescriptor) { - Build(1000); - RepairDB(); - Reopen(); - Check(1000, 1000); -} - -TEST(CorruptionTest, SequenceNumberRecovery) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); - RepairDB(); - Reopen(); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v5", v); - // Write something. If sequence number was not recovered properly, - // it will be hidden by an earlier write. - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v6", v); - Reopen(); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v6", v); -} - -TEST(CorruptionTest, CorruptedDescriptor) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - - Corrupt(kDescriptorFile, 0, 1000); - Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); - - RepairDB(); - Reopen(); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("hello", v); -} - -TEST(CorruptionTest, CompactionInputError) { - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Force compactions by writing lots of values - Build(10000); - Check(10000, 10000); - dbi->TEST_CompactRange(0, "", "~"); - ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); -} - -TEST(CorruptionTest, CompactionInputErrorParanoid) { - Options options; - options.paranoid_checks = true; - options.write_buffer_size = 1048576; - Reopen(&options); - - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Write must eventually fail because of corrupted table - Status s; - std::string tmp1, tmp2; - for (int i = 0; i < 10000 && s.ok(); i++) { - s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); - } - ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; -} - -TEST(CorruptionTest, UnrelatedKeys) { - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - Corrupt(kTableFile, 100, 1); - - std::string tmp1, tmp2; - ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); - ASSERT_EQ(Value(1000, &tmp2).ToString(), v); - dbi->TEST_CompactMemTable(); - ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); - ASSERT_EQ(Value(1000, &tmp2).ToString(), v); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/db_bench.cc b/leveldb/db/db_bench.cc deleted file mode 100644 index d1cbdc0..0000000 --- a/leveldb/db/db_bench.cc +++ /dev/null @@ -1,613 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include "db/db_impl.h" -#include "db/version_set.h" -#include "leveldb/cache.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/write_batch.h" -#include "port/port.h" -#include "util/crc32c.h" -#include "util/histogram.h" -#include "util/random.h" -#include "util/testutil.h" - -// Comma-separated list of operations to run in the specified order -// Actual benchmarks: -// fillseq -- write N values in sequential key order in async mode -// fillrandom -- write N values in random key order in async mode -// overwrite -- overwrite N values in random key order in async mode -// fillsync -- write N/100 values in random key order in sync mode -// fill100K -- write N/1000 100K values in random order in async mode -// readseq -- read N values sequentially -// readreverse -- read N values in reverse order -// readrandom -- read N values in random order -// crc32c -- repeated crc32c of 4K of data -// Meta operations: -// compact -- Compact the entire DB -// stats -- Print DB stats -// heapprofile -- Dump a heap profile (if supported by this port) -static const char* FLAGS_benchmarks = - "fillseq," - "fillsync," - "fillrandom," - "overwrite," - "readrandom," - "readrandom," // Extra run to allow previous compactions to quiesce - "readseq," - "readreverse," - "compact," - "readrandom," - "readseq," - "readreverse," - "fill100K," - "crc32c," - "snappycomp," - "snappyuncomp," - ; - -// Number of key/values to place in database -static int FLAGS_num = 1000000; - -// Size of each value -static int FLAGS_value_size = 100; - -// Arrange to generate values that shrink to this fraction of -// their original size after compression -static double FLAGS_compression_ratio = 0.5; - -// Print histogram of operation timings -static bool FLAGS_histogram = false; - -// Number of bytes to buffer in memtable before compacting -// (initialized to default value by "main") -static int FLAGS_write_buffer_size = 0; - -// Number of bytes to use as a cache of uncompressed data. -// Negative means use default settings. -static int FLAGS_cache_size = -1; - -namespace leveldb { - -// Helper for quickly generating random data. -namespace { -class RandomGenerator { - private: - std::string data_; - int pos_; - - public: - RandomGenerator() { - // We use a limited amount of data over and over again and ensure - // that it is larger than the compression window (32KB), and also - // large enough to serve all typical value sizes we want to write. - Random rnd(301); - std::string piece; - while (data_.size() < 1048576) { - // Add a short fragment that is as compressible as specified - // by FLAGS_compression_ratio. - test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); - data_.append(piece); - } - pos_ = 0; - } - - Slice Generate(int len) { - if (pos_ + len > data_.size()) { - pos_ = 0; - assert(len < data_.size()); - } - pos_ += len; - return Slice(data_.data() + pos_ - len, len); - } -}; - -static Slice TrimSpace(Slice s) { - int start = 0; - while (start < s.size() && isspace(s[start])) { - start++; - } - int limit = s.size(); - while (limit > start && isspace(s[limit-1])) { - limit--; - } - return Slice(s.data() + start, limit - start); -} - -} - -class Benchmark { - private: - Cache* cache_; - DB* db_; - int num_; - int heap_counter_; - double start_; - double last_op_finish_; - int64_t bytes_; - std::string message_; - std::string post_message_; - Histogram hist_; - RandomGenerator gen_; - Random rand_; - - // State kept for progress messages - int done_; - int next_report_; // When to report next - - void PrintHeader() { - const int kKeySize = 16; - PrintEnvironment(); - fprintf(stdout, "Keys: %d bytes each\n", kKeySize); - fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", - FLAGS_value_size, - static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); - fprintf(stdout, "Entries: %d\n", num_); - fprintf(stdout, "RawSize: %.1f MB (estimated)\n", - ((static_cast(kKeySize + FLAGS_value_size) * num_) - / 1048576.0)); - fprintf(stdout, "FileSize: %.1f MB (estimated)\n", - (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) - / 1048576.0)); - PrintWarnings(); - fprintf(stdout, "------------------------------------------------\n"); - } - - void PrintWarnings() { -#if defined(__GNUC__) && !defined(__OPTIMIZE__) - fprintf(stdout, - "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" - ); -#endif -#ifndef NDEBUG - fprintf(stdout, - "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); -#endif - - // See if snappy is working by attempting to compress a compressible string - const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; - std::string compressed; - if (!port::Snappy_Compress(text, sizeof(text), &compressed)) { - fprintf(stdout, "WARNING: Snappy compression is not enabled\n"); - } else if (compressed.size() >= sizeof(text)) { - fprintf(stdout, "WARNING: Snappy compression is not effective\n"); - } - } - - void PrintEnvironment() { - fprintf(stderr, "LevelDB: version %d.%d\n", - kMajorVersion, kMinorVersion); - -#if defined(__linux) - time_t now = time(NULL); - fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline - - FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); - if (cpuinfo != NULL) { - char line[1000]; - int num_cpus = 0; - std::string cpu_type; - std::string cache_size; - while (fgets(line, sizeof(line), cpuinfo) != NULL) { - const char* sep = strchr(line, ':'); - if (sep == NULL) { - continue; - } - Slice key = TrimSpace(Slice(line, sep - 1 - line)); - Slice val = TrimSpace(Slice(sep + 1)); - if (key == "model name") { - ++num_cpus; - cpu_type = val.ToString(); - } else if (key == "cache size") { - cache_size = val.ToString(); - } - } - fclose(cpuinfo); - fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); - fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); - } -#endif - } - - void Start() { - start_ = Env::Default()->NowMicros() * 1e-6; - bytes_ = 0; - message_.clear(); - last_op_finish_ = start_; - hist_.Clear(); - done_ = 0; - next_report_ = 100; - } - - void FinishedSingleOp() { - if (FLAGS_histogram) { - double now = Env::Default()->NowMicros() * 1e-6; - double micros = (now - last_op_finish_) * 1e6; - hist_.Add(micros); - if (micros > 20000) { - fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); - fflush(stderr); - } - last_op_finish_ = now; - } - - done_++; - if (done_ >= next_report_) { - if (next_report_ < 1000) next_report_ += 100; - else if (next_report_ < 5000) next_report_ += 500; - else if (next_report_ < 10000) next_report_ += 1000; - else if (next_report_ < 50000) next_report_ += 5000; - else if (next_report_ < 100000) next_report_ += 10000; - else if (next_report_ < 500000) next_report_ += 50000; - else next_report_ += 100000; - fprintf(stderr, "... finished %d ops%30s\r", done_, ""); - fflush(stderr); - } - } - - void Stop(const Slice& name) { - double finish = Env::Default()->NowMicros() * 1e-6; - - // Pretend at least one op was done in case we are running a benchmark - // that does nto call FinishedSingleOp(). - if (done_ < 1) done_ = 1; - - if (bytes_ > 0) { - char rate[100]; - snprintf(rate, sizeof(rate), "%6.1f MB/s", - (bytes_ / 1048576.0) / (finish - start_)); - if (!message_.empty()) { - message_ = std::string(rate) + " " + message_; - } else { - message_ = rate; - } - } - - fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", - name.ToString().c_str(), - (finish - start_) * 1e6 / done_, - (message_.empty() ? "" : " "), - message_.c_str()); - if (FLAGS_histogram) { - fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); - } - fflush(stdout); - - if (!post_message_.empty()) { - fprintf(stdout, "\n%s\n", post_message_.c_str()); - post_message_.clear(); - } - } - - public: - enum Order { - SEQUENTIAL, - RANDOM - }; - enum DBState { - FRESH, - EXISTING - }; - - Benchmark() - : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), - db_(NULL), - num_(FLAGS_num), - heap_counter_(0), - bytes_(0), - rand_(301) { - std::vector files; - Env::Default()->GetChildren("/tmp/dbbench", &files); - for (int i = 0; i < files.size(); i++) { - if (Slice(files[i]).starts_with("heap-")) { - Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); - } - } - DestroyDB("/tmp/dbbench", Options()); - } - - ~Benchmark() { - delete db_; - delete cache_; - } - - void Run() { - PrintHeader(); - Open(); - - const char* benchmarks = FLAGS_benchmarks; - while (benchmarks != NULL) { - const char* sep = strchr(benchmarks, ','); - Slice name; - if (sep == NULL) { - name = benchmarks; - benchmarks = NULL; - } else { - name = Slice(benchmarks, sep - benchmarks); - benchmarks = sep + 1; - } - - Start(); - - WriteOptions write_options; - bool known = true; - if (name == Slice("fillseq")) { - Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); - } else if (name == Slice("fillbatch")) { - Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000); - } else if (name == Slice("fillrandom")) { - Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1); - } else if (name == Slice("overwrite")) { - Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1); - } else if (name == Slice("fillsync")) { - write_options.sync = true; - Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); - } else if (name == Slice("fill100K")) { - Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); - } else if (name == Slice("readseq")) { - ReadSequential(); - } else if (name == Slice("readreverse")) { - ReadReverse(); - } else if (name == Slice("readrandom")) { - ReadRandom(); - } else if (name == Slice("readrandomsmall")) { - int n = num_; - num_ /= 1000; - ReadRandom(); - num_ = n; - } else if (name == Slice("compact")) { - Compact(); - } else if (name == Slice("crc32c")) { - Crc32c(4096, "(4K per op)"); - } else if (name == Slice("snappycomp")) { - SnappyCompress(); - } else if (name == Slice("snappyuncomp")) { - SnappyUncompress(); - } else if (name == Slice("heapprofile")) { - HeapProfile(); - } else if (name == Slice("stats")) { - PrintStats(); - } else { - known = false; - if (name != Slice()) { // No error message for empty name - fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); - } - } - if (known) { - Stop(name); - } - } - } - - private: - void Crc32c(int size, const char* label) { - // Checksum about 500MB of data total - std::string data(size, 'x'); - int64_t bytes = 0; - uint32_t crc = 0; - while (bytes < 500 * 1048576) { - crc = crc32c::Value(data.data(), size); - FinishedSingleOp(); - bytes += size; - } - // Print so result is not dead - fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); - - bytes_ = bytes; - message_ = label; - } - - void SnappyCompress() { - Slice input = gen_.Generate(Options().block_size); - int64_t bytes = 0; - int64_t produced = 0; - bool ok = true; - std::string compressed; - while (ok && bytes < 1024 * 1048576) { // Compress 1G - ok = port::Snappy_Compress(input.data(), input.size(), &compressed); - produced += compressed.size(); - bytes += input.size(); - FinishedSingleOp(); - } - - if (!ok) { - message_ = "(snappy failure)"; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "(output: %.1f%%)", - (produced * 100.0) / bytes); - message_ = buf; - bytes_ = bytes; - } - } - - void SnappyUncompress() { - Slice input = gen_.Generate(Options().block_size); - std::string compressed; - bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); - int64_t bytes = 0; - std::string uncompressed; - while (ok && bytes < 1024 * 1048576) { // Compress 1G - ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), - &uncompressed); - bytes += uncompressed.size(); - FinishedSingleOp(); - } - - if (!ok) { - message_ = "(snappy failure)"; - } else { - bytes_ = bytes; - } - } - - void Open() { - assert(db_ == NULL); - Options options; - options.create_if_missing = true; - options.block_cache = cache_; - options.write_buffer_size = FLAGS_write_buffer_size; - Status s = DB::Open(options, "/tmp/dbbench", &db_); - if (!s.ok()) { - fprintf(stderr, "open error: %s\n", s.ToString().c_str()); - exit(1); - } - } - - void Write(const WriteOptions& options, Order order, DBState state, - int num_entries, int value_size, int entries_per_batch) { - if (state == FRESH) { - delete db_; - db_ = NULL; - DestroyDB("/tmp/dbbench", Options()); - Open(); - Start(); // Do not count time taken to destroy/open - } - - if (num_entries != num_) { - char msg[100]; - snprintf(msg, sizeof(msg), "(%d ops)", num_entries); - message_ = msg; - } - - WriteBatch batch; - Status s; - std::string val; - for (int i = 0; i < num_entries; i += entries_per_batch) { - batch.Clear(); - for (int j = 0; j < entries_per_batch; j++) { - const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num); - char key[100]; - snprintf(key, sizeof(key), "%016d", k); - batch.Put(key, gen_.Generate(value_size)); - bytes_ += value_size + strlen(key); - FinishedSingleOp(); - } - s = db_->Write(options, &batch); - if (!s.ok()) { - fprintf(stderr, "put error: %s\n", s.ToString().c_str()); - exit(1); - } - } - } - - void ReadSequential() { - Iterator* iter = db_->NewIterator(ReadOptions()); - int i = 0; - for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - } - - void ReadReverse() { - Iterator* iter = db_->NewIterator(ReadOptions()); - int i = 0; - for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - } - - void ReadRandom() { - ReadOptions options; - std::string value; - for (int i = 0; i < num_; i++) { - char key[100]; - const int k = rand_.Next() % FLAGS_num; - snprintf(key, sizeof(key), "%016d", k); - db_->Get(options, key, &value); - FinishedSingleOp(); - } - } - - void Compact() { - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - std::string property; - char name[100]; - snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); - if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbi->TEST_CompactRange(level, "", "~"); - } - } - - void PrintStats() { - std::string stats; - if (!db_->GetProperty("leveldb.stats", &stats)) { - message_ = "(failed)"; - } else { - post_message_ = stats; - } - } - - static void WriteToFile(void* arg, const char* buf, int n) { - reinterpret_cast(arg)->Append(Slice(buf, n)); - } - - void HeapProfile() { - char fname[100]; - snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); - WritableFile* file; - Status s = Env::Default()->NewWritableFile(fname, &file); - if (!s.ok()) { - message_ = s.ToString(); - return; - } - bool ok = port::GetHeapProfile(WriteToFile, file); - delete file; - if (!ok) { - message_ = "not supported"; - Env::Default()->DeleteFile(fname); - } - } -}; - -} - -int main(int argc, char** argv) { - FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; - for (int i = 1; i < argc; i++) { - double d; - int n; - char junk; - if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { - FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); - } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { - FLAGS_compression_ratio = d; - } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && - (n == 0 || n == 1)) { - FLAGS_histogram = n; - } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { - FLAGS_num = n; - } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { - FLAGS_value_size = n; - } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { - FLAGS_write_buffer_size = n; - } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { - FLAGS_cache_size = n; - } else { - fprintf(stderr, "Invalid flag '%s'\n", argv[i]); - exit(1); - } - } - - leveldb::Benchmark benchmark; - benchmark.Run(); - return 0; -} diff --git a/leveldb/db/db_impl.cc b/leveldb/db/db_impl.cc deleted file mode 100644 index 3b9e04e..0000000 --- a/leveldb/db/db_impl.cc +++ /dev/null @@ -1,1188 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_impl.h" - -#include -#include -#include -#include -#include -#include -#include "db/builder.h" -#include "db/db_iter.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/status.h" -#include "leveldb/table.h" -#include "leveldb/table_builder.h" -#include "port/port.h" -#include "table/block.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" -#include "util/mutexlock.h" - -namespace leveldb { - -struct DBImpl::CompactionState { - Compaction* const compaction; - - // Sequence numbers < smallest_snapshot are not significant since we - // will never have to service a snapshot below smallest_snapshot. - // Therefore if we have seen a sequence number S <= smallest_snapshot, - // we can drop all entries for the same key with sequence numbers < S. - SequenceNumber smallest_snapshot; - - // Files produced by compaction - struct Output { - uint64_t number; - uint64_t file_size; - InternalKey smallest, largest; - }; - std::vector outputs; - - // State kept for output being generated - WritableFile* outfile; - TableBuilder* builder; - - uint64_t total_bytes; - - Output* current_output() { return &outputs[outputs.size()-1]; } - - explicit CompactionState(Compaction* c) - : compaction(c), - outfile(NULL), - builder(NULL), - total_bytes(0) { - } -}; - -namespace { -class NullWritableFile : public WritableFile { - public: - virtual Status Append(const Slice& data) { return Status::OK(); } - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } -}; -} - -// Fix user-supplied options to be reasonable -template -static void ClipToRange(T* ptr, V minvalue, V maxvalue) { - if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; - if (static_cast(*ptr) < minvalue) *ptr = minvalue; -} -Options SanitizeOptions(const std::string& dbname, - const InternalKeyComparator* icmp, - const Options& src) { - Options result = src; - result.comparator = icmp; - ClipToRange(&result.max_open_files, 20, 50000); - ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); - ClipToRange(&result.block_size, 1<<10, 4<<20); - if (result.info_log == NULL) { - // Open a log file in the same directory as the db - src.env->CreateDir(dbname); // In case it does not exist - src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); - Status s = src.env->NewWritableFile(InfoLogFileName(dbname), - &result.info_log); - if (!s.ok()) { - // No place suitable for logging - result.info_log = new NullWritableFile; - } - } - if (result.block_cache == NULL) { - result.block_cache = NewLRUCache(8 << 20); - } - return result; -} - -DBImpl::DBImpl(const Options& options, const std::string& dbname) - : env_(options.env), - internal_comparator_(options.comparator), - options_(SanitizeOptions(dbname, &internal_comparator_, options)), - owns_info_log_(options_.info_log != options.info_log), - owns_cache_(options_.block_cache != options.block_cache), - dbname_(dbname), - db_lock_(NULL), - shutting_down_(NULL), - bg_cv_(&mutex_), - compacting_cv_(&mutex_), - mem_(new MemTable(internal_comparator_)), - imm_(NULL), - logfile_(NULL), - log_(NULL), - bg_compaction_scheduled_(false), - compacting_(false) { - has_imm_.Release_Store(NULL); - - // Reserve ten files or so for other uses and give the rest to TableCache. - const int table_cache_size = options.max_open_files - 10; - table_cache_ = new TableCache(dbname_, &options_, table_cache_size); - - versions_ = new VersionSet(dbname_, &options_, table_cache_, - &internal_comparator_); -} - -DBImpl::~DBImpl() { - // Wait for background work to finish - mutex_.Lock(); - shutting_down_.Release_Store(this); // Any non-NULL value is ok - if (bg_compaction_scheduled_) { - while (bg_compaction_scheduled_) { - bg_cv_.Wait(); - } - } - mutex_.Unlock(); - - if (db_lock_ != NULL) { - env_->UnlockFile(db_lock_); - } - - delete versions_; - delete mem_; - delete imm_; - delete log_; - delete logfile_; - delete table_cache_; - - if (owns_info_log_) { - delete options_.info_log; - } - if (owns_cache_) { - delete options_.block_cache; - } -} - -Status DBImpl::NewDB() { - VersionEdit new_db; - new_db.SetComparatorName(user_comparator()->Name()); - new_db.SetLogNumber(0); - new_db.SetNextFile(2); - new_db.SetLastSequence(0); - - const std::string manifest = DescriptorFileName(dbname_, 1); - WritableFile* file; - Status s = env_->NewWritableFile(manifest, &file); - if (!s.ok()) { - return s; - } - { - log::Writer log(file); - std::string record; - new_db.EncodeTo(&record); - s = log.AddRecord(record); - if (s.ok()) { - s = file->Close(); - } - } - delete file; - if (s.ok()) { - // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1); - } else { - env_->DeleteFile(manifest); - } - return s; -} - -void DBImpl::MaybeIgnoreError(Status* s) const { - if (s->ok() || options_.paranoid_checks) { - // No change needed - } else { - Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str()); - *s = Status::OK(); - } -} - -void DBImpl::DeleteObsoleteFiles() { - // Make a set of all of the live files - std::set live = pending_outputs_; - versions_->AddLiveFiles(&live); - - std::vector filenames; - env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - bool keep = true; - switch (type) { - case kLogFile: - keep = ((number == versions_->LogNumber()) || - (number == versions_->PrevLogNumber())); - break; - case kDescriptorFile: - // Keep my manifest file, and any newer incarnations' - // (in case there is a race that allows other incarnations) - keep = (number >= versions_->ManifestFileNumber()); - break; - case kTableFile: - keep = (live.find(number) != live.end()); - break; - case kTempFile: - // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live" - keep = (live.find(number) != live.end()); - break; - case kCurrentFile: - case kDBLockFile: - case kInfoLogFile: - keep = true; - break; - } - - if (!keep) { - if (type == kTableFile) { - table_cache_->Evict(number); - } - Log(env_, options_.info_log, "Delete type=%d #%lld\n", - int(type), - static_cast(number)); - env_->DeleteFile(dbname_ + "/" + filenames[i]); - } - } - } -} - -Status DBImpl::Recover(VersionEdit* edit) { - mutex_.AssertHeld(); - - // Ignore error from CreateDir since the creation of the DB is - // committed only when the descriptor is created, and this directory - // may already exist from a previous failed creation attempt. - env_->CreateDir(dbname_); - assert(db_lock_ == NULL); - Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); - if (!s.ok()) { - return s; - } - - if (!env_->FileExists(CurrentFileName(dbname_))) { - if (options_.create_if_missing) { - s = NewDB(); - if (!s.ok()) { - return s; - } - } else { - return Status::InvalidArgument( - dbname_, "does not exist (create_if_missing is false)"); - } - } else { - if (options_.error_if_exists) { - return Status::InvalidArgument( - dbname_, "exists (error_if_exists is true)"); - } - } - - s = versions_->Recover(); - if (s.ok()) { - // Recover from the log files named in the descriptor - SequenceNumber max_sequence(0); - if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log - s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence); - } - if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state - s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence); - } - if (s.ok()) { - if (versions_->LastSequence() < max_sequence) { - versions_->SetLastSequence(max_sequence); - } - } - } - - return s; -} - -Status DBImpl::RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - WritableFile* info_log; - const char* fname; - Status* status; // NULL if options_.paranoid_checks==false - virtual void Corruption(size_t bytes, const Status& s) { - Log(env, info_log, "%s%s: dropping %d bytes; %s", - (this->status == NULL ? "(ignoring error) " : ""), - fname, static_cast(bytes), s.ToString().c_str()); - if (this->status != NULL && this->status->ok()) *this->status = s; - } - }; - - mutex_.AssertHeld(); - - // Open the log file - std::string fname = LogFileName(dbname_, log_number); - SequentialFile* file; - Status status = env_->NewSequentialFile(fname, &file); - if (!status.ok()) { - MaybeIgnoreError(&status); - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log; - reporter.fname = fname.c_str(); - reporter.status = (options_.paranoid_checks ? &status : NULL); - // We intentially make log::Reader do checksumming even if - // paranoid_checks==false so that corruptions cause entire commits - // to be skipped instead of propagating bad information (like overly - // large sequence numbers). - log::Reader reader(file, &reporter, true/*checksum*/); - Log(env_, options_.info_log, "Recovering log #%llu", - (unsigned long long) log_number); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - MemTable* mem = NULL; - while (reader.ReadRecord(&record, &scratch) && - status.ok()) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - - if (mem == NULL) { - mem = new MemTable(internal_comparator_); - } - status = WriteBatchInternal::InsertInto(&batch, mem); - MaybeIgnoreError(&status); - if (!status.ok()) { - break; - } - const SequenceNumber last_seq = - WriteBatchInternal::Sequence(&batch) + - WriteBatchInternal::Count(&batch) - 1; - if (last_seq > *max_sequence) { - *max_sequence = last_seq; - } - - if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { - status = WriteLevel0Table(mem, edit); - if (!status.ok()) { - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - break; - } - delete mem; - mem = NULL; - } - } - - if (status.ok() && mem != NULL) { - status = WriteLevel0Table(mem, edit); - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - } - - delete mem; - delete file; - return status; -} - -Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { - mutex_.AssertHeld(); - const uint64_t start_micros = env_->NowMicros(); - FileMetaData meta; - meta.number = versions_->NewFileNumber(); - pending_outputs_.insert(meta.number); - Iterator* iter = mem->NewIterator(); - Log(env_, options_.info_log, "Level-0 table #%llu: started", - (unsigned long long) meta.number); - - Status s; - { - mutex_.Unlock(); - s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit); - mutex_.Lock(); - } - - Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s", - (unsigned long long) meta.number, - (unsigned long long) meta.file_size, - s.ToString().c_str()); - delete iter; - pending_outputs_.erase(meta.number); - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.file_size; - stats_[0].Add(stats); - return s; -} - -Status DBImpl::CompactMemTable() { - mutex_.AssertHeld(); - assert(imm_ != NULL); - assert(compacting_); - - // Save the contents of the memtable as a new Table - VersionEdit edit; - Status s = WriteLevel0Table(imm_, &edit); - - // Replace immutable memtable with the generated Table - if (s.ok()) { - edit.SetPrevLogNumber(0); - s = versions_->LogAndApply(&edit, imm_); - } - - if (s.ok()) { - // Commit to the new state - imm_ = NULL; - has_imm_.Release_Store(NULL); - DeleteObsoleteFiles(); - } - - compacting_cv_.SignalAll(); // Wake up waiter even if there was an error - return s; -} - -void DBImpl::TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end) { - MutexLock l(&mutex_); - while (compacting_) { - compacting_cv_.Wait(); - } - Compaction* c = versions_->CompactRange( - level, - InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek), - InternalKey(end, 0, static_cast(0))); - - if (c != NULL) { - CompactionState* compact = new CompactionState(c); - DoCompactionWork(compact); // Ignore error in test compaction - CleanupCompaction(compact); - } - - // Start any background compaction that may have been delayed by this thread - MaybeScheduleCompaction(); -} - -Status DBImpl::TEST_CompactMemTable() { - MutexLock l(&mutex_); - Status s = MakeRoomForWrite(true /* force compaction */); - if (s.ok()) { - // Wait until the compaction completes - while (imm_ != NULL && bg_error_.ok()) { - compacting_cv_.Wait(); - } - if (imm_ != NULL) { - s = bg_error_; - } - } - return s; -} - -void DBImpl::MaybeScheduleCompaction() { - mutex_.AssertHeld(); - if (bg_compaction_scheduled_) { - // Already scheduled - } else if (compacting_) { - // Some other thread is running a compaction. Do not conflict with it. - } else if (shutting_down_.Acquire_Load()) { - // DB is being deleted; no more background compactions - } else if (imm_ == NULL && !versions_->NeedsCompaction()) { - // No work to be done - } else { - bg_compaction_scheduled_ = true; - env_->Schedule(&DBImpl::BGWork, this); - } -} - -void DBImpl::BGWork(void* db) { - reinterpret_cast(db)->BackgroundCall(); -} - -void DBImpl::BackgroundCall() { - MutexLock l(&mutex_); - assert(bg_compaction_scheduled_); - if (!shutting_down_.Acquire_Load() && - !compacting_) { - BackgroundCompaction(); - } - bg_compaction_scheduled_ = false; - bg_cv_.SignalAll(); - - // Previous compaction may have produced too many files in a level, - // so reschedule another compaction if needed. - MaybeScheduleCompaction(); -} - -void DBImpl::BackgroundCompaction() { - mutex_.AssertHeld(); - assert(!compacting_); - - if (imm_ != NULL) { - compacting_ = true; - CompactMemTable(); - compacting_ = false; - compacting_cv_.SignalAll(); - return; - } - - Compaction* c = versions_->PickCompaction(); - if (c == NULL) { - // Nothing to do - return; - } - - Status status; - if (c->IsTrivialMove()) { - // Move file to next level - assert(c->num_input_files(0) == 1); - FileMetaData* f = c->input(0, 0); - c->edit()->DeleteFile(c->level(), f->number); - c->edit()->AddFile(c->level() + 1, f->number, f->file_size, - f->smallest, f->largest); - status = versions_->LogAndApply(c->edit(), NULL); - Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", - static_cast(f->number), - c->level() + 1, - static_cast(f->file_size), - status.ToString().c_str()); - } else { - CompactionState* compact = new CompactionState(c); - status = DoCompactionWork(compact); - CleanupCompaction(compact); - } - delete c; - - if (status.ok()) { - // Done - } else if (shutting_down_.Acquire_Load()) { - // Ignore compaction errors found during shutting down - } else { - Log(env_, options_.info_log, - "Compaction error: %s", status.ToString().c_str()); - if (options_.paranoid_checks && bg_error_.ok()) { - bg_error_ = status; - } - } -} - -void DBImpl::CleanupCompaction(CompactionState* compact) { - mutex_.AssertHeld(); - if (compact->builder != NULL) { - // May happen if we get a shutdown call in the middle of compaction - compact->builder->Abandon(); - delete compact->builder; - } else { - assert(compact->outfile == NULL); - } - delete compact->outfile; - for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - pending_outputs_.erase(out.number); - } - delete compact; -} - -Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { - assert(compact != NULL); - assert(compact->builder == NULL); - uint64_t file_number; - { - mutex_.Lock(); - file_number = versions_->NewFileNumber(); - pending_outputs_.insert(file_number); - CompactionState::Output out; - out.number = file_number; - out.smallest.Clear(); - out.largest.Clear(); - compact->outputs.push_back(out); - mutex_.Unlock(); - } - - // Make the output file - std::string fname = TableFileName(dbname_, file_number); - Status s = env_->NewWritableFile(fname, &compact->outfile); - if (s.ok()) { - compact->builder = new TableBuilder(options_, compact->outfile); - } - return s; -} - -Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, - Iterator* input) { - assert(compact != NULL); - assert(compact->outfile != NULL); - assert(compact->builder != NULL); - - const uint64_t output_number = compact->current_output()->number; - assert(output_number != 0); - - // Check for iterator errors - Status s = input->status(); - const uint64_t current_entries = compact->builder->NumEntries(); - if (s.ok()) { - s = compact->builder->Finish(); - } else { - compact->builder->Abandon(); - } - const uint64_t current_bytes = compact->builder->FileSize(); - compact->current_output()->file_size = current_bytes; - compact->total_bytes += current_bytes; - delete compact->builder; - compact->builder = NULL; - - // Finish and check for file errors - if (s.ok()) { - s = compact->outfile->Sync(); - } - if (s.ok()) { - s = compact->outfile->Close(); - } - delete compact->outfile; - compact->outfile = NULL; - - if (s.ok() && current_entries > 0) { - // Verify that the table is usable - Iterator* iter = table_cache_->NewIterator(ReadOptions(), - output_number, - current_bytes); - s = iter->status(); - delete iter; - if (s.ok()) { - Log(env_, options_.info_log, - "Generated table #%llu: %lld keys, %lld bytes", - (unsigned long long) output_number, - (unsigned long long) current_entries, - (unsigned long long) current_bytes); - } - } - return s; -} - - -Status DBImpl::InstallCompactionResults(CompactionState* compact) { - mutex_.AssertHeld(); - Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1, - static_cast(compact->total_bytes)); - - // Add compaction outputs - compact->compaction->AddInputDeletions(compact->compaction->edit()); - const int level = compact->compaction->level(); - for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - compact->compaction->edit()->AddFile( - level + 1, - out.number, out.file_size, out.smallest, out.largest); - pending_outputs_.erase(out.number); - } - compact->outputs.clear(); - - Status s = versions_->LogAndApply(compact->compaction->edit(), NULL); - if (s.ok()) { - compact->compaction->ReleaseInputs(); - DeleteObsoleteFiles(); - } else { - // Discard any files we may have created during this failed compaction - for (size_t i = 0; i < compact->outputs.size(); i++) { - env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); - } - } - return s; -} - -Status DBImpl::DoCompactionWork(CompactionState* compact) { - const uint64_t start_micros = env_->NowMicros(); - int64_t imm_micros = 0; // Micros spent doing imm_ compactions - - Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1); - - assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); - assert(compact->builder == NULL); - assert(compact->outfile == NULL); - if (snapshots_.empty()) { - compact->smallest_snapshot = versions_->LastSequence(); - } else { - compact->smallest_snapshot = snapshots_.oldest()->number_; - } - - // Release mutex while we're actually doing the compaction work - compacting_ = true; - mutex_.Unlock(); - - Iterator* input = versions_->MakeInputIterator(compact->compaction); - input->SeekToFirst(); - Status status; - ParsedInternalKey ikey; - std::string current_user_key; - bool has_current_user_key = false; - SequenceNumber last_sequence_for_key = kMaxSequenceNumber; - for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Prioritize immutable compaction work - if (has_imm_.NoBarrier_Load() != NULL) { - const uint64_t imm_start = env_->NowMicros(); - mutex_.Lock(); - if (imm_ != NULL) { - CompactMemTable(); - compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary - } - mutex_.Unlock(); - imm_micros += (env_->NowMicros() - imm_start); - } - - Slice key = input->key(); - InternalKey tmp_internal_key; - tmp_internal_key.DecodeFrom(key); - if (compact->compaction->ShouldStopBefore(tmp_internal_key) && - compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - - // Handle key/value, add to state, etc. - bool drop = false; - if (!ParseInternalKey(key, &ikey)) { - // Do not hide error keys - current_user_key.clear(); - has_current_user_key = false; - last_sequence_for_key = kMaxSequenceNumber; - } else { - if (!has_current_user_key || - user_comparator()->Compare(ikey.user_key, - Slice(current_user_key)) != 0) { - // First occurrence of this user key - current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); - has_current_user_key = true; - last_sequence_for_key = kMaxSequenceNumber; - } - - if (last_sequence_for_key <= compact->smallest_snapshot) { - // Hidden by an newer entry for same user key - drop = true; // (A) - } else if (ikey.type == kTypeDeletion && - ikey.sequence <= compact->smallest_snapshot && - compact->compaction->IsBaseLevelForKey(ikey.user_key)) { - // For this user key: - // (1) there is no data in higher levels - // (2) data in lower levels will have larger sequence numbers - // (3) data in layers that are being compacted here and have - // smaller sequence numbers will be dropped in the next - // few iterations of this loop (by rule (A) above). - // Therefore this deletion marker is obsolete and can be dropped. - drop = true; - } - - last_sequence_for_key = ikey.sequence; - } -#if 0 - Log(env_, options_.info_log, - " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " - "%d smallest_snapshot: %d", - ikey.user_key.ToString().c_str(), - (int)ikey.sequence, ikey.type, kTypeValue, drop, - compact->compaction->IsBaseLevelForKey(ikey.user_key), - (int)last_sequence_for_key, (int)compact->smallest_snapshot); -#endif - - if (!drop) { - // Open output file if necessary - if (compact->builder == NULL) { - status = OpenCompactionOutputFile(compact); - if (!status.ok()) { - break; - } - } - if (compact->builder->NumEntries() == 0) { - compact->current_output()->smallest.DecodeFrom(key); - } - compact->current_output()->largest.DecodeFrom(key); - compact->builder->Add(key, input->value()); - - // Close output file if it is big enough - if (compact->builder->FileSize() >= - compact->compaction->MaxOutputFileSize()) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - } - - input->Next(); - } - - if (status.ok() && shutting_down_.Acquire_Load()) { - status = Status::IOError("Deleting DB during compaction"); - } - if (status.ok() && compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - } - if (status.ok()) { - status = input->status(); - } - delete input; - input = NULL; - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros - imm_micros; - for (int which = 0; which < 2; which++) { - for (int i = 0; i < compact->compaction->num_input_files(which); i++) { - stats.bytes_read += compact->compaction->input(which, i)->file_size; - } - } - for (size_t i = 0; i < compact->outputs.size(); i++) { - stats.bytes_written += compact->outputs[i].file_size; - } - - mutex_.Lock(); - stats_[compact->compaction->level() + 1].Add(stats); - - if (status.ok()) { - status = InstallCompactionResults(compact); - } - compacting_ = false; - compacting_cv_.SignalAll(); - return status; -} - -Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, - SequenceNumber* latest_snapshot) { - mutex_.Lock(); - *latest_snapshot = versions_->LastSequence(); - - // Collect together all needed child iterators - std::vector list; - list.push_back(mem_->NewIterator()); - if (imm_ != NULL) { - list.push_back(imm_->NewIterator()); - } - versions_->current()->AddIterators(options, &list); - Iterator* internal_iter = - NewMergingIterator(&internal_comparator_, &list[0], list.size()); - versions_->current()->Ref(); - internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current()); - - mutex_.Unlock(); - return internal_iter; -} - -Iterator* DBImpl::TEST_NewInternalIterator() { - SequenceNumber ignored; - return NewInternalIterator(ReadOptions(), &ignored); -} - -int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { - MutexLock l(&mutex_); - return versions_->MaxNextLevelOverlappingBytes(); -} - -Status DBImpl::Get(const ReadOptions& options, - const Slice& key, - std::string* value) { - // TODO(opt): faster implementation - Iterator* iter = NewIterator(options); - iter->Seek(key); - bool found = false; - if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) { - Slice v = iter->value(); - value->assign(v.data(), v.size()); - found = true; - } - // Non-OK iterator status trumps everything else - Status result = iter->status(); - if (result.ok() && !found) { - result = Status::NotFound(Slice()); // Use an empty error message for speed - } - delete iter; - return result; -} - -Iterator* DBImpl::NewIterator(const ReadOptions& options) { - SequenceNumber latest_snapshot; - Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); - SequenceNumber sequence = - (options.snapshot ? options.snapshot->number_ : latest_snapshot); - return NewDBIterator(&dbname_, env_, - user_comparator(), internal_iter, sequence); -} - -void DBImpl::Unref(void* arg1, void* arg2) { - DBImpl* impl = reinterpret_cast(arg1); - Version* v = reinterpret_cast(arg2); - MutexLock l(&impl->mutex_); - v->Unref(); -} - -const Snapshot* DBImpl::GetSnapshot() { - MutexLock l(&mutex_); - return snapshots_.New(versions_->LastSequence()); -} - -void DBImpl::ReleaseSnapshot(const Snapshot* s) { - MutexLock l(&mutex_); - snapshots_.Delete(s); -} - -// Convenience methods -Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { - return DB::Put(o, key, val); -} - -Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { - return DB::Delete(options, key); -} - -Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { - Status status; - MutexLock l(&mutex_); - status = MakeRoomForWrite(false); // May temporarily release lock and wait - uint64_t last_sequence = versions_->LastSequence(); - if (status.ok()) { - WriteBatchInternal::SetSequence(updates, last_sequence + 1); - last_sequence += WriteBatchInternal::Count(updates); - versions_->SetLastSequence(last_sequence); - - // Add to log and apply to memtable - status = log_->AddRecord(WriteBatchInternal::Contents(updates)); - if (status.ok() && options.sync) { - status = logfile_->Sync(); - } - if (status.ok()) { - status = WriteBatchInternal::InsertInto(updates, mem_); - } - } - if (options.post_write_snapshot != NULL) { - *options.post_write_snapshot = - status.ok() ? snapshots_.New(last_sequence) : NULL; - } - return status; -} - -Status DBImpl::MakeRoomForWrite(bool force) { - mutex_.AssertHeld(); - Status s; - while (true) { - if (!bg_error_.ok()) { - // Yield previous error - s = bg_error_; - break; - } else if (!force && - (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { - // There is room in current memtable - break; - } else if (imm_ != NULL) { - // We have filled up the current memtable, but the previous - // one is still being compacted, so we wait. - compacting_cv_.Wait(); - } else { - // Attempt to switch to a new memtable and trigger compaction of old - assert(versions_->PrevLogNumber() == 0); - uint64_t new_log_number = versions_->NewFileNumber(); - WritableFile* lfile = NULL; - s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); - if (!s.ok()) { - break; - } - VersionEdit edit; - edit.SetPrevLogNumber(versions_->LogNumber()); - edit.SetLogNumber(new_log_number); - s = versions_->LogAndApply(&edit, NULL); - if (!s.ok()) { - delete lfile; - env_->DeleteFile(LogFileName(dbname_, new_log_number)); - break; - } - delete log_; - delete logfile_; - logfile_ = lfile; - log_ = new log::Writer(lfile); - imm_ = mem_; - has_imm_.Release_Store(imm_); - mem_ = new MemTable(internal_comparator_); - force = false; // Do not force another compaction if have room - MaybeScheduleCompaction(); - } - } - return s; -} - -bool DBImpl::GetProperty(const Slice& property, std::string* value) { - value->clear(); - - MutexLock l(&mutex_); - Slice in = property; - Slice prefix("leveldb."); - if (!in.starts_with(prefix)) return false; - in.remove_prefix(prefix.size()); - - if (in.starts_with("num-files-at-level")) { - in.remove_prefix(strlen("num-files-at-level")); - uint64_t level; - bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || level < 0 || level >= config::kNumLevels) { - return false; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "%d", - versions_->NumLevelFiles(static_cast(level))); - *value = buf; - return true; - } - } else if (in == "stats") { - char buf[200]; - snprintf(buf, sizeof(buf), - " Compactions\n" - "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n" - "--------------------------------------------------\n" - ); - value->append(buf); - for (int level = 0; level < config::kNumLevels; level++) { - int files = versions_->NumLevelFiles(level); - if (stats_[level].micros > 0 || files > 0) { - snprintf( - buf, sizeof(buf), - "%3d %8d %8.0f %9.0f %8.0f %9.0f\n", - level, - files, - versions_->NumLevelBytes(level) / 1048576.0, - stats_[level].micros / 1e6, - stats_[level].bytes_read / 1048576.0, - stats_[level].bytes_written / 1048576.0); - value->append(buf); - } - } - return true; - } - - return false; -} - -void DBImpl::GetApproximateSizes( - const Range* range, int n, - uint64_t* sizes) { - // TODO(opt): better implementation - Version* v; - { - MutexLock l(&mutex_); - versions_->current()->Ref(); - v = versions_->current(); - } - - for (int i = 0; i < n; i++) { - // Convert user_key into a corresponding internal key. - InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); - uint64_t start = versions_->ApproximateOffsetOf(v, k1); - uint64_t limit = versions_->ApproximateOffsetOf(v, k2); - sizes[i] = (limit >= start ? limit - start : 0); - } - - { - MutexLock l(&mutex_); - v->Unref(); - } -} - -// Default implementations of convenience methods that subclasses of DB -// can call if they wish -Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { - WriteBatch batch; - batch.Put(key, value); - return Write(opt, &batch); -} - -Status DB::Delete(const WriteOptions& opt, const Slice& key) { - WriteBatch batch; - batch.Delete(key); - return Write(opt, &batch); -} - -DB::~DB() { } - -Status DB::Open(const Options& options, const std::string& dbname, - DB** dbptr) { - *dbptr = NULL; - - DBImpl* impl = new DBImpl(options, dbname); - impl->mutex_.Lock(); - VersionEdit edit; - Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists - if (s.ok()) { - uint64_t new_log_number = impl->versions_->NewFileNumber(); - WritableFile* lfile; - s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), - &lfile); - if (s.ok()) { - edit.SetLogNumber(new_log_number); - impl->logfile_ = lfile; - impl->log_ = new log::Writer(lfile); - s = impl->versions_->LogAndApply(&edit, NULL); - } - if (s.ok()) { - impl->DeleteObsoleteFiles(); - } - } - impl->mutex_.Unlock(); - if (s.ok()) { - *dbptr = impl; - } else { - delete impl; - } - return s; -} - -Status DestroyDB(const std::string& dbname, const Options& options) { - Env* env = options.env; - std::vector filenames; - // Ignore error in case directory does not exist - env->GetChildren(dbname, &filenames); - if (filenames.empty()) { - return Status::OK(); - } - - FileLock* lock; - Status result = env->LockFile(LockFileName(dbname), &lock); - if (result.ok()) { - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - Status del = env->DeleteFile(dbname + "/" + filenames[i]); - if (result.ok() && !del.ok()) { - result = del; - } - } - } - env->UnlockFile(lock); // Ignore error since state is already gone - env->DeleteFile(LockFileName(dbname)); - env->DeleteDir(dbname); // Ignore error in case dir contains other files - } - return result; -} - -} diff --git a/leveldb/db/db_impl.h b/leveldb/db/db_impl.h deleted file mode 100644 index 7699d8c..0000000 --- a/leveldb/db/db_impl.h +++ /dev/null @@ -1,184 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ -#define STORAGE_LEVELDB_DB_DB_IMPL_H_ - -#include -#include "db/dbformat.h" -#include "db/log_writer.h" -#include "db/snapshot.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "port/port.h" - -namespace leveldb { - -class MemTable; -class TableCache; -class Version; -class VersionEdit; -class VersionSet; - -class DBImpl : public DB { - public: - DBImpl(const Options& options, const std::string& dbname); - virtual ~DBImpl(); - - // Implementations of the DB interface - virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); - virtual Status Delete(const WriteOptions&, const Slice& key); - virtual Status Write(const WriteOptions& options, WriteBatch* updates); - virtual Status Get(const ReadOptions& options, - const Slice& key, - std::string* value); - virtual Iterator* NewIterator(const ReadOptions&); - virtual const Snapshot* GetSnapshot(); - virtual void ReleaseSnapshot(const Snapshot* snapshot); - virtual bool GetProperty(const Slice& property, std::string* value); - virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); - - // Extra methods (for testing) that are not in the public DB interface - - // Compact any files in the named level that overlap [begin,end] - void TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end); - - // Force current memtable contents to be compacted. - Status TEST_CompactMemTable(); - - // Return an internal iterator over the current state of the database. - // The keys of this iterator are internal keys (see format.h). - // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator(); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t TEST_MaxNextLevelOverlappingBytes(); - - private: - friend class DB; - - Iterator* NewInternalIterator(const ReadOptions&, - SequenceNumber* latest_snapshot); - - Status NewDB(); - - // Recover the descriptor from persistent storage. May do a significant - // amount of work to recover recently logged updates. Any changes to - // be made to the descriptor are added to *edit. - Status Recover(VersionEdit* edit); - - void MaybeIgnoreError(Status* s) const; - - // Delete any unneeded files and stale in-memory entries. - void DeleteObsoleteFiles(); - - // Called when an iterator over a particular version of the - // descriptor goes away. - static void Unref(void* arg1, void* arg2); - - // Compact the in-memory write buffer to disk. Switches to a new - // log-file/memtable and writes a new descriptor iff successful. - Status CompactMemTable(); - - Status RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence); - - Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); - - Status MakeRoomForWrite(bool force /* compact even if there is room? */); - - struct CompactionState; - - void MaybeScheduleCompaction(); - static void BGWork(void* db); - void BackgroundCall(); - void BackgroundCompaction(); - void CleanupCompaction(CompactionState* compact); - Status DoCompactionWork(CompactionState* compact); - - Status OpenCompactionOutputFile(CompactionState* compact); - Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); - Status InstallCompactionResults(CompactionState* compact); - - // Constant after construction - Env* const env_; - const InternalKeyComparator internal_comparator_; - const Options options_; // options_.comparator == &internal_comparator_ - bool owns_info_log_; - bool owns_cache_; - const std::string dbname_; - - // table_cache_ provides its own synchronization - TableCache* table_cache_; - - // Lock over the persistent DB state. Non-NULL iff successfully acquired. - FileLock* db_lock_; - - // State below is protected by mutex_ - port::Mutex mutex_; - port::AtomicPointer shutting_down_; - port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ - port::CondVar compacting_cv_; // Signalled when !compacting_ - MemTable* mem_; - MemTable* imm_; // Memtable being compacted - port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ - WritableFile* logfile_; - log::Writer* log_; - SnapshotList snapshots_; - - // Set of table files to protect from deletion because they are - // part of ongoing compactions. - std::set pending_outputs_; - - // Has a background compaction been scheduled or is running? - bool bg_compaction_scheduled_; - - // Is there a compaction running? - bool compacting_; - - VersionSet* versions_; - - // Have we encountered a background error in paranoid mode? - Status bg_error_; - - // Per level compaction stats. stats_[level] stores the stats for - // compactions that produced data for the specified "level". - struct CompactionStats { - int64_t micros; - int64_t bytes_read; - int64_t bytes_written; - - CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { } - - void Add(const CompactionStats& c) { - this->micros += c.micros; - this->bytes_read += c.bytes_read; - this->bytes_written += c.bytes_written; - } - }; - CompactionStats stats_[config::kNumLevels]; - - // No copying allowed - DBImpl(const DBImpl&); - void operator=(const DBImpl&); - - const Comparator* user_comparator() const { - return internal_comparator_.user_comparator(); - } -}; - -// Sanitize db options. The caller should delete result.info_log if -// it is not equal to src.info_log. -extern Options SanitizeOptions(const std::string& db, - const InternalKeyComparator* icmp, - const Options& src); - -} - -#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/leveldb/db/db_iter.cc b/leveldb/db/db_iter.cc deleted file mode 100644 index 0be18ff..0000000 --- a/leveldb/db/db_iter.cc +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_iter.h" - -#include "db/filename.h" -#include "db/dbformat.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/mutexlock.h" - -namespace leveldb { - -#if 0 -static void DumpInternalIter(Iterator* iter) { - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey k; - if (!ParseInternalKey(iter->key(), &k)) { - fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); - } else { - fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); - } - } -} -#endif - -namespace { - -// Memtables and sstables that make the DB representation contain -// (userkey,seq,type) => uservalue entries. DBIter -// combines multiple entries for the same userkey found in the DB -// representation into a single entry while accounting for sequence -// numbers, deletion markers, overwrites, etc. -class DBIter: public Iterator { - public: - // Which direction is the iterator currently moving? - // (1) When moving forward, the internal iterator is positioned at - // the exact entry that yields this->key(), this->value() - // (2) When moving backwards, the internal iterator is positioned - // just before all entries whose user key == this->key(). - enum Direction { - kForward, - kReverse - }; - - DBIter(const std::string* dbname, Env* env, - const Comparator* cmp, Iterator* iter, SequenceNumber s) - : dbname_(dbname), - env_(env), - user_comparator_(cmp), - iter_(iter), - sequence_(s), - direction_(kForward), - valid_(false) { - } - virtual ~DBIter() { - delete iter_; - } - virtual bool Valid() const { return valid_; } - virtual Slice key() const { - assert(valid_); - return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_; - } - virtual Slice value() const { - assert(valid_); - return (direction_ == kForward) ? iter_->value() : saved_value_; - } - virtual Status status() const { - if (status_.ok()) { - return iter_->status(); - } else { - return status_; - } - } - - virtual void Next(); - virtual void Prev(); - virtual void Seek(const Slice& target); - virtual void SeekToFirst(); - virtual void SeekToLast(); - - private: - void FindNextUserEntry(bool skipping, std::string* skip); - void FindPrevUserEntry(); - bool ParseKey(ParsedInternalKey* key); - - inline void SaveKey(const Slice& k, std::string* dst) { - dst->assign(k.data(), k.size()); - } - - inline void ClearSavedValue() { - if (saved_value_.capacity() > 1048576) { - std::string empty; - swap(empty, saved_value_); - } else { - saved_value_.clear(); - } - } - - const std::string* const dbname_; - Env* const env_; - const Comparator* const user_comparator_; - Iterator* const iter_; - SequenceNumber const sequence_; - - Status status_; - std::string saved_key_; // == current key when direction_==kReverse - std::string saved_value_; // == current raw value when direction_==kReverse - Direction direction_; - bool valid_; - - // No copying allowed - DBIter(const DBIter&); - void operator=(const DBIter&); -}; - -inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { - if (!ParseInternalKey(iter_->key(), ikey)) { - status_ = Status::Corruption("corrupted internal key in DBIter"); - return false; - } else { - return true; - } -} - -void DBIter::Next() { - assert(valid_); - - if (direction_ == kReverse) { // Switch directions? - direction_ = kForward; - // iter_ is pointing just before the entries for this->key(), - // so advance into the range of entries for this->key() and then - // use the normal skipping code below. - if (!iter_->Valid()) { - iter_->SeekToFirst(); - } else { - iter_->Next(); - } - if (!iter_->Valid()) { - valid_ = false; - saved_key_.clear(); - return; - } - } - - // Temporarily use saved_key_ as storage for key to skip. - std::string* skip = &saved_key_; - SaveKey(ExtractUserKey(iter_->key()), skip); - FindNextUserEntry(true, skip); -} - -void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { - // Loop until we hit an acceptable entry to yield - assert(iter_->Valid()); - assert(direction_ == kForward); - do { - ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - switch (ikey.type) { - case kTypeDeletion: - // Arrange to skip all upcoming entries for this key since - // they are hidden by this deletion. - SaveKey(ikey.user_key, skip); - skipping = true; - break; - case kTypeValue: - if (skipping && - user_comparator_->Compare(ikey.user_key, *skip) <= 0) { - // Entry hidden - } else { - valid_ = true; - saved_key_.clear(); - return; - } - break; - } - } - iter_->Next(); - } while (iter_->Valid()); - saved_key_.clear(); - valid_ = false; -} - -void DBIter::Prev() { - assert(valid_); - - if (direction_ == kForward) { // Switch directions? - // iter_ is pointing at the current entry. Scan backwards until - // the key changes so we can use the normal reverse scanning code. - assert(iter_->Valid()); // Otherwise valid_ would have been false - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); - while (true) { - iter_->Prev(); - if (!iter_->Valid()) { - valid_ = false; - saved_key_.clear(); - ClearSavedValue(); - return; - } - if (user_comparator_->Compare(ExtractUserKey(iter_->key()), - saved_key_) < 0) { - break; - } - } - direction_ = kReverse; - } - - FindPrevUserEntry(); -} - -void DBIter::FindPrevUserEntry() { - assert(direction_ == kReverse); - - ValueType value_type = kTypeDeletion; - if (iter_->Valid()) { - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); - do { - ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - if ((value_type != kTypeDeletion) && - user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { - // We encountered a non-deleted value in entries for previous keys, - break; - } - value_type = ikey.type; - if (value_type == kTypeDeletion) { - ClearSavedValue(); - } else { - Slice raw_value = iter_->value(); - if (saved_value_.capacity() > raw_value.size() + 1048576) { - std::string empty; - swap(empty, saved_value_); - } - saved_value_.assign(raw_value.data(), raw_value.size()); - } - } - iter_->Prev(); - } while (iter_->Valid()); - } - - if (value_type == kTypeDeletion) { - // End - valid_ = false; - saved_key_.clear(); - ClearSavedValue(); - direction_ = kForward; - } else { - valid_ = true; - } -} - -void DBIter::Seek(const Slice& target) { - direction_ = kForward; - ClearSavedValue(); - saved_key_.clear(); - AppendInternalKey( - &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); - iter_->Seek(saved_key_); - if (iter_->Valid()) { - FindNextUserEntry(false, &saved_key_ /* temporary storage */); - } else { - valid_ = false; - } -} - -void DBIter::SeekToFirst() { - direction_ = kForward; - ClearSavedValue(); - iter_->SeekToFirst(); - if (iter_->Valid()) { - FindNextUserEntry(false, &saved_key_ /* temporary storage */); - } else { - valid_ = false; - } -} - -void DBIter::SeekToLast() { - direction_ = kReverse; - ClearSavedValue(); - iter_->SeekToLast(); - FindPrevUserEntry(); -} - -} // anonymous namespace - -Iterator* NewDBIterator( - const std::string* dbname, - Env* env, - const Comparator* user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence) { - return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); -} - -} diff --git a/leveldb/db/db_iter.h b/leveldb/db/db_iter.h deleted file mode 100644 index 195f3d3..0000000 --- a/leveldb/db/db_iter.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ -#define STORAGE_LEVELDB_DB_DB_ITER_H_ - -#include -#include "leveldb/db.h" -#include "db/dbformat.h" - -namespace leveldb { - -// Return a new iterator that converts internal keys (yielded by -// "*internal_iter") that were live at the specified "sequence" number -// into appropriate user keys. -extern Iterator* NewDBIterator( - const std::string* dbname, - Env* env, - const Comparator* user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence); - -} - -#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/leveldb/db/db_test.cc b/leveldb/db/db_test.cc deleted file mode 100644 index f828e3d..0000000 --- a/leveldb/db/db_test.cc +++ /dev/null @@ -1,1030 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} - -class DBTest { - public: - std::string dbname_; - Env* env_; - DB* db_; - - Options last_options_; - - DBTest() : env_(Env::Default()) { - dbname_ = test::TmpDir() + "/db_test"; - DestroyDB(dbname_, Options()); - db_ = NULL; - Reopen(); - } - - ~DBTest() { - delete db_; - DestroyDB(dbname_, Options()); - } - - DBImpl* dbfull() { - return reinterpret_cast(db_); - } - - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); - } - - void DestroyAndReopen(Options* options = NULL) { - delete db_; - db_ = NULL; - DestroyDB(dbname_, Options()); - ASSERT_OK(TryReopen(options)); - } - - Status TryReopen(Options* options) { - delete db_; - db_ = NULL; - Options opts; - if (options != NULL) { - opts = *options; - } else { - opts.create_if_missing = true; - } - last_options_ = opts; - - return DB::Open(opts, dbname_, &db_); - } - - Status Put(const std::string& k, const std::string& v) { - return db_->Put(WriteOptions(), k, v); - } - - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } - - std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { - ReadOptions options; - options.snapshot = snapshot; - std::string result; - Status s = db_->Get(options, k, &result); - if (s.IsNotFound()) { - result = "NOT_FOUND"; - } else if (!s.ok()) { - result = s.ToString(); - } - return result; - } - - std::string AllEntriesFor(const Slice& user_key) { - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); - iter->Seek(target.Encode()); - std::string result; - if (!iter->status().ok()) { - result = iter->status().ToString(); - } else { - result = "[ "; - bool first = true; - while (iter->Valid()) { - ParsedInternalKey ikey; - if (!ParseInternalKey(iter->key(), &ikey)) { - result += "CORRUPTED"; - } else { - if (last_options_.comparator->Compare( - ikey.user_key, user_key) != 0) { - break; - } - if (!first) { - result += ", "; - } - first = false; - switch (ikey.type) { - case kTypeValue: - result += iter->value().ToString(); - break; - case kTypeDeletion: - result += "DEL"; - break; - } - } - iter->Next(); - } - if (!first) { - result += " "; - } - result += "]"; - } - delete iter; - return result; - } - - int NumTableFilesAtLevel(int level) { - std::string property; - ASSERT_TRUE( - db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), - &property)); - return atoi(property.c_str()); - } - - uint64_t Size(const Slice& start, const Slice& limit) { - Range r(start, limit); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - - void Compact(const Slice& start, const Slice& limit) { - dbfull()->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - if (NumTableFilesAtLevel(level) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbfull()->TEST_CompactRange(level, "", "~"); - } - } - - void DumpFileCounts(const char* label) { - fprintf(stderr, "---\n%s:\n", label); - fprintf(stderr, "maxoverlap: %lld\n", - static_cast( - dbfull()->TEST_MaxNextLevelOverlappingBytes())); - for (int level = 0; level < config::kNumLevels; level++) { - int num = NumTableFilesAtLevel(level); - if (num > 0) { - fprintf(stderr, " level %3d : %d files\n", level, num); - } - } - } - - std::string IterStatus(Iterator* iter) { - std::string result; - if (iter->Valid()) { - result = iter->key().ToString() + "->" + iter->value().ToString(); - } else { - result = "(invalid)"; - } - return result; - } -}; - -TEST(DBTest, Empty) { - ASSERT_TRUE(db_ != NULL); - ASSERT_EQ("NOT_FOUND", Get("foo")); -} - -TEST(DBTest, ReadWrite) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v2", Get("bar")); -} - -TEST(DBTest, PutDeleteGet) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - ASSERT_OK(db_->Delete(WriteOptions(), "foo")); - ASSERT_EQ("NOT_FOUND", Get("foo")); -} - -TEST(DBTest, IterEmpty) { - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("foo"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterSingle) { - ASSERT_OK(Put("a", "va")); - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterMulti) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", "vb")); - ASSERT_OK(Put("c", "vc")); - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("ax"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("z"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - // Switch from reverse to forward - iter->SeekToLast(); - iter->Prev(); - iter->Prev(); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Switch from forward to reverse - iter->SeekToFirst(); - iter->Next(); - iter->Next(); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Make sure iter stays at snapshot - ASSERT_OK(Put("a", "va2")); - ASSERT_OK(Put("a2", "va3")); - ASSERT_OK(Put("b", "vb2")); - ASSERT_OK(Put("c", "vc2")); - ASSERT_OK(Delete("b")); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterSmallAndLargeMix) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", std::string(100000, 'b'))); - ASSERT_OK(Put("c", "vc")); - ASSERT_OK(Put("d", std::string(100000, 'd'))); - ASSERT_OK(Put("e", std::string(100000, 'e'))); - - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, Recover) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("baz", "v5")); - - Reopen(); - ASSERT_EQ("v1", Get("foo")); - - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v5", Get("baz")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - - Reopen(); - ASSERT_EQ("v3", Get("foo")); - ASSERT_OK(Put("foo", "v4")); - ASSERT_EQ("v4", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ("v5", Get("baz")); -} - -TEST(DBTest, RecoveryWithEmptyLog) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("foo", "v2")); - Reopen(); - Reopen(); - ASSERT_OK(Put("foo", "v3")); - Reopen(); - ASSERT_EQ("v3", Get("foo")); -} - -static std::string Key(int i) { - char buf[100]; - snprintf(buf, sizeof(buf), "key%06d", i); - return std::string(buf); -} - -TEST(DBTest, MinorCompactionsHappen) { - Options options; - options.write_buffer_size = 10000; - Reopen(&options); - - const int N = 500; - - int starting_num_tables = NumTableFilesAtLevel(0); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); - } - int ending_num_tables = NumTableFilesAtLevel(0); - ASSERT_GT(ending_num_tables, starting_num_tables); - - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); - } - - Reopen(); - - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); - } -} - -TEST(DBTest, RecoverWithLargeLog) { - { - Options options; - Reopen(&options); - ASSERT_OK(Put("big1", std::string(200000, '1'))); - ASSERT_OK(Put("big2", std::string(200000, '2'))); - ASSERT_OK(Put("small3", std::string(10, '3'))); - ASSERT_OK(Put("small4", std::string(10, '4'))); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - } - - // Make sure that if we re-open with a small write buffer size that - // we flush table files in the middle of a large log file. - Options options; - options.write_buffer_size = 100000; - Reopen(&options); - ASSERT_EQ(NumTableFilesAtLevel(0), 3); - ASSERT_EQ(std::string(200000, '1'), Get("big1")); - ASSERT_EQ(std::string(200000, '2'), Get("big2")); - ASSERT_EQ(std::string(10, '3'), Get("small3")); - ASSERT_EQ(std::string(10, '4'), Get("small4")); - ASSERT_GT(NumTableFilesAtLevel(0), 1); -} - -TEST(DBTest, CompactionsGenerateMultipleFiles) { - Options options; - options.write_buffer_size = 100000000; // Large write buffer - Reopen(&options); - - Random rnd(301); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - std::vector values; - for (int i = 0; i < 80; i++) { - values.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(Key(i), values[i])); - } - - // Reopening moves updates to level-0 - Reopen(&options); - dbfull()->TEST_CompactRange(0, "", Key(100000)); - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 1); - for (int i = 0; i < 80; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); - } -} - -TEST(DBTest, SparseMerge) { - Options options; - options.compression = kNoCompression; - Reopen(&options); - - // Suppose there is: - // small amount of data with prefix A - // large amount of data with prefix B - // small amount of data with prefix C - // and that recent updates have made small changes to all three prefixes. - // Check that we do not do a compaction that merges all of B in one shot. - const std::string value(1000, 'x'); - Put("A", "va"); - // Write approximately 100MB of "B" values - for (int i = 0; i < 100000; i++) { - char key[100]; - snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); - } - Put("C", "vc"); - Compact("", "z"); - - // Make sparse update - Put("A", "va2"); - Put("B100", "bvalue2"); - Put("C", "vc2"); - dbfull()->TEST_CompactMemTable(); - - // Compactions should not cause us to create a situation where - // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); -} - -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; -} - -TEST(DBTest, ApproximateSizes) { - Options options; - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - DestroyAndReopen(); - - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - Reopen(&options); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - const int N = 80; - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); - } - - // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - for (int compact_start = 0; compact_start < N; compact_start += 10) { - for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); - ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), - 100000 * (i+1), 100000 * (i+1) + 10000)); - ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), - 100000 * 10, 100000 * 10 + 10000)); - } - ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); - ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); - - dbfull()->TEST_CompactRange(0, - Key(compact_start), - Key(compact_start + 9)); - } - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 0); - } -} - -TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { - Options options; - options.compression = kNoCompression; - Reopen(); - - Random rnd(301); - std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(2), big1)); - ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(4), big1)); - ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); - - ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); - - dbfull()->TEST_CompactRange(0, Key(0), Key(100)); - } -} - -TEST(DBTest, IteratorPinsRef) { - Put("foo", "hello"); - - // Get iterator that will yield the current contents of the DB. - Iterator* iter = db_->NewIterator(ReadOptions()); - - // Write to force compactions - Put("foo", "newvalue1"); - for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values - } - Put("foo", "newvalue2"); - - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("hello", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - delete iter; -} - -TEST(DBTest, Snapshot) { - Put("foo", "v1"); - const Snapshot* s1 = db_->GetSnapshot(); - Put("foo", "v2"); - const Snapshot* s2 = db_->GetSnapshot(); - Put("foo", "v3"); - const Snapshot* s3 = db_->GetSnapshot(); - - Put("foo", "v4"); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v3", Get("foo", s3)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s3); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s1); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s2); - ASSERT_EQ("v4", Get("foo")); -} - -TEST(DBTest, HiddenValuesAreRemoved) { - Random rnd(301); - std::string big = RandomString(&rnd, 50000); - Put("foo", big); - Put("pastfoo", "v"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put("foo", "tiny"); - Put("pastfoo2", "v2"); // Advance sequence number one more - - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_GT(NumTableFilesAtLevel(0), 0); - - ASSERT_EQ(big, Get("foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); - db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); - dbfull()->TEST_CompactRange(0, "", "x"); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GE(NumTableFilesAtLevel(1), 1); - dbfull()->TEST_CompactRange(1, "", "x"); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - - ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); -} - -TEST(DBTest, DeletionMarkers1) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file - Delete("foo"); - Put("foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - dbfull()->TEST_CompactRange(0, "", "z"); - // DEL eliminated, but v1 remains because we aren't compacting that level - // (DEL can be eliminated because v2 hides v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(1, "", "z"); - // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. - // (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); -} - -TEST(DBTest, DeletionMarkers2) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file - Delete("foo"); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(0, "", "z"); - // DEL kept: L2 file overlaps - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(1, "", "z"); - // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. - // (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); -} - -TEST(DBTest, ComparatorCheck) { - class NewComparator : public Comparator { - public: - virtual const char* Name() const { return "leveldb.NewComparator"; } - virtual int Compare(const Slice& a, const Slice& b) const { - return BytewiseComparator()->Compare(a, b); - } - virtual void FindShortestSeparator(std::string* s, const Slice& l) const { - BytewiseComparator()->FindShortestSeparator(s, l); - } - virtual void FindShortSuccessor(std::string* key) const { - BytewiseComparator()->FindShortSuccessor(key); - } - }; - NewComparator cmp; - Options new_options; - new_options.comparator = &cmp; - Status s = TryReopen(&new_options); - ASSERT_TRUE(!s.ok()); - ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) - << s.ToString(); -} - -TEST(DBTest, DBOpen_Options) { - std::string dbname = test::TmpDir() + "/db_options_test"; - DestroyDB(dbname, Options()); - - // Does not exist, and create_if_missing == false: error - DB* db = NULL; - Options opts; - opts.create_if_missing = false; - Status s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); - ASSERT_TRUE(db == NULL); - - // Does not exist, and create_if_missing == true: OK - opts.create_if_missing = true; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; - - // Does exist, and error_if_exists == true: error - opts.create_if_missing = false; - opts.error_if_exists = true; - s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); - ASSERT_TRUE(db == NULL); - - // Does exist, and error_if_exists == false: OK - opts.create_if_missing = true; - opts.error_if_exists = false; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; -} - -class ModelDB: public DB { - public: - explicit ModelDB(const Options& options): options_(options) { } - ~ModelDB() { } - virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { - return DB::Put(o, k, v); - } - virtual Status Delete(const WriteOptions& o, const Slice& key) { - return DB::Delete(o, key); - } - virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) { - assert(false); // Not implemented - return Status::NotFound(key); - } - virtual Iterator* NewIterator(const ReadOptions& options) { - if (options.snapshot == NULL) { - KVMap* saved = new KVMap; - *saved = map_; - return new ModelIter(saved, true); - } else { - const KVMap* snapshot_state = - reinterpret_cast(options.snapshot->number_); - return new ModelIter(snapshot_state, false); - } - } - virtual const Snapshot* GetSnapshot() { - KVMap* saved = new KVMap; - *saved = map_; - return snapshots_.New( - reinterpret_cast(saved)); - } - - virtual void ReleaseSnapshot(const Snapshot* snapshot) { - const KVMap* saved = reinterpret_cast(snapshot->number_); - delete saved; - snapshots_.Delete(snapshot); - } - virtual Status Write(const WriteOptions& options, WriteBatch* batch) { - assert(options.post_write_snapshot == NULL); // Not supported - for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeValue: - map_[it.key().ToString()] = it.value().ToString(); - break; - case kTypeDeletion: - map_.erase(it.key().ToString()); - break; - } - } - return Status::OK(); - } - - virtual bool GetProperty(const Slice& property, std::string* value) { - return false; - } - virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { - for (int i = 0; i < n; i++) { - sizes[i] = 0; - } - } - private: - typedef std::map KVMap; - class ModelIter: public Iterator { - public: - ModelIter(const KVMap* map, bool owned) - : map_(map), owned_(owned), iter_(map_->end()) { - } - ~ModelIter() { - if (owned_) delete map_; - } - virtual bool Valid() const { return iter_ != map_->end(); } - virtual void SeekToFirst() { iter_ = map_->begin(); } - virtual void SeekToLast() { - if (map_->empty()) { - iter_ = map_->end(); - } else { - iter_ = map_->find(map_->rbegin()->first); - } - } - virtual void Seek(const Slice& k) { - iter_ = map_->lower_bound(k.ToString()); - } - virtual void Next() { ++iter_; } - virtual void Prev() { --iter_; } - virtual Slice key() const { return iter_->first; } - virtual Slice value() const { return iter_->second; } - virtual Status status() const { return Status::OK(); } - private: - const KVMap* const map_; - const bool owned_; // Do we own map_ - KVMap::const_iterator iter_; - }; - const Options options_; - KVMap map_; - SnapshotList snapshots_; -}; - -static std::string RandomKey(Random* rnd) { - int len = (rnd->OneIn(3) - ? 1 // Short sometimes to encourage collisions - : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); - return test::RandomKey(rnd, len); -} - -static bool CompareIterators(int step, - DB* model, - DB* db, - const Snapshot* model_snap, - const Snapshot* db_snap) { - ReadOptions options; - options.snapshot = model_snap; - Iterator* miter = model->NewIterator(options); - options.snapshot = db_snap; - Iterator* dbiter = db->NewIterator(options); - bool ok = true; - int count = 0; - for (miter->SeekToFirst(), dbiter->SeekToFirst(); - ok && miter->Valid() && dbiter->Valid(); - miter->Next(), dbiter->Next()) { - count++; - if (miter->key().compare(dbiter->key()) != 0) { - fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(dbiter->key()).c_str()); - ok = false; - break; - } - - if (miter->value().compare(dbiter->value()) != 0) { - fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(miter->value()).c_str(), - EscapeString(miter->value()).c_str()); - ok = false; - } - } - - if (ok) { - if (miter->Valid() != dbiter->Valid()) { - fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", - step, miter->Valid(), dbiter->Valid()); - ok = false; - } - } - fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); - delete miter; - delete dbiter; - return ok; -} - -TEST(DBTest, Randomized) { - Random rnd(test::RandomSeed()); - ModelDB model(last_options_); - const int N = 10000; - const Snapshot* model_snap = NULL; - const Snapshot* db_snap = NULL; - std::string k, v; - for (int step = 0; step < N; step++) { - if (step % 100 == 0) { - fprintf(stderr, "Step %d of %d\n", step, N); - } - int p = rnd.Uniform(100); - if (p < 45) { // Put - k = RandomKey(&rnd); - v = RandomString(&rnd, - rnd.OneIn(20) - ? 100 + rnd.Uniform(100) - : rnd.Uniform(8)); - ASSERT_OK(model.Put(WriteOptions(), k, v)); - ASSERT_OK(db_->Put(WriteOptions(), k, v)); - - } else if (p < 90) { // Delete - k = RandomKey(&rnd); - ASSERT_OK(model.Delete(WriteOptions(), k)); - ASSERT_OK(db_->Delete(WriteOptions(), k)); - - - } else { // Multi-element batch - WriteBatch b; - const int num = rnd.Uniform(8); - for (int i = 0; i < num; i++) { - if (i == 0 || !rnd.OneIn(10)) { - k = RandomKey(&rnd); - } else { - // Periodically re-use the same key from the previous iter, so - // we have multiple entries in the write batch for the same key - } - if (rnd.OneIn(2)) { - v = RandomString(&rnd, rnd.Uniform(10)); - b.Put(k, v); - } else { - b.Delete(k); - } - } - ASSERT_OK(model.Write(WriteOptions(), &b)); - ASSERT_OK(db_->Write(WriteOptions(), &b)); - } - - if ((step % 100) == 0) { - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); - // Save a snapshot from each DB this time that we'll use next - // time we compare things, to make sure the current state is - // preserved with the snapshot - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); - - Reopen(); - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - - model_snap = model.GetSnapshot(); - db_snap = db_->GetSnapshot(); - } - } - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/dbformat.cc b/leveldb/db/dbformat.cc deleted file mode 100644 index c12c138..0000000 --- a/leveldb/db/dbformat.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "db/dbformat.h" -#include "port/port.h" -#include "util/coding.h" - -namespace leveldb { - -static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { - assert(seq <= kMaxSequenceNumber); - assert(t <= kValueTypeForSeek); - return (seq << 8) | t; -} - -void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { - result->append(key.user_key.data(), key.user_key.size()); - PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); -} - -std::string ParsedInternalKey::DebugString() const { - char buf[50]; - snprintf(buf, sizeof(buf), "' @ %llu : %d", - (unsigned long long) sequence, - int(type)); - std::string result = "'"; - result += user_key.ToString(); - result += buf; - return result; -} - -const char* InternalKeyComparator::Name() const { - return "leveldb.InternalKeyComparator"; -} - -int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { - // Order by: - // increasing user key (according to user-supplied comparator) - // decreasing sequence number - // decreasing type (though sequence# should be enough to disambiguate) - int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); - if (r == 0) { - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); - if (anum > bnum) { - r = -1; - } else if (anum < bnum) { - r = +1; - } - } - return r; -} - -void InternalKeyComparator::FindShortestSeparator( - std::string* start, - const Slice& limit) const { - // Attempt to shorten the user portion of the key - Slice user_start = ExtractUserKey(*start); - Slice user_limit = ExtractUserKey(limit); - std::string tmp(user_start.data(), user_start.size()); - user_comparator_->FindShortestSeparator(&tmp, user_limit); - if (user_comparator_->Compare(*start, tmp) < 0) { - // User key has become larger. Tack on the earliest possible - // number to the shortened user key. - PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); - assert(this->Compare(*start, tmp) < 0); - assert(this->Compare(tmp, limit) < 0); - start->swap(tmp); - } -} - -void InternalKeyComparator::FindShortSuccessor(std::string* key) const { - Slice user_key = ExtractUserKey(*key); - std::string tmp(user_key.data(), user_key.size()); - user_comparator_->FindShortSuccessor(&tmp); - if (user_comparator_->Compare(user_key, tmp) < 0) { - // User key has become larger. Tack on the earliest possible - // number to the shortened user key. - PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); - assert(this->Compare(*key, tmp) < 0); - key->swap(tmp); - } -} - -} diff --git a/leveldb/db/dbformat.h b/leveldb/db/dbformat.h deleted file mode 100644 index d583665..0000000 --- a/leveldb/db/dbformat.h +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ -#define STORAGE_LEVELDB_DB_FORMAT_H_ - -#include -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/slice.h" -#include "leveldb/table_builder.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -// Grouping of constants. We may want to make some of these -// parameters set via options. -namespace config { -static const int kNumLevels = 7; -} - -class InternalKey; - -// Value types encoded as the last component of internal keys. -// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk -// data structures. -enum ValueType { - kTypeDeletion = 0x0, - kTypeValue = 0x1, -}; -// kValueTypeForSeek defines the ValueType that should be passed when -// constructing a ParsedInternalKey object for seeking to a particular -// sequence number (since we sort sequence numbers in decreasing order -// and the value type is embedded as the low 8 bits in the sequence -// number in internal keys, we need to use the highest-numbered -// ValueType, not the lowest). -static const ValueType kValueTypeForSeek = kTypeValue; - -typedef uint64_t SequenceNumber; - -// We leave eight bits empty at the bottom so a type and sequence# -// can be packed together into 64-bits. -static const SequenceNumber kMaxSequenceNumber = - ((0x1ull << 56) - 1); - -struct ParsedInternalKey { - Slice user_key; - SequenceNumber sequence; - ValueType type; - - ParsedInternalKey() { } // Intentionally left uninitialized (for speed) - ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) - : user_key(u), sequence(seq), type(t) { } - std::string DebugString() const; -}; - -// Return the length of the encoding of "key". -inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { - return key.user_key.size() + 8; -} - -// Append the serialization of "key" to *result. -extern void AppendInternalKey(std::string* result, - const ParsedInternalKey& key); - -// Attempt to parse an internal key from "internal_key". On success, -// stores the parsed data in "*result", and returns true. -// -// On error, returns false, leaves "*result" in an undefined state. -extern bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result); - -// Returns the user key portion of an internal key. -inline Slice ExtractUserKey(const Slice& internal_key) { - assert(internal_key.size() >= 8); - return Slice(internal_key.data(), internal_key.size() - 8); -} - -inline ValueType ExtractValueType(const Slice& internal_key) { - assert(internal_key.size() >= 8); - const size_t n = internal_key.size(); - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; - return static_cast(c); -} - -// A comparator for internal keys that uses a specified comparator for -// the user key portion and breaks ties by decreasing sequence number. -class InternalKeyComparator : public Comparator { - private: - const Comparator* user_comparator_; - public: - explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } - virtual const char* Name() const; - virtual int Compare(const Slice& a, const Slice& b) const; - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const; - virtual void FindShortSuccessor(std::string* key) const; - - const Comparator* user_comparator() const { return user_comparator_; } - - int Compare(const InternalKey& a, const InternalKey& b) const; -}; - -// Modules in this directory should keep internal keys wrapped inside -// the following class instead of plain strings so that we do not -// incorrectly use string comparisons instead of an InternalKeyComparator. -class InternalKey { - private: - std::string rep_; - public: - InternalKey() { } // Leave rep_ as empty to indicate it is invalid - InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { - AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); - } - - void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } - Slice Encode() const { - assert(!rep_.empty()); - return rep_; - } - - Slice user_key() const { return ExtractUserKey(rep_); } - - void SetFrom(const ParsedInternalKey& p) { - rep_.clear(); - AppendInternalKey(&rep_, p); - } - - void Clear() { rep_.clear(); } -}; - -inline int InternalKeyComparator::Compare( - const InternalKey& a, const InternalKey& b) const { - return Compare(a.Encode(), b.Encode()); -} - -inline bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result) { - const size_t n = internal_key.size(); - if (n < 8) return false; - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; - result->sequence = num >> 8; - result->type = static_cast(c); - result->user_key = Slice(internal_key.data(), n - 8); - return (c <= static_cast(kTypeValue)); -} - -} - -#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/leveldb/db/dbformat_test.cc b/leveldb/db/dbformat_test.cc deleted file mode 100644 index 57c5578..0000000 --- a/leveldb/db/dbformat_test.cc +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/dbformat.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -static std::string IKey(const std::string& user_key, - uint64_t seq, - ValueType vt) { - std::string encoded; - AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); - return encoded; -} - -static std::string Shorten(const std::string& s, const std::string& l) { - std::string result = s; - InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); - return result; -} - -static std::string ShortSuccessor(const std::string& s) { - std::string result = s; - InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); - return result; -} - -static void TestKey(const std::string& key, - uint64_t seq, - ValueType vt) { - std::string encoded = IKey(key, seq, vt); - - Slice in(encoded); - ParsedInternalKey decoded("", 0, kTypeValue); - - ASSERT_TRUE(ParseInternalKey(in, &decoded)); - ASSERT_EQ(key, decoded.user_key.ToString()); - ASSERT_EQ(seq, decoded.sequence); - ASSERT_EQ(vt, decoded.type); - - ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); -} - -class FormatTest { }; - -TEST(FormatTest, InternalKey_EncodeDecode) { - const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; - const uint64_t seq[] = { - 1, 2, 3, - (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, - (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, - (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 - }; - for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { - for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { - TestKey(keys[k], seq[s], kTypeValue); - TestKey("hello", 1, kTypeDeletion); - } - } -} - -TEST(FormatTest, InternalKeyShortSeparator) { - // When user keys are same - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 99, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 101, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeDeletion))); - - // When user keys are misordered - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("bar", 99, kTypeValue))); - - // When user keys are different, but correctly ordered - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - Shorten(IKey("foo", 100, kTypeValue), - IKey("hello", 200, kTypeValue))); - - // When start user key is prefix of limit user key - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foobar", 200, kTypeValue))); - - // When limit user key is prefix of start user key - ASSERT_EQ(IKey("foobar", 100, kTypeValue), - Shorten(IKey("foobar", 100, kTypeValue), - IKey("foo", 200, kTypeValue))); -} - -TEST(FormatTest, InternalKeyShortestSuccessor) { - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - ShortSuccessor(IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), - ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/filename.cc b/leveldb/db/filename.cc deleted file mode 100644 index b3a917c..0000000 --- a/leveldb/db/filename.cc +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include "db/filename.h" -#include "db/dbformat.h" -#include "leveldb/env.h" -#include "util/logging.h" - -namespace leveldb { - -static std::string MakeFileName(const std::string& name, uint64_t number, - const char* suffix) { - char buf[100]; - snprintf(buf, sizeof(buf), "/%06llu.%s", - static_cast(number), - suffix); - return name + buf; -} - -std::string LogFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "log"); -} - -std::string TableFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "sst"); -} - -std::string DescriptorFileName(const std::string& dbname, uint64_t number) { - assert(number > 0); - char buf[100]; - snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", - static_cast(number)); - return dbname + buf; -} - -std::string CurrentFileName(const std::string& dbname) { - return dbname + "/CURRENT"; -} - -std::string LockFileName(const std::string& dbname) { - return dbname + "/LOCK"; -} - -std::string TempFileName(const std::string& dbname, uint64_t number) { - assert(number > 0); - return MakeFileName(dbname, number, "dbtmp"); -} - -std::string InfoLogFileName(const std::string& dbname) { - return dbname + "/LOG"; -} - -// Return the name of the old info log file for "dbname". -std::string OldInfoLogFileName(const std::string& dbname) { - return dbname + "/LOG.old"; -} - - -// Owned filenames have the form: -// dbname/CURRENT -// dbname/LOCK -// dbname/LOG -// dbname/LOG.old -// dbname/MANIFEST-[0-9]+ -// dbname/[0-9]+.(log|sst) -bool ParseFileName(const std::string& fname, - uint64_t* number, - FileType* type) { - Slice rest(fname); - if (rest == "CURRENT") { - *number = 0; - *type = kCurrentFile; - } else if (rest == "LOCK") { - *number = 0; - *type = kDBLockFile; - } else if (rest == "LOG" || rest == "LOG.old") { - *number = 0; - *type = kInfoLogFile; - } else if (rest.starts_with("MANIFEST-")) { - rest.remove_prefix(strlen("MANIFEST-")); - uint64_t num; - if (!ConsumeDecimalNumber(&rest, &num)) { - return false; - } - if (!rest.empty()) { - return false; - } - *type = kDescriptorFile; - *number = num; - } else { - // Avoid strtoull() to keep filename format independent of the - // current locale - uint64_t num; - if (!ConsumeDecimalNumber(&rest, &num)) { - return false; - } - Slice suffix = rest; - if (suffix == Slice(".log")) { - *type = kLogFile; - } else if (suffix == Slice(".sst")) { - *type = kTableFile; - } else if (suffix == Slice(".dbtmp")) { - *type = kTempFile; - } else { - return false; - } - *number = num; - } - return true; -} - -Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number) { - // Remove leading "dbname/" and add newline to manifest file name - std::string manifest = DescriptorFileName(dbname, descriptor_number); - Slice contents = manifest; - assert(contents.starts_with(dbname + "/")); - contents.remove_prefix(dbname.size() + 1); - std::string tmp = TempFileName(dbname, descriptor_number); - Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); - if (s.ok()) { - s = env->RenameFile(tmp, CurrentFileName(dbname)); - } - if (!s.ok()) { - env->DeleteFile(tmp); - } - return s; -} - -} diff --git a/leveldb/db/filename.h b/leveldb/db/filename.h deleted file mode 100644 index 6a99744..0000000 --- a/leveldb/db/filename.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// File names used by DB code - -#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ -#define STORAGE_LEVELDB_DB_FILENAME_H_ - -#include -#include -#include "leveldb/slice.h" -#include "leveldb/status.h" -#include "port/port.h" - -namespace leveldb { - -class Env; - -enum FileType { - kLogFile, - kDBLockFile, - kTableFile, - kDescriptorFile, - kCurrentFile, - kTempFile, - kInfoLogFile, // Either the current one, or an old one -}; - -// Return the name of the log file with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string LogFileName(const std::string& dbname, uint64_t number); - -// Return the name of the sstable with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string TableFileName(const std::string& dbname, uint64_t number); - -// Return the name of the descriptor file for the db named by -// "dbname" and the specified incarnation number. The result will be -// prefixed with "dbname". -extern std::string DescriptorFileName(const std::string& dbname, - uint64_t number); - -// Return the name of the current file. This file contains the name -// of the current manifest file. The result will be prefixed with -// "dbname". -extern std::string CurrentFileName(const std::string& dbname); - -// Return the name of the lock file for the db named by -// "dbname". The result will be prefixed with "dbname". -extern std::string LockFileName(const std::string& dbname); - -// Return the name of a temporary file owned by the db named "dbname". -// The result will be prefixed with "dbname". -extern std::string TempFileName(const std::string& dbname, uint64_t number); - -// Return the name of the info log file for "dbname". -extern std::string InfoLogFileName(const std::string& dbname); - -// Return the name of the old info log file for "dbname". -extern std::string OldInfoLogFileName(const std::string& dbname); - -// If filename is a leveldb file, store the type of the file in *type. -// The number encoded in the filename is stored in *number. If the -// filename was successfully parsed, returns true. Else return false. -extern bool ParseFileName(const std::string& filename, - uint64_t* number, - FileType* type); - -// Make the CURRENT file point to the descriptor file with the -// specified number. -extern Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number); - - -} - -#endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/leveldb/db/filename_test.cc b/leveldb/db/filename_test.cc deleted file mode 100644 index 2f61e8d..0000000 --- a/leveldb/db/filename_test.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/filename.h" - -#include "db/dbformat.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -class FileNameTest { }; - -TEST(FileNameTest, Parse) { - Slice db; - FileType type; - uint64_t number; - - // Successful parses - static struct { - const char* fname; - uint64_t number; - FileType type; - } cases[] = { - { "100.log", 100, kLogFile }, - { "0.log", 0, kLogFile }, - { "0.sst", 0, kTableFile }, - { "CURRENT", 0, kCurrentFile }, - { "LOCK", 0, kDBLockFile }, - { "MANIFEST-2", 2, kDescriptorFile }, - { "MANIFEST-7", 7, kDescriptorFile }, - { "LOG", 0, kInfoLogFile }, - { "LOG.old", 0, kInfoLogFile }, - { "18446744073709551615.log", 18446744073709551615ull, kLogFile }, - }; - for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { - std::string f = cases[i].fname; - ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; - ASSERT_EQ(cases[i].type, type) << f; - ASSERT_EQ(cases[i].number, number) << f; - } - - // Errors - static const char* errors[] = { - "", - "foo", - "foo-dx-100.log", - ".log", - "", - "manifest", - "CURREN", - "CURRENTX", - "MANIFES", - "MANIFEST", - "MANIFEST-", - "XMANIFEST-3", - "MANIFEST-3x", - "LOC", - "LOCKx", - "LO", - "LOGx", - "18446744073709551616.log", - "184467440737095516150.log", - "100", - "100.", - "100.lop" - }; - for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { - std::string f = errors[i]; - ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; - }; -} - -TEST(FileNameTest, Construction) { - uint64_t number; - FileType type; - std::string fname; - - fname = CurrentFileName("foo"); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(0, number); - ASSERT_EQ(kCurrentFile, type); - - fname = LockFileName("foo"); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(0, number); - ASSERT_EQ(kDBLockFile, type); - - fname = LogFileName("foo", 192); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(192, number); - ASSERT_EQ(kLogFile, type); - - fname = TableFileName("bar", 200); - ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(200, number); - ASSERT_EQ(kTableFile, type); - - fname = DescriptorFileName("bar", 100); - ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(100, number); - ASSERT_EQ(kDescriptorFile, type); - - fname = TempFileName("tmp", 999); - ASSERT_EQ("tmp/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(999, number); - ASSERT_EQ(kTempFile, type); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/log_format.h b/leveldb/db/log_format.h deleted file mode 100644 index 137cd4a..0000000 --- a/leveldb/db/log_format.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Log format information shared by reader and writer. -// See ../doc/log_format.txt for more detail. - -#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ -#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ - -namespace leveldb { -namespace log { - -enum RecordType { - // Zero is reserved for preallocated files - kZeroType = 0, - - kFullType = 1, - - // For fragments - kFirstType = 2, - kMiddleType = 3, - kLastType = 4, -}; -static const int kMaxRecordType = kLastType; - -static const int kBlockSize = 32768; - -// Header is checksum (4 bytes), type (1 byte), length (2 bytes). -static const int kHeaderSize = 4 + 1 + 2; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/leveldb/db/log_reader.cc b/leveldb/db/log_reader.cc deleted file mode 100644 index 75e1d28..0000000 --- a/leveldb/db/log_reader.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_reader.h" - -#include -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { -namespace log { - -Reader::Reporter::~Reporter() { -} - -Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) - : file_(file), - reporter_(reporter), - checksum_(checksum), - backing_store_(new char[kBlockSize]), - buffer_(), - eof_(false) { -} - -Reader::~Reader() { - delete[] backing_store_; -} - -bool Reader::ReadRecord(Slice* record, std::string* scratch) { - scratch->clear(); - record->clear(); - bool in_fragmented_record = false; - - Slice fragment; - while (true) { - switch (ReadPhysicalRecord(&fragment)) { - case kFullType: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - } - scratch->clear(); - *record = fragment; - return true; - - case kFirstType: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - } - scratch->assign(fragment.data(), fragment.size()); - in_fragmented_record = true; - break; - - case kMiddleType: - if (!in_fragmented_record) { - ReportDrop(fragment.size(), "missing start of fragmented record"); - } else { - scratch->append(fragment.data(), fragment.size()); - } - break; - - case kLastType: - if (!in_fragmented_record) { - ReportDrop(fragment.size(), "missing start of fragmented record"); - } else { - scratch->append(fragment.data(), fragment.size()); - *record = Slice(*scratch); - return true; - } - break; - - case kEof: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - scratch->clear(); - } - return false; - - case kBadRecord: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "error in middle of record"); - in_fragmented_record = false; - scratch->clear(); - } - break; - - default: - ReportDrop( - (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), - "unknown record type"); - in_fragmented_record = false; - scratch->clear(); - break; - } - } - return false; -} - -void Reader::ReportDrop(size_t bytes, const char* reason) { - if (reporter_ != NULL) { - reporter_->Corruption(bytes, Status::Corruption(reason)); - } -} - -unsigned int Reader::ReadPhysicalRecord(Slice* result) { - while (true) { - if (buffer_.size() < kHeaderSize) { - if (!eof_) { - // Last read was a full read, so this is a trailer to skip - buffer_.clear(); - Status status = file_->Read(kBlockSize, &buffer_, backing_store_); - if (!status.ok()) { - if (reporter_ != NULL) { - reporter_->Corruption(kBlockSize, status); - } - buffer_.clear(); - eof_ = true; - return kEof; - } else if (buffer_.size() < kBlockSize) { - eof_ = true; - } - continue; - } else if (buffer_.size() == 0) { - // End of file - return kEof; - } else { - ReportDrop(buffer_.size(), "truncated record at end of file"); - buffer_.clear(); - return kEof; - } - } - - // Parse the header - const char* header = buffer_.data(); - const uint32_t a = static_cast(header[4]) & 0xff; - const uint32_t b = static_cast(header[5]) & 0xff; - const unsigned int type = header[6]; - const uint32_t length = a | (b << 8); - if (kHeaderSize + length > buffer_.size()) { - ReportDrop(buffer_.size(), "bad record length"); - buffer_.clear(); - return kBadRecord; - } - - // Check crc - if (checksum_) { - if (type == kZeroType && length == 0) { - // Skip zero length record without reporting any drops since - // such records are produced by the mmap based writing code in - // env_posix.cc that preallocates file regions. - buffer_.clear(); - return kBadRecord; - } - - uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); - uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); - if (actual_crc != expected_crc) { - // Drop the rest of the buffer since "length" itself may have - // been corrupted and if we trust it, we could find some - // fragment of a real log record that just happens to look - // like a valid log record. - ReportDrop(buffer_.size(), "checksum mismatch"); - buffer_.clear(); - return kBadRecord; - } - } - - buffer_.remove_prefix(kHeaderSize + length); - *result = Slice(header + kHeaderSize, length); - return type; - } -} - -} -} diff --git a/leveldb/db/log_reader.h b/leveldb/db/log_reader.h deleted file mode 100644 index baf1475..0000000 --- a/leveldb/db/log_reader.h +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ -#define STORAGE_LEVELDB_DB_LOG_READER_H_ - -#include "db/log_format.h" -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class SequentialFile; - -namespace log { - -class Reader { - public: - // Interface for reporting errors. - class Reporter { - public: - virtual ~Reporter(); - - // Some corruption was detected. "size" is the approximate number - // of bytes dropped due to the corruption. - virtual void Corruption(size_t bytes, const Status& status) = 0; - }; - - // Create a reader that will return log records from "*file". - // "*file" must remain live while this Reader is in use. - // - // If "reporter" is non-NULL, it is notified whenever some data is - // dropped due to a detected corruption. "*reporter" must remain - // live while this Reader is in use. - // - // If "checksum" is true, verify checksums if available. - Reader(SequentialFile* file, Reporter* reporter, bool checksum); - - ~Reader(); - - // Read the next record into *record. Returns true if read - // successfully, false if we hit end of the input. May use - // "*scratch" as temporary storage. The contents filled in *record - // will only be valid until the next mutating operation on this - // reader or the next mutation to *scratch. - bool ReadRecord(Slice* record, std::string* scratch); - - private: - SequentialFile* const file_; - Reporter* const reporter_; - bool const checksum_; - char* const backing_store_; - Slice buffer_; - bool eof_; // Last Read() indicated EOF by returning < kBlockSize - - // Extend record types with the following special values - enum { - kEof = kMaxRecordType + 1, - kBadRecord = kMaxRecordType + 2 - }; - - // Return type, or one of the preceding special values - unsigned int ReadPhysicalRecord(Slice* result); - void ReportDrop(size_t bytes, const char* reason); - - // No copying allowed - Reader(const Reader&); - void operator=(const Reader&); -}; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/leveldb/db/log_test.cc b/leveldb/db/log_test.cc deleted file mode 100644 index 025a5ff..0000000 --- a/leveldb/db/log_test.cc +++ /dev/null @@ -1,361 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { -namespace log { - -// Construct a string of the specified length made out of the supplied -// partial string. -static std::string BigString(const std::string& partial_string, size_t n) { - std::string result; - while (result.size() < n) { - result.append(partial_string); - } - result.resize(n); - return result; -} - -// Construct a string from a number -static std::string NumberString(int n) { - char buf[50]; - snprintf(buf, sizeof(buf), "%d.", n); - return std::string(buf); -} - -// Return a skewed potentially long string -static std::string RandomSkewedString(int i, Random* rnd) { - return BigString(NumberString(i), rnd->Skewed(17)); -} - -class LogTest { - private: - class StringDest : public WritableFile { - public: - std::string contents_; - - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } - virtual Status Append(const Slice& slice) { - contents_.append(slice.data(), slice.size()); - return Status::OK(); - } - }; - - class StringSource : public SequentialFile { - public: - Slice contents_; - bool force_error_; - bool returned_partial_; - StringSource() : force_error_(false), returned_partial_(false) { } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; - ASSERT_EQ(kBlockSize, n); - - if (force_error_) { - force_error_ = false; - returned_partial_ = true; - return Status::Corruption("read error"); - } - - if (contents_.size() < n) { - n = contents_.size(); - returned_partial_ = true; - } - *result = Slice(contents_.data(), n); - contents_.remove_prefix(n); - return Status::OK(); - } - }; - - class ReportCollector : public Reader::Reporter { - public: - size_t dropped_bytes_; - std::string message_; - - ReportCollector() : dropped_bytes_(0) { } - virtual void Corruption(size_t bytes, const Status& status) { - dropped_bytes_ += bytes; - message_.append(status.ToString()); - } - }; - - StringDest dest_; - StringSource source_; - ReportCollector report_; - bool reading_; - Writer writer_; - Reader reader_; - - public: - LogTest() : reading_(false), - writer_(&dest_), - reader_(&source_, &report_, true/*checksum*/) { - } - - void Write(const std::string& msg) { - ASSERT_TRUE(!reading_) << "Write() after starting to read"; - writer_.AddRecord(Slice(msg)); - } - - size_t WrittenBytes() const { - return dest_.contents_.size(); - } - - std::string Read() { - if (!reading_) { - reading_ = true; - source_.contents_ = Slice(dest_.contents_); - } - std::string scratch; - Slice record; - if (reader_.ReadRecord(&record, &scratch)) { - return record.ToString(); - } else { - return "EOF"; - } - } - - void IncrementByte(int offset, int delta) { - dest_.contents_[offset] += delta; - } - - void SetByte(int offset, char new_byte) { - dest_.contents_[offset] = new_byte; - } - - void ShrinkSize(int bytes) { - dest_.contents_.resize(dest_.contents_.size() - bytes); - } - - void FixChecksum(int header_offset, int len) { - // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); - crc = crc32c::Mask(crc); - EncodeFixed32(&dest_.contents_[header_offset], crc); - } - - void ForceError() { - source_.force_error_ = true; - } - - size_t DroppedBytes() const { - return report_.dropped_bytes_; - } - - // Returns OK iff recorded error message contains "msg" - std::string MatchError(const std::string& msg) const { - if (report_.message_.find(msg) == std::string::npos) { - return report_.message_; - } else { - return "OK"; - } - } -}; - -TEST(LogTest, Empty) { - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, ReadWrite) { - Write("foo"); - Write("bar"); - Write(""); - Write("xxxx"); - ASSERT_EQ("foo", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("xxxx", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ("EOF", Read()); // Make sure reads at eof work -} - -TEST(LogTest, ManyBlocks) { - for (int i = 0; i < 100000; i++) { - Write(NumberString(i)); - } - for (int i = 0; i < 100000; i++) { - ASSERT_EQ(NumberString(i), Read()); - } - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, Fragmentation) { - Write("small"); - Write(BigString("medium", 50000)); - Write(BigString("large", 100000)); - ASSERT_EQ("small", Read()); - ASSERT_EQ(BigString("medium", 50000), Read()); - ASSERT_EQ(BigString("large", 100000), Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, MarginalTrailer) { - // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); - Write(""); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, ShortTrailer) { - const int n = kBlockSize - 2*kHeaderSize + 4; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); - Write(""); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, AlignedEof) { - const int n = kBlockSize - 2*kHeaderSize + 4; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, RandomRead) { - const int N = 500; - Random write_rnd(301); - for (int i = 0; i < N; i++) { - Write(RandomSkewedString(i, &write_rnd)); - } - Random read_rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); - } - ASSERT_EQ("EOF", Read()); -} - -// Tests of all the error paths in log_reader.cc follow: - -TEST(LogTest, ReadError) { - Write("foo"); - ForceError(); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kBlockSize, DroppedBytes()); - ASSERT_EQ("OK", MatchError("read error")); -} - -TEST(LogTest, BadRecordType) { - Write("foo"); - // Type is stored in header[6] - IncrementByte(6, 100); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("unknown record type")); -} - -TEST(LogTest, TruncatedTrailingRecord) { - Write("foo"); - ShrinkSize(4); // Drop all payload as well as a header byte - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); - ASSERT_EQ("OK", MatchError("truncated record at end of file")); -} - -TEST(LogTest, BadLength) { - Write("foo"); - ShrinkSize(1); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); - ASSERT_EQ("OK", MatchError("bad record length")); -} - -TEST(LogTest, ChecksumMismatch) { - Write("foo"); - IncrementByte(0, 10); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(10, DroppedBytes()); - ASSERT_EQ("OK", MatchError("checksum mismatch")); -} - -TEST(LogTest, UnexpectedMiddleType) { - Write("foo"); - SetByte(6, kMiddleType); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("missing start")); -} - -TEST(LogTest, UnexpectedLastType) { - Write("foo"); - SetByte(6, kLastType); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("missing start")); -} - -TEST(LogTest, UnexpectedFullType) { - Write("foo"); - Write("bar"); - SetByte(6, kFirstType); - FixChecksum(0, 3); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("partial record without end")); -} - -TEST(LogTest, UnexpectedFirstType) { - Write("foo"); - Write(BigString("bar", 100000)); - SetByte(6, kFirstType); - FixChecksum(0, 3); - ASSERT_EQ(BigString("bar", 100000), Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("partial record without end")); -} - -TEST(LogTest, ErrorJoinsRecords) { - // Consider two fragmented records: - // first(R1) last(R1) first(R2) last(R2) - // where the middle two fragments disappear. We do not want - // first(R1),last(R2) to get joined and returned as a valid record. - - // Write records that span two blocks - Write(BigString("foo", kBlockSize)); - Write(BigString("bar", kBlockSize)); - Write("correct"); - - // Wipe the middle block - for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { - SetByte(offset, 'x'); - } - - ASSERT_EQ("correct", Read()); - ASSERT_EQ("EOF", Read()); - const int dropped = DroppedBytes(); - ASSERT_LE(dropped, 2*kBlockSize + 100); - ASSERT_GE(dropped, 2*kBlockSize); -} - -} -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/log_writer.cc b/leveldb/db/log_writer.cc deleted file mode 100644 index 1696851..0000000 --- a/leveldb/db/log_writer.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_writer.h" - -#include -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { -namespace log { - -Writer::Writer(WritableFile* dest) - : dest_(dest), - block_offset_(0) { - for (int i = 0; i <= kMaxRecordType; i++) { - char t = static_cast(i); - type_crc_[i] = crc32c::Value(&t, 1); - } -} - -Writer::~Writer() { -} - -Status Writer::AddRecord(const Slice& slice) { - const char* ptr = slice.data(); - size_t left = slice.size(); - - // Fragment the record if necessary and emit it. Note that if slice - // is empty, we still want to iterate once to emit a single - // zero-length record - Status s; - do { - const int leftover = kBlockSize - block_offset_; - assert(leftover >= 0); - if (leftover < kHeaderSize) { - // Switch to a new block - if (leftover > 0) { - // Fill the trailer (literal below relies on kHeaderSize being 7) - assert(kHeaderSize == 7); - dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); - } - block_offset_ = 0; - } - - // Invariant: we never leave < kHeaderSize bytes in a block. - assert(kBlockSize - block_offset_ - kHeaderSize >= 0); - - const size_t avail = kBlockSize - block_offset_ - kHeaderSize; - const size_t fragment_length = (left < avail) ? left : avail; - - RecordType type; - const bool begin = (ptr == slice.data()); - const bool end = (left == fragment_length); - if (begin && end) { - type = kFullType; - } else if (begin) { - type = kFirstType; - } else if (end) { - type = kLastType; - } else { - type = kMiddleType; - } - - s = EmitPhysicalRecord(type, ptr, fragment_length); - ptr += fragment_length; - left -= fragment_length; - } while (s.ok() && left > 0); - return s; -} - -Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { - assert(n <= 0xffff); // Must fit in two bytes - assert(block_offset_ + kHeaderSize + n <= kBlockSize); - - // Format the header - char buf[kHeaderSize]; - buf[4] = static_cast(n & 0xff); - buf[5] = static_cast(n >> 8); - buf[6] = static_cast(t); - - // Compute the crc of the record type and the payload. - uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); - crc = crc32c::Mask(crc); // Adjust for storage - EncodeFixed32(buf, crc); - - // Write the header and the payload - Status s = dest_->Append(Slice(buf, kHeaderSize)); - if (s.ok()) { - s = dest_->Append(Slice(ptr, n)); - if (s.ok()) { - s = dest_->Flush(); - } - } - block_offset_ += kHeaderSize + n; - return s; -} - -} -} diff --git a/leveldb/db/log_writer.h b/leveldb/db/log_writer.h deleted file mode 100644 index d3cf27d..0000000 --- a/leveldb/db/log_writer.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ -#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ - -#include -#include "db/log_format.h" -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class WritableFile; - -namespace log { - -class Writer { - public: - // Create a writer that will append data to "*dest". - // "*dest" must be initially empty. - // "*dest" must remain live while this Writer is in use. - explicit Writer(WritableFile* dest); - ~Writer(); - - Status AddRecord(const Slice& slice); - - private: - WritableFile* dest_; - int block_offset_; // Current offset in block - - // crc32c values for all supported record types. These are - // pre-computed to reduce the overhead of computing the crc of the - // record type stored in the header. - uint32_t type_crc_[kMaxRecordType + 1]; - - Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); - - // No copying allowed - Writer(const Writer&); - void operator=(const Writer&); -}; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/leveldb/db/memtable.cc b/leveldb/db/memtable.cc deleted file mode 100644 index a3b618a..0000000 --- a/leveldb/db/memtable.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/memtable.h" -#include "db/dbformat.h" -#include "leveldb/comparator.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "util/coding.h" - -namespace leveldb { - -static Slice GetLengthPrefixedSlice(const char* data) { - uint32_t len; - const char* p = data; - p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted - return Slice(p, len); -} - -MemTable::MemTable(const InternalKeyComparator& cmp) - : comparator_(cmp), - table_(comparator_, &arena_) { -} - -MemTable::~MemTable() { -} - -size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } - -int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) - const { - // Internal keys are encoded as length-prefixed strings. - Slice a = GetLengthPrefixedSlice(aptr); - Slice b = GetLengthPrefixedSlice(bptr); - return comparator.Compare(a, b); -} - -// Encode a suitable internal key target for "target" and return it. -// Uses *scratch as scratch space, and the returned pointer will point -// into this scratch space. -static const char* EncodeKey(std::string* scratch, const Slice& target) { - scratch->clear(); - PutVarint32(scratch, target.size()); - scratch->append(target.data(), target.size()); - return scratch->data(); -} - -class MemTableIterator: public Iterator { - public: - explicit MemTableIterator(MemTable::Table* table) { - iter_ = new MemTable::Table::Iterator(table); - } - virtual ~MemTableIterator() { delete iter_; } - - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } - virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } - virtual Slice value() const { - Slice key_slice = GetLengthPrefixedSlice(iter_->key()); - return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); - } - - virtual Status status() const { return Status::OK(); } - - private: - MemTable::Table::Iterator* iter_; - std::string tmp_; // For passing to EncodeKey - - // No copying allowed - MemTableIterator(const MemTableIterator&); - void operator=(const MemTableIterator&); -}; - -Iterator* MemTable::NewIterator() { - return new MemTableIterator(&table_); -} - -void MemTable::Add(SequenceNumber s, ValueType type, - const Slice& key, - const Slice& value) { - // Format of an entry is concatenation of: - // key_size : varint32 of internal_key.size() - // key bytes : char[internal_key.size()] - // value_size : varint32 of value.size() - // value bytes : char[value.size()] - size_t key_size = key.size(); - size_t val_size = value.size(); - size_t internal_key_size = key_size + 8; - const size_t encoded_len = - VarintLength(internal_key_size) + internal_key_size + - VarintLength(val_size) + val_size; - char* buf = arena_.Allocate(encoded_len); - char* p = EncodeVarint32(buf, internal_key_size); - memcpy(p, key.data(), key_size); - p += key_size; - EncodeFixed64(p, (s << 8) | type); - p += 8; - p = EncodeVarint32(p, val_size); - memcpy(p, value.data(), val_size); - assert((p + val_size) - buf == encoded_len); - table_.Insert(buf); -} - -} diff --git a/leveldb/db/memtable.h b/leveldb/db/memtable.h deleted file mode 100644 index 45b3342..0000000 --- a/leveldb/db/memtable.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ -#define STORAGE_LEVELDB_DB_MEMTABLE_H_ - -#include -#include "leveldb/db.h" -#include "db/dbformat.h" -#include "db/skiplist.h" -#include "util/arena.h" - -namespace leveldb { - -class InternalKeyComparator; -class Mutex; -class MemTableIterator; - -class MemTable { - public: - explicit MemTable(const InternalKeyComparator& comparator); - ~MemTable(); - - // Returns an estimate of the number of bytes of data in use by this - // data structure. - // - // REQUIRES: external synchronization to prevent simultaneous - // operations on the same MemTable. - size_t ApproximateMemoryUsage(); - - // Return an iterator that yields the contents of the memtable. - // - // The caller must ensure that the underlying MemTable remains live - // while the returned iterator is live. The keys returned by this - // iterator are internal keys encoded by AppendInternalKey in the - // db/format.{h,cc} module. - Iterator* NewIterator(); - - // Add an entry into memtable that maps key to value at the - // specified sequence number and with the specified type. - // Typically value will be empty if type==kTypeDeletion. - void Add(SequenceNumber seq, ValueType type, - const Slice& key, - const Slice& value); - - private: - struct KeyComparator { - const InternalKeyComparator comparator; - explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } - int operator()(const char* a, const char* b) const; - }; - friend class MemTableIterator; - friend class MemTableBackwardIterator; - - typedef SkipList Table; - - KeyComparator comparator_; - Arena arena_; - Table table_; - - // No copying allowed - MemTable(const MemTable&); - void operator=(const MemTable&); -}; - -} - -#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/leveldb/db/repair.cc b/leveldb/db/repair.cc deleted file mode 100644 index c8e7b9e..0000000 --- a/leveldb/db/repair.cc +++ /dev/null @@ -1,380 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// We recover the contents of the descriptor from the other files we find. -// (1) Any log files are first converted to tables -// (2) We scan every table to compute -// (a) smallest/largest for the table -// (b) largest sequence number in the table -// (3) We generate descriptor contents: -// - log number is set to zero -// - next-file-number is set to 1 + largest file number we found -// - last-sequence-number is set to largest sequence# found across -// all tables (see 2c) -// - compaction pointers are cleared -// - every table file is added at level 0 -// -// Possible optimization 1: -// (a) Compute total size and use to pick appropriate max-level M -// (b) Sort tables by largest sequence# in the table -// (c) For each table: if it overlaps earlier table, place in level-0, -// else place in level-M. -// Possible optimization 2: -// Store per-table metadata (smallest, largest, largest-seq#, ...) -// in the table's meta section to speed up ScanTable. - -#include "db/builder.h" -#include "db/db_impl.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "db/version_edit.h" -#include "db/write_batch_internal.h" -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/env.h" - -namespace leveldb { - -namespace { - -class Repairer { - public: - Repairer(const std::string& dbname, const Options& options) - : dbname_(dbname), - env_(options.env), - icmp_(options.comparator), - options_(SanitizeOptions(dbname, &icmp_, options)), - owns_info_log_(options_.info_log != options.info_log), - next_file_number_(1) { - // TableCache can be small since we expect each table to be opened once. - table_cache_ = new TableCache(dbname_, &options_, 10); - } - - ~Repairer() { - delete table_cache_; - if (owns_info_log_) { - delete options_.info_log; - } - } - - Status Run() { - Status status = FindFiles(); - if (status.ok()) { - ConvertLogFilesToTables(); - ExtractMetaData(); - status = WriteDescriptor(); - } - if (status.ok()) { - unsigned long long bytes = 0; - for (size_t i = 0; i < tables_.size(); i++) { - bytes += tables_[i].meta.file_size; - } - Log(env_, options_.info_log, - "**** Repaired leveldb %s; " - "recovered %d files; %llu bytes. " - "Some data may have been lost. " - "****", - dbname_.c_str(), - static_cast(tables_.size()), - bytes); - } - return status; - } - - private: - struct TableInfo { - FileMetaData meta; - SequenceNumber max_sequence; - }; - - std::string const dbname_; - Env* const env_; - InternalKeyComparator const icmp_; - Options const options_; - bool owns_info_log_; - TableCache* table_cache_; - VersionEdit edit_; - - std::vector manifests_; - std::vector table_numbers_; - std::vector logs_; - std::vector tables_; - uint64_t next_file_number_; - - Status FindFiles() { - std::vector filenames; - Status status = env_->GetChildren(dbname_, &filenames); - if (!status.ok()) { - return status; - } - if (filenames.empty()) { - return Status::IOError(dbname_, "repair found no files"); - } - - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - if (type == kDescriptorFile) { - manifests_.push_back(filenames[i]); - } else { - if (number + 1 > next_file_number_) { - next_file_number_ = number + 1; - } - if (type == kLogFile) { - logs_.push_back(number); - } else if (type == kTableFile) { - table_numbers_.push_back(number); - } else { - // Ignore other files - } - } - } - } - return status; - } - - void ConvertLogFilesToTables() { - for (size_t i = 0; i < logs_.size(); i++) { - std::string logname = LogFileName(dbname_, logs_[i]); - Status status = ConvertLogToTable(logs_[i]); - if (!status.ok()) { - Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", - (unsigned long long) logs_[i], - status.ToString().c_str()); - } - ArchiveFile(logname); - } - } - - Status ConvertLogToTable(uint64_t log) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - WritableFile* info_log; - uint64_t lognum; - virtual void Corruption(size_t bytes, const Status& s) { - // We print error messages for corruption, but continue repairing. - Log(env, info_log, "Log #%llu: dropping %d bytes; %s", - (unsigned long long) lognum, - static_cast(bytes), - s.ToString().c_str()); - } - }; - - // Open the log file - std::string logname = LogFileName(dbname_, log); - SequentialFile* lfile; - Status status = env_->NewSequentialFile(logname, &lfile); - if (!status.ok()) { - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log; - reporter.lognum = log; - // We intentially make log::Reader do checksumming so that - // corruptions cause entire commits to be skipped instead of - // propagating bad information (like overly large sequence - // numbers). - log::Reader reader(lfile, &reporter, false/*do not checksum*/); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - MemTable mem(icmp_); - int counter = 0; - while (reader.ReadRecord(&record, &scratch)) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, &mem); - if (status.ok()) { - counter += WriteBatchInternal::Count(&batch); - } else { - Log(env_, options_.info_log, "Log #%llu: ignoring %s", - (unsigned long long) log, - status.ToString().c_str()); - status = Status::OK(); // Keep going with rest of file - } - } - delete lfile; - - // We ignore any version edits generated by the conversion to a Table - // since ExtractMetaData() will also generate edits. - VersionEdit skipped; - FileMetaData meta; - meta.number = next_file_number_++; - Iterator* iter = mem.NewIterator(); - status = BuildTable(dbname_, env_, options_, table_cache_, iter, - &meta, &skipped); - delete iter; - if (status.ok()) { - if (meta.file_size > 0) { - table_numbers_.push_back(meta.number); - } - } - Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", - (unsigned long long) log, - counter, - (unsigned long long) meta.number, - status.ToString().c_str()); - return status; - } - - void ExtractMetaData() { - std::vector kept; - for (size_t i = 0; i < table_numbers_.size(); i++) { - TableInfo t; - t.meta.number = table_numbers_[i]; - Status status = ScanTable(&t); - if (!status.ok()) { - std::string fname = TableFileName(dbname_, table_numbers_[i]); - Log(env_, options_.info_log, "Table #%llu: ignoring %s", - (unsigned long long) table_numbers_[i], - status.ToString().c_str()); - ArchiveFile(fname); - } else { - tables_.push_back(t); - } - } - } - - Status ScanTable(TableInfo* t) { - std::string fname = TableFileName(dbname_, t->meta.number); - int counter = 0; - Status status = env_->GetFileSize(fname, &t->meta.file_size); - if (status.ok()) { - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), t->meta.number, t->meta.file_size); - bool empty = true; - ParsedInternalKey parsed; - t->max_sequence = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - Slice key = iter->key(); - if (!ParseInternalKey(key, &parsed)) { - Log(env_, options_.info_log, "Table #%llu: unparsable key %s", - (unsigned long long) t->meta.number, - EscapeString(key).c_str()); - continue; - } - - counter++; - if (empty) { - empty = false; - t->meta.smallest.DecodeFrom(key); - } - t->meta.largest.DecodeFrom(key); - if (parsed.sequence > t->max_sequence) { - t->max_sequence = parsed.sequence; - } - } - if (!iter->status().ok()) { - status = iter->status(); - } - delete iter; - } - Log(env_, options_.info_log, "Table #%llu: %d entries %s", - (unsigned long long) t->meta.number, - counter, - status.ToString().c_str()); - return status; - } - - Status WriteDescriptor() { - std::string tmp = TempFileName(dbname_, 1); - WritableFile* file; - Status status = env_->NewWritableFile(tmp, &file); - if (!status.ok()) { - return status; - } - - SequenceNumber max_sequence = 0; - for (size_t i = 0; i < tables_.size(); i++) { - if (max_sequence < tables_[i].max_sequence) { - max_sequence = tables_[i].max_sequence; - } - } - - edit_.SetComparatorName(icmp_.user_comparator()->Name()); - edit_.SetLogNumber(0); - edit_.SetNextFile(next_file_number_); - edit_.SetLastSequence(max_sequence); - - for (size_t i = 0; i < tables_.size(); i++) { - // TODO(opt): separate out into multiple levels - const TableInfo& t = tables_[i]; - edit_.AddFile(0, t.meta.number, t.meta.file_size, - t.meta.smallest, t.meta.largest); - } - - //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); - { - log::Writer log(file); - std::string record; - edit_.EncodeTo(&record); - status = log.AddRecord(record); - } - if (status.ok()) { - status = file->Close(); - } - delete file; - file = NULL; - - if (!status.ok()) { - env_->DeleteFile(tmp); - } else { - // Discard older manifests - for (size_t i = 0; i < manifests_.size(); i++) { - ArchiveFile(dbname_ + "/" + manifests_[i]); - } - - // Install new manifest - status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); - if (status.ok()) { - status = SetCurrentFile(env_, dbname_, 1); - } else { - env_->DeleteFile(tmp); - } - } - return status; - } - - void ArchiveFile(const std::string& fname) { - // Move into another directory. E.g., for - // dir/foo - // rename to - // dir/lost/foo - const char* slash = strrchr(fname.c_str(), '/'); - std::string new_dir; - if (slash != NULL) { - new_dir.assign(fname.data(), slash - fname.data()); - } - new_dir.append("/lost"); - env_->CreateDir(new_dir); // Ignore error - std::string new_file = new_dir; - new_file.append("/"); - new_file.append((slash == NULL) ? fname.c_str() : slash + 1); - Status s = env_->RenameFile(fname, new_file); - Log(env_, options_.info_log, "Archiving %s: %s\n", - fname.c_str(), s.ToString().c_str()); - } -}; -} - -Status RepairDB(const std::string& dbname, const Options& options) { - Repairer repairer(dbname, options); - return repairer.Run(); -} - -} diff --git a/leveldb/db/skiplist.h b/leveldb/db/skiplist.h deleted file mode 100644 index be39354..0000000 --- a/leveldb/db/skiplist.h +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Thread safety -// ------------- -// -// Writes require external synchronization, most likely a mutex. -// Reads require a guarantee that the SkipList will not be destroyed -// while the read is in progress. Apart from that, reads progress -// without any internal locking or synchronization. -// -// Invariants: -// -// (1) Allocated nodes are never deleted until the SkipList is -// destroyed. This is trivially guaranteed by the code since we -// never delete any skip list nodes. -// -// (2) The contents of a Node except for the next/prev pointers are -// immutable after the Node has been linked into the SkipList. -// Only Insert() modifies the list, and it is careful to initialize -// a node and use release-stores to publish the nodes in one or -// more lists. -// -// ... prev vs. next pointer ordering ... - -#include -#include -#include "port/port.h" -#include "util/arena.h" -#include "util/random.h" - -namespace leveldb { - -class Arena; - -template -class SkipList { - private: - struct Node; - - public: - // Create a new SkipList object that will use "cmp" for comparing keys, - // and will allocate memory using "*arena". Objects allocated in the arena - // must remain allocated for the lifetime of the skiplist object. - explicit SkipList(Comparator cmp, Arena* arena); - - // Insert key into the list. - // REQUIRES: nothing that compares equal to key is currently in the list. - void Insert(const Key& key); - - // Returns true iff an entry that compares equal to key is in the list. - bool Contains(const Key& key) const; - - // Iteration over the contents of a skip list - class Iterator { - public: - // Initialize an iterator over the specified list. - // The returned iterator is not valid. - explicit Iterator(const SkipList* list); - - // Returns true iff the iterator is positioned at a valid node. - bool Valid() const; - - // Returns the key at the current position. - // REQUIRES: Valid() - const Key& key() const; - - // Advances to the next position. - // REQUIRES: Valid() - void Next(); - - // Advances to the previous position. - // REQUIRES: Valid() - void Prev(); - - // Advance to the first entry with a key >= target - void Seek(const Key& target); - - // Position at the first entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToFirst(); - - // Position at the last entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToLast(); - - private: - const SkipList* list_; - Node* node_; - // Intentionally copyable - }; - - private: - enum { kMaxHeight = 12 }; - - // Immutable after construction - Comparator const compare_; - Arena* const arena_; // Arena used for allocations of nodes - - Node* const head_; - - // Modified only by Insert(). Read racily by readers, but stale - // values are ok. - port::AtomicPointer max_height_; // Height of the entire list - - inline int GetMaxHeight() const { - return reinterpret_cast(max_height_.NoBarrier_Load()); - } - - // Read/written only by Insert(). - Random rnd_; - - Node* NewNode(const Key& key, int height); - int RandomHeight(); - bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } - - // Return true if key is greater than the data stored in "n" - bool KeyIsAfterNode(const Key& key, Node* n) const; - - // Return the earliest node that comes at or after key. - // Return NULL if there is no such node. - // - // If prev is non-NULL, fills prev[level] with pointer to previous - // node at "level" for every level in [0..max_height_-1]. - Node* FindGreaterOrEqual(const Key& key, Node** prev) const; - - // Return the latest node with a key < key. - // Return head_ if there is no such node. - Node* FindLessThan(const Key& key) const; - - // Return the last node in the list. - // Return head_ if list is empty. - Node* FindLast() const; - - // No copying allowed - SkipList(const SkipList&); - void operator=(const SkipList&); -}; - -// Implementation details follow -template -struct SkipList::Node { - explicit Node(const Key& k) : key(k) { } - - Key const key; - - // Accessors/mutators for links. Wrapped in methods so we can - // add the appropriate barriers as necessary. - Node* Next(int n) { - assert(n >= 0); - // Use an 'acquire load' so that we observe a fully initialized - // version of the returned Node. - return reinterpret_cast(next_[n].Acquire_Load()); - } - void SetNext(int n, Node* x) { - assert(n >= 0); - // Use a 'release store' so that anybody who reads through this - // pointer observes a fully initialized version of the inserted node. - next_[n].Release_Store(x); - } - - // No-barrier variants that can be safely used in a few locations. - Node* NoBarrier_Next(int n) { - assert(n >= 0); - return reinterpret_cast(next_[n].NoBarrier_Load()); - } - void NoBarrier_SetNext(int n, Node* x) { - assert(n >= 0); - next_[n].NoBarrier_Store(x); - } - - private: - // Array of length equal to the node height. next_[0] is lowest level link. - port::AtomicPointer next_[1]; -}; - -template -typename SkipList::Node* -SkipList::NewNode(const Key& key, int height) { - char* mem = arena_->AllocateAligned( - sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); - return new (mem) Node(key); -} - -template -inline SkipList::Iterator::Iterator(const SkipList* list) { - list_ = list; - node_ = NULL; -} - -template -inline bool SkipList::Iterator::Valid() const { - return node_ != NULL; -} - -template -inline const Key& SkipList::Iterator::key() const { - assert(Valid()); - return node_->key; -} - -template -inline void SkipList::Iterator::Next() { - assert(Valid()); - node_ = node_->Next(0); -} - -template -inline void SkipList::Iterator::Prev() { - // Instead of using explicit "prev" links, we just search for the - // last node that falls before key. - assert(Valid()); - node_ = list_->FindLessThan(node_->key); - if (node_ == list_->head_) { - node_ = NULL; - } -} - -template -inline void SkipList::Iterator::Seek(const Key& target) { - node_ = list_->FindGreaterOrEqual(target, NULL); -} - -template -inline void SkipList::Iterator::SeekToFirst() { - node_ = list_->head_->Next(0); -} - -template -inline void SkipList::Iterator::SeekToLast() { - node_ = list_->FindLast(); - if (node_ == list_->head_) { - node_ = NULL; - } -} - -template -int SkipList::RandomHeight() { - // Increase height with probability 1 in kBranching - static const unsigned int kBranching = 4; - int height = 1; - while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { - height++; - } - assert(height > 0); - assert(height <= kMaxHeight); - return height; -} - -template -bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { - // NULL n is considered infinite - return (n != NULL) && (compare_(n->key, key) < 0); -} - -template -typename SkipList::Node* SkipList::FindGreaterOrEqual(const Key& key, Node** prev) - const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - Node* next = x->Next(level); - if (KeyIsAfterNode(key, next)) { - // Keep searching in this list - x = next; - } else { - if (prev != NULL) prev[level] = x; - if (level == 0) { - return next; - } else { - // Switch to next list - level--; - } - } - } -} - -template -typename SkipList::Node* -SkipList::FindLessThan(const Key& key) const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - assert(x == head_ || compare_(x->key, key) < 0); - Node* next = x->Next(level); - if (next == NULL || compare_(next->key, key) >= 0) { - if (level == 0) { - return x; - } else { - // Switch to next list - level--; - } - } else { - x = next; - } - } -} - -template -typename SkipList::Node* SkipList::FindLast() - const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - Node* next = x->Next(level); - if (next == NULL) { - if (level == 0) { - return x; - } else { - // Switch to next list - level--; - } - } else { - x = next; - } - } -} - -template -SkipList::SkipList(Comparator cmp, Arena* arena) - : compare_(cmp), - arena_(arena), - head_(NewNode(0 /* any key will do */, kMaxHeight)), - max_height_(reinterpret_cast(1)), - rnd_(0xdeadbeef) { - for (int i = 0; i < kMaxHeight; i++) { - head_->SetNext(i, NULL); - } -} - -template -void SkipList::Insert(const Key& key) { - // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() - // here since Insert() is externally synchronized. - Node* prev[kMaxHeight]; - Node* x = FindGreaterOrEqual(key, prev); - - // Our data structure does not allow duplicate insertion - assert(x == NULL || !Equal(key, x->key)); - - int height = RandomHeight(); - if (height > GetMaxHeight()) { - for (int i = GetMaxHeight(); i < height; i++) { - prev[i] = head_; - } - //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); - - // It is ok to mutate max_height_ without any synchronization - // with concurrent readers. A concurrent reader that observes - // the new value of max_height_ will see either the old value of - // new level pointers from head_ (NULL), or a new value set in - // the loop below. In the former case the reader will - // immediately drop to the next level since NULL sorts after all - // keys. In the latter case the reader will use the new node. - max_height_.NoBarrier_Store(reinterpret_cast(height)); - } - - x = NewNode(key, height); - for (int i = 0; i < height; i++) { - // NoBarrier_SetNext() suffices since we will add a barrier when - // we publish a pointer to "x" in prev[i]. - x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); - prev[i]->SetNext(i, x); - } -} - -template -bool SkipList::Contains(const Key& key) const { - Node* x = FindGreaterOrEqual(key, NULL); - if (x != NULL && Equal(key, x->key)) { - return true; - } else { - return false; - } -} - -} diff --git a/leveldb/db/skiplist_test.cc b/leveldb/db/skiplist_test.cc deleted file mode 100644 index 5f9ec0d..0000000 --- a/leveldb/db/skiplist_test.cc +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/skiplist.h" -#include -#include "leveldb/env.h" -#include "util/arena.h" -#include "util/hash.h" -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { - -typedef uint64_t Key; - -struct Comparator { - int operator()(const Key& a, const Key& b) const { - if (a < b) { - return -1; - } else if (a > b) { - return +1; - } else { - return 0; - } - } -}; - -class SkipTest { }; - -TEST(SkipTest, Empty) { - Arena arena; - Comparator cmp; - SkipList list(cmp, &arena); - ASSERT_TRUE(!list.Contains(10)); - - SkipList::Iterator iter(&list); - ASSERT_TRUE(!iter.Valid()); - iter.SeekToFirst(); - ASSERT_TRUE(!iter.Valid()); - iter.Seek(100); - ASSERT_TRUE(!iter.Valid()); - iter.SeekToLast(); - ASSERT_TRUE(!iter.Valid()); -} - -TEST(SkipTest, InsertAndLookup) { - const int N = 2000; - const int R = 5000; - Random rnd(1000); - std::set keys; - Arena arena; - Comparator cmp; - SkipList list(cmp, &arena); - for (int i = 0; i < N; i++) { - Key key = rnd.Next() % R; - if (keys.insert(key).second) { - list.Insert(key); - } - } - - for (int i = 0; i < R; i++) { - if (list.Contains(i)) { - ASSERT_EQ(keys.count(i), 1); - } else { - ASSERT_EQ(keys.count(i), 0); - } - } - - // Simple iterator tests - { - SkipList::Iterator iter(&list); - ASSERT_TRUE(!iter.Valid()); - - iter.Seek(0); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.begin()), iter.key()); - - iter.SeekToFirst(); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.begin()), iter.key()); - - iter.SeekToLast(); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.rbegin()), iter.key()); - } - - // Forward iteration test - for (int i = 0; i < R; i++) { - SkipList::Iterator iter(&list); - iter.Seek(i); - - // Compare against model iterator - std::set::iterator model_iter = keys.lower_bound(i); - for (int j = 0; j < 3; j++) { - if (model_iter == keys.end()) { - ASSERT_TRUE(!iter.Valid()); - break; - } else { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - ++model_iter; - iter.Next(); - } - } - } - - // Backward iteration test - { - SkipList::Iterator iter(&list); - iter.SeekToLast(); - - // Compare against model iterator - for (std::set::reverse_iterator model_iter = keys.rbegin(); - model_iter != keys.rend(); - ++model_iter) { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - iter.Prev(); - } - ASSERT_TRUE(!iter.Valid()); - } -} - -// We want to make sure that with a single writer and multiple -// concurrent readers (with no synchronization other than when a -// reader's iterator is created), the reader always observes all the -// data that was present in the skip list when the iterator was -// constructor. Because insertions are happening concurrently, we may -// also observe new values that were inserted since the iterator was -// constructed, but we should never miss any values that were present -// at iterator construction time. -// -// We generate multi-part keys: -// -// where: -// key is in range [0..K-1] -// gen is a generation number for key -// hash is hash(key,gen) -// -// The insertion code picks a random key, sets gen to be 1 + the last -// generation number inserted for that key, and sets hash to Hash(key,gen). -// -// At the beginning of a read, we snapshot the last inserted -// generation number for each key. We then iterate, including random -// calls to Next() and Seek(). For every key we encounter, we -// check that it is either expected given the initial snapshot or has -// been concurrently added since the iterator started. -class ConcurrentTest { - private: - static const uint32_t K = 4; - - static uint64_t key(Key key) { return (key >> 40); } - static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } - static uint64_t hash(Key key) { return key & 0xff; } - - static uint64_t HashNumbers(uint64_t k, uint64_t g) { - uint64_t data[2] = { k, g }; - return Hash(reinterpret_cast(data), sizeof(data), 0); - } - - static Key MakeKey(uint64_t k, uint64_t g) { - assert(sizeof(Key) == sizeof(uint64_t)); - assert(k <= K); // We sometimes pass K to seek to the end of the skiplist - assert(g <= 0xffffffffu); - return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); - } - - static bool IsValidKey(Key k) { - return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); - } - - static Key RandomTarget(Random* rnd) { - switch (rnd->Next() % 10) { - case 0: - // Seek to beginning - return MakeKey(0, 0); - case 1: - // Seek to end - return MakeKey(K, 0); - default: - // Seek to middle - return MakeKey(rnd->Next() % K, 0); - } - } - - // Per-key generation - struct State { - port::AtomicPointer generation[K]; - void Set(int k, intptr_t v) { - generation[k].Release_Store(reinterpret_cast(v)); - } - intptr_t Get(int k) { - return reinterpret_cast(generation[k].Acquire_Load()); - } - - State() { - for (int k = 0; k < K; k++) { - Set(k, 0); - } - } - }; - - // Current state of the test - State current_; - - Arena arena_; - - // SkipList is not protected by mu_. We just use a single writer - // thread to modify it. - SkipList list_; - - public: - ConcurrentTest() : list_(Comparator(), &arena_) { } - - // REQUIRES: External synchronization - void WriteStep(Random* rnd) { - const uint32_t k = rnd->Next() % K; - const intptr_t g = current_.Get(k) + 1; - const Key key = MakeKey(k, g); - list_.Insert(key); - current_.Set(k, g); - } - - void ReadStep(Random* rnd) { - // Remember the initial committed state of the skiplist. - State initial_state; - for (int k = 0; k < K; k++) { - initial_state.Set(k, current_.Get(k)); - } - - Key pos = RandomTarget(rnd); - SkipList::Iterator iter(&list_); - iter.Seek(pos); - while (true) { - Key current; - if (!iter.Valid()) { - current = MakeKey(K, 0); - } else { - current = iter.key(); - ASSERT_TRUE(IsValidKey(current)) << std::hex << current; - } - ASSERT_LE(pos, current) << "should not go backwards"; - - // Verify that everything in [pos,current) was not present in - // initial_state. - while (pos < current) { - ASSERT_LT(key(pos), K) << std::hex << pos; - - // Note that generation 0 is never inserted, so it is ok if - // <*,0,*> is missing. - ASSERT_TRUE((gen(pos) == 0) || - (gen(pos) > initial_state.Get(key(pos))) - ) << "key: " << key(pos) - << "; gen: " << gen(pos) - << "; initgen: " - << initial_state.Get(key(pos)); - - // Advance to next key in the valid key space - if (key(pos) < key(current)) { - pos = MakeKey(key(pos) + 1, 0); - } else { - pos = MakeKey(key(pos), gen(pos) + 1); - } - } - - if (!iter.Valid()) { - break; - } - - if (rnd->Next() % 2) { - iter.Next(); - pos = MakeKey(key(pos), gen(pos) + 1); - } else { - Key new_target = RandomTarget(rnd); - if (new_target > pos) { - pos = new_target; - iter.Seek(new_target); - } - } - } - } -}; -const uint32_t ConcurrentTest::K; - -// Simple test that does single-threaded testing of the ConcurrentTest -// scaffolding. -TEST(SkipTest, ConcurrentWithoutThreads) { - ConcurrentTest test; - Random rnd(test::RandomSeed()); - for (int i = 0; i < 10000; i++) { - test.ReadStep(&rnd); - test.WriteStep(&rnd); - } -} - -class TestState { - public: - ConcurrentTest t_; - int seed_; - port::AtomicPointer quit_flag_; - - enum ReaderState { - STARTING, - RUNNING, - DONE - }; - - explicit TestState(int s) - : seed_(s), - quit_flag_(NULL), - state_(STARTING), - state_cv_(&mu_) {} - - void Wait(ReaderState s) { - mu_.Lock(); - while (state_ != s) { - state_cv_.Wait(); - } - mu_.Unlock(); - } - - void Change(ReaderState s) { - mu_.Lock(); - state_ = s; - state_cv_.Signal(); - mu_.Unlock(); - } - - private: - port::Mutex mu_; - ReaderState state_; - port::CondVar state_cv_; -}; - -static void ConcurrentReader(void* arg) { - TestState* state = reinterpret_cast(arg); - Random rnd(state->seed_); - int64_t reads = 0; - state->Change(TestState::RUNNING); - while (!state->quit_flag_.Acquire_Load()) { - state->t_.ReadStep(&rnd); - ++reads; - } - state->Change(TestState::DONE); -} - -static void RunConcurrent(int run) { - const int seed = test::RandomSeed() + (run * 100); - Random rnd(seed); - const int N = 1000; - const int kSize = 1000; - for (int i = 0; i < N; i++) { - if ((i % 100) == 0) { - fprintf(stderr, "Run %d of %d\n", i, N); - } - TestState state(seed + 1); - Env::Default()->Schedule(ConcurrentReader, &state); - state.Wait(TestState::RUNNING); - for (int i = 0; i < kSize; i++) { - state.t_.WriteStep(&rnd); - } - state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do - state.Wait(TestState::DONE); - } -} - -TEST(SkipTest, Concurrent1) { RunConcurrent(1); } -TEST(SkipTest, Concurrent2) { RunConcurrent(2); } -TEST(SkipTest, Concurrent3) { RunConcurrent(3); } -TEST(SkipTest, Concurrent4) { RunConcurrent(4); } -TEST(SkipTest, Concurrent5) { RunConcurrent(5); } - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/snapshot.h b/leveldb/db/snapshot.h deleted file mode 100644 index 9a90756..0000000 --- a/leveldb/db/snapshot.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ -#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ - -#include "leveldb/db.h" - -namespace leveldb { - -class SnapshotList; - -// Snapshots are kept in a doubly-linked list in the DB. -// Each Snapshot corresponds to a particular sequence number. -class Snapshot { - public: - SequenceNumber number_; // const after creation - - private: - friend class SnapshotList; - - // Snapshot is kept in a doubly-linked circular list - Snapshot* prev_; - Snapshot* next_; - - SnapshotList* list_; // just for sanity checks -}; - -class SnapshotList { - public: - SnapshotList() { - list_.prev_ = &list_; - list_.next_ = &list_; - } - - bool empty() const { return list_.next_ == &list_; } - Snapshot* oldest() const { assert(!empty()); return list_.next_; } - Snapshot* newest() const { assert(!empty()); return list_.prev_; } - - const Snapshot* New(SequenceNumber seq) { - Snapshot* s = new Snapshot; - s->number_ = seq; - s->list_ = this; - s->next_ = &list_; - s->prev_ = list_.prev_; - s->prev_->next_ = s; - s->next_->prev_ = s; - return s; - } - - void Delete(const Snapshot* s) { - assert(s->list_ == this); - s->prev_->next_ = s->next_; - s->next_->prev_ = s->prev_; - delete s; - } - - private: - // Dummy head of doubly-linked list of snapshots - Snapshot list_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/leveldb/db/table_cache.cc b/leveldb/db/table_cache.cc deleted file mode 100644 index 325d707..0000000 --- a/leveldb/db/table_cache.cc +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/table_cache.h" - -#include "db/filename.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "util/coding.h" - -namespace leveldb { - -struct TableAndFile { - RandomAccessFile* file; - Table* table; -}; - -static void DeleteEntry(const Slice& key, void* value) { - TableAndFile* tf = reinterpret_cast(value); - delete tf->table; - delete tf->file; - delete tf; -} - -static void UnrefEntry(void* arg1, void* arg2) { - Cache* cache = reinterpret_cast(arg1); - Cache::Handle* h = reinterpret_cast(arg2); - cache->Release(h); -} - -TableCache::TableCache(const std::string& dbname, - const Options* options, - int entries) - : env_(options->env), - dbname_(dbname), - options_(options), - cache_(NewLRUCache(entries)) { -} - -TableCache::~TableCache() { - delete cache_; -} - -Iterator* TableCache::NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr) { - if (tableptr != NULL) { - *tableptr = NULL; - } - - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - Slice key(buf, sizeof(buf)); - Cache::Handle* handle = cache_->Lookup(key); - if (handle == NULL) { - std::string fname = TableFileName(dbname_, file_number); - RandomAccessFile* file = NULL; - Table* table = NULL; - Status s = env_->NewRandomAccessFile(fname, &file); - if (s.ok()) { - s = Table::Open(*options_, file, file_size, &table); - } - - if (!s.ok()) { - assert(table == NULL); - delete file; - // We do not cache error results so that if the error is transient, - // or somebody repairs the file, we recover automatically. - return NewErrorIterator(s); - } - - TableAndFile* tf = new TableAndFile; - tf->file = file; - tf->table = table; - handle = cache_->Insert(key, tf, 1, &DeleteEntry); - } - - Table* table = reinterpret_cast(cache_->Value(handle))->table; - Iterator* result = table->NewIterator(options); - result->RegisterCleanup(&UnrefEntry, cache_, handle); - if (tableptr != NULL) { - *tableptr = table; - } - return result; -} - -void TableCache::Evict(uint64_t file_number) { - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - cache_->Erase(Slice(buf, sizeof(buf))); -} - -} diff --git a/leveldb/db/table_cache.h b/leveldb/db/table_cache.h deleted file mode 100644 index 5376194..0000000 --- a/leveldb/db/table_cache.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Thread-safe (provides internal synchronization) - -#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ -#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ - -#include -#include -#include "db/dbformat.h" -#include "leveldb/cache.h" -#include "leveldb/table.h" -#include "port/port.h" - -namespace leveldb { - -class Env; - -class TableCache { - public: - TableCache(const std::string& dbname, const Options* options, int entries); - ~TableCache(); - - // Return an iterator for the specified file number (the corresponding - // file length must be exactly "file_size" bytes). If "tableptr" is - // non-NULL, also sets "*tableptr" to point to the Table object - // underlying the returned iterator, or NULL if no Table object underlies - // the returned iterator. The returned "*tableptr" object is owned by - // the cache and should not be deleted, and is valid for as long as the - // returned iterator is live. - Iterator* NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr = NULL); - - // Evict any entry for the specified file number - void Evict(uint64_t file_number); - - private: - Env* const env_; - const std::string dbname_; - const Options* options_; - Cache* cache_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/leveldb/db/version_edit.cc b/leveldb/db/version_edit.cc deleted file mode 100644 index 3941271..0000000 --- a/leveldb/db/version_edit.cc +++ /dev/null @@ -1,268 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_edit.h" - -#include "db/version_set.h" -#include "util/coding.h" - -namespace leveldb { - -// Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. -enum Tag { - kComparator = 1, - kLogNumber = 2, - kNextFileNumber = 3, - kLastSequence = 4, - kCompactPointer = 5, - kDeletedFile = 6, - kNewFile = 7, - // 8 was used for large value refs - kPrevLogNumber = 9, -}; - -void VersionEdit::Clear() { - comparator_.clear(); - log_number_ = 0; - prev_log_number_ = 0; - last_sequence_ = 0; - next_file_number_ = 0; - has_comparator_ = false; - has_log_number_ = false; - has_prev_log_number_ = false; - has_next_file_number_ = false; - has_last_sequence_ = false; - deleted_files_.clear(); - new_files_.clear(); -} - -void VersionEdit::EncodeTo(std::string* dst) const { - if (has_comparator_) { - PutVarint32(dst, kComparator); - PutLengthPrefixedSlice(dst, comparator_); - } - if (has_log_number_) { - PutVarint32(dst, kLogNumber); - PutVarint64(dst, log_number_); - } - if (has_prev_log_number_) { - PutVarint32(dst, kPrevLogNumber); - PutVarint64(dst, prev_log_number_); - } - if (has_next_file_number_) { - PutVarint32(dst, kNextFileNumber); - PutVarint64(dst, next_file_number_); - } - if (has_last_sequence_) { - PutVarint32(dst, kLastSequence); - PutVarint64(dst, last_sequence_); - } - - for (size_t i = 0; i < compact_pointers_.size(); i++) { - PutVarint32(dst, kCompactPointer); - PutVarint32(dst, compact_pointers_[i].first); // level - PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); - } - - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { - PutVarint32(dst, kDeletedFile); - PutVarint32(dst, iter->first); // level - PutVarint64(dst, iter->second); // file number - } - - for (size_t i = 0; i < new_files_.size(); i++) { - const FileMetaData& f = new_files_[i].second; - PutVarint32(dst, kNewFile); - PutVarint32(dst, new_files_[i].first); // level - PutVarint64(dst, f.number); - PutVarint64(dst, f.file_size); - PutLengthPrefixedSlice(dst, f.smallest.Encode()); - PutLengthPrefixedSlice(dst, f.largest.Encode()); - } -} - -static bool GetInternalKey(Slice* input, InternalKey* dst) { - Slice str; - if (GetLengthPrefixedSlice(input, &str)) { - dst->DecodeFrom(str); - return true; - } else { - return false; - } -} - -static bool GetLevel(Slice* input, int* level) { - uint32_t v; - if (GetVarint32(input, &v) && - v < config::kNumLevels) { - *level = v; - return true; - } else { - return false; - } -} - -Status VersionEdit::DecodeFrom(const Slice& src) { - Clear(); - Slice input = src; - const char* msg = NULL; - uint32_t tag; - - // Temporary storage for parsing - int level; - uint64_t number; - FileMetaData f; - Slice str; - InternalKey key; - - while (msg == NULL && GetVarint32(&input, &tag)) { - switch (tag) { - case kComparator: - if (GetLengthPrefixedSlice(&input, &str)) { - comparator_ = str.ToString(); - has_comparator_ = true; - } else { - msg = "comparator name"; - } - break; - - case kLogNumber: - if (GetVarint64(&input, &log_number_)) { - has_log_number_ = true; - } else { - msg = "log number"; - } - break; - - case kPrevLogNumber: - if (GetVarint64(&input, &prev_log_number_)) { - has_prev_log_number_ = true; - } else { - msg = "previous log number"; - } - break; - - case kNextFileNumber: - if (GetVarint64(&input, &next_file_number_)) { - has_next_file_number_ = true; - } else { - msg = "next file number"; - } - break; - - case kLastSequence: - if (GetVarint64(&input, &last_sequence_)) { - has_last_sequence_ = true; - } else { - msg = "last sequence number"; - } - break; - - case kCompactPointer: - if (GetLevel(&input, &level) && - GetInternalKey(&input, &key)) { - compact_pointers_.push_back(std::make_pair(level, key)); - } else { - msg = "compaction pointer"; - } - break; - - case kDeletedFile: - if (GetLevel(&input, &level) && - GetVarint64(&input, &number)) { - deleted_files_.insert(std::make_pair(level, number)); - } else { - msg = "deleted file"; - } - break; - - case kNewFile: - if (GetLevel(&input, &level) && - GetVarint64(&input, &f.number) && - GetVarint64(&input, &f.file_size) && - GetInternalKey(&input, &f.smallest) && - GetInternalKey(&input, &f.largest)) { - new_files_.push_back(std::make_pair(level, f)); - } else { - msg = "new-file entry"; - } - break; - - default: - msg = "unknown tag"; - break; - } - } - - if (msg == NULL && !input.empty()) { - msg = "invalid tag"; - } - - Status result; - if (msg != NULL) { - result = Status::Corruption("VersionEdit", msg); - } - return result; -} - -std::string VersionEdit::DebugString() const { - std::string r; - r.append("VersionEdit {"); - if (has_comparator_) { - r.append("\n Comparator: "); - r.append(comparator_); - } - if (has_log_number_) { - r.append("\n LogNumber: "); - AppendNumberTo(&r, log_number_); - } - if (has_prev_log_number_) { - r.append("\n PrevLogNumber: "); - AppendNumberTo(&r, prev_log_number_); - } - if (has_next_file_number_) { - r.append("\n NextFile: "); - AppendNumberTo(&r, next_file_number_); - } - if (has_last_sequence_) { - r.append("\n LastSeq: "); - AppendNumberTo(&r, last_sequence_); - } - for (size_t i = 0; i < compact_pointers_.size(); i++) { - r.append("\n CompactPointer: "); - AppendNumberTo(&r, compact_pointers_[i].first); - r.append(" '"); - AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); - r.append("'"); - } - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { - r.append("\n DeleteFile: "); - AppendNumberTo(&r, iter->first); - r.append(" "); - AppendNumberTo(&r, iter->second); - } - for (size_t i = 0; i < new_files_.size(); i++) { - const FileMetaData& f = new_files_[i].second; - r.append("\n AddFile: "); - AppendNumberTo(&r, new_files_[i].first); - r.append(" "); - AppendNumberTo(&r, f.number); - r.append(" "); - AppendNumberTo(&r, f.file_size); - r.append(" '"); - AppendEscapedStringTo(&r, f.smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, f.largest.Encode()); - r.append("'"); - } - r.append("\n}\n"); - return r; -} - -} diff --git a/leveldb/db/version_edit.h b/leveldb/db/version_edit.h deleted file mode 100644 index ab874da..0000000 --- a/leveldb/db/version_edit.h +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ -#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ - -#include -#include -#include -#include "db/dbformat.h" - -namespace leveldb { - -class VersionSet; - -struct FileMetaData { - int refs; - uint64_t number; - uint64_t file_size; // File size in bytes - InternalKey smallest; // Smallest internal key served by table - InternalKey largest; // Largest internal key served by table - - FileMetaData() : refs(0), file_size(0) { } -}; - -class VersionEdit { - public: - VersionEdit() { Clear(); } - ~VersionEdit() { } - - void Clear(); - - void SetComparatorName(const Slice& name) { - has_comparator_ = true; - comparator_ = name.ToString(); - } - void SetLogNumber(uint64_t num) { - has_log_number_ = true; - log_number_ = num; - } - void SetPrevLogNumber(uint64_t num) { - has_prev_log_number_ = true; - prev_log_number_ = num; - } - void SetNextFile(uint64_t num) { - has_next_file_number_ = true; - next_file_number_ = num; - } - void SetLastSequence(SequenceNumber seq) { - has_last_sequence_ = true; - last_sequence_ = seq; - } - void SetCompactPointer(int level, const InternalKey& key) { - compact_pointers_.push_back(std::make_pair(level, key)); - } - - // Add the specified file at the specified number. - // REQUIRES: This version has not been saved (see VersionSet::SaveTo) - // REQUIRES: "smallest" and "largest" are smallest and largest keys in file - void AddFile(int level, uint64_t file, - uint64_t file_size, - const InternalKey& smallest, - const InternalKey& largest) { - FileMetaData f; - f.number = file; - f.file_size = file_size; - f.smallest = smallest; - f.largest = largest; - new_files_.push_back(std::make_pair(level, f)); - } - - // Delete the specified "file" from the specified "level". - void DeleteFile(int level, uint64_t file) { - deleted_files_.insert(std::make_pair(level, file)); - } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(const Slice& src); - - std::string DebugString() const; - - private: - friend class VersionSet; - - typedef std::set< std::pair > DeletedFileSet; - - std::string comparator_; - uint64_t log_number_; - uint64_t prev_log_number_; - uint64_t next_file_number_; - SequenceNumber last_sequence_; - bool has_comparator_; - bool has_log_number_; - bool has_prev_log_number_; - bool has_next_file_number_; - bool has_last_sequence_; - - std::vector< std::pair > compact_pointers_; - DeletedFileSet deleted_files_; - std::vector< std::pair > new_files_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/leveldb/db/version_edit_test.cc b/leveldb/db/version_edit_test.cc deleted file mode 100644 index 67959f7..0000000 --- a/leveldb/db/version_edit_test.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_edit.h" -#include "util/testharness.h" - -namespace leveldb { - -static void TestEncodeDecode(const VersionEdit& edit) { - std::string encoded, encoded2; - edit.EncodeTo(&encoded); - VersionEdit parsed; - Status s = parsed.DecodeFrom(encoded); - ASSERT_TRUE(s.ok()) << s.ToString(); - parsed.EncodeTo(&encoded2); - ASSERT_EQ(encoded, encoded2); -} - -class VersionEditTest { }; - -TEST(VersionEditTest, EncodeDecode) { - static const uint64_t kBig = 1ull << 50; - - VersionEdit edit; - for (int i = 0; i < 4; i++) { - TestEncodeDecode(edit); - edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, - InternalKey("foo", kBig + 500 + i, kTypeValue), - InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); - edit.DeleteFile(4, kBig + 700 + i); - edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); - } - - edit.SetComparatorName("foo"); - edit.SetLogNumber(kBig + 100); - edit.SetNextFile(kBig + 200); - edit.SetLastSequence(kBig + 1000); - TestEncodeDecode(edit); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/version_set.cc b/leveldb/db/version_set.cc deleted file mode 100644 index c439f49..0000000 --- a/leveldb/db/version_set.cc +++ /dev/null @@ -1,1027 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_set.h" - -#include -#include -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "leveldb/env.h" -#include "leveldb/table_builder.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -static const int kTargetFileSize = 2 * 1048576; - -// Maximum bytes of overlaps in grandparent (i.e., level+2) before we -// stop building a single file in a level->level+1 compaction. -static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; - -static double MaxBytesForLevel(int level) { - // Note: the result for level zero is not really used since we set - // the level-0 compaction threshold based on number of files. - double result = 10 * 1048576.0; // Result for both level-0 and level-1 - while (level > 1) { - result *= 10; - level--; - } - return result; -} - -static uint64_t MaxFileSizeForLevel(int level) { - return kTargetFileSize; // We could vary per level to reduce number of files? -} - -namespace { -std::string IntSetToString(const std::set& s) { - std::string result = "{"; - for (std::set::const_iterator it = s.begin(); - it != s.end(); - ++it) { - result += (result.size() > 1) ? "," : ""; - result += NumberToString(*it); - } - result += "}"; - return result; -} -} - -Version::~Version() { - assert(refs_ == 0); - for (int level = 0; level < config::kNumLevels; level++) { - for (size_t i = 0; i < files_[level].size(); i++) { - FileMetaData* f = files_[level][i]; - assert(f->refs >= 0); - f->refs--; - if (f->refs <= 0) { - delete f; - } - } - } - delete cleanup_mem_; -} - -// An internal iterator. For a given version/level pair, yields -// information about the files in the level. For a given entry, key() -// is the largest key that occurs in the file, and value() is an -// 16-byte value containing the file number and file size, both -// encoded using EncodeFixed64. -class Version::LevelFileNumIterator : public Iterator { - public: - LevelFileNumIterator(const Version* version, - const std::vector* flist) - : icmp_(version->vset_->icmp_.user_comparator()), - flist_(flist), - index_(flist->size()) { // Marks as invalid - } - virtual bool Valid() const { - return index_ < flist_->size(); - } - virtual void Seek(const Slice& target) { - uint32_t left = 0; - uint32_t right = flist_->size() - 1; - while (left < right) { - uint32_t mid = (left + right) / 2; - int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target); - if (cmp < 0) { - // Key at "mid.largest" is < than "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - index_ = left; - } - virtual void SeekToFirst() { index_ = 0; } - virtual void SeekToLast() { - index_ = flist_->empty() ? 0 : flist_->size() - 1; - } - virtual void Next() { - assert(Valid()); - index_++; - } - virtual void Prev() { - assert(Valid()); - if (index_ == 0) { - index_ = flist_->size(); // Marks as invalid - } else { - index_--; - } - } - Slice key() const { - assert(Valid()); - return (*flist_)[index_]->largest.Encode(); - } - Slice value() const { - assert(Valid()); - EncodeFixed64(value_buf_, (*flist_)[index_]->number); - EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); - return Slice(value_buf_, sizeof(value_buf_)); - } - virtual Status status() const { return Status::OK(); } - private: - const InternalKeyComparator icmp_; - const std::vector* const flist_; - uint32_t index_; - - // Backing store for value(). Holds the file number and size. - mutable char value_buf_[16]; -}; - -static Iterator* GetFileIterator(void* arg, - const ReadOptions& options, - const Slice& file_value) { - TableCache* cache = reinterpret_cast(arg); - if (file_value.size() != 16) { - return NewErrorIterator( - Status::Corruption("FileReader invoked with unexpected value")); - } else { - return cache->NewIterator(options, - DecodeFixed64(file_value.data()), - DecodeFixed64(file_value.data() + 8)); - } -} - -Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, - int level) const { - return NewTwoLevelIterator( - new LevelFileNumIterator(this, &files_[level]), - &GetFileIterator, vset_->table_cache_, options); -} - -void Version::AddIterators(const ReadOptions& options, - std::vector* iters) { - // Merge all level zero files together since they may overlap - for (size_t i = 0; i < files_[0].size(); i++) { - iters->push_back( - vset_->table_cache_->NewIterator( - options, files_[0][i]->number, files_[0][i]->file_size)); - } - - // For levels > 0, we can use a concatenating iterator that sequentially - // walks through the non-overlapping files in the level, opening them - // lazily. - for (int level = 1; level < config::kNumLevels; level++) { - if (!files_[level].empty()) { - iters->push_back(NewConcatenatingIterator(options, level)); - } - } -} - -void Version::Ref() { - ++refs_; -} - -void Version::Unref() { - assert(refs_ >= 1); - --refs_; - if (refs_ == 0) { - vset_->MaybeDeleteOldVersions(); - // TODO: try to delete obsolete files - } -} - -std::string Version::DebugString() const { - std::string r; - for (int level = 0; level < config::kNumLevels; level++) { - // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g'] - r.append("level "); - AppendNumberTo(&r, level); - r.push_back(':'); - const std::vector& files = files_[level]; - for (size_t i = 0; i < files.size(); i++) { - r.push_back(' '); - AppendNumberTo(&r, files[i]->number); - r.push_back(':'); - AppendNumberTo(&r, files[i]->file_size); - r.append("['"); - AppendEscapedStringTo(&r, files[i]->smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, files[i]->largest.Encode()); - r.append("']"); - } - r.push_back('\n'); - } - return r; -} - -// A helper class so we can efficiently apply a whole sequence -// of edits to a particular state without creating intermediate -// Versions that contain full copies of the intermediate state. -class VersionSet::Builder { - private: - typedef std::map FileMap; - VersionSet* vset_; - FileMap files_[config::kNumLevels]; - - public: - // Initialize a builder with the files from *base and other info from *vset - Builder(VersionSet* vset, Version* base) - : vset_(vset) { - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = base->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - FileMetaData* f = files[i]; - f->refs++; - files_[level].insert(std::make_pair(f->number, f)); - } - } - } - - ~Builder() { - for (int level = 0; level < config::kNumLevels; level++) { - const FileMap& fmap = files_[level]; - for (FileMap::const_iterator iter = fmap.begin(); - iter != fmap.end(); - ++iter) { - FileMetaData* f = iter->second; - f->refs--; - if (f->refs <= 0) { - delete f; - } - } - } - } - - // Apply all of the edits in *edit to the current state. - void Apply(VersionEdit* edit) { - // Update compaction pointers - for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { - const int level = edit->compact_pointers_[i].first; - vset_->compact_pointer_[level] = - edit->compact_pointers_[i].second.Encode().ToString(); - } - - // Delete files - const VersionEdit::DeletedFileSet& del = edit->deleted_files_; - for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); - iter != del.end(); - ++iter) { - const int level = iter->first; - const uint64_t number = iter->second; - FileMap::iterator fiter = files_[level].find(number); - assert(fiter != files_[level].end()); // Sanity check for debug mode - if (fiter != files_[level].end()) { - FileMetaData* f = fiter->second; - f->refs--; - if (f->refs <= 0) { - delete f; - } - files_[level].erase(fiter); - } - } - - // Add new files - for (size_t i = 0; i < edit->new_files_.size(); i++) { - const int level = edit->new_files_[i].first; - FileMetaData* f = new FileMetaData(edit->new_files_[i].second); - f->refs = 1; - assert(files_[level].count(f->number) == 0); - files_[level].insert(std::make_pair(f->number, f)); - } - } - - // Save the current state in *v. - void SaveTo(Version* v) { - for (int level = 0; level < config::kNumLevels; level++) { - const FileMap& fmap = files_[level]; - for (FileMap::const_iterator iter = fmap.begin(); - iter != fmap.end(); - ++iter) { - FileMetaData* f = iter->second; - f->refs++; - v->files_[level].push_back(f); - } - } - } -}; - -VersionSet::VersionSet(const std::string& dbname, - const Options* options, - TableCache* table_cache, - const InternalKeyComparator* cmp) - : env_(options->env), - dbname_(dbname), - options_(options), - table_cache_(table_cache), - icmp_(*cmp), - next_file_number_(2), - manifest_file_number_(0), // Filled by Recover() - last_sequence_(0), - log_number_(0), - prev_log_number_(0), - descriptor_file_(NULL), - descriptor_log_(NULL), - current_(new Version(this)), - oldest_(current_) { -} - -VersionSet::~VersionSet() { - for (Version* v = oldest_; v != NULL; ) { - Version* next = v->next_; - assert(v->refs_ == 0); - delete v; - v = next; - } - delete descriptor_log_; - delete descriptor_file_; -} - -Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { - if (edit->has_log_number_) { - assert(edit->log_number_ >= log_number_); - assert(edit->log_number_ < next_file_number_); - } else { - edit->SetLogNumber(log_number_); - } - - if (!edit->has_prev_log_number_) { - edit->SetPrevLogNumber(prev_log_number_); - } - - edit->SetNextFile(next_file_number_); - edit->SetLastSequence(last_sequence_); - - Version* v = new Version(this); - { - Builder builder(this, current_); - builder.Apply(edit); - builder.SaveTo(v); - } - - std::string new_manifest_file; - Status s = Finalize(v); - - // Initialize new descriptor log file if necessary by creating - // a temporary file that contains a snapshot of the current version. - if (s.ok()) { - if (descriptor_log_ == NULL) { - assert(descriptor_file_ == NULL); - new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); - edit->SetNextFile(next_file_number_); - s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); - if (s.ok()) { - descriptor_log_ = new log::Writer(descriptor_file_); - s = WriteSnapshot(descriptor_log_); - } - } - } - - // Write new record to MANIFEST log - if (s.ok()) { - std::string record; - edit->EncodeTo(&record); - s = descriptor_log_->AddRecord(record); - if (s.ok()) { - s = descriptor_file_->Sync(); - } - } - - // If we just created a new descriptor file, install it by writing a - // new CURRENT file that points to it. - if (s.ok() && !new_manifest_file.empty()) { - s = SetCurrentFile(env_, dbname_, manifest_file_number_); - } - - // Install the new version - if (s.ok()) { - assert(current_->next_ == NULL); - assert(current_->cleanup_mem_ == NULL); - current_->cleanup_mem_ = cleanup_mem; - v->next_ = NULL; - current_->next_ = v; - current_ = v; - log_number_ = edit->log_number_; - prev_log_number_ = edit->prev_log_number_; - } else { - delete v; - if (!new_manifest_file.empty()) { - delete descriptor_log_; - delete descriptor_file_; - descriptor_log_ = NULL; - descriptor_file_ = NULL; - env_->DeleteFile(new_manifest_file); - } - } - - return s; -} - -Status VersionSet::Recover() { - struct LogReporter : public log::Reader::Reporter { - Status* status; - virtual void Corruption(size_t bytes, const Status& s) { - if (this->status->ok()) *this->status = s; - } - }; - - // Read "CURRENT" file, which contains a pointer to the current manifest file - std::string current; - Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); - if (!s.ok()) { - return s; - } - if (current.empty() || current[current.size()-1] != '\n') { - return Status::Corruption("CURRENT file does not end with newline"); - } - current.resize(current.size() - 1); - - std::string dscname = dbname_ + "/" + current; - SequentialFile* file; - s = env_->NewSequentialFile(dscname, &file); - if (!s.ok()) { - return s; - } - - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t prev_log_number = 0; - Builder builder(this, current_); - - { - LogReporter reporter; - reporter.status = &s; - log::Reader reader(file, &reporter, true/*checksum*/); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (s.ok()) { - if (edit.has_comparator_ && - edit.comparator_ != icmp_.user_comparator()->Name()) { - s = Status::InvalidArgument( - edit.comparator_ + "does not match existing comparator ", - icmp_.user_comparator()->Name()); - } - } - - if (s.ok()) { - builder.Apply(&edit); - } - - if (edit.has_log_number_) { - log_number = edit.log_number_; - have_log_number = true; - } - - if (edit.has_prev_log_number_) { - prev_log_number = edit.prev_log_number_; - have_prev_log_number = true; - } - - if (edit.has_next_file_number_) { - next_file = edit.next_file_number_; - have_next_file = true; - } - - if (edit.has_last_sequence_) { - last_sequence = edit.last_sequence_; - have_last_sequence = true; - } - } - } - delete file; - file = NULL; - - if (s.ok()) { - if (!have_next_file) { - s = Status::Corruption("no meta-nextfile entry in descriptor"); - } else if (!have_log_number) { - s = Status::Corruption("no meta-lognumber entry in descriptor"); - } else if (!have_last_sequence) { - s = Status::Corruption("no last-sequence-number entry in descriptor"); - } - - if (!have_prev_log_number) { - prev_log_number = 0; - } - } - - if (s.ok()) { - Version* v = new Version(this); - builder.SaveTo(v); - s = Finalize(v); - if (!s.ok()) { - delete v; - } else { - // Install recovered version - v->next_ = NULL; - current_->next_ = v; - current_ = v; - manifest_file_number_ = next_file; - next_file_number_ = next_file + 1; - last_sequence_ = last_sequence; - log_number_ = log_number; - prev_log_number_ = prev_log_number; - } - } - - return s; -} - -static int64_t TotalFileSize(const std::vector& files) { - int64_t sum = 0; - for (size_t i = 0; i < files.size(); i++) { - sum += files[i]->file_size; - } - return sum; -} - -Status VersionSet::Finalize(Version* v) { - // Precomputed best level for next compaction - int best_level = -1; - double best_score = -1; - - Status s; - for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { - s = SortLevel(v, level); - - double score; - if (level == 0) { - // We treat level-0 specially by bounding the number of files - // instead of number of bytes for two reasons: - // - // (1) With larger write-buffer sizes, it is nice not to do too - // many level-0 compactions. - // - // (2) The files in level-0 are merged on every read and - // therefore we wish to avoid too many files when the individual - // file size is small (perhaps because of a small write-buffer - // setting, or very high compression ratios, or lots of - // overwrites/deletions). - score = v->files_[level].size() / 4.0; - } else { - // Compute the ratio of current size to size limit. - const uint64_t level_bytes = TotalFileSize(v->files_[level]); - score = static_cast(level_bytes) / MaxBytesForLevel(level); - } - - if (score > best_score) { - best_level = level; - best_score = score; - } - } - - v->compaction_level_ = best_level; - v->compaction_score_ = best_score; - return s; -} - -Status VersionSet::WriteSnapshot(log::Writer* log) { - // TODO: Break up into multiple records to reduce memory usage on recovery? - - // Save metadata - VersionEdit edit; - edit.SetComparatorName(icmp_.user_comparator()->Name()); - - // Save compaction pointers - for (int level = 0; level < config::kNumLevels; level++) { - if (!compact_pointer_[level].empty()) { - InternalKey key; - key.DecodeFrom(compact_pointer_[level]); - edit.SetCompactPointer(level, key); - } - } - - // Save files - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = current_->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - const FileMetaData* f = files[i]; - edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); - } - } - - std::string record; - edit.EncodeTo(&record); - return log->AddRecord(record); -} - -// Helper to sort by tables_[file_number].smallest -struct VersionSet::BySmallestKey { - const InternalKeyComparator* internal_comparator; - - bool operator()(FileMetaData* f1, FileMetaData* f2) const { - return internal_comparator->Compare(f1->smallest, f2->smallest) < 0; - } -}; - -Status VersionSet::SortLevel(Version* v, uint64_t level) { - Status result; - BySmallestKey cmp; - cmp.internal_comparator = &icmp_; - std::sort(v->files_[level].begin(), v->files_[level].end(), cmp); - - if (result.ok() && level > 0) { - // There should be no overlap - for (size_t i = 1; i < v->files_[level].size(); i++) { - const InternalKey& prev_end = v->files_[level][i-1]->largest; - const InternalKey& this_begin = v->files_[level][i]->smallest; - if (icmp_.Compare(prev_end, this_begin) >= 0) { - result = Status::Corruption( - "overlapping ranges in same level", - (EscapeString(prev_end.Encode()) + " vs. " + - EscapeString(this_begin.Encode()))); - break; - } - } - } - return result; -} - -int VersionSet::NumLevelFiles(int level) const { - assert(level >= 0); - assert(level < config::kNumLevels); - return current_->files_[level].size(); -} - -uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { - uint64_t result = 0; - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = v->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - if (icmp_.Compare(files[i]->largest, ikey) <= 0) { - // Entire file is before "ikey", so just add the file size - result += files[i]->file_size; - } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { - // Entire file is after "ikey", so ignore - if (level > 0) { - // Files other than level 0 are sorted by meta->smallest, so - // no further files in this level will contain data for - // "ikey". - break; - } - } else { - // "ikey" falls in the range for this table. Add the - // approximate offset of "ikey" within the table. - Table* tableptr; - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); - if (tableptr != NULL) { - result += tableptr->ApproximateOffsetOf(ikey.Encode()); - } - delete iter; - } - } - } - return result; -} - -void VersionSet::MaybeDeleteOldVersions() { - // Note: it is important to delete versions in order since a newer - // version with zero refs may be holding a pointer to a memtable - // that is used by somebody who has a ref on an older version. - while (oldest_ != current_ && oldest_->refs_ == 0) { - Version* next = oldest_->next_; - delete oldest_; - oldest_ = next; - } -} - -void VersionSet::AddLiveFiles(std::set* live) { - for (Version* v = oldest_; v != NULL; v = v->next_) { - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = v->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - live->insert(files[i]->number); - } - } - } -} - -int64_t VersionSet::NumLevelBytes(int level) const { - assert(level >= 0); - assert(level < config::kNumLevels); - return TotalFileSize(current_->files_[level]); -} - -int64_t VersionSet::MaxNextLevelOverlappingBytes() { - int64_t result = 0; - std::vector overlaps; - for (int level = 0; level < config::kNumLevels - 1; level++) { - for (size_t i = 0; i < current_->files_[level].size(); i++) { - const FileMetaData* f = current_->files_[level][i]; - GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); - const int64_t sum = TotalFileSize(overlaps); - if (sum > result) { - result = sum; - } - } - } - return result; -} - -// Store in "*inputs" all files in "level" that overlap [begin,end] -void VersionSet::GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs) { - inputs->clear(); - Slice user_begin = begin.user_key(); - Slice user_end = end.user_key(); - const Comparator* user_cmp = icmp_.user_comparator(); - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || - user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { - // Either completely before or after range; skip it - } else { - inputs->push_back(f); - } - } -} - -// Stores the minimal range that covers all entries in inputs in -// *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest) { - assert(!inputs.empty()); - smallest->Clear(); - largest->Clear(); - for (size_t i = 0; i < inputs.size(); i++) { - FileMetaData* f = inputs[i]; - if (i == 0) { - *smallest = f->smallest; - *largest = f->largest; - } else { - if (icmp_.Compare(f->smallest, *smallest) < 0) { - *smallest = f->smallest; - } - if (icmp_.Compare(f->largest, *largest) > 0) { - *largest = f->largest; - } - } - } -} - -// Stores the minimal range that covers all entries in inputs1 and inputs2 -// in *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange2(const std::vector& inputs1, - const std::vector& inputs2, - InternalKey* smallest, - InternalKey* largest) { - std::vector all = inputs1; - all.insert(all.end(), inputs2.begin(), inputs2.end()); - GetRange(all, smallest, largest); -} - -Iterator* VersionSet::MakeInputIterator(Compaction* c) { - ReadOptions options; - options.verify_checksums = options_->paranoid_checks; - options.fill_cache = false; - - // Level-0 files have to be merged together. For other levels, - // we will make a concatenating iterator per level. - // TODO(opt): use concatenating iterator for level-0 if there is no overlap - const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); - Iterator** list = new Iterator*[space]; - int num = 0; - for (int which = 0; which < 2; which++) { - if (!c->inputs_[which].empty()) { - if (c->level() + which == 0) { - const std::vector& files = c->inputs_[which]; - for (size_t i = 0; i < files.size(); i++) { - list[num++] = table_cache_->NewIterator( - options, files[i]->number, files[i]->file_size); - } - } else { - // Create concatenating iterator for the files from this level - list[num++] = NewTwoLevelIterator( - new Version::LevelFileNumIterator( - c->input_version_, &c->inputs_[which]), - &GetFileIterator, table_cache_, options); - } - } - } - assert(num <= space); - Iterator* result = NewMergingIterator(&icmp_, list, num); - delete[] list; - return result; -} - -Compaction* VersionSet::PickCompaction() { - if (!NeedsCompaction()) { - return NULL; - } - const int level = current_->compaction_level_; - assert(level >= 0); - assert(level+1 < config::kNumLevels); - - Compaction* c = new Compaction(level); - c->input_version_ = current_; - c->input_version_->Ref(); - - // Pick the first file that comes after compact_pointer_[level] - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (compact_pointer_[level].empty() || - icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { - c->inputs_[0].push_back(f); - break; - } - } - if (c->inputs_[0].empty()) { - // Wrap-around to the beginning of the key space - c->inputs_[0].push_back(current_->files_[level][0]); - } - - // Files in level 0 may overlap each other, so pick up all overlapping ones - if (level == 0) { - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - // Note that the next call will discard the file we placed in - // c->inputs_[0] earlier and replace it with an overlapping set - // which will include the picked file. - GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); - assert(!c->inputs_[0].empty()); - } - - SetupOtherInputs(c); - - return c; -} - -void VersionSet::SetupOtherInputs(Compaction* c) { - const int level = c->level(); - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - - GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); - - // Get entire range covered by compaction - InternalKey all_start, all_limit; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - - // See if we can grow the number of inputs in "level" without - // changing the number of "level+1" files we pick up. - if (!c->inputs_[1].empty()) { - std::vector expanded0; - GetOverlappingInputs(level, all_start, all_limit, &expanded0); - if (expanded0.size() > c->inputs_[0].size()) { - InternalKey new_start, new_limit; - GetRange(expanded0, &new_start, &new_limit); - std::vector expanded1; - GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); - if (expanded1.size() == c->inputs_[1].size()) { - Log(env_, options_->info_log, - "Expanding@%d %d+%d to %d+%d\n", - level, - int(c->inputs_[0].size()), - int(c->inputs_[1].size()), - int(expanded0.size()), - int(expanded1.size())); - smallest = new_start; - largest = new_limit; - c->inputs_[0] = expanded0; - c->inputs_[1] = expanded1; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - } - } - } - - // Compute the set of grandparent files that overlap this compaction - // (parent == level+1; grandparent == level+2) - if (level + 2 < config::kNumLevels) { - GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); - } - - if (false) { - Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", - level, - EscapeString(smallest.Encode()).c_str(), - EscapeString(largest.Encode()).c_str()); - } - - // Update the place where we will do the next compaction for this level. - // We update this immediately instead of waiting for the VersionEdit - // to be applied so that if the compaction fails, we will try a different - // key range next time. - compact_pointer_[level] = largest.Encode().ToString(); - c->edit_.SetCompactPointer(level, largest); -} - -Compaction* VersionSet::CompactRange( - int level, - const InternalKey& begin, - const InternalKey& end) { - std::vector inputs; - GetOverlappingInputs(level, begin, end, &inputs); - if (inputs.empty()) { - return NULL; - } - - Compaction* c = new Compaction(level); - c->input_version_ = current_; - c->input_version_->Ref(); - c->inputs_[0] = inputs; - SetupOtherInputs(c); - return c; -} - -Compaction::Compaction(int level) - : level_(level), - max_output_file_size_(MaxFileSizeForLevel(level)), - input_version_(NULL), - grandparent_index_(0), - seen_key_(false), - overlapped_bytes_(0) { - for (int i = 0; i < config::kNumLevels; i++) { - level_ptrs_[i] = 0; - } -} - -Compaction::~Compaction() { - if (input_version_ != NULL) { - input_version_->Unref(); - } -} - -bool Compaction::IsTrivialMove() const { - // Avoid a move if there is lots of overlapping grandparent data. - // Otherwise, the move could create a parent file that will require - // a very expensive merge later on. - return (num_input_files(0) == 1 && - num_input_files(1) == 0 && - TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes); -} - -void Compaction::AddInputDeletions(VersionEdit* edit) { - for (int which = 0; which < 2; which++) { - for (size_t i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level_ + which, inputs_[which][i]->number); - } - } -} - -bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { - const std::vector& files = input_version_->files_[lvl]; - for (; level_ptrs_[lvl] < files.size(); ) { - FileMetaData* f = files[level_ptrs_[lvl]]; - if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { - // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { - // Key falls in this file's range, so definitely not base level - return false; - } - break; - } - level_ptrs_[lvl]++; - } - } - return true; -} - -bool Compaction::ShouldStopBefore(const InternalKey& key) { - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; - } - grandparent_index_++; - } - seen_key_ = true; - - if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; - } else { - return false; - } -} - -void Compaction::ReleaseInputs() { - if (input_version_ != NULL) { - input_version_->Unref(); - input_version_ = NULL; - } -} - -} diff --git a/leveldb/db/version_set.h b/leveldb/db/version_set.h deleted file mode 100644 index e377513..0000000 --- a/leveldb/db/version_set.h +++ /dev/null @@ -1,308 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// The representation of a DBImpl consists of a set of Versions. The -// newest version is called "current". Older versions may be kept -// around to provide a consistent view to live iterators. -// -// Each Version keeps track of a set of Table files per level. The -// entire set of versions is maintained in a VersionSet. -// -// Version,VersionSet are thread-compatible, but require external -// synchronization on all accesses. - -#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ -#define STORAGE_LEVELDB_DB_VERSION_SET_H_ - -#include -#include -#include -#include "db/dbformat.h" -#include "db/version_edit.h" -#include "port/port.h" - -namespace leveldb { - -namespace log { class Writer; } - -class Compaction; -class Iterator; -class MemTable; -class TableBuilder; -class TableCache; -class Version; -class VersionSet; -class WritableFile; - -class Version { - public: - // Append to *iters a sequence of iterators that will - // yield the contents of this Version when merged together. - // REQUIRES: This version has been saved (see VersionSet::SaveTo) - void AddIterators(const ReadOptions&, std::vector* iters); - - // Reference count management (so Versions do not disappear out from - // under live iterators) - void Ref(); - void Unref(); - - // Return a human readable string that describes this version's contents. - std::string DebugString() const; - - private: - friend class Compaction; - friend class VersionSet; - - class LevelFileNumIterator; - Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; - - VersionSet* vset_; // VersionSet to which this Version belongs - Version* next_; // Next version in linked list - int refs_; // Number of live refs to this version - MemTable* cleanup_mem_; // NULL, or table to delete when version dropped - - // List of files per level - std::vector files_[config::kNumLevels]; - - // Level that should be compacted next and its compaction score. - // Score < 1 means compaction is not strictly needed. These fields - // are initialized by Finalize(). - double compaction_score_; - int compaction_level_; - - explicit Version(VersionSet* vset) - : vset_(vset), next_(NULL), refs_(0), - cleanup_mem_(NULL), - compaction_score_(-1), - compaction_level_(-1) { - } - - ~Version(); - - // No copying allowed - Version(const Version&); - void operator=(const Version&); -}; - -class VersionSet { - public: - VersionSet(const std::string& dbname, - const Options* options, - TableCache* table_cache, - const InternalKeyComparator*); - ~VersionSet(); - - // Apply *edit to the current version to form a new descriptor that - // is both saved to persistent state and installed as the new - // current version. Iff Apply() returns OK, arrange to delete - // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed - // by older versions. - Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); - - // Recover the last saved descriptor from persistent storage. - Status Recover(); - - // Save current contents to *log - Status WriteSnapshot(log::Writer* log); - - // Return the current version. - Version* current() const { return current_; } - - // Return the current manifest file number - uint64_t ManifestFileNumber() const { return manifest_file_number_; } - - // Allocate and return a new file number - uint64_t NewFileNumber() { return next_file_number_++; } - - // Return the number of Table files at the specified level. - int NumLevelFiles(int level) const; - - // Return the combined file size of all files at the specified level. - int64_t NumLevelBytes(int level) const; - - // Return the last sequence number. - uint64_t LastSequence() const { return last_sequence_; } - - // Set the last sequence number to s. - void SetLastSequence(uint64_t s) { - assert(s >= last_sequence_); - last_sequence_ = s; - } - - // Return the current log file number. - uint64_t LogNumber() const { return log_number_; } - - // Return the log file number for the log file that is currently - // being compacted, or zero if there is no such log file. - uint64_t PrevLogNumber() const { return prev_log_number_; } - - // Pick level and inputs for a new compaction. - // Returns NULL if there is no compaction to be done. - // Otherwise returns a pointer to a heap-allocated object that - // describes the compaction. Caller should delete the result. - Compaction* PickCompaction(); - - // Return a compaction object for compacting the range [begin,end] in - // the specified level. Returns NULL if there is nothing in that - // level that overlaps the specified range. Caller should delete - // the result. - Compaction* CompactRange( - int level, - const InternalKey& begin, - const InternalKey& end); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t MaxNextLevelOverlappingBytes(); - - // Create an iterator that reads over the compaction inputs for "*c". - // The caller should delete the iterator when no longer needed. - Iterator* MakeInputIterator(Compaction* c); - - // Returns true iff some level needs a compaction. - bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } - - // Add all files listed in any live version to *live. - // May also mutate some internal state. - void AddLiveFiles(std::set* live); - - // Return the approximate offset in the database of the data for - // "key" as of version "v". - uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); - - private: - class Builder; - - friend class Compaction; - friend class Version; - - Status Finalize(Version* v); - - // Delete any old versions that are no longer needed. - void MaybeDeleteOldVersions(); - - struct BySmallestKey; - Status SortLevel(Version* v, uint64_t level); - - void GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs); - - void GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest); - - void GetRange2(const std::vector& inputs1, - const std::vector& inputs2, - InternalKey* smallest, - InternalKey* largest); - - void SetupOtherInputs(Compaction* c); - - Env* const env_; - const std::string dbname_; - const Options* const options_; - TableCache* const table_cache_; - const InternalKeyComparator icmp_; - uint64_t next_file_number_; - uint64_t manifest_file_number_; - uint64_t last_sequence_; - uint64_t log_number_; - uint64_t prev_log_number_; // 0 or backing store for memtable being compacted - - // Opened lazily - WritableFile* descriptor_file_; - log::Writer* descriptor_log_; - - // Versions are kept in a singly linked list that is never empty - Version* current_; // Pointer to the last (newest) list entry - Version* oldest_; // Pointer to the first (oldest) list entry - - // Per-level key at which the next compaction at that level should start. - // Either an empty string, or a valid InternalKey. - std::string compact_pointer_[config::kNumLevels]; - - // No copying allowed - VersionSet(const VersionSet&); - void operator=(const VersionSet&); -}; - -// A Compaction encapsulates information about a compaction. -class Compaction { - public: - ~Compaction(); - - // Return the level that is being compacted. Inputs from "level" - // and "level+1" will be merged to produce a set of "level+1" files. - int level() const { return level_; } - - // Return the object that holds the edits to the descriptor done - // by this compaction. - VersionEdit* edit() { return &edit_; } - - // "which" must be either 0 or 1 - int num_input_files(int which) const { return inputs_[which].size(); } - - // Return the ith input file at "level()+which" ("which" must be 0 or 1). - FileMetaData* input(int which, int i) const { return inputs_[which][i]; } - - // Maximum size of files to build during this compaction. - uint64_t MaxOutputFileSize() const { return max_output_file_size_; } - - // Is this a trivial compaction that can be implemented by just - // moving a single input file to the next level (no merging or splitting) - bool IsTrivialMove() const; - - // Add all inputs to this compaction as delete operations to *edit. - void AddInputDeletions(VersionEdit* edit); - - // Returns true if the information we have available guarantees that - // the compaction is producing data in "level+1" for which no data exists - // in levels greater than "level+1". - bool IsBaseLevelForKey(const Slice& user_key); - - // Returns true iff we should stop building the current output - // before processing "key". - bool ShouldStopBefore(const InternalKey& key); - - // Release the input version for the compaction, once the compaction - // is successful. - void ReleaseInputs(); - - private: - friend class Version; - friend class VersionSet; - - explicit Compaction(int level); - - int level_; - uint64_t max_output_file_size_; - Version* input_version_; - VersionEdit edit_; - - // Each compaction reads inputs from "level_" and "level_+1" - std::vector inputs_[2]; // The two sets of inputs - - // State used to check for number of of overlapping grandparent files - // (parent == level_ + 1, grandparent == level_ + 2) - std::vector grandparents_; - size_t grandparent_index_; // Index in grandparent_starts_ - bool seen_key_; // Some output key has been seen - int64_t overlapped_bytes_; // Bytes of overlap between current output - // and grandparent files - - // State for implementing IsBaseLevelForKey - - // level_ptrs_ holds indices into input_version_->levels_: our state - // is that we are positioned at one of the file ranges for each - // higher level than the ones involved in this compaction (i.e. for - // all L >= level_ + 2). - size_t level_ptrs_[config::kNumLevels]; -}; - -} - -#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/leveldb/db/write_batch.cc b/leveldb/db/write_batch.cc deleted file mode 100644 index d561528..0000000 --- a/leveldb/db/write_batch.cc +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// WriteBatch::rep_ := -// sequence: fixed64 -// count: fixed32 -// data: record[count] -// record := -// kTypeValue varstring varstring | -// kTypeDeletion varstring -// varstring := -// len: varint32 -// data: uint8[len] - -#include "leveldb/write_batch.h" - -#include "leveldb/db.h" -#include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "util/coding.h" - -namespace leveldb { - -WriteBatch::WriteBatch() { - Clear(); -} - -WriteBatch::~WriteBatch() { } - -void WriteBatch::Clear() { - rep_.clear(); - rep_.resize(12); -} - -int WriteBatchInternal::Count(const WriteBatch* b) { - return DecodeFixed32(b->rep_.data() + 8); -} - -void WriteBatchInternal::SetCount(WriteBatch* b, int n) { - EncodeFixed32(&b->rep_[8], n); -} - -SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { - return SequenceNumber(DecodeFixed64(b->rep_.data())); -} - -void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { - EncodeFixed64(&b->rep_[0], seq); -} - -void WriteBatch::Put(const Slice& key, const Slice& value) { - WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeValue)); - PutLengthPrefixedSlice(&rep_, key); - PutLengthPrefixedSlice(&rep_, value); -} - -void WriteBatch::Delete(const Slice& key) { - WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeDeletion)); - PutLengthPrefixedSlice(&rep_, key); -} - -Status WriteBatchInternal::InsertInto(const WriteBatch* b, - MemTable* memtable) { - const int count = WriteBatchInternal::Count(b); - int found = 0; - Iterator it(*b); - for (; !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeDeletion: - memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); - break; - case kTypeValue: - memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); - break; - } - found++; - } - if (!it.status().ok()) { - return it.status(); - } else if (found != count) { - return Status::Corruption("wrong count in WriteBatch"); - } - return Status::OK(); -} - -void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { - assert(contents.size() >= 12); - b->rep_.assign(contents.data(), contents.size()); -} - -WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) - : input_(WriteBatchInternal::Contents(&batch)), - done_(false) { - if (input_.size() < 12) { - done_ = true; - } else { - seq_ = WriteBatchInternal::Sequence(&batch), - input_.remove_prefix(12); - GetNextEntry(); - } -} - -void WriteBatchInternal::Iterator::Next() { - assert(!done_); - seq_++; - GetNextEntry(); -} - -void WriteBatchInternal::Iterator::GetNextEntry() { - if (input_.empty()) { - done_ = true; - return; - } - char tag = input_[0]; - input_.remove_prefix(1); - switch (tag) { - case kTypeValue: - if (GetLengthPrefixedSlice(&input_, &key_) && - GetLengthPrefixedSlice(&input_, &value_)) { - op_ = static_cast(tag); - } else { - status_ = Status::Corruption("bad WriteBatch Put"); - done_ = true; - input_.clear(); - } - break; - case kTypeDeletion: - if (GetLengthPrefixedSlice(&input_, &key_)) { - op_ = kTypeDeletion; - } else { - status_ = Status::Corruption("bad WriteBatch Delete"); - done_ = true; - input_.clear(); - } - break; - default: - status_ = Status::Corruption("unknown WriteBatch tag"); - done_ = true; - input_.clear(); - break; - } -} - -} diff --git a/leveldb/db/write_batch_internal.h b/leveldb/db/write_batch_internal.h deleted file mode 100644 index ab0a823..0000000 --- a/leveldb/db/write_batch_internal.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ -#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ - -#include "leveldb/write_batch.h" - -namespace leveldb { - -// WriteBatchInternal provides static methods for manipulating a -// WriteBatch that we don't want in the public WriteBatch interface. -class WriteBatchInternal { - public: - // Return the number of entries in the batch. - static int Count(const WriteBatch* batch); - - // Set the count for the number of entries in the batch. - static void SetCount(WriteBatch* batch, int n); - - // Return the seqeunce number for the start of this batch. - static SequenceNumber Sequence(const WriteBatch* batch); - - // Store the specified number as the seqeunce number for the start of - // this batch. - static void SetSequence(WriteBatch* batch, SequenceNumber seq); - - static Slice Contents(const WriteBatch* batch) { - return Slice(batch->rep_); - } - - static size_t ByteSize(const WriteBatch* batch) { - return batch->rep_.size(); - } - - static void SetContents(WriteBatch* batch, const Slice& contents); - - static Status InsertInto(const WriteBatch* batch, MemTable* memtable); - - // Iterate over the contents of a write batch. - class Iterator { - public: - explicit Iterator(const WriteBatch& batch); - bool Done() const { return done_; } - void Next(); - ValueType op() const { return op_; } - const Slice& key() const { return key_; } - const Slice& value() const { return value_; } - SequenceNumber sequence_number() const { return seq_; } - Status status() const { return status_; } - - private: - void GetNextEntry(); - - Slice input_; - bool done_; - ValueType op_; - Slice key_; - Slice value_; - SequenceNumber seq_; - Status status_; - }; -}; - -} - - -#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/leveldb/db/write_batch_test.cc b/leveldb/db/write_batch_test.cc deleted file mode 100644 index 2bf1134..0000000 --- a/leveldb/db/write_batch_test.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "leveldb/env.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -static std::string PrintContents(WriteBatch* b) { - InternalKeyComparator cmp(BytewiseComparator()); - MemTable mem(cmp); - std::string state; - Status s = WriteBatchInternal::InsertInto(b, &mem); - Iterator* iter = mem.NewIterator(); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey ikey; - ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); - switch (ikey.type) { - case kTypeValue: - state.append("Put("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - break; - case kTypeDeletion: - state.append("Delete("); - state.append(ikey.user_key.ToString()); - state.append(")"); - break; - } - state.append("@"); - state.append(NumberToString(ikey.sequence)); - } - delete iter; - if (!s.ok()) { - state.append("ParseError()"); - } - return state; -} - -class WriteBatchTest { }; - -TEST(WriteBatchTest, Empty) { - WriteBatch batch; - ASSERT_EQ("", PrintContents(&batch)); - ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); -} - -TEST(WriteBatchTest, Multiple) { - WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - batch.Put(Slice("baz"), Slice("boo")); - WriteBatchInternal::SetSequence(&batch, 100); - ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); - ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); - ASSERT_EQ("Put(baz, boo)@102" - "Delete(box)@101" - "Put(foo, bar)@100", - PrintContents(&batch)); -} - -TEST(WriteBatchTest, Corruption) { - WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - WriteBatchInternal::SetSequence(&batch, 200); - Slice contents = WriteBatchInternal::Contents(&batch); - WriteBatchInternal::SetContents(&batch, - Slice(contents.data(),contents.size()-1)); - ASSERT_EQ("Put(foo, bar)@200" - "ParseError()", - PrintContents(&batch)); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/doc/doc.css b/leveldb/doc/doc.css deleted file mode 100644 index 700c564..0000000 --- a/leveldb/doc/doc.css +++ /dev/null @@ -1,89 +0,0 @@ -body { - margin-left: 0.5in; - margin-right: 0.5in; - background: white; - color: black; -} - -h1 { - margin-left: -0.2in; - font-size: 14pt; -} -h2 { - margin-left: -0in; - font-size: 12pt; -} -h3 { - margin-left: -0in; -} -h4 { - margin-left: -0in; -} -hr { - margin-left: -0in; -} - -/* Definition lists: definition term bold */ -dt { - font-weight: bold; -} - -address { - text-align: center; -} -code,samp,var { - color: blue; -} -kbd { - color: #600000; -} -div.note p { - float: right; - width: 3in; - margin-right: 0%; - padding: 1px; - border: 2px solid #6060a0; - background-color: #fffff0; -} - -ul { - margin-top: -0em; - margin-bottom: -0em; -} - -ol { - margin-top: -0em; - margin-bottom: -0em; -} - -UL.nobullets { - list-style-type: none; - list-style-image: none; - margin-left: -1em; -} - -p { - margin: 1em 0 1em 0; - padding: 0 0 0 0; -} - -pre { - line-height: 1.3em; - padding: 0.4em 0 0.8em 0; - margin: 0 0 0 0; - border: 0 0 0 0; - color: blue; -} - -.datatable { - margin-left: auto; - margin-right: auto; - margin-top: 2em; - margin-bottom: 2em; - border: 1px solid; -} - -.datatable td,th { - padding: 0 0.5em 0 0.5em; - text-align: right; -} diff --git a/leveldb/doc/impl.html b/leveldb/doc/impl.html deleted file mode 100644 index dd09fea..0000000 --- a/leveldb/doc/impl.html +++ /dev/null @@ -1,217 +0,0 @@ - - - - -Leveldb file layout and compactions - - - - -

Files

- -The implementation of leveldb is similar in spirit to the -representation of a single - -Bigtable tablet (section 5.3). -However the organization of the files that make up the representation -is somewhat different and is explained below. - -

-Each database is represented by a set of file stored in a directory. -There are several different types of files as documented below: -

-

Log files

-

-A log file (*.log) stores a sequence of recent updates. Each update -is appended to the current log file. When the log file reaches a -pre-determined size (approximately 1MB by default), it is converted -to a sorted table (see below) and a new log file is created for future -updates. -

-A copy of the current log file is kept in an in-memory structure (the -memtable). This copy is consulted on every read so that read -operations reflect all logged updates. -

-

Sorted tables

-

-A sorted table (*.sst) stores a sequence of entries sorted by key. -Each entry is either a value for the key, or a deletion marker for the -key. (Deletion markers are kept around to hide obsolete values -present in older sorted tables). -

-The set of sorted tables are organized into a sequence of levels. The -sorted table generated from a log file is placed in a special young -level (also called level-0). When the number of young files exceeds a -certain threshold (currently four), all of the young files are merged -together with all of the overlapping level-1 files to produce a -sequence of new level-1 files (we create a new level-1 file for every -2MB of data.) -

-Files in the young level may contain overlapping keys. However files -in other levels have distinct non-overlapping key ranges. Consider -level number L where L >= 1. When the combined size of files in -level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, -...), one file in level-L, and all of the overlapping files in -level-(L+1) are merged to form a set of new files for level-(L+1). -These merges have the effect of gradually migrating new updates from -the young level to the largest level using only bulk reads and writes -(i.e., minimizing expensive seeks). - -

Manifest

-

-A MANIFEST file lists the set of sorted tables that make up each -level, the corresponding key ranges, and other important metadata. -A new MANIFEST file (with a new number embedded in the file name) -is created whenever the database is reopened. The MANIFEST file is -formatted as a log, and changes made to the serving state (as files -are added or removed) are appended to this log. -

-

Current

-

-CURRENT is a simple text file that contains the name of the latest -MANIFEST file. -

-

Info logs

-

-Informational messages are printed to files named LOG and LOG.old. -

-

Others

-

-Other files used for miscellaneous purposes may also be present -(LOCK, *.dbtmp). - -

Level 0

-When the log file grows above a certain size (1MB by default): -
    -
  • Write the contents of the current memtable to an sstable -
  • Replace the current memtable by a brand new empty memtable -
  • Switch to a new log file -
  • Delete the old log file and the old memtable -
-Experimental measurements show that generating an sstable from a 1MB -log file takes ~12ms, which seems like an acceptable latency hiccup to -add infrequently to a log write. - -

-The new sstable is added to a special level-0 level. level-0 contains -a set of files (up to 4 by default). However unlike other levels, -these files do not cover disjoint ranges, but may overlap each other. - -

Compactions

- -

-When the size of level L exceeds its limit, we compact it in a -background thread. The compaction picks a file from level L and all -overlapping files from the next level L+1. Note that if a level-L -file overlaps only part of a level-(L+1) file, the entire file at -level-(L+1) is used as an input to the compaction and will be -discarded after the compaction. Aside: because level-0 is special -(files in it may overlap each other), we treat compactions from -level-0 to level-1 specially: a level-0 compaction may pick more than -one level-0 file in case some of these files overlap each other. - -

-A compaction merges the contents of the picked files to produce a -sequence of level-(L+1) files. We switch to producing a new -level-(L+1) file after the current output file has reached the target -file size (2MB). We also switch to a new output file when the key -range of the current output file has grown enough to overlap more then -ten level-(L+2) files. This last rule ensures that a later compaction -of a level-(L+1) file will not pick up too much data from level-(L+2). - -

-The old files are discarded and the new files are added to the serving -state. - -

-Compactions for a particular level rotate through the key space. In -more detail, for each level L, we remember the ending key of the last -compaction at level L. The next compaction for level L will pick the -first file that starts after this key (wrapping around to the -beginning of the key space if there is no such file). - -

-Compactions drop overwritten values. They also drop deletion markers -if there are no higher numbered levels that contain a file whose range -overlaps the current key. - -

Timing

- -Level-0 compactions will read up to four 1MB files from level-0, and -at worst all the level-1 files (10MB). I.e., we will read 14MB and -write 14MB. - -

-Other than the special level-0 compactions, we will pick one 2MB file -from level L. In the worst case, this will overlap ~ 12 files from -level L+1 (10 because level-(L+1) is ten times the size of level-L, -and another two at the boundaries since the file ranges at level-L -will usually not be aligned with the file ranges at level-L+1). The -compaction will therefore read 26MB and write 26MB. Assuming a disk -IO rate of 100MB/s (ballpark range for modern drives), the worst -compaction cost will be approximately 0.5 second. - -

-If we throttle the background writing to something small, say 10% of -the full 100MB/s speed, a compaction may take up to 5 seconds. If the -user is writing at 10MB/s, we might build up lots of level-0 files -(~50 to hold the 5*10MB). This may signficantly increase the cost of -reads due to the overhead of merging more files together on every -read. - -

-Solution 1: To reduce this problem, we might want to increase the log -switching threshold when the number of level-0 files is large. Though -the downside is that the larger this threshold, the larger the delay -that we will add to write latency when a write triggers a log switch. - -

-Solution 2: We might want to decrease write rate artificially when the -number of level-0 files goes up. - -

-Solution 3: We work on reducing the cost of very wide merges. -Perhaps most of the level-0 files will have their blocks sitting -uncompressed in the cache and we will only need to worry about the -O(N) complexity in the merging iterator. - -

Number of files

- -Instead of always making 2MB files, we could make larger files for -larger levels to reduce the total file count, though at the expense of -more bursty compactions. Alternatively, we could shard the set of -files into multiple directories. - -

-An experiment on an ext3 filesystem on Feb 04, 2011 shows -the following timings to do 100K file opens in directories with -varying number of files: - - - - - -
Files in directoryMicroseconds to open a file
10009
1000010
10000016
-So maybe even the sharding is not necessary on modern filesystems? - -

Recovery

- -
    -
  • Read CURRENT to find name of the latest committed MANIFEST -
  • Read the named MANIFEST file -
  • Clean up stale files -
  • We could open all sstables here, but it is probably better to be lazy... -
  • Convert log chunk to a new level-0 sstable -
  • Start directing new writes to a new log file with recovered sequence# -
- -

Garbage collection of files

- -DeleteObsoleteFiles() is called at the end of every -compaction and at the end of recovery. It finds the names of all -files in the database. It deletes all log files that are not the -current log file. It deletes all table files that are not referenced -from some level and are not the output of an active compaction. - - - diff --git a/leveldb/doc/index.html b/leveldb/doc/index.html deleted file mode 100644 index c2312b7..0000000 --- a/leveldb/doc/index.html +++ /dev/null @@ -1,498 +0,0 @@ - - - - -Leveldb - - - -

Leveldb

-
Jeff Dean, Sanjay Ghemawat
-

-The leveldb library provides a persistent key value store. Keys and -values are arbitrary byte arrays. The keys are ordered within the key -value store according to a user-specified comparator function. - -

-

Opening A Database

-

-A leveldb database has a name which corresponds to a file system -directory. All of the contents of database are stored in this -directory. The following example shows how to open a database, -creating it if necessary: -

-

-  #include <assert>
-  #include "leveldb/include/db.h"
-
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.create_if_missing = true;
-  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-  assert(status.ok());
-  ...
-
-If you want to raise an error if the database already exists, add -the following line before the leveldb::DB::Open call: -
-  options.error_if_exists = true;
-
-

Status

-

-You may have noticed the leveldb::Status type above. Values of this -type are returned by most functions in leveldb that may encounter an -error. You can check if such a result is ok, and also print an -associated error message: -

-

-   leveldb::Status s = ...;
-   if (!s.ok()) cerr << s.ToString() << endl;
-
-

Closing A Database

-

-When you are done with a database, just delete the database object. -Example: -

-

-  ... open the db as described above ...
-  ... do something with db ...
-  delete db;
-
-

Reads And Writes

-

-The database provides Put, Delete, and Get methods to -modify/query the database. For example, the following code -moves the value stored under key1 to key2. -

-  std::string value;
-  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
-  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
-
- -

Atomic Updates

-

-Note that if the process dies after the Put of key2 but before the -delete of key1, the same value may be left stored under multiple keys. -Such problems can be avoided by using the WriteBatch class to -atomically apply a set of updates: -

-

-  #include "leveldb/include/write_batch.h"
-  ...
-  std::string value;
-  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-  if (s.ok()) {
-    leveldb::WriteBatch batch;
-    batch.Delete(key1);
-    batch.Put(key2, value);
-    s = db->Write(leveldb::WriteOptions(), &batch);
-  }
-
-The WriteBatch holds a sequence of edits to be made to the database, -and these edits within the batch are applied in order. Note that we -called Delete before Put so that if key1 is identical to key2, -we do not end up erroneously dropping the value entirely. -

-Apart from its atomicity benefits, WriteBatch may also be used to -speed up bulk updates by placing lots of individual mutations into the -same batch. - -

Synchronous Writes

-By default, each write to leveldb is asynchronous: it -returns after pushing the write from the process into the operating -system. The transfer from operating system memory to the underlying -persistent storage happens asynchronously. The sync flag -can be turned on for a particular write to make the write operation -not return until the data being written has been pushed all the way to -persistent storage. (On Posix systems, this is implemented by calling -either fsync(...) or fdatasync(...) or -msync(..., MS_SYNC) before the write operation returns.) -
-  leveldb::WriteOptions write_options;
-  write_options.sync = true;
-  db->Put(write_options, ...);
-
-Asynchronous writes are often more than a thousand times as fast as -synchronous writes. The downside of asynchronous writes is that a -crash of the machine may cause the last few updates to be lost. Note -that a crash of just the writing process (i.e., not a reboot) will not -cause any loss since even when sync is false, an update -is pushed from the process memory into the operating system before it -is considered done. - -

-Asynchronous writes can often be used safely. For example, when -loading a large amount of data into the database you can handle lost -updates by restarting the bulk load after a crash. A hybrid scheme is -also possible where every Nth write is synchronous, and in the event -of a crash, the bulk load is restarted just after the last synchronous -write finished by the previous run. (The synchronous write can update -a marker that describes where to restart on a crash.) - -

-WriteBatch provides an alternative to asynchronous writes. -Multiple updates may be placed in the same WriteBatch and -applied together using a synchronous write (i.e., -write_options.sync is set to true). The extra cost of -the synchronous write will be amortized across all of the writes in -the batch. - -

-

Concurrency

-

-A database may only be opened by one process at a time. The leveldb -implementation acquires a lock from the operating system to prevent -misuse. Within a single process, the same leveldb::DB object may -be safely used by multiple concurrent threads. -

-

Iteration

-

-The following example demonstrates how to print all key,value pairs -in a database. -

-

-  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
-  }
-  assert(it->status().ok());  // Check for any errors found during the scan
-  delete it;
-
-The following variation shows how to process just the keys in the -range [start,limit): -

-

-  for (it->Seek(start);
-       it->Valid() && it->key().ToString() < limit;
-       it->Next()) {
-    ...
-  }
-
-You can also process entries in reverse order. (Caveat: reverse -iteration may be somewhat slower than forward iteration.) -

-

-  for (it->SeekToLast(); it->Valid(); it->Prev()) {
-    ...
-  }
-
-

Snapshots

-

-Snapshots provide consistent read-only views over the entire state of -the key-value store. ReadOptions::snapshot may be non-NULL to indicate -that a read should operate on a particular version of the DB state. -If ReadOptions::snapshot is NULL, the read will operate on an -implicit snapshot of the current state. -

-Snapshots typically are created by the DB::GetSnapshot() method: -

-

-  leveldb::ReadOptions options;
-  options.snapshot = db->GetSnapshot();
-  ... apply some updates to db ...
-  leveldb::Iterator* iter = db->NewIterator(options);
-  ... read using iter to view the state when the snapshot was created ...
-  delete iter;
-  db->ReleaseSnapshot(options.snapshot);
-
-Note that when a snapshot is no longer needed, it should be released -using the DB::ReleaseSnapshot interface. This allows the -implementation to get rid of state that was being maintained just to -support reading as of that snapshot. -

-A Write operation can also return a snapshot that -represents the state of the database just after applying a particular -set of updates: -

-

-  leveldb::Snapshot* snapshot;
-  leveldb::WriteOptions write_options;
-  write_options.post_write_snapshot = &snapshot;
-  leveldb::Status status = db->Write(write_options, ...);
-  ... perform other mutations to db ...
-
-  leveldb::ReadOptions read_options;
-  read_options.snapshot = snapshot;
-  leveldb::Iterator* iter = db->NewIterator(read_options);
-  ... read as of the state just after the Write call returned ...
-  delete iter;
-
-  db->ReleaseSnapshot(snapshot);
-
-

Slice

-

-The return value of the it->key() and it->value() calls above -are instances of the leveldb::Slice type. Slice is a simple -structure that contains a length and a pointer to an external byte -array. Returning a Slice is a cheaper alternative to returning a -std::string since we do not need to copy potentially large keys and -values. In addition, leveldb methods do not return null-terminated -C-style strings since leveldb keys and values are allowed to -contain '\0' bytes. -

-C++ strings and null-terminated C-style strings can be easily converted -to a Slice: -

-

-   leveldb::Slice s1 = "hello";
-
-   std::string str("world");
-   leveldb::Slice s2 = str;
-
-A Slice can be easily converted back to a C++ string: -
-   std::string str = s1.ToString();
-   assert(str == std::string("hello"));
-
-Be careful when using Slices since it is up to the caller to ensure that -the external byte array into which the Slice points remains live while -the Slice is in use. For example, the following is buggy: -

-

-   leveldb::Slice slice;
-   if (...) {
-     std::string str = ...;
-     slice = str;
-   }
-   Use(slice);
-
-When the if statement goes out of scope, str will be destroyed and the -backing storage for slice will disappear. -

-

Comparators

-

-The preceding examples used the default ordering function for key, -which orders bytes lexicographically. You can however supply a custom -comparator when opening a database. For example, suppose each -database key consists of two numbers and we should sort by the first -number, breaking ties by the second number. First, define a proper -subclass of leveldb::Comparator that expresses these rules: -

-

-  class TwoPartComparator : public leveldb::Comparator {
-   public:
-    // Three-way comparison function:
-    //   if a < b: negative result
-    //   if a > b: positive result
-    //   else: zero result
-    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
-      int a1, a2, b1, b2;
-      ParseKey(a, &a1, &a2);
-      ParseKey(b, &b1, &b2);
-      if (a1 < b1) return -1;
-      if (a1 > b1) return +1;
-      if (a2 < b2) return -1;
-      if (a2 > b2) return +1;
-      return 0;
-    }
-
-    // Ignore the following methods for now:
-    const char* Name() { return "TwoPartComparator"; }
-    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
-    void FindShortSuccessor(std::string*) const { }
-  };
-
-Now create a database using this custom comparator: -

-

-  TwoPartComparator cmp;
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.create_if_missing = true;
-  options.comparator = &cmp;
-  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-  ...
-
-

Backwards compatibility

-

-The result of the comparator's Name method is attached to the -database when it is created, and is checked on every subsequent -database open. If the name changes, the leveldb::DB::Open call will -fail. Therefore, change the name if and only if the new key format -and comparison function are incompatible with existing databases, and -it is ok to discard the contents of all existing databases. -

-You can however still gradually evolve your key format over time with -a little bit of pre-planning. For example, you could store a version -number at the end of each key (one byte should suffice for most uses). -When you wish to switch to a new key format (e.g., adding an optional -third part to the keys processed by TwoPartComparator), -(a) keep the same comparator name (b) increment the version number -for new keys (c) change the comparator function so it uses the -version numbers found in the keys to decide how to interpret them. -

-

Performance

-

-Performance can be tuned by changing the default values of the -types defined in leveldb/include/options.h. - -

-

Block size

-

-leveldb groups adjacent keys together into the same block and such a -block is the unit of transfer to and from persistent storage. The -default block size is approximately 4096 uncompressed bytes. -Applications that mostly do bulk scans over the contents of the -database may wish to increase this size. Applications that do a lot -of point reads of small values may wish to switch to a smaller block -size if performance measurements indicate an improvement. There isn't -much benefit in using blocks smaller than one kilobyte, or larger than -a few megabytes. Also note that compression will be more effective -with larger block sizes. -

-

Compression

-

-Each block is individually compressed before being written to -persistent storage. Compression is on by default since the default -compression method is very fast, and is automatically disabled for -uncompressible data. In rare cases, applications may want to disable -compression entirely, but should only do so if benchmarks show a -performance improvement: -

-

-  leveldb::Options options;
-  options.compression = leveldb::kNoCompression;
-  ... leveldb::DB::Open(options, name, ...) ....
-
-

Cache

-

-The contents of the database are stored in a set of files in the -filesystem and each file stores a sequence of compressed blocks. If -options.cache is non-NULL, it is used to cache frequently used -uncompressed block contents. -

-

-  #include "leveldb/include/cache.h"
-
-  leveldb::Options options;
-  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
-  leveldb::DB* db;
-  leveldb::DB::Open(options, name, &db);
-  ... use the db ...
-  delete db
-  delete options.cache;
-
-Note that the cache holds uncompressed data, and therefore it should -be sized according to application level data sizes, without any -reduction from compression. (Caching of compressed blocks is left to -the operating system buffer cache, or any custom Env -implementation provided by the client.) -

-When performing a bulk read, the application may wish to disable -caching so that the data processed by the bulk read does not end up -displacing most of the cached contents. A per-iterator option can be -used to achieve this: -

-

-  leveldb::ReadOptions options;
-  options.fill_cache = false;
-  leveldb::Iterator* it = db->NewIterator(options);
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    ...
-  }
-
-

Key Layout

-

-Note that the unit of disk transfer and caching is a block. Adjacent -keys (according to the database sort order) will usually be placed in -the same block. Therefore the application can improve its performance -by placing keys that are accessed together near each other and placing -infrequently used keys in a separate region of the key space. -

-For example, suppose we are implementing a simple file system on top -of leveldb. The types of entries we might wish to store are: -

-

-   filename -> permission-bits, length, list of file_block_ids
-   file_block_id -> data
-
-We might want to prefix filename keys with one letter (say '/') and the -file_block_id keys with a different letter (say '0') so that scans -over just the metadata do not force us to fetch and cache bulky file -contents. -

-

Checksums

-

-leveldb associates checksums with all data it stores in the file system. -There are two separate controls provided over how aggressively these -checksums are verified: -

-

    -
  • ReadOptions::verify_checksums may be set to true to force - checksum verification of all data that is read from the file system on - behalf of a particular read. By default, no such verification is - done. -

    -

  • Options::paranoid_checks may be set to true before opening a - database to make the database implementation raise an error as soon as - it detects an internal corruption. Depending on which portion of the - database has been corrupted, the error may be raised when the database - is opened, or later by another database operation. By default, - paranoid checking is off so that the database can be used even if - parts of its persistent storage have been corrupted. -

    - If a database is corrupted (perhaps it cannot be opened when - paranoid checking is turned on), the leveldb::RepairDB function - may be used to recover as much of the data as possible -

    -

-

Approximate Sizes

-

-The GetApproximateSizes method can used to get the approximate -number of bytes of file system space used by one or more key ranges. -

-

-   leveldb::Range ranges[2];
-   ranges[0] = leveldb::Range("a", "c");
-   ranges[1] = leveldb::Range("x", "z");
-   uint64_t sizes[2];
-   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
-
-The preceding call will set sizes[0] to the approximate number of -bytes of file system space used by the key range [a..c) and -sizes[1] to the approximate number of bytes used by the key range -[x..z). -

-

Environment

-

-All file operations (and other operating system calls) issued by the -leveldb implementation are routed through a leveldb::Env object. -Sophisticated clients may wish to provide their own Env -implementation to get better control. For example, an application may -introduce artificial delays in the file IO paths to limit the impact -of leveldb on other activities in the system. -

-

-  class SlowEnv : public leveldb::Env {
-    .. implementation of the Env interface ...
-  };
-
-  SlowEnv env;
-  leveldb::Options options;
-  options.env = &env;
-  Status s = leveldb::DB::Open(options, ...);
-
-

Porting

-

-leveldb may be ported to a new platform by providing platform -specific implementations of the types/methods/functions exported by -leveldb/port/port.h. See leveldb/port/port_example.h for more -details. -

-In addition, the new platform may need a new default leveldb::Env -implementation. See leveldb/util/env_posix.h for an example. - -

Other Information

- -

-Details about the leveldb implementation may be found in -the following documents: -

- - - diff --git a/leveldb/doc/log_format.txt b/leveldb/doc/log_format.txt deleted file mode 100644 index 3a0414b..0000000 --- a/leveldb/doc/log_format.txt +++ /dev/null @@ -1,75 +0,0 @@ -The log file contents are a sequence of 32KB blocks. The only -exception is that the tail of the file may contain a partial block. - -Each block consists of a sequence of records: - block := record* trailer? - record := - checksum: uint32 // crc32c of type and data[] - length: uint16 - type: uint8 // One of FULL, FIRST, MIDDLE, LAST - data: uint8[length] - -A record never starts within the last six bytes of a block (since it -won't fit). Any leftover bytes here form the trailer, which must -consist entirely of zero bytes and must be skipped by readers. - -Aside: if exactly seven bytes are left in the current block, and a new -non-zero length record is added, the writer must emit a FIRST record -(which contains zero bytes of user data) to fill up the trailing seven -bytes of the block and then emit all of the user data in subsequent -blocks. - -More types may be added in the future. Some Readers may skip record -types they do not understand, others may report that some data was -skipped. - -FULL == 1 -FIRST == 2 -MIDDLE == 3 -LAST == 4 - -The FULL record contains the contents of an entire user record. - -FIRST, MIDDLE, LAST are types used for user records that have been -split into multiple fragments (typically because of block boundaries). -FIRST is the type of the first fragment of a user record, LAST is the -type of the last fragment of a user record, and MID is the type of all -interior fragments of a user record. - -Example: consider a sequence of user records: - A: length 1000 - B: length 97270 - C: length 8000 -A will be stored as a FULL record in the first block. - -B will be split into three fragments: first fragment occupies the rest -of the first block, second fragment occupies the entirety of the -second block, and the third fragment occupies a prefix of the third -block. This will leave six bytes free in the third block, which will -be left empty as the trailer. - -C will be stored as a FULL record in the fourth block. - -=================== - -Some benefits over the recordio format: - -(1) We do not need any heuristics for resyncing - just go to next -block boundary and scan. If there is a corruption, skip to the next -block. As a side-benefit, we do not get confused when part of the -contents of one log file are embedded as a record inside another log -file. - -(2) Splitting at approximate boundaries (e.g., for mapreduce) is -simple: find the next block boundary and skip records until we -hit a FULL or FIRST record. - -(3) We do not need extra buffering for large records. - -Some downsides compared to recordio format: - -(1) No packing of tiny records. This could be fixed by adding a new -record type, so it is a shortcoming of the current implementation, -not necessarily the format. - -(2) No compression. Again, this could be fixed by adding new record types. diff --git a/leveldb/doc/table_format.txt b/leveldb/doc/table_format.txt deleted file mode 100644 index ad5aa4b..0000000 --- a/leveldb/doc/table_format.txt +++ /dev/null @@ -1,61 +0,0 @@ -File format -=========== - - - [data block 1] - [data block 2] - ... - [data block N] - [meta block 1] - ... - [meta block K] - [metaindex block] - [index block] - [Footer] (fixed size; starts at file_size - sizeof(Footer)) - - -The file contains internal pointers. Each such pointer is called -a BlockHandle and contains the following information: - offset: varint64 - size: varint64 - -(1) The sequence of key/value pairs in the file are stored in sorted -order and partitioned into a sequence of data blocks. These blocks -come one after another at the beginning of the file. Each data block -is formatted according to the code in block_builder.cc, and then -optionally compressed. - -(2) After the data blocks we store a bunch of meta blocks. The -supported meta block types are described below. More meta block types -may be added in the future. Each meta block is again formatted using -block_builder.cc and then optionally compressed. - -(3) A "metaindex" block. It contains one entry for every other meta -block where the key is the name of the meta block and the value is a -BlockHandle pointing to that meta block. - -(4) An "index" block. This block contains one entry per data block, -where the key is a string >= last key in that data block and before -the first key in the successive data block. The value is the -BlockHandle for the data block. - -(6) At the very end of the file is a fixed length footer that contains -the BlockHandle of the metaindex and index blocks as well as a magic number. - metaindex_handle: char[p]; // Block handle for metaindex - index_handle: char[q]; // Block handle for index - padding: char[40-p-q]; // 0 bytes to make fixed length - // (40==2*BlockHandle::kMaxEncodedLength) - magic: fixed64; // == 0xdb4775248b80fb57 - -"stats" Meta Block ------------------- - -This meta block contains a bunch of stats. The key is the name -of the statistic. The value contains the statistic. -TODO(postrelease): record following stats. - data size - index size - key size (uncompressed) - value size (uncompressed) - number of entries - number of data blocks diff --git a/leveldb/include/leveldb/cache.h b/leveldb/include/leveldb/cache.h deleted file mode 100644 index 79196d1..0000000 --- a/leveldb/include/leveldb/cache.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Cache is an interface that maps keys to values. It has internal -// synchronization and may be safely accessed concurrently from -// multiple threads. It may automatically evict entries to make room -// for new entries. Values have a specified charge against the cache -// capacity. For example, a cache where the values are variable -// length strings, may use the length of the string as the charge for -// the string. -// -// A builtin cache implementation with a least-recently-used eviction -// policy is provided. Clients may use their own implementations if -// they want something more sophisticated (like scan-resistance, a -// custom eviction policy, variable cache sizing, etc.) - -#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ -#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ - -#include -#include "leveldb/slice.h" - -namespace leveldb { - -class Cache; - -// Create a new cache with a fixed size capacity. This implementation -// of Cache uses a least-recently-used eviction policy. -extern Cache* NewLRUCache(size_t capacity); - -class Cache { - public: - Cache() { } - - // Destroys all existing entries by calling the "deleter" - // function that was passed to the constructor. - virtual ~Cache(); - - // Opaque handle to an entry stored in the cache. - struct Handle { }; - - // Insert a mapping from key->value into the cache and assign it - // the specified charge against the total cache capacity. - // - // Returns a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. - // - // When the inserted entry is no longer needed, the key and - // value will be passed to "deleter". - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) = 0; - - // If the cache has no mapping for "key", returns NULL. - // - // Else return a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. - virtual Handle* Lookup(const Slice& key) = 0; - - // Release a mapping returned by a previous Lookup(). - // REQUIRES: handle must not have been released yet. - // REQUIRES: handle must have been returned by a method on *this. - virtual void Release(Handle* handle) = 0; - - // Return the value encapsulated in a handle returned by a - // successful Lookup(). - // REQUIRES: handle must not have been released yet. - // REQUIRES: handle must have been returned by a method on *this. - virtual void* Value(Handle* handle) = 0; - - // If the cache contains entry for key, erase it. Note that the - // underlying entry will be kept around until all existing handles - // to it have been released. - virtual void Erase(const Slice& key) = 0; - - // Return a new numeric id. May be used by multiple clients who are - // sharing the same cache to partition the key space. Typically the - // client will allocate a new id at startup and prepend the id to - // its cache keys. - virtual uint64_t NewId() = 0; - - private: - void LRU_Remove(Handle* e); - void LRU_Append(Handle* e); - void Unref(Handle* e); - - struct Rep; - Rep* rep_; - - // No copying allowed - Cache(const Cache&); - void operator=(const Cache&); -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/leveldb/include/leveldb/comparator.h b/leveldb/include/leveldb/comparator.h deleted file mode 100644 index 4e00e4d..0000000 --- a/leveldb/include/leveldb/comparator.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ -#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ - -#include - -namespace leveldb { - -class Slice; - -// A Comparator object provides a total order across slices that are -// used as keys in an sstable or a database. -class Comparator { - public: - virtual ~Comparator(); - - // Three-way comparison. Returns value: - // < 0 iff "a" < "b", - // == 0 iff "a" == "b", - // > 0 iff "a" > "b" - virtual int Compare(const Slice& a, const Slice& b) const = 0; - - // The name of the comparator. Used to check for comparator - // mismatches (i.e., a DB created with one comparator is - // accessed using a different comparator. - // - // The client of this package should switch to a new name whenever - // the comparator implementation changes in a way that will cause - // the relative ordering of any two keys to change. - // - // Names starting with "leveldb." are reserved and should not be used - // by any clients of this package. - virtual const char* Name() const = 0; - - // Advanced functions: these are used to reduce the space requirements - // for internal data structures like index blocks. - - // If *start < limit, changes *start to a short string in [start,limit). - // Simple comparator implementations may return with *start unchanged, - // i.e., an implementation of this method that does nothing is correct. - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const = 0; - - // Changes *key to a short string >= *key. - // Simple comparator implementations may return with *key unchanged, - // i.e., an implementation of this method that does nothing is correct. - virtual void FindShortSuccessor(std::string* key) const = 0; -}; - -// Return a builtin comparator that uses lexicographic byte-wise -// ordering. The result remains the property of this module and -// must not be deleted. -extern const Comparator* BytewiseComparator(); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/leveldb/include/leveldb/db.h b/leveldb/include/leveldb/db.h deleted file mode 100644 index f18ded3..0000000 --- a/leveldb/include/leveldb/db.h +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ -#define STORAGE_LEVELDB_INCLUDE_DB_H_ - -#include -#include -#include "leveldb/iterator.h" -#include "leveldb/options.h" - -namespace leveldb { - -static const int kMajorVersion = 1; -static const int kMinorVersion = 1; - -struct Options; -struct ReadOptions; -struct WriteOptions; - -class Snapshot; -class WriteBatch; - -// Some internal types. Clients should ignore. -class WriteBatchInternal; - -struct Range { - Slice start; - Slice limit; - - Range(const Slice& s, const Slice& l) : start(s), limit(l) { } -}; - -// A DB is a persistent ordered map from keys to values. -class DB { - public: - // Open the database with the specified "name". - // Stores a pointer to a heap-allocated database in *dbptr and returns - // OK on success. - // Stores NULL in *dbptr and returns a non-OK status on error. - // Caller should delete *dbptr when it is no longer needed. - static Status Open(const Options& options, - const std::string& name, - DB** dbptr); - - DB() { } - virtual ~DB(); - - // Set the database entry for "key" to "value". Returns OK on success, - // and a non-OK status on error. - // Note: consider setting options.sync = true. - virtual Status Put(const WriteOptions& options, - const Slice& key, - const Slice& value) = 0; - - // Remove the database entry (if any) for "key". Returns OK on - // success, and a non-OK status on error. It is not an error if "key" - // did not exist in the database. - // Note: consider setting options.sync = true. - virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; - - // Apply the specified updates to the database. - // Returns OK on success, non-OK on failure. - // Note: consider setting options.sync = true. - virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; - - // If the database contains an entry for "key" store the - // corresponding value in *value and return OK. - // - // If there is no entry for "key" leave *value unchanged and return - // a status for which Status::IsNotFound() returns true. - // - // May return some other Status on an error. - virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) = 0; - - // Return a heap-allocated iterator over the contents of the database. - // The result of NewIterator() is initially invalid (caller must - // call one of the Seek methods on the iterator before using it). - // - // Caller should delete the iterator when it is no longer needed. - // The returned iterator should be deleted before this db is deleted. - virtual Iterator* NewIterator(const ReadOptions& options) = 0; - - // Return a handle to the current DB state. Iterators created with - // this handle will all observe a stable snapshot of the current DB - // state. The caller must call ReleaseSnapshot(result) when the - // snapshot is no longer needed. - virtual const Snapshot* GetSnapshot() = 0; - - // Release a previously acquired snapshot. The caller must not - // use "snapshot" after this call. - virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; - - // DB implementations can export properties about their state - // via this method. If "property" is a valid property understood by this - // DB implementation, fills "*value" with its current value and returns - // true. Otherwise returns false. - // - // - // Valid property names include: - // - // "leveldb.num-files-at-level" - return the number of files at level , - // where is an ASCII representation of a level number (e.g. "0"). - // "leveldb.stats" - returns a multi-line string that describes statistics - // about the internal operation of the DB. - virtual bool GetProperty(const Slice& property, std::string* value) = 0; - - // For each i in [0,n-1], store in "sizes[i]", the approximate - // file system space used by keys in "[range[i].start .. range[i].limit)". - // - // Note that the returned sizes measure file system space usage, so - // if the user data compresses by a factor of ten, the returned - // sizes will be one-tenth the size of the corresponding user data size. - // - // The results may not include the sizes of recently written data. - virtual void GetApproximateSizes(const Range* range, int n, - uint64_t* sizes) = 0; - - // Possible extensions: - // (1) Add a method to compact a range of keys - - private: - // No copying allowed - DB(const DB&); - void operator=(const DB&); -}; - -// Destroy the contents of the specified database. -// Be very careful using this method. -Status DestroyDB(const std::string& name, const Options& options); - -// If a DB cannot be opened, you may attempt to call this method to -// resurrect as much of the contents of the database as possible. -// Some data may be lost, so be careful when calling this function -// on a database that contains important information. -Status RepairDB(const std::string& dbname, const Options& options); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/leveldb/include/leveldb/env.h b/leveldb/include/leveldb/env.h deleted file mode 100644 index 4b6e712..0000000 --- a/leveldb/include/leveldb/env.h +++ /dev/null @@ -1,290 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// An Env is an interface used by the leveldb implementation to access -// operating system functionality like the filesystem etc. Callers -// may wish to provide a custom Env object when opening a database to -// get fine gain control; e.g., to rate limit file system operations. - -#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ -#define STORAGE_LEVELDB_INCLUDE_ENV_H_ - -#include -#include -#include -#include -#include "leveldb/status.h" - -namespace leveldb { - -class FileLock; -class RandomAccessFile; -class SequentialFile; -class Slice; -class WritableFile; - -class Env { - public: - Env() { } - virtual ~Env(); - - // Return a default environment suitable for the current operating - // system. Sophisticated users may wish to provide their own Env - // implementation instead of relying on this default environment. - // - // The result of Default() belongs to leveldb and must never be deleted. - static Env* Default(); - - // Create a brand new sequentially-readable file with the specified name. - // On success, stores a pointer to the new file in *result and returns OK. - // On failure stores NULL in *result and returns non-OK. If the file does - // not exist, returns a non-OK status. - // - // The returned file will only be accessed by one thread at a time. - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) = 0; - - // Create a brand new random access read-only file with the - // specified name. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores NULL in *result and - // returns non-OK. If the file does not exist, returns a non-OK - // status. - // - // The returned file may be concurrently accessed by multiple threads. - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) = 0; - - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores NULL in *result and - // returns non-OK. - // - // The returned file will only be accessed by one thread at a time. - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) = 0; - - // Returns true iff the named file exists. - virtual bool FileExists(const std::string& fname) = 0; - - // Store in *result the names of the children of the specified directory. - // The names are relative to "dir". - // Original contents of *results are dropped. - virtual Status GetChildren(const std::string& dir, - std::vector* result) = 0; - - // Delete the named file. - virtual Status DeleteFile(const std::string& fname) = 0; - - // Create the specified directory. - virtual Status CreateDir(const std::string& dirname) = 0; - - // Delete the specified directory. - virtual Status DeleteDir(const std::string& dirname) = 0; - - // Store the size of fname in *file_size. - virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; - - // Rename file src to target. - virtual Status RenameFile(const std::string& src, - const std::string& target) = 0; - - // Lock the specified file. Used to prevent concurrent access to - // the same db by multiple processes. On failure, stores NULL in - // *lock and returns non-OK. - // - // On success, stores a pointer to the object that represents the - // acquired lock in *lock and returns OK. The caller should call - // UnlockFile(*lock) to release the lock. If the process exits, - // the lock will be automatically released. - // - // If somebody else already holds the lock, finishes immediately - // with a failure. I.e., this call does not wait for existing locks - // to go away. - // - // May create the named file if it does not already exist. - virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; - - // Release the lock acquired by a previous successful call to LockFile. - // REQUIRES: lock was returned by a successful LockFile() call - // REQUIRES: lock has not already been unlocked. - virtual Status UnlockFile(FileLock* lock) = 0; - - // Arrange to run "(*function)(arg)" once in a background thread. - // - // "function" may run in an unspecified thread. Multiple functions - // added to the same Env may run concurrently in different threads. - // I.e., the caller may not assume that background work items are - // serialized. - virtual void Schedule( - void (*function)(void* arg), - void* arg) = 0; - - // Start a new thread, invoking "function(arg)" within the new thread. - // When "function(arg)" returns, the thread will be destroyed. - virtual void StartThread(void (*function)(void* arg), void* arg) = 0; - - // *path is set to a temporary directory that can be used for testing. It may - // or many not have just been created. The directory may or may not differ - // between runs of the same process, but subsequent calls will return the - // same directory. - virtual Status GetTestDirectory(std::string* path) = 0; - - // Write an entry to the log file with the specified format. - virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; - - // Returns the number of micro-seconds since some fixed point in time. Only - // useful for computing deltas of time. - virtual uint64_t NowMicros() = 0; - - // Sleep/delay the thread for the perscribed number of micro-seconds. - virtual void SleepForMicroseconds(int micros) = 0; - - private: - // No copying allowed - Env(const Env&); - void operator=(const Env&); -}; - -// A file abstraction for reading sequentially through a file -class SequentialFile { - public: - SequentialFile() { } - virtual ~SequentialFile(); - - // Read up to "n" bytes from the file. "scratch[0..n-1]" may be - // written by this routine. Sets "*result" to the data that was - // read (including if fewer than "n" bytes were successfully read). - // If an error was encountered, returns a non-OK status. - // - // REQUIRES: External synchronization - virtual Status Read(size_t n, Slice* result, char* scratch) = 0; -}; - -// A file abstraction for randomly reading the contents of a file. -class RandomAccessFile { - public: - RandomAccessFile() { } - virtual ~RandomAccessFile(); - - // Read up to "n" bytes from the file starting at "offset". - // "scratch[0..n-1]" may be written by this routine. Sets "*result" - // to the data that was read (including if fewer than "n" bytes were - // successfully read). If an error was encountered, returns a - // non-OK status. - // - // Safe for concurrent use by multiple threads. - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const = 0; -}; - -// A file abstraction for sequential writing. The implementation -// must provide buffering since callers may append small fragments -// at a time to the file. -class WritableFile { - public: - WritableFile() { } - virtual ~WritableFile(); - - virtual Status Append(const Slice& data) = 0; - virtual Status Close() = 0; - virtual Status Flush() = 0; - virtual Status Sync() = 0; - - private: - // No copying allowed - WritableFile(const WritableFile&); - void operator=(const WritableFile&); -}; - -// Identifies a locked file. -class FileLock { - public: - FileLock() { } - virtual ~FileLock(); - private: - // No copying allowed - FileLock(const FileLock&); - void operator=(const FileLock&); -}; - -// Log the specified data to *info_log if info_log is non-NULL. -extern void Log(Env* env, WritableFile* info_log, const char* format, ...) -# if defined(__GNUC__) || defined(__clang__) - __attribute__((__format__ (__printf__, 3, 4))) -# endif - ; - -// A utility routine: write "data" to the named file. -extern Status WriteStringToFile(Env* env, const Slice& data, - const std::string& fname); - -// A utility routine: read contents of named file into *data -extern Status ReadFileToString(Env* env, const std::string& fname, - std::string* data); - -// An implementation of Env that forwards all calls to another Env. -// May be useful to clients who wish to override just part of the -// functionality of another Env. -class EnvWrapper : public Env { - public: - // Initialize an EnvWrapper that delegates all calls to *target - explicit EnvWrapper(Env* target) : target_(target) { } - virtual ~EnvWrapper(); - - // Return the target to which this Env forwards all calls - Env* target() const { return target_; } - - // The following text is boilerplate that forwards all methods to target() - Status NewSequentialFile(const std::string& f, SequentialFile** r) { - return target_->NewSequentialFile(f, r); - } - Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { - return target_->NewRandomAccessFile(f, r); - } - Status NewWritableFile(const std::string& f, WritableFile** r) { - return target_->NewWritableFile(f, r); - } - bool FileExists(const std::string& f) { return target_->FileExists(f); } - Status GetChildren(const std::string& dir, std::vector* r) { - return target_->GetChildren(dir, r); - } - Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } - Status CreateDir(const std::string& d) { return target_->CreateDir(d); } - Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } - Status GetFileSize(const std::string& f, uint64_t* s) { - return target_->GetFileSize(f, s); - } - Status RenameFile(const std::string& s, const std::string& t) { - return target_->RenameFile(s, t); - } - Status LockFile(const std::string& f, FileLock** l) { - return target_->LockFile(f, l); - } - Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } - void Schedule(void (*f)(void*), void* a) { - return target_->Schedule(f, a); - } - void StartThread(void (*f)(void*), void* a) { - return target_->StartThread(f, a); - } - virtual Status GetTestDirectory(std::string* path) { - return target_->GetTestDirectory(path); - } - virtual void Logv(WritableFile* log, const char* format, va_list ap) { - return target_->Logv(log, format, ap); - } - uint64_t NowMicros() { - return target_->NowMicros(); - } - void SleepForMicroseconds(int micros) { - target_->SleepForMicroseconds(micros); - } - private: - Env* target_; -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/leveldb/include/leveldb/iterator.h b/leveldb/include/leveldb/iterator.h deleted file mode 100644 index 1866fb5..0000000 --- a/leveldb/include/leveldb/iterator.h +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// An iterator yields a sequence of key/value pairs from a source. -// The following class defines the interface. Multiple implementations -// are provided by this library. In particular, iterators are provided -// to access the contents of a Table or a DB. - -#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ -#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ - -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class Iterator { - public: - Iterator(); - virtual ~Iterator(); - - // An iterator is either positioned at a key/value pair, or - // not valid. This method returns true iff the iterator is valid. - virtual bool Valid() const = 0; - - // Position at the first key in the source. The iterator is Valid() - // after this call iff the source is not empty. - virtual void SeekToFirst() = 0; - - // Position at the last key in the source. The iterator is - // Valid() after this call iff the source is not empty. - virtual void SeekToLast() = 0; - - // Position at the first key in the source that at or past target - // The iterator is Valid() after this call iff the source contains - // an entry that comes at or past target. - virtual void Seek(const Slice& target) = 0; - - // Moves to the next entry in the source. After this call, Valid() is - // true iff the iterator was not positioned at the last entry in the source. - // REQUIRES: Valid() - virtual void Next() = 0; - - // Moves to the previous entry in the source. After this call, Valid() is - // true iff the iterator was not positioned at the first entry in source. - // REQUIRES: Valid() - virtual void Prev() = 0; - - // Return the key for the current entry. The underlying storage for - // the returned slice is valid only until the next modification of - // the iterator. - // REQUIRES: Valid() - virtual Slice key() const = 0; - - // Return the value for the current entry. The underlying storage for - // the returned slice is valid only until the next modification of - // the iterator. - // REQUIRES: !AtEnd() && !AtStart() - virtual Slice value() const = 0; - - // If an error has occurred, return it. Else return an ok status. - virtual Status status() const = 0; - - // Clients are allowed to register function/arg1/arg2 triples that - // will be invoked when this iterator is destroyed. - // - // Note that unlike all of the preceding methods, this method is - // not abstract and therefore clients should not override it. - typedef void (*CleanupFunction)(void* arg1, void* arg2); - void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); - - private: - struct Cleanup { - CleanupFunction function; - void* arg1; - void* arg2; - Cleanup* next; - }; - Cleanup cleanup_; - - // No copying allowed - Iterator(const Iterator&); - void operator=(const Iterator&); -}; - -// Return an empty iterator (yields nothing). -extern Iterator* NewEmptyIterator(); - -// Return an empty iterator with the specified status. -extern Iterator* NewErrorIterator(const Status& status); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/leveldb/include/leveldb/options.h b/leveldb/include/leveldb/options.h deleted file mode 100644 index a94651f..0000000 --- a/leveldb/include/leveldb/options.h +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ -#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ - -#include - -namespace leveldb { - -class Cache; -class Comparator; -class Env; -class Snapshot; -class WritableFile; - -// DB contents are stored in a set of blocks, each of which holds a -// sequence of key,value pairs. Each block may be compressed before -// being stored in a file. The following enum describes which -// compression method (if any) is used to compress a block. -enum CompressionType { - // NOTE: do not change the values of existing entries, as these are - // part of the persistent format on disk. - kNoCompression = 0x0, - kSnappyCompression = 0x1, -}; - -// Options to control the behavior of a database (passed to DB::Open) -struct Options { - // ------------------- - // Parameters that affect behavior - - // Comparator used to define the order of keys in the table. - // Default: a comparator that uses lexicographic byte-wise ordering - // - // REQUIRES: The client must ensure that the comparator supplied - // here has the same name and orders keys *exactly* the same as the - // comparator provided to previous open calls on the same DB. - const Comparator* comparator; - - // If true, the database will be created if it is missing. - // Default: false - bool create_if_missing; - - // If true, an error is raised if the database already exists. - // Default: false - bool error_if_exists; - - // If true, the implementation will do aggressive checking of the - // data it is processing and will stop early if it detects any - // errors. This may have unforeseen ramifications: for example, a - // corruption of one DB entry may cause a large number of entries to - // become unreadable or for the entire DB to become unopenable. - // Default: false - bool paranoid_checks; - - // Use the specified object to interact with the environment, - // e.g. to read/write files, schedule background work, etc. - // Default: Env::Default() - Env* env; - - // Any internal progress/error information generated by the db will - // be to written to info_log if it is non-NULL, or to a file stored - // in the same directory as the DB contents if info_log is NULL. - // Default: NULL - WritableFile* info_log; - - // ------------------- - // Parameters that affect performance - - // Amount of data to build up in memory (backed by an unsorted log - // on disk) before converting to a sorted on-disk file. - // - // Larger values increase performance, especially during bulk loads. - // Up to two write buffers may be held in memory at the same time, - // so you may wish to adjust this parameter to control memory usage. - // - // Default: 4MB - size_t write_buffer_size; - - // Number of open files that can be used by the DB. You may need to - // increase this if your database has a large working set (budget - // one open file per 2MB of working set). - // - // Default: 1000 - int max_open_files; - - // Control over blocks (user data is stored in a set of blocks, and - // a block is the unit of reading from disk). - - // If non-NULL, use the specified cache for blocks. - // If NULL, leveldb will automatically create and use an 8MB internal cache. - // Default: NULL - Cache* block_cache; - - // Approximate size of user data packed per block. Note that the - // block size specified here corresponds to uncompressed data. The - // actual size of the unit read from disk may be smaller if - // compression is enabled. This parameter can be changed dynamically. - // - // Default: 4K - size_t block_size; - - // Number of keys between restart points for delta encoding of keys. - // This parameter can be changed dynamically. Most clients should - // leave this parameter alone. - // - // Default: 16 - int block_restart_interval; - - // Compress blocks using the specified compression algorithm. This - // parameter can be changed dynamically. - // - // Default: kSnappyCompression, which gives lightweight but fast - // compression. - // - // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: - // ~200-500MB/s compression - // ~400-800MB/s decompression - // Note that these speeds are significantly faster than most - // persistent storage speeds, and therefore it is typically never - // worth switching to kNoCompression. Even if the input data is - // incompressible, the kSnappyCompression implementation will - // efficiently detect that and will switch to uncompressed mode. - CompressionType compression; - - // Create an Options object with default values for all fields. - Options(); -}; - -// Options that control read operations -struct ReadOptions { - // If true, all data read from underlying storage will be - // verified against corresponding checksums. - // Default: false - bool verify_checksums; - - // Should the data read for this iteration be cached in memory? - // Callers may wish to set this field to false for bulk scans. - // Default: true - bool fill_cache; - - // If "snapshot" is non-NULL, read as of the supplied snapshot - // (which must belong to the DB that is being read and which must - // not have been released). If "snapshot" is NULL, use an impliicit - // snapshot of the state at the beginning of this read operation. - // Default: NULL - const Snapshot* snapshot; - - ReadOptions() - : verify_checksums(false), - fill_cache(true), - snapshot(NULL) { - } -}; - -// Options that control write operations -struct WriteOptions { - // If true, the write will be flushed from the operating system - // buffer cache (by calling WritableFile::Sync()) before the write - // is considered complete. If this flag is true, writes will be - // slower. - // - // If this flag is false, and the machine crashes, some recent - // writes may be lost. Note that if it is just the process that - // crashes (i.e., the machine does not reboot), no writes will be - // lost even if sync==false. - // - // In other words, a DB write with sync==false has similar - // crash semantics as the "write()" system call. A DB write - // with sync==true has similar crash semantics to a "write()" - // system call followed by "fsync()". - // - // Default: false - bool sync; - - // If "post_write_snapshot" is non-NULL, and the write succeeds, - // *post_write_snapshot will be modified to point to a snapshot of - // the DB state immediately after this write. The caller must call - // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the - // snapshot is no longer needed. - // - // If "post_write_snapshot" is non-NULL, and the write fails, - // *post_write_snapshot will be set to NULL. - // - // Default: NULL - const Snapshot** post_write_snapshot; - - WriteOptions() - : sync(false), - post_write_snapshot(NULL) { - } -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/leveldb/include/leveldb/slice.h b/leveldb/include/leveldb/slice.h deleted file mode 100644 index 62cb894..0000000 --- a/leveldb/include/leveldb/slice.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Slice is a simple structure containing a pointer into some external -// storage and a size. The user of a Slice must ensure that the slice -// is not used after the corresponding external storage has been -// deallocated. - -#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ -#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ - -#include -#include -#include -#include - -namespace leveldb { - -class Slice { - public: - // Create an empty slice. - Slice() : data_(""), size_(0) { } - - // Create a slice that refers to data[0,n-1]. - Slice(const char* data, size_t n) : data_(data), size_(n) { } - - // Create a slice that refers to the contents of "s" - Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } - - // Create a slice that refers to s[0,strlen(s)-1] - Slice(const char* s) : data_(s), size_(strlen(s)) { } - - // Return a pointer to the beginning of the referenced data - const char* data() const { return data_; } - - // Return the length (in bytes) of the referenced data - size_t size() const { return size_; } - - // Return true iff the length of the referenced data is zero - bool empty() const { return size_ == 0; } - - // Return the ith byte in the referenced data. - // REQUIRES: n < size() - char operator[](size_t n) const { - assert(n < size()); - return data_[n]; - } - - // Change this slice to refer to an empty array - void clear() { data_ = ""; size_ = 0; } - - // Drop the first "n" bytes from this slice. - void remove_prefix(size_t n) { - assert(n <= size()); - data_ += n; - size_ -= n; - } - - // Return a string that contains the copy of the referenced data. - std::string ToString() const { return std::string(data_, size_); } - - // Three-way comparison. Returns value: - // < 0 iff "*this" < "b", - // == 0 iff "*this" == "b", - // > 0 iff "*this" > "b" - int compare(const Slice& b) const; - - // Return true iff "x" is a prefix of "*this" - bool starts_with(const Slice& x) const { - return ((size_ >= x.size_) && - (memcmp(data_, x.data_, x.size_) == 0)); - } - - private: - const char* data_; - size_t size_; - - // Intentionally copyable -}; - -inline bool operator==(const Slice& x, const Slice& y) { - return ((x.size() == y.size()) && - (memcmp(x.data(), y.data(), x.size()) == 0)); -} - -inline bool operator!=(const Slice& x, const Slice& y) { - return !(x == y); -} - -inline int Slice::compare(const Slice& b) const { - const int min_len = (size_ < b.size_) ? size_ : b.size_; - int r = memcmp(data_, b.data_, min_len); - if (r == 0) { - if (size_ < b.size_) r = -1; - else if (size_ > b.size_) r = +1; - } - return r; -} - -} - - -#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/leveldb/include/leveldb/status.h b/leveldb/include/leveldb/status.h deleted file mode 100644 index 47e3edf..0000000 --- a/leveldb/include/leveldb/status.h +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Status encapsulates the result of an operation. It may indicate success, -// or it may indicate an error with an associated error message. - -#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ -#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ - -#include -#include -#include "leveldb/slice.h" - -namespace leveldb { - -class Status { - public: - // Create a success status. - Status() : state_(NULL) { } - ~Status() { delete state_; } - - // Copy the specified status. - Status(const Status& s); - void operator=(const Status& s); - - // Return a success status. - static Status OK() { return Status(); } - - // Return error status of an appropriate type. - static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kNotFound, msg, Slice()); - } - static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kCorruption, msg, msg2); - } - static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kNotSupported, msg, msg2); - } - static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kInvalidArgument, msg, msg2); - } - static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kIOError, msg, msg2); - } - - // Returns true iff the status indicates success. - bool ok() const { return (state_ == NULL); } - - // Returns true iff the status indicates a NotFound error. - bool IsNotFound() const { return code() == kNotFound; } - - // Return a string representation of this status suitable for printing. - // Returns the string "OK" for success. - std::string ToString() const; - - private: - enum Code { - kOk = 0, - kNotFound = 1, - kCorruption = 2, - kNotSupported = 3, - kInvalidArgument = 4, - kIOError = 5, - }; - Code code() const { return (state_ == NULL) ? kOk : state_->first; } - - Status(Code code, const Slice& msg, const Slice& msg2); - - typedef std::pair State; - State* state_; -}; - -inline Status::Status(const Status& s) { - state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); -} -inline void Status::operator=(const Status& s) { - if (this != &s) { - delete state_; - state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); - } -} - -} - -#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/leveldb/include/leveldb/table.h b/leveldb/include/leveldb/table.h deleted file mode 100644 index bd99176..0000000 --- a/leveldb/include/leveldb/table.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ -#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ - -#include -#include "leveldb/iterator.h" - -namespace leveldb { - -class Block; -class BlockHandle; -struct Options; -class RandomAccessFile; -struct ReadOptions; - -// A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. -class Table { - public: - // Attempt to open the table that is stored in bytes [0..file_size) - // of "file", and read the metadata entries necessary to allow - // retrieving data from the table. - // - // If successful, returns ok and sets "*table" to the newly opened - // table. The client should delete "*table" when no longer needed. - // If there was an error while initializing the table, sets "*table" - // to NULL and returns a non-ok status. Does not take ownership of - // "*source", but the client must ensure that "source" remains live - // for the duration of the returned table's lifetime. - // - // *file must remain live while this Table is in use. - static Status Open(const Options& options, - RandomAccessFile* file, - uint64_t file_size, - Table** table); - - ~Table(); - - // Returns a new iterator over the table contents. - // The result of NewIterator() is initially invalid (caller must - // call one of the Seek methods on the iterator before using it). - Iterator* NewIterator(const ReadOptions&) const; - - // Given a key, return an approximate byte offset in the file where - // the data for that key begins (or would begin if the key were - // present in the file). The returned value is in terms of file - // bytes, and so includes effects like compression of the underlying data. - // E.g., the approximate offset of the last key in the table will - // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key) const; - - private: - struct Rep; - Rep* rep_; - - explicit Table(Rep* rep) { rep_ = rep; } - static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); - - // No copying allowed - Table(const Table&); - void operator=(const Table&); -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ diff --git a/leveldb/include/leveldb/table_builder.h b/leveldb/include/leveldb/table_builder.h deleted file mode 100644 index 49d2d51..0000000 --- a/leveldb/include/leveldb/table_builder.h +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// TableBuilder provides the interface used to build a Table -// (an immutable and sorted map from keys to values). - -#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ -#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ - -#include -#include "leveldb/options.h" -#include "leveldb/status.h" - -namespace leveldb { - -class BlockBuilder; -class BlockHandle; -class WritableFile; - -class TableBuilder { - public: - // Create a builder that will store the contents of the table it is - // building in *file. Does not close the file. It is up to the - // caller to close the file after calling Finish(). - TableBuilder(const Options& options, WritableFile* file); - - // REQUIRES: Either Finish() or Abandon() has been called. - ~TableBuilder(); - - // Change the options used by this builder. Note: only some of the - // option fields can be changed after construction. If a field is - // not allowed to change dynamically and its value in the structure - // passed to the constructor is different from its value in the - // structure passed to this method, this method will return an error - // without changing any fields. - Status ChangeOptions(const Options& options); - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value); - - // Advanced operation: flush any buffered key/value pairs to file. - // Can be used to ensure that two adjacent entries never live in - // the same data block. Most clients should not need to use this method. - // REQUIRES: Finish(), Abandon() have not been called - void Flush(); - - // Return non-ok iff some error has been detected. - Status status() const; - - // Finish building the table. Stops using the file passed to the - // constructor after this function returns. - // REQUIRES: Finish(), Abandon() have not been called - Status Finish(); - - // Indicate that the contents of this builder should be abandoned. Stops - // using the file passed to the constructor after this function returns. - // If the caller is not going to call Finish(), it must call Abandon() - // before destroying this builder. - // REQUIRES: Finish(), Abandon() have not been called - void Abandon(); - - // Number of calls to Add() so far. - uint64_t NumEntries() const; - - // Size of the file generated so far. If invoked after a successful - // Finish() call, returns the size of the final generated file. - uint64_t FileSize() const; - - private: - bool ok() const { return status().ok(); } - void WriteBlock(BlockBuilder* block, BlockHandle* handle); - - struct Rep; - Rep* rep_; - - // No copying allowed - TableBuilder(const TableBuilder&); - void operator=(const TableBuilder&); -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/leveldb/include/leveldb/write_batch.h b/leveldb/include/leveldb/write_batch.h deleted file mode 100644 index 3411952..0000000 --- a/leveldb/include/leveldb/write_batch.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// WriteBatch holds a collection of updates to apply atomically to a DB. -// -// The updates are applied in the order in which they are added -// to the WriteBatch. For example, the value of "key" will be "v3" -// after the following batch is written: -// -// batch.Put("key", "v1"); -// batch.Delete("key"); -// batch.Put("key", "v2"); -// batch.Put("key", "v3"); - -#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ -#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ - -#include - -namespace leveldb { - -class Slice; - -class WriteBatch { - public: - WriteBatch(); - ~WriteBatch(); - - // Store the mapping "key->value" in the database. - void Put(const Slice& key, const Slice& value); - - // If the database contains a mapping for "key", erase it. Else do nothing. - void Delete(const Slice& key); - - // Clear all updates buffered in this batch. - void Clear(); - - private: - friend class WriteBatchInternal; - - std::string rep_; // See comment in write_batch.cc for the format of rep_ - - // Intentionally copyable -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/leveldb/leveldb.gyp b/leveldb/leveldb.gyp deleted file mode 100644 index 20d1b1d..0000000 --- a/leveldb/leveldb.gyp +++ /dev/null @@ -1,315 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -{ - 'variables': { - 'use_snappy%': 0, - }, - 'target_defaults': { - 'defines': [ - 'LEVELDB_PLATFORM_CHROMIUM=1', - ], - 'include_dirs': [ - '.', - 'include/', - ], - 'conditions': [ - ['OS == "win"', { - 'include_dirs': [ - 'port/win', - ], - }], - ['use_snappy', { - 'defines': [ - 'USE_SNAPPY=1', - ], - }], - ], - }, - 'targets': [ - { - 'target_name': 'leveldb', - 'type': '<(library)', - 'dependencies': [ - # The base libary is a lightweight abstraction layer for things like - # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ - '../../base/base.gyp:base', - ], - 'conditions': [ - ['use_snappy', { - 'dependencies': [ - '../../third_party/snappy/snappy.gyp:snappy', - ], - }], - ], - 'direct_dependent_settings': { - 'include_dirs': [ - 'include/', - ], - }, - 'sources': [ - # Include and then exclude so that all files show up in IDEs, even if - # they don't build. - 'db/builder.cc', - 'db/builder.h', - 'db/db_impl.cc', - 'db/db_impl.h', - 'db/db_iter.cc', - 'db/db_iter.h', - 'db/filename.cc', - 'db/filename.h', - 'db/dbformat.cc', - 'db/dbformat.h', - 'db/log_format.h', - 'db/log_reader.cc', - 'db/log_reader.h', - 'db/log_writer.cc', - 'db/log_writer.h', - 'db/memtable.cc', - 'db/memtable.h', - 'db/repair.cc', - 'db/skiplist.h', - 'db/snapshot.h', - 'db/table_cache.cc', - 'db/table_cache.h', - 'db/version_edit.cc', - 'db/version_edit.h', - 'db/version_set.cc', - 'db/version_set.h', - 'db/write_batch.cc', - 'db/write_batch_internal.h', - 'include/leveldb/cache.h', - 'include/leveldb/comparator.h', - 'include/leveldb/db.h', - 'include/leveldb/env.h', - 'include/leveldb/iterator.h', - 'include/leveldb/options.h', - 'include/leveldb/slice.h', - 'include/leveldb/status.h', - 'include/leveldb/table.h', - 'include/leveldb/table_builder.h', - 'include/leveldb/write_batch.h', - 'port/port.h', - 'port/port_chromium.cc', - 'port/port_chromium.h', - 'port/port_example.h', - 'port/port_posix.cc', - 'port/port_posix.h', - 'table/block.cc', - 'table/block.h', - 'table/block_builder.cc', - 'table/block_builder.h', - 'table/format.cc', - 'table/format.h', - 'table/iterator.cc', - 'table/iterator_wrapper.h', - 'table/merger.cc', - 'table/merger.h', - 'table/table.cc', - 'table/table_builder.cc', - 'table/two_level_iterator.cc', - 'table/two_level_iterator.h', - 'util/arena.cc', - 'util/arena.h', - 'util/cache.cc', - 'util/coding.cc', - 'util/coding.h', - 'util/comparator.cc', - 'util/crc32c.cc', - 'util/crc32c.h', - 'util/env.cc', - 'util/env_chromium.cc', - 'util/env_posix.cc', - 'util/hash.cc', - 'util/hash.h', - 'util/logging.cc', - 'util/logging.h', - 'util/mutexlock.h', - 'util/options.cc', - 'util/random.h', - 'util/status.cc', - ], - 'sources/': [ - ['exclude', '_(android|example|portable|posix)\\.cc$'], - ], - }, - { - 'target_name': 'leveldb_testutil', - 'type': '<(library)', - 'dependencies': [ - '../../base/base.gyp:base', - 'leveldb', - ], - 'export_dependent_settings': [ - # The tests use include directories from these projects. - '../../base/base.gyp:base', - 'leveldb', - ], - 'sources': [ - 'util/histogram.cc', - 'util/histogram.h', - 'util/testharness.cc', - 'util/testharness.h', - 'util/testutil.cc', - 'util/testutil.h', - ], - }, - { - 'target_name': 'leveldb_arena_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/arena_test.cc', - ], - }, - { - 'target_name': 'leveldb_cache_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/cache_test.cc', - ], - }, - { - 'target_name': 'leveldb_coding_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/coding_test.cc', - ], - }, - { - 'target_name': 'leveldb_corruption_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/corruption_test.cc', - ], - }, - { - 'target_name': 'leveldb_crc32c_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/crc32c_test.cc', - ], - }, - { - 'target_name': 'leveldb_db_bench', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/db_bench.cc', - ], - }, - { - 'target_name': 'leveldb_db_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/db_test.cc', - ], - }, - { - 'target_name': 'leveldb_dbformat_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/dbformat_test.cc', - ], - }, - { - 'target_name': 'leveldb_env_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/env_test.cc', - ], - }, - { - 'target_name': 'leveldb_filename_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/filename_test.cc', - ], - }, - { - 'target_name': 'leveldb_log_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/log_test.cc', - ], - }, - { - 'target_name': 'leveldb_skiplist_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/skiplist_test.cc', - ], - }, - { - 'target_name': 'leveldb_table_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'table/table_test.cc', - ], - }, - { - 'target_name': 'leveldb_version_edit_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/version_edit_test.cc', - ], - }, - { - 'target_name': 'leveldb_write_batch_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/write_batch_test.cc', - ], - }, - ], -} - -# Local Variables: -# tab-width:2 -# indent-tabs-mode:nil -# End: -# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/leveldb/port/README b/leveldb/port/README deleted file mode 100644 index 422563e..0000000 --- a/leveldb/port/README +++ /dev/null @@ -1,10 +0,0 @@ -This directory contains interfaces and implementations that isolate the -rest of the package from platform details. - -Code in the rest of the package includes "port.h" from this directory. -"port.h" in turn includes a platform specific "port_.h" file -that provides the platform specific implementation. - -See port_posix.h for an example of what must be provided in a platform -specific header file. - diff --git a/leveldb/port/port.h b/leveldb/port/port.h deleted file mode 100644 index 816826b..0000000 --- a/leveldb/port/port.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_PORT_PORT_H_ -#define STORAGE_LEVELDB_PORT_PORT_H_ - -#include - -// Include the appropriate platform specific file below. If you are -// porting to a new platform, see "port_example.h" for documentation -// of what the new port_.h file must provide. -#if defined(LEVELDB_PLATFORM_POSIX) -# include "port/port_posix.h" -#elif defined(LEVELDB_PLATFORM_CHROMIUM) -# include "port/port_chromium.h" -#elif defined(LEVELDB_PLATFORM_ANDROID) -# include "port/port_android.h" -#endif - -#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/leveldb/port/port_android.cc b/leveldb/port/port_android.cc deleted file mode 100644 index 240e9ca..0000000 --- a/leveldb/port/port_android.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_android.h" - -#include - -extern "C" { -size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { - return fread(a, b, c, d); -} - -size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { - return fwrite(a, b, c, d); -} - -int fflush_unlocked(FILE *f) { - return fflush(f); -} - -int fdatasync(int fd) { - return fsync(fd); -} -} - -namespace leveldb { -namespace port { - -static void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } -} - -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } -void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } -void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } - -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); -} - -CondVar::~CondVar() { - PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); -} - -void CondVar::Wait() { - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); -} - -void CondVar::Signal(){ - PthreadCall("signal", pthread_cond_signal(&cv_)); -} - -void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); -} - -} -} diff --git a/leveldb/port/port_android.h b/leveldb/port/port_android.h deleted file mode 100644 index 13df9c9..0000000 --- a/leveldb/port/port_android.h +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ -#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ - -#include -#include -#include -#include -#include -#include - -// Collapse the plethora of ARM flavors available to an easier to manage set -// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto -#if defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || \ - defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6Z__) || \ - defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7A__) -#define ARMV6_OR_7 1 -#endif - -extern "C" { - size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); - size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); - int fflush_unlocked(FILE *f); - int fdatasync (int fd); -} - -namespace leveldb { -namespace port { - -static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; - -class CondVar; - -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld() { - //TODO(gabor): How can I implement this? - } - - private: - friend class CondVar; - pthread_mutex_t mu_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - private: - Mutex* mu_; - pthread_cond_t cv_; -}; - -#ifndef ARMV6_OR_7 -// On ARM chipsets = V6 -#ifdef ARMV6_OR_7 - __asm__ __volatile__("dmb" : : : "memory"); -#else - pLinuxKernelMemoryBarrier(); -#endif - } - - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - void* r = rep_; - MemoryBarrier(); - return r; - } - inline void Release_Store(void* v) { - MemoryBarrier(); - rep_ = v; - } - inline void* NoBarrier_Load() const { - void* r = rep_; - return r; - } - inline void NoBarrier_Store(void* v) { - rep_ = v; - } -}; - -// TODO(gabor): Implement compress -inline bool Snappy_Compress( - const char* input, - size_t input_length, - std::string* output) { - return false; -} - -// TODO(gabor): Implement uncompress -inline bool Snappy_Uncompress( - const char* input_data, - size_t input_length, - std::string* output) { - return false; -} - -inline uint64_t ThreadIdentifier() { - pthread_t tid = pthread_self(); - uint64_t r = 0; - memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); - return r; -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/leveldb/port/port_chromium.cc b/leveldb/port/port_chromium.cc deleted file mode 100644 index 2ab49b9..0000000 --- a/leveldb/port/port_chromium.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_chromium.h" - -#include "util/logging.h" - -#if defined(USE_SNAPPY) -# include "third_party/snappy/src/snappy.h" -#endif - -namespace leveldb { -namespace port { - -Mutex::Mutex() { -} - -Mutex::~Mutex() { -} - -void Mutex::Lock() { - mu_.Acquire(); -} - -void Mutex::Unlock() { - mu_.Release(); -} - -void Mutex::AssertHeld() { - mu_.AssertAcquired(); -} - -CondVar::CondVar(Mutex* mu) - : cv_(&mu->mu_) { -} - -CondVar::~CondVar() { } - -void CondVar::Wait() { - cv_.Wait(); -} - -void CondVar::Signal(){ - cv_.Signal(); -} - -void CondVar::SignalAll() { - cv_.Broadcast(); -} - -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { -#if defined(USE_SNAPPY) - output->resize(snappy::MaxCompressedLength(input_length)); - size_t outlen; - snappy::RawCompress(input, input_length, &(*output)[0], &outlen); - output->resize(outlen); - return true; -#else - return false; -#endif -} - -bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { -#if defined(USE_SNAPPY) - size_t ulength; - if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { - return false; - } - output->resize(ulength); - return snappy::RawUncompress(input_data, input_length, &(*output)[0]); -#else - return false; -#endif -} - -} -} diff --git a/leveldb/port/port_chromium.h b/leveldb/port/port_chromium.h deleted file mode 100644 index 1851e6e..0000000 --- a/leveldb/port/port_chromium.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ -#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ - -#include -#include -#include -#include "base/atomicops.h" -#include "base/basictypes.h" -#include "base/logging.h" -#include "base/synchronization/condition_variable.h" -#include "base/synchronization/lock.h" - -// Linux's ThreadIdentifier() needs this. -#if defined(OS_LINUX) -# include -#endif - -#if defined(OS_WIN) -#define snprintf _snprintf -#define va_copy(a, b) do { (a) = (b); } while (0) -#endif - -namespace leveldb { -namespace port { - -// Chromium only supports little endian. -static const bool kLittleEndian = true; - -class Mutex { - public: - Mutex(); - ~Mutex(); - void Lock(); - void Unlock(); - void AssertHeld(); - - private: - base::Lock mu_; - - friend class CondVar; - DISALLOW_COPY_AND_ASSIGN(Mutex); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - - private: - base::ConditionVariable cv_; - - DISALLOW_COPY_AND_ASSIGN(CondVar); -}; - -class AtomicPointer { - private: - typedef base::subtle::AtomicWord Rep; - Rep rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} - inline void* Acquire_Load() const { - return reinterpret_cast(::base::subtle::Acquire_Load(&rep_)); - } - inline void Release_Store(void* v) { - ::base::subtle::Release_Store(&rep_, reinterpret_cast(v)); - } - inline void* NoBarrier_Load() const { - return reinterpret_cast(::base::subtle::NoBarrier_Load(&rep_)); - } - inline void NoBarrier_Store(void* v) { - ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast(v)); - } -}; - -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); -bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output); - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ diff --git a/leveldb/port/port_example.h b/leveldb/port/port_example.h deleted file mode 100644 index 8a624f3..0000000 --- a/leveldb/port/port_example.h +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// This file contains the specification, but not the implementations, -// of the types/operations/etc. that should be defined by a platform -// specific port_.h file. Use this file as a reference for -// how to port this package to a new platform. - -#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ -#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ - -namespace leveldb { -namespace port { - -// TODO(jorlow): Many of these belong more in the environment class rather than -// here. We should try moving them and see if it affects perf. - -// The following boolean constant must be true on a little-endian machine -// and false otherwise. -static const bool kLittleEndian = true /* or some other expression */; - -// ------------------ Threading ------------------- - -// A Mutex represents an exclusive lock. -class Mutex { - public: - Mutex(); - ~Mutex(); - - // Lock the mutex. Waits until other lockers have exited. - // Will deadlock if the mutex is already locked by this thread. - void Lock(); - - // Unlock the mutex. - // REQUIRES: This mutex was locked by this thread. - void Unlock(); - - // Optionally crash if this thread does not hold this mutex. - // The implementation must be fast, especially if NDEBUG is - // defined. The implementation is allowed to skip all checks. - void AssertHeld(); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - - // Atomically release *mu and block on this condition variable until - // either a call to SignalAll(), or a call to Signal() that picks - // this thread to wakeup. - // REQUIRES: this thread holds *mu - void Wait(); - - // If there are some threads waiting, wake up at least one of them. - void Signal(); - - // Wake up all waiting threads. - void SignallAll(); -}; - -// A type that holds a pointer that can be read or written atomically -// (i.e., without word-tearing.) -class AtomicPointer { - private: - intptr_t rep_; - public: - // Initialize to arbitrary value - AtomicPointer(); - - // Initialize to hold v - explicit AtomicPointer(void* v) : rep_(v) { } - - // Read and return the stored pointer with the guarantee that no - // later memory access (read or write) by this thread can be - // reordered ahead of this read. - void* Acquire_Load() const; - - // Set v as the stored pointer with the guarantee that no earlier - // memory access (read or write) by this thread can be reordered - // after this store. - void Release_Store(void* v); - - // Read the stored pointer with no ordering guarantees. - void* NoBarrier_Load() const; - - // Set va as the stored pointer with no ordering guarantees. - void NoBarrier_Store(void* v); -}; - -// ------------------ Compression ------------------- - -// Store the snappy compression of "input[0,input_length-1]" in *output. -// Returns false if snappy is not supported by this port. -extern bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); - -// Attempt to snappy uncompress input[0,input_length-1] into *output. -// Returns true if successful, false if the input is invalid lightweight -// compressed data. -extern bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output); - -// ------------------ Miscellaneous ------------------- - -// If heap profiling is not supported, returns false. -// Else repeatedly calls (*func)(arg, data, n) and then returns true. -// The concatenation of all "data[0,n-1]" fragments is the heap profile. -extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/leveldb/port/port_posix.cc b/leveldb/port/port_posix.cc deleted file mode 100644 index e75da8b..0000000 --- a/leveldb/port/port_posix.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_posix.h" - -#include -#include -#include -#include "util/logging.h" - -namespace leveldb { -namespace port { - -static void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } -} - -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } - -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } - -void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } - -void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } - -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); -} - -CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } - -void CondVar::Wait() { - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); -} - -void CondVar::Signal() { - PthreadCall("signal", pthread_cond_signal(&cv_)); -} - -void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); -} - -} -} diff --git a/leveldb/port/port_posix.h b/leveldb/port/port_posix.h deleted file mode 100644 index c158db1..0000000 --- a/leveldb/port/port_posix.h +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ -#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ - -#include -#include -#include -#include -#include -#include - -namespace leveldb { -namespace port { - -static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); - -class CondVar; - -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld() { } - - private: - friend class CondVar; - pthread_mutex_t mu_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - private: - pthread_cond_t cv_; - Mutex* mu_; -}; - -// Storage for a lock-free pointer -class AtomicPointer { - private: - std::atomic rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); - } - inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); - } - inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); - } - inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); - } -}; - -// TODO(gabor): Implement actual compress -inline bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { - return false; -} - -// TODO(gabor): Implement actual uncompress -inline bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { - return false; -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/leveldb/port/win/stdint.h b/leveldb/port/win/stdint.h deleted file mode 100644 index 39edd0d..0000000 --- a/leveldb/port/win/stdint.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// MSVC didn't ship with this file until the 2010 version. - -#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ -#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ - -#if !defined(_MSC_VER) -#error This file should only be included when compiling with MSVC. -#endif - -// Define C99 equivalent types. -typedef signed char int8_t; -typedef signed short int16_t; -typedef signed int int32_t; -typedef signed long long int64_t; -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/leveldb/table/block.cc b/leveldb/table/block.cc deleted file mode 100644 index 92b2877..0000000 --- a/leveldb/table/block.cc +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Decodes the blocks generated by block_builder.cc. - -#include "table/block.h" - -#include -#include -#include "leveldb/comparator.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -inline uint32_t Block::NumRestarts() const { - assert(size_ >= 2*sizeof(uint32_t)); - return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); -} - -Block::Block(const char* data, size_t size) - : data_(data), - size_(size) { - if (size_ < sizeof(uint32_t)) { - size_ = 0; // Error marker - } else { - restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); - if (restart_offset_ > size_ - sizeof(uint32_t)) { - // The size is too small for NumRestarts() and therefore - // restart_offset_ wrapped around. - size_ = 0; - } - } -} - -Block::~Block() { - delete[] data_; -} - -// Helper routine: decode the next block entry starting at "p", -// storing the number of shared key bytes, non_shared key bytes, -// and the length of the value in "*shared", "*non_shared", and -// "*value_length", respectively. Will not derefence past "limit". -// -// If any errors are detected, returns NULL. Otherwise, returns a -// pointer to the key delta (just past the three decoded values). -static inline const char* DecodeEntry(const char* p, const char* limit, - uint32_t* shared, - uint32_t* non_shared, - uint32_t* value_length) { - if (limit - p < 3) return NULL; - *shared = reinterpret_cast(p)[0]; - *non_shared = reinterpret_cast(p)[1]; - *value_length = reinterpret_cast(p)[2]; - if ((*shared | *non_shared | *value_length) < 128) { - // Fast path: all three values are encoded in one byte each - p += 3; - } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; - if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; - } - - if (static_cast(limit - p) < (*non_shared + *value_length)) { - return NULL; - } - return p; -} - -class Block::Iter : public Iterator { - private: - const Comparator* const comparator_; - const char* const data_; // underlying block contents - uint32_t const restarts_; // Offset of restart array (list of fixed32) - uint32_t const num_restarts_; // Number of uint32_t entries in restart array - - // current_ is offset in data_ of current entry. >= restarts_ if !Valid - uint32_t current_; - uint32_t restart_index_; // Index of restart block in which current_ falls - std::string key_; - Slice value_; - Status status_; - - inline int Compare(const Slice& a, const Slice& b) const { - return comparator_->Compare(a, b); - } - - // Return the offset in data_ just past the end of the current entry. - inline uint32_t NextEntryOffset() const { - return (value_.data() + value_.size()) - data_; - } - - uint32_t GetRestartPoint(uint32_t index) { - assert(index < num_restarts_); - return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); - } - - void SeekToRestartPoint(uint32_t index) { - key_.clear(); - restart_index_ = index; - // current_ will be fixed by ParseNextKey(); - - // ParseNextKey() starts at the end of value_, so set value_ accordingly - uint32_t offset = GetRestartPoint(index); - value_ = Slice(data_ + offset, 0); - } - - public: - Iter(const Comparator* comparator, - const char* data, - uint32_t restarts, - uint32_t num_restarts) - : comparator_(comparator), - data_(data), - restarts_(restarts), - num_restarts_(num_restarts), - current_(restarts_), - restart_index_(num_restarts_) { - assert(num_restarts_ > 0); - } - - virtual bool Valid() const { return current_ < restarts_; } - virtual Status status() const { return status_; } - virtual Slice key() const { - assert(Valid()); - return key_; - } - virtual Slice value() const { - assert(Valid()); - return value_; - } - - virtual void Next() { - assert(Valid()); - ParseNextKey(); - } - - virtual void Prev() { - assert(Valid()); - - // Scan backwards to a restart point before current_ - const uint32_t original = current_; - while (GetRestartPoint(restart_index_) >= original) { - if (restart_index_ == 0) { - // No more entries - current_ = restarts_; - restart_index_ = num_restarts_; - return; - } - restart_index_--; - } - - SeekToRestartPoint(restart_index_); - do { - // Loop until end of current entry hits the start of original entry - } while (ParseNextKey() && NextEntryOffset() < original); - } - - virtual void Seek(const Slice& target) { - // Binary search in restart array to find the first restart point - // with a key >= target - uint32_t left = 0; - uint32_t right = num_restarts_ - 1; - while (left < right) { - uint32_t mid = (left + right + 1) / 2; - uint32_t region_offset = GetRestartPoint(mid); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, - data_ + restarts_, - &shared, &non_shared, &value_length); - if (key_ptr == NULL || (shared != 0)) { - CorruptionError(); - return; - } - Slice mid_key(key_ptr, non_shared); - if (Compare(mid_key, target) < 0) { - // Key at "mid" is smaller than "target". Therefore all - // blocks before "mid" are uninteresting. - left = mid; - } else { - // Key at "mid" is >= "target". Therefore all blocks at or - // after "mid" are uninteresting. - right = mid - 1; - } - } - - // Linear search (within restart block) for first key >= target - SeekToRestartPoint(left); - while (true) { - if (!ParseNextKey()) { - return; - } - if (Compare(key_, target) >= 0) { - return; - } - } - } - - virtual void SeekToFirst() { - SeekToRestartPoint(0); - ParseNextKey(); - } - - virtual void SeekToLast() { - SeekToRestartPoint(num_restarts_ - 1); - while (ParseNextKey() && NextEntryOffset() < restarts_) { - // Keep skipping - } - } - - private: - void CorruptionError() { - current_ = restarts_; - restart_index_ = num_restarts_; - status_ = Status::Corruption("bad entry in block"); - key_.clear(); - value_.clear(); - } - - bool ParseNextKey() { - current_ = NextEntryOffset(); - const char* p = data_ + current_; - const char* limit = data_ + restarts_; // Restarts come right after data - if (p >= limit) { - // No more entries to return. Mark as invalid. - current_ = restarts_; - restart_index_ = num_restarts_; - return false; - } - - // Decode next entry - uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); - if (p == NULL || key_.size() < shared) { - CorruptionError(); - return false; - } else { - key_.resize(shared); - key_.append(p, non_shared); - value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; - } - return true; - } - } -}; - -Iterator* Block::NewIterator(const Comparator* cmp) { - if (size_ < 2*sizeof(uint32_t)) { - return NewErrorIterator(Status::Corruption("bad block contents")); - } - const uint32_t num_restarts = NumRestarts(); - if (num_restarts == 0) { - return NewEmptyIterator(); - } else { - return new Iter(cmp, data_, restart_offset_, num_restarts); - } -} - -} diff --git a/leveldb/table/block.h b/leveldb/table/block.h deleted file mode 100644 index cdf0598..0000000 --- a/leveldb/table/block.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ -#define STORAGE_LEVELDB_TABLE_BLOCK_H_ - -#include -#include -#include "leveldb/iterator.h" - -namespace leveldb { - -class Comparator; - -class Block { - public: - // Initialize the block with the specified contents. - // Takes ownership of data[] and will delete[] it when done. - Block(const char* data, size_t size); - - ~Block(); - - size_t size() const { return size_; } - Iterator* NewIterator(const Comparator* comparator); - - private: - uint32_t NumRestarts() const; - - const char* data_; - size_t size_; - uint32_t restart_offset_; // Offset in data_ of restart array - - // No copying allowed - Block(const Block&); - void operator=(const Block&); - - class Iter; -}; - -} - -#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ diff --git a/leveldb/table/block_builder.cc b/leveldb/table/block_builder.cc deleted file mode 100644 index dc958c8..0000000 --- a/leveldb/table/block_builder.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// BlockBuilder generates blocks where keys are prefix-compressed: -// -// When we store a key, we drop the prefix shared with the previous -// string. This helps reduce the space requirement significantly. -// Furthermore, once every K keys, we do not apply the prefix -// compression and store the entire key. We call this a "restart -// point". The tail end of the block stores the offsets of all of the -// restart points, and can be used to do a binary search when looking -// for a particular key. Values are stored as-is (without compression) -// immediately following the corresponding key. -// -// An entry for a particular key-value pair has the form: -// shared_bytes: varint32 -// unshared_bytes: varint32 -// value_length: varint32 -// key_delta: char[unshared_bytes] -// value: char[value_length] -// shared_bytes == 0 for restart points. -// -// The trailer of the block has the form: -// restarts: uint32[num_restarts] -// num_restarts: uint32 -// restarts[i] contains the offset within the block of the ith restart point. - -#include "table/block_builder.h" - -#include -#include -#include "leveldb/comparator.h" -#include "leveldb/table_builder.h" -#include "util/coding.h" - -namespace leveldb { - -BlockBuilder::BlockBuilder(const Options* options) - : options_(options), - restarts_(), - counter_(0), - finished_(false) { - assert(options->block_restart_interval >= 1); - restarts_.push_back(0); // First restart point is at offset 0 -} - -void BlockBuilder::Reset() { - buffer_.clear(); - restarts_.clear(); - restarts_.push_back(0); // First restart point is at offset 0 - counter_ = 0; - finished_ = false; - last_key_.clear(); -} - -size_t BlockBuilder::CurrentSizeEstimate() const { - return (buffer_.size() + // Raw data buffer - restarts_.size() * sizeof(uint32_t) + // Restart array - sizeof(uint32_t)); // Restart array length -} - -Slice BlockBuilder::Finish() { - // Append restart array - for (size_t i = 0; i < restarts_.size(); i++) { - PutFixed32(&buffer_, restarts_[i]); - } - PutFixed32(&buffer_, restarts_.size()); - finished_ = true; - return Slice(buffer_); -} - -void BlockBuilder::Add(const Slice& key, const Slice& value) { - Slice last_key_piece(last_key_); - assert(!finished_); - assert(counter_ <= options_->block_restart_interval); - assert(buffer_.empty() // No values yet? - || options_->comparator->Compare(key, last_key_piece) > 0); - size_t shared = 0; - if (counter_ < options_->block_restart_interval) { - // See how much sharing to do with previous string - const size_t min_length = std::min(last_key_piece.size(), key.size()); - while ((shared < min_length) && (last_key_[shared] == key[shared])) { - shared++; - } - } else { - // Restart compression - restarts_.push_back(buffer_.size()); - counter_ = 0; - } - const size_t non_shared = key.size() - shared; - - // Add "" to buffer_ - PutVarint32(&buffer_, shared); - PutVarint32(&buffer_, non_shared); - PutVarint32(&buffer_, value.size()); - - // Add string delta to buffer_ followed by value - buffer_.append(key.data() + shared, non_shared); - buffer_.append(value.data(), value.size()); - - // Update state - last_key_.resize(shared); - last_key_.append(key.data() + shared, non_shared); - assert(Slice(last_key_) == key); - counter_++; -} - -} diff --git a/leveldb/table/block_builder.h b/leveldb/table/block_builder.h deleted file mode 100644 index bf92a0f..0000000 --- a/leveldb/table/block_builder.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ -#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ - -#include - -#include -#include "leveldb/slice.h" - -namespace leveldb { - -struct Options; - -class BlockBuilder { - public: - explicit BlockBuilder(const Options* options); - - // Reset the contents as if the BlockBuilder was just constructed. - void Reset(); - - // REQUIRES: Finish() has not been callled since the last call to Reset(). - // REQUIRES: key is larger than any previously added key - void Add(const Slice& key, const Slice& value); - - // Finish building the block and return a slice that refers to the - // block contents. The returned slice will remain valid for the - // lifetime of this builder or until Reset() is called. - Slice Finish(); - - // Returns an estimate of the current (uncompressed) size of the block - // we are building. - size_t CurrentSizeEstimate() const; - - // Return true iff no entries have been added since the last Reset() - bool empty() const { - return buffer_.empty(); - } - - private: - const Options* options_; - std::string buffer_; // Destination buffer - std::vector restarts_; // Restart points - int counter_; // Number of entries emitted since restart - bool finished_; // Has Finish() been called? - std::string last_key_; - - // No copying allowed - BlockBuilder(const BlockBuilder&); - void operator=(const BlockBuilder&); -}; - -} - -#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ diff --git a/leveldb/table/format.cc b/leveldb/table/format.cc deleted file mode 100644 index 63971db..0000000 --- a/leveldb/table/format.cc +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/format.h" - -#include "leveldb/env.h" -#include "port/port.h" -#include "table/block.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { - -void BlockHandle::EncodeTo(std::string* dst) const { - // Sanity check that all fields have been set - assert(offset_ != ~static_cast(0)); - assert(size_ != ~static_cast(0)); - PutVarint64(dst, offset_); - PutVarint64(dst, size_); -} - -Status BlockHandle::DecodeFrom(Slice* input) { - if (GetVarint64(input, &offset_) && - GetVarint64(input, &size_)) { - return Status::OK(); - } else { - return Status::Corruption("bad block handle"); - } -} - -void Footer::EncodeTo(std::string* dst) const { -#ifndef NDEBUG - const size_t original_size = dst->size(); -#endif - metaindex_handle_.EncodeTo(dst); - index_handle_.EncodeTo(dst); - dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding - PutFixed32(dst, static_cast(kTableMagicNumber & 0xffffffffu)); - PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); - assert(dst->size() == original_size + kEncodedLength); -} - -Status Footer::DecodeFrom(Slice* input) { - const char* magic_ptr = input->data() + kEncodedLength - 8; - const uint32_t magic_lo = DecodeFixed32(magic_ptr); - const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); - const uint64_t magic = ((static_cast(magic_hi) << 32) | - (static_cast(magic_lo))); - if (magic != kTableMagicNumber) { - return Status::InvalidArgument("not an sstable (bad magic number)"); - } - - Status result = metaindex_handle_.DecodeFrom(input); - if (result.ok()) { - result = index_handle_.DecodeFrom(input); - } - if (result.ok()) { - // We skip over any leftover data (just padding for now) in "input" - const char* end = magic_ptr + 8; - *input = Slice(end, input->data() + input->size() - end); - } - return result; -} - -Status ReadBlock(RandomAccessFile* file, - const ReadOptions& options, - const BlockHandle& handle, - Block** block) { - *block = NULL; - - // Read the block contents as well as the type/crc footer. - // See table_builder.cc for the code that built this structure. - size_t n = static_cast(handle.size()); - char* buf = new char[n + kBlockTrailerSize]; - Slice contents; - Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); - if (!s.ok()) { - delete[] buf; - return s; - } - if (contents.size() != n + kBlockTrailerSize) { - delete[] buf; - return Status::Corruption("truncated block read"); - } - - // Check the crc of the type and the block contents - const char* data = contents.data(); // Pointer to where Read put the data - if (options.verify_checksums) { - const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); - const uint32_t actual = crc32c::Value(data, n + 1); - if (actual != crc) { - delete[] buf; - s = Status::Corruption("block checksum mismatch"); - return s; - } - } - - switch (data[n]) { - case kNoCompression: - if (data != buf) { - // File implementation gave us pointer to some other data. - // Copy into buf[]. - memcpy(buf, data, n + kBlockTrailerSize); - } - - // Ok - break; - case kSnappyCompression: { - std::string decompressed; - if (!port::Snappy_Uncompress(data, n, &decompressed)) { - delete[] buf; - s = Status::Corruption("corrupted compressed block contents"); - return s; - } - delete[] buf; // Done with uncompressed data - buf = new char[decompressed.size()]; - memcpy(buf, decompressed.data(), decompressed.size()); - n = decompressed.size(); - break; - } - default: - delete[] buf; - return Status::Corruption("bad block type"); - } - - *block = new Block(buf, n); // Block takes ownership of buf[] - return Status::OK(); -} - -} diff --git a/leveldb/table/format.h b/leveldb/table/format.h deleted file mode 100644 index a6ab964..0000000 --- a/leveldb/table/format.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ -#define STORAGE_LEVELDB_TABLE_FORMAT_H_ - -#include -#include -#include "leveldb/slice.h" -#include "leveldb/status.h" -#include "leveldb/table_builder.h" - -namespace leveldb { - -class Block; -class RandomAccessFile; -struct ReadOptions; - -// BlockHandle is a pointer to the extent of a file that stores a data -// block or a meta block. -class BlockHandle { - public: - BlockHandle(); - - // The offset of the block in the file. - uint64_t offset() const { return offset_; } - void set_offset(uint64_t offset) { offset_ = offset; } - - // The size of the stored block - uint64_t size() const { return size_; } - void set_size(uint64_t size) { size_ = size; } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(Slice* input); - - // Maximum encoding length of a BlockHandle - enum { kMaxEncodedLength = 10 + 10 }; - - private: - uint64_t offset_; - uint64_t size_; -}; - -// Footer encapsulates the fixed information stored at the tail -// end of every table file. -class Footer { - public: - Footer() { } - - // The block handle for the metaindex block of the table - const BlockHandle& metaindex_handle() const { return metaindex_handle_; } - void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } - - // The block handle for the index block of the table - const BlockHandle& index_handle() const { - return index_handle_; - } - void set_index_handle(const BlockHandle& h) { - index_handle_ = h; - } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(Slice* input); - - // Encoded length of a Footer. Note that the serialization of a - // Footer will always occupy exactly this many bytes. It consists - // of two block handles and a magic number. - enum { - kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 - }; - - private: - BlockHandle metaindex_handle_; - BlockHandle index_handle_; -}; - -// kTableMagicNumber was picked by running -// echo http://code.google.com/p/leveldb/ | sha1sum -// and taking the leading 64 bits. -static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; - -// 1-byte type + 32-bit crc -static const size_t kBlockTrailerSize = 5; - -// Read the block identified by "handle" from "file". On success, -// store a pointer to the heap-allocated result in *block and return -// OK. On failure store NULL in *block and return non-OK. -extern Status ReadBlock(RandomAccessFile* file, - const ReadOptions& options, - const BlockHandle& handle, - Block** block); - -// Implementation details follow. Clients should ignore, - -inline BlockHandle::BlockHandle() - : offset_(~static_cast(0)), - size_(~static_cast(0)) { -} - -} - -#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ diff --git a/leveldb/table/iterator.cc b/leveldb/table/iterator.cc deleted file mode 100644 index 4ddd55f..0000000 --- a/leveldb/table/iterator.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/iterator.h" -#include "util/logging.h" - -namespace leveldb { - -Iterator::Iterator() { - cleanup_.function = NULL; - cleanup_.next = NULL; -} - -Iterator::~Iterator() { - if (cleanup_.function != NULL) { - (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); - for (Cleanup* c = cleanup_.next; c != NULL; ) { - (*c->function)(c->arg1, c->arg2); - Cleanup* next = c->next; - delete c; - c = next; - } - } -} - -void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { - assert(func != NULL); - Cleanup* c; - if (cleanup_.function == NULL) { - c = &cleanup_; - } else { - c = new Cleanup; - c->next = cleanup_.next; - cleanup_.next = c; - } - c->function = func; - c->arg1 = arg1; - c->arg2 = arg2; -} - -namespace { -class EmptyIterator : public Iterator { - public: - EmptyIterator(const Status& s) : status_(s) { } - virtual bool Valid() const { return false; } - virtual void Seek(const Slice& target) { } - virtual void SeekToFirst() { } - virtual void SeekToLast() { } - virtual void Next() { assert(false); } - virtual void Prev() { assert(false); } - Slice key() const { assert(false); return Slice(); } - Slice value() const { assert(false); return Slice(); } - virtual Status status() const { return status_; } - private: - Status status_; -}; -} - -Iterator* NewEmptyIterator() { - return new EmptyIterator(Status::OK()); -} - -Iterator* NewErrorIterator(const Status& status) { - return new EmptyIterator(status); -} - -} diff --git a/leveldb/table/iterator_wrapper.h b/leveldb/table/iterator_wrapper.h deleted file mode 100644 index 158d3a7..0000000 --- a/leveldb/table/iterator_wrapper.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ -#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ - -namespace leveldb { - -// A internal wrapper class with an interface similar to Iterator that -// caches the valid() and key() results for an underlying iterator. -// This can help avoid virtual function calls and also gives better -// cache locality. -class IteratorWrapper { - private: - Iterator* iter_; - bool valid_; - Slice key_; - public: - IteratorWrapper(): iter_(NULL), valid_(false) { } - explicit IteratorWrapper(Iterator* iter): iter_(NULL) { - Set(iter); - } - ~IteratorWrapper() { delete iter_; } - Iterator* iter() const { return iter_; } - - // Takes ownership of "iter" and will delete it when destroyed, or - // when Set() is invoked again. - void Set(Iterator* iter) { - delete iter_; - iter_ = iter; - if (iter_ == NULL) { - valid_ = false; - } else { - Update(); - } - } - - - // Iterator interface methods - bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } - Slice value() const { assert(Valid()); return iter_->value(); } - // Methods below require iter() != NULL - Status status() const { assert(iter_); return iter_->status(); } - void Next() { assert(iter_); iter_->Next(); Update(); } - void Prev() { assert(iter_); iter_->Prev(); Update(); } - void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } - void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } - void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } - - private: - void Update() { - valid_ = iter_->Valid(); - if (valid_) { - key_ = iter_->key(); - } - } -}; - -} - - -#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ diff --git a/leveldb/table/merger.cc b/leveldb/table/merger.cc deleted file mode 100644 index 6ce06bb..0000000 --- a/leveldb/table/merger.cc +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/merger.h" - -#include "leveldb/comparator.h" -#include "leveldb/iterator.h" -#include "table/iterator_wrapper.h" - -namespace leveldb { - -namespace { -class MergingIterator : public Iterator { - public: - MergingIterator(const Comparator* comparator, Iterator** children, int n) - : comparator_(comparator), - children_(new IteratorWrapper[n]), - n_(n), - current_(NULL), - direction_(kForward) { - for (int i = 0; i < n; i++) { - children_[i].Set(children[i]); - } - } - - virtual ~MergingIterator() { - delete[] children_; - } - - virtual bool Valid() const { - return (current_ != NULL); - } - - virtual void SeekToFirst() { - for (int i = 0; i < n_; i++) { - children_[i].SeekToFirst(); - } - FindSmallest(); - direction_ = kForward; - } - - virtual void SeekToLast() { - for (int i = 0; i < n_; i++) { - children_[i].SeekToLast(); - } - FindLargest(); - direction_ = kReverse; - } - - virtual void Seek(const Slice& target) { - for (int i = 0; i < n_; i++) { - children_[i].Seek(target); - } - FindSmallest(); - direction_ = kForward; - } - - virtual void Next() { - assert(Valid()); - - // Ensure that all children are positioned after key(). - // If we are moving in the forward direction, it is already - // true for all of the non-current_ children since current_ is - // the smallest child and key() == current_->key(). Otherwise, - // we explicitly position the non-current_ children. - if (direction_ != kForward) { - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child != current_) { - child->Seek(key()); - if (child->Valid() && - comparator_->Compare(key(), child->key()) == 0) { - child->Next(); - } - } - } - direction_ = kForward; - } - - current_->Next(); - FindSmallest(); - } - - virtual void Prev() { - assert(Valid()); - - // Ensure that all children are positioned before key(). - // If we are moving in the reverse direction, it is already - // true for all of the non-current_ children since current_ is - // the largest child and key() == current_->key(). Otherwise, - // we explicitly position the non-current_ children. - if (direction_ != kReverse) { - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child != current_) { - child->Seek(key()); - if (child->Valid()) { - // Child is at first entry >= key(). Step back one to be < key() - child->Prev(); - } else { - // Child has no entries >= key(). Position at last entry. - child->SeekToLast(); - } - } - } - direction_ = kReverse; - } - - current_->Prev(); - FindLargest(); - } - - virtual Slice key() const { - assert(Valid()); - return current_->key(); - } - - virtual Slice value() const { - assert(Valid()); - return current_->value(); - } - - virtual Status status() const { - Status status; - for (int i = 0; i < n_; i++) { - status = children_[i].status(); - if (!status.ok()) { - break; - } - } - return status; - } - - private: - void FindSmallest(); - void FindLargest(); - - // We might want to use a heap in case there are lots of children. - // For now we use a simple array since we expect a very small number - // of children in leveldb. - const Comparator* comparator_; - IteratorWrapper* children_; - int n_; - IteratorWrapper* current_; - - // Which direction is the iterator moving? - enum Direction { - kForward, - kReverse - }; - Direction direction_; -}; - -void MergingIterator::FindSmallest() { - IteratorWrapper* smallest = NULL; - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child->Valid()) { - if (smallest == NULL) { - smallest = child; - } else if (comparator_->Compare(child->key(), smallest->key()) < 0) { - smallest = child; - } - } - } - current_ = smallest; -} - -void MergingIterator::FindLargest() { - IteratorWrapper* largest = NULL; - for (int i = n_-1; i >= 0; i--) { - IteratorWrapper* child = &children_[i]; - if (child->Valid()) { - if (largest == NULL) { - largest = child; - } else if (comparator_->Compare(child->key(), largest->key()) > 0) { - largest = child; - } - } - } - current_ = largest; -} -} - -Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { - assert(n >= 0); - if (n == 0) { - return NewEmptyIterator(); - } else if (n == 1) { - return list[0]; - } else { - return new MergingIterator(cmp, list, n); - } -} - -} diff --git a/leveldb/table/merger.h b/leveldb/table/merger.h deleted file mode 100644 index 71d9dc5..0000000 --- a/leveldb/table/merger.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ -#define STORAGE_LEVELDB_TABLE_MERGER_H_ - -namespace leveldb { - -class Comparator; -class Iterator; - -// Return an iterator that provided the union of the data in -// children[0,n-1]. Takes ownership of the child iterators and -// will delete them when the result iterator is deleted. -// -// The result does no duplicate suppression. I.e., if a particular -// key is present in K child iterators, it will be yielded K times. -// -// REQUIRES: n >= 0 -extern Iterator* NewMergingIterator( - const Comparator* comparator, Iterator** children, int n); - -} - -#endif // STORAGE_LEVELDB_TABLE_MERGER_H_ diff --git a/leveldb/table/table.cc b/leveldb/table/table.cc deleted file mode 100644 index 9820753..0000000 --- a/leveldb/table/table.cc +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table.h" - -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "table/block.h" -#include "table/format.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" - -namespace leveldb { - -struct Table::Rep { - ~Rep() { - delete index_block; - } - - Options options; - Status status; - RandomAccessFile* file; - uint64_t cache_id; - - BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer - Block* index_block; -}; - -Status Table::Open(const Options& options, - RandomAccessFile* file, - uint64_t size, - Table** table) { - *table = NULL; - if (size < Footer::kEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); - } - - char footer_space[Footer::kEncodedLength]; - Slice footer_input; - Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, - &footer_input, footer_space); - if (!s.ok()) return s; - - Footer footer; - s = footer.DecodeFrom(&footer_input); - if (!s.ok()) return s; - - // Read the index block - Block* index_block = NULL; - if (s.ok()) { - s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); - } - - if (s.ok()) { - // We've successfully read the footer and the index block: we're - // ready to serve requests. - Rep* rep = new Table::Rep; - rep->options = options; - rep->file = file; - rep->metaindex_handle = footer.metaindex_handle(); - rep->index_block = index_block; - rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); - *table = new Table(rep); - } else { - if (index_block) delete index_block; - } - - return s; -} - -Table::~Table() { - delete rep_; -} - -static void DeleteBlock(void* arg, void* ignored) { - delete reinterpret_cast(arg); -} - -static void DeleteCachedBlock(const Slice& key, void* value) { - Block* block = reinterpret_cast(value); - delete block; -} - -static void ReleaseBlock(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle); -} - -// Convert an index iterator value (i.e., an encoded BlockHandle) -// into an iterator over the contents of the corresponding block. -Iterator* Table::BlockReader(void* arg, - const ReadOptions& options, - const Slice& index_value) { - Table* table = reinterpret_cast(arg); - Cache* block_cache = table->rep_->options.block_cache; - Block* block = NULL; - Cache::Handle* cache_handle = NULL; - - BlockHandle handle; - Slice input = index_value; - Status s = handle.DecodeFrom(&input); - // We intentionally allow extra stuff in index_value so that we - // can add more features in the future. - - if (s.ok()) { - if (block_cache != NULL) { - char cache_key_buffer[16]; - EncodeFixed64(cache_key_buffer, table->rep_->cache_id); - EncodeFixed64(cache_key_buffer+8, handle.offset()); - Slice key(cache_key_buffer, sizeof(cache_key_buffer)); - cache_handle = block_cache->Lookup(key); - if (cache_handle != NULL) { - block = reinterpret_cast(block_cache->Value(cache_handle)); - } else { - s = ReadBlock(table->rep_->file, options, handle, &block); - if (s.ok() && options.fill_cache) { - cache_handle = block_cache->Insert( - key, block, block->size(), &DeleteCachedBlock); - } - } - } else { - s = ReadBlock(table->rep_->file, options, handle, &block); - } - } - - Iterator* iter; - if (block != NULL) { - iter = block->NewIterator(table->rep_->options.comparator); - if (cache_handle == NULL) { - iter->RegisterCleanup(&DeleteBlock, block, NULL); - } else { - iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); - } - } else { - iter = NewErrorIterator(s); - } - return iter; -} - -Iterator* Table::NewIterator(const ReadOptions& options) const { - return NewTwoLevelIterator( - rep_->index_block->NewIterator(rep_->options.comparator), - &Table::BlockReader, const_cast(this), options); -} - -uint64_t Table::ApproximateOffsetOf(const Slice& key) const { - Iterator* index_iter = - rep_->index_block->NewIterator(rep_->options.comparator); - index_iter->Seek(key); - uint64_t result; - if (index_iter->Valid()) { - BlockHandle handle; - Slice input = index_iter->value(); - Status s = handle.DecodeFrom(&input); - if (s.ok()) { - result = handle.offset(); - } else { - // Strange: we can't decode the block handle in the index block. - // We'll just return the offset of the metaindex block, which is - // close to the whole file size for this case. - result = rep_->metaindex_handle.offset(); - } - } else { - // key is past the last key in the file. Approximate the offset - // by returning the offset of the metaindex block (which is - // right near the end of the file). - result = rep_->metaindex_handle.offset(); - } - delete index_iter; - return result; -} - -} diff --git a/leveldb/table/table_builder.cc b/leveldb/table/table_builder.cc deleted file mode 100644 index 7ec7ad2..0000000 --- a/leveldb/table/table_builder.cc +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table_builder.h" - -#include -#include -#include "leveldb/comparator.h" -#include "leveldb/env.h" -#include "table/block_builder.h" -#include "table/format.h" -#include "util/coding.h" -#include "util/crc32c.h" -#include "util/logging.h" - -namespace leveldb { - -struct TableBuilder::Rep { - Options options; - Options index_block_options; - WritableFile* file; - uint64_t offset; - Status status; - BlockBuilder data_block; - BlockBuilder index_block; - std::string last_key; - int64_t num_entries; - bool closed; // Either Finish() or Abandon() has been called. - - // We do not emit the index entry for a block until we have seen the - // first key for the next data block. This allows us to use shorter - // keys in the index block. For example, consider a block boundary - // between the keys "the quick brown fox" and "the who". We can use - // "the r" as the key for the index block entry since it is >= all - // entries in the first block and < all entries in subsequent - // blocks. - // - // Invariant: r->pending_index_entry is true only if data_block is empty. - bool pending_index_entry; - BlockHandle pending_handle; // Handle to add to index block - - std::string compressed_output; - - Rep(const Options& opt, WritableFile* f) - : options(opt), - index_block_options(opt), - file(f), - offset(0), - data_block(&options), - index_block(&index_block_options), - num_entries(0), - closed(false), - pending_index_entry(false) { - index_block_options.block_restart_interval = 1; - } -}; - -TableBuilder::TableBuilder(const Options& options, WritableFile* file) - : rep_(new Rep(options, file)) { -} - -TableBuilder::~TableBuilder() { - assert(rep_->closed); // Catch errors where caller forgot to call Finish() - delete rep_; -} - -Status TableBuilder::ChangeOptions(const Options& options) { - // Note: if more fields are added to Options, update - // this function to catch changes that should not be allowed to - // change in the middle of building a Table. - if (options.comparator != rep_->options.comparator) { - return Status::InvalidArgument("changing comparator while building table"); - } - - // Note that any live BlockBuilders point to rep_->options and therefore - // will automatically pick up the updated options. - rep_->options = options; - rep_->index_block_options = options; - rep_->index_block_options.block_restart_interval = 1; - return Status::OK(); -} - -void TableBuilder::Add(const Slice& key, const Slice& value) { - Rep* r = rep_; - assert(!r->closed); - if (!ok()) return; - if (r->num_entries > 0) { - assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); - } - - if (r->pending_index_entry) { - assert(r->data_block.empty()); - r->options.comparator->FindShortestSeparator(&r->last_key, key); - std::string handle_encoding; - r->pending_handle.EncodeTo(&handle_encoding); - r->index_block.Add(r->last_key, Slice(handle_encoding)); - r->pending_index_entry = false; - } - - r->last_key.assign(key.data(), key.size()); - r->num_entries++; - r->data_block.Add(key, value); - - const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); - if (estimated_block_size >= r->options.block_size) { - Flush(); - } -} - -void TableBuilder::Flush() { - Rep* r = rep_; - assert(!r->closed); - if (!ok()) return; - if (r->data_block.empty()) return; - assert(!r->pending_index_entry); - WriteBlock(&r->data_block, &r->pending_handle); - if (ok()) { - r->pending_index_entry = true; - r->status = r->file->Flush(); - } -} - -void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { - // File format contains a sequence of blocks where each block has: - // block_data: uint8[n] - // type: uint8 - // crc: uint32 - assert(ok()); - Rep* r = rep_; - Slice raw = block->Finish(); - - Slice block_contents; - CompressionType type = r->options.compression; - // TODO(postrelease): Support more compression options: zlib? - switch (type) { - case kNoCompression: - block_contents = raw; - break; - - case kSnappyCompression: { - std::string* compressed = &r->compressed_output; - if (port::Snappy_Compress(raw.data(), raw.size(), compressed) && - compressed->size() < raw.size() - (raw.size() / 8u)) { - block_contents = *compressed; - } else { - // Snappy not supported, or compressed less than 12.5%, so just - // store uncompressed form - block_contents = raw; - type = kNoCompression; - } - break; - } - } - handle->set_offset(r->offset); - handle->set_size(block_contents.size()); - r->status = r->file->Append(block_contents); - if (r->status.ok()) { - char trailer[kBlockTrailerSize]; - trailer[0] = type; - uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); - crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type - EncodeFixed32(trailer+1, crc32c::Mask(crc)); - r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); - if (r->status.ok()) { - r->offset += block_contents.size() + kBlockTrailerSize; - } - } - r->compressed_output.clear(); - block->Reset(); -} - -Status TableBuilder::status() const { - return rep_->status; -} - -Status TableBuilder::Finish() { - Rep* r = rep_; - Flush(); - assert(!r->closed); - r->closed = true; - BlockHandle metaindex_block_handle; - BlockHandle index_block_handle; - if (ok()) { - BlockBuilder meta_index_block(&r->options); - // TODO(postrelease): Add stats and other meta blocks - WriteBlock(&meta_index_block, &metaindex_block_handle); - } - if (ok()) { - if (r->pending_index_entry) { - r->options.comparator->FindShortSuccessor(&r->last_key); - std::string handle_encoding; - r->pending_handle.EncodeTo(&handle_encoding); - r->index_block.Add(r->last_key, Slice(handle_encoding)); - r->pending_index_entry = false; - } - WriteBlock(&r->index_block, &index_block_handle); - } - if (ok()) { - Footer footer; - footer.set_metaindex_handle(metaindex_block_handle); - footer.set_index_handle(index_block_handle); - std::string footer_encoding; - footer.EncodeTo(&footer_encoding); - r->status = r->file->Append(footer_encoding); - if (r->status.ok()) { - r->offset += footer_encoding.size(); - } - } - return r->status; -} - -void TableBuilder::Abandon() { - Rep* r = rep_; - assert(!r->closed); - r->closed = true; -} - -uint64_t TableBuilder::NumEntries() const { - return rep_->num_entries; -} - -uint64_t TableBuilder::FileSize() const { - return rep_->offset; -} - -} diff --git a/leveldb/table/table_test.cc b/leveldb/table/table_test.cc deleted file mode 100644 index 4b3e85e..0000000 --- a/leveldb/table/table_test.cc +++ /dev/null @@ -1,841 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table.h" - -#include -#include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "leveldb/table_builder.h" -#include "table/block.h" -#include "table/block_builder.h" -#include "table/format.h" -#include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -// Return reverse of "key". -// Used to test non-lexicographic comparators. -static std::string Reverse(const Slice& key) { - std::string str(key.ToString()); - std::string rev(str.rbegin(), str.rend()); - return rev; -} - -namespace { -class ReverseKeyComparator : public Comparator { - public: - virtual const char* Name() const { - return "leveldb.ReverseBytewiseComparator"; - } - - virtual int Compare(const Slice& a, const Slice& b) const { - return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); - } - - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const { - std::string s = Reverse(*start); - std::string l = Reverse(limit); - BytewiseComparator()->FindShortestSeparator(&s, l); - *start = Reverse(s); - } - - virtual void FindShortSuccessor(std::string* key) const { - std::string s = Reverse(*key); - BytewiseComparator()->FindShortSuccessor(&s); - *key = Reverse(s); - } -}; -} -static ReverseKeyComparator reverse_key_comparator; - -static void Increment(const Comparator* cmp, std::string* key) { - if (cmp == BytewiseComparator()) { - key->push_back('\0'); - } else { - assert(cmp == &reverse_key_comparator); - std::string rev = Reverse(*key); - rev.push_back('\0'); - *key = Reverse(rev); - } -} - -// An STL comparator that uses a Comparator -namespace { -struct STLLessThan { - const Comparator* cmp; - - STLLessThan() : cmp(BytewiseComparator()) { } - STLLessThan(const Comparator* c) : cmp(c) { } - bool operator()(const std::string& a, const std::string& b) const { - return cmp->Compare(Slice(a), Slice(b)) < 0; - } -}; -} - -class StringSink: public WritableFile { - public: - ~StringSink() { } - - const std::string& contents() const { return contents_; } - - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } - - virtual Status Append(const Slice& data) { - contents_.append(data.data(), data.size()); - return Status::OK(); - } - - private: - std::string contents_; -}; - - -class StringSource: public RandomAccessFile { - public: - StringSource(const Slice& contents) - : contents_(contents.data(), contents.size()) { - } - - virtual ~StringSource() { } - - uint64_t Size() const { return contents_.size(); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - if (offset > contents_.size()) { - return Status::InvalidArgument("invalid Read offset"); - } - if (offset + n > contents_.size()) { - n = contents_.size() - offset; - } - memcpy(scratch, &contents_[offset], n); - *result = Slice(scratch, n); - return Status::OK(); - } - - private: - std::string contents_; -}; - -typedef std::map KVMap; - -// Helper class for tests to unify the interface between -// BlockBuilder/TableBuilder and Block/Table. -class Constructor { - public: - explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } - virtual ~Constructor() { } - - void Add(const std::string& key, const Slice& value) { - data_[key] = value.ToString(); - } - - // Finish constructing the data structure with all the keys that have - // been added so far. Returns the keys in sorted order in "*keys" - // and stores the key/value pairs in "*kvmap" - void Finish(const Options& options, - std::vector* keys, - KVMap* kvmap) { - *kvmap = data_; - keys->clear(); - for (KVMap::const_iterator it = data_.begin(); - it != data_.end(); - ++it) { - keys->push_back(it->first); - } - data_.clear(); - Status s = FinishImpl(options, *kvmap); - ASSERT_TRUE(s.ok()) << s.ToString(); - } - - // Construct the data structure from the data in "data" - virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; - - virtual size_t NumBytes() const = 0; - - virtual Iterator* NewIterator() const = 0; - - virtual const KVMap& data() { return data_; } - - virtual DB* db() const { return NULL; } // Overridden in DBConstructor - - private: - KVMap data_; -}; - -class BlockConstructor: public Constructor { - public: - explicit BlockConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp), - block_size_(-1), - block_(NULL) { } - ~BlockConstructor() { - delete block_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete block_; - block_ = NULL; - BlockBuilder builder(&options); - - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - builder.Add(it->first, it->second); - } - // Open the block - Slice block_data = builder.Finish(); - block_size_ = block_data.size(); - char* block_data_copy = new char[block_size_]; - memcpy(block_data_copy, block_data.data(), block_size_); - block_ = new Block(block_data_copy, block_size_); - return Status::OK(); - } - virtual size_t NumBytes() const { return block_size_; } - - virtual Iterator* NewIterator() const { - return block_->NewIterator(comparator_); - } - - private: - const Comparator* comparator_; - int block_size_; - Block* block_; - - BlockConstructor(); -}; - -class TableConstructor: public Constructor { - public: - TableConstructor(const Comparator* cmp) - : Constructor(cmp), - source_(NULL), table_(NULL) { - } - ~TableConstructor() { - Reset(); - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - Reset(); - StringSink sink; - TableBuilder builder(options, &sink); - - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - builder.Add(it->first, it->second); - ASSERT_TRUE(builder.status().ok()); - } - Status s = builder.Finish(); - ASSERT_TRUE(s.ok()) << s.ToString(); - - ASSERT_EQ(sink.contents().size(), builder.FileSize()); - - // Open the table - source_ = new StringSource(sink.contents()); - Options table_options; - table_options.comparator = options.comparator; - return Table::Open(table_options, source_, sink.contents().size(), &table_); - } - virtual size_t NumBytes() const { return source_->Size(); } - - virtual Iterator* NewIterator() const { - return table_->NewIterator(ReadOptions()); - } - - uint64_t ApproximateOffsetOf(const Slice& key) const { - return table_->ApproximateOffsetOf(key); - } - - private: - void Reset() { - delete table_; - delete source_; - table_ = NULL; - source_ = NULL; - } - - StringSource* source_; - Table* table_; - - TableConstructor(); -}; - -// A helper class that converts internal format keys into user keys -class KeyConvertingIterator: public Iterator { - public: - explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } - virtual ~KeyConvertingIterator() { delete iter_; } - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& target) { - ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); - std::string encoded; - AppendInternalKey(&encoded, ikey); - iter_->Seek(encoded); - } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } - - virtual Slice key() const { - assert(Valid()); - ParsedInternalKey key; - if (!ParseInternalKey(iter_->key(), &key)) { - status_ = Status::Corruption("malformed internal key"); - return Slice("corrupted key"); - } - return key.user_key; - } - - virtual Slice value() const { return iter_->value(); } - virtual Status status() const { - return status_.ok() ? iter_->status() : status_; - } - - private: - mutable Status status_; - Iterator* iter_; - - // No copying allowed - KeyConvertingIterator(const KeyConvertingIterator&); - void operator=(const KeyConvertingIterator&); -}; - -class MemTableConstructor: public Constructor { - public: - explicit MemTableConstructor(const Comparator* cmp) - : Constructor(cmp), - internal_comparator_(cmp) { - memtable_ = new MemTable(internal_comparator_); - } - ~MemTableConstructor() { - delete memtable_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete memtable_; - memtable_ = new MemTable(internal_comparator_); - int seq = 1; - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - memtable_->Add(seq, kTypeValue, it->first, it->second); - seq++; - } - return Status::OK(); - } - virtual size_t NumBytes() const { - return memtable_->ApproximateMemoryUsage(); - } - - virtual Iterator* NewIterator() const { - return new KeyConvertingIterator(memtable_->NewIterator()); - } - - private: - InternalKeyComparator internal_comparator_; - MemTable* memtable_; -}; - -class DBConstructor: public Constructor { - public: - explicit DBConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp) { - db_ = NULL; - NewDB(); - } - ~DBConstructor() { - delete db_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete db_; - db_ = NULL; - NewDB(); - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - WriteBatch batch; - batch.Put(it->first, it->second); - ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); - } - return Status::OK(); - } - virtual size_t NumBytes() const { - Range r("", "\xff\xff"); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - - virtual Iterator* NewIterator() const { - return db_->NewIterator(ReadOptions()); - } - - virtual DB* db() const { return db_; } - - private: - void NewDB() { - std::string name = test::TmpDir() + "/table_testdb"; - - Options options; - options.comparator = comparator_; - Status status = DestroyDB(name, options); - ASSERT_TRUE(status.ok()) << status.ToString(); - - options.create_if_missing = true; - options.error_if_exists = true; - options.write_buffer_size = 10000; // Something small to force merging - status = DB::Open(options, name, &db_); - ASSERT_TRUE(status.ok()) << status.ToString(); - } - - const Comparator* comparator_; - DB* db_; -}; - -enum TestType { - TABLE_TEST, - BLOCK_TEST, - MEMTABLE_TEST, - DB_TEST, -}; - -struct TestArgs { - TestType type; - bool reverse_compare; - int restart_interval; -}; - -static const TestArgs kTestArgList[] = { - { TABLE_TEST, false, 16 }, - { TABLE_TEST, false, 1 }, - { TABLE_TEST, false, 1024 }, - { TABLE_TEST, true, 16 }, - { TABLE_TEST, true, 1 }, - { TABLE_TEST, true, 1024 }, - - { BLOCK_TEST, false, 16 }, - { BLOCK_TEST, false, 1 }, - { BLOCK_TEST, false, 1024 }, - { BLOCK_TEST, true, 16 }, - { BLOCK_TEST, true, 1 }, - { BLOCK_TEST, true, 1024 }, - - // Restart interval does not matter for memtables - { MEMTABLE_TEST, false, 16 }, - { MEMTABLE_TEST, true, 16 }, - - // Do not bother with restart interval variations for DB - { DB_TEST, false, 16 }, - { DB_TEST, true, 16 }, -}; -static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); - -class Harness { - public: - Harness() : constructor_(NULL) { } - - void Init(const TestArgs& args) { - delete constructor_; - constructor_ = NULL; - options_ = Options(); - - options_.block_restart_interval = args.restart_interval; - // Use shorter block size for tests to exercise block boundary - // conditions more. - options_.block_size = 256; - if (args.reverse_compare) { - options_.comparator = &reverse_key_comparator; - } - switch (args.type) { - case TABLE_TEST: - constructor_ = new TableConstructor(options_.comparator); - break; - case BLOCK_TEST: - constructor_ = new BlockConstructor(options_.comparator); - break; - case MEMTABLE_TEST: - constructor_ = new MemTableConstructor(options_.comparator); - break; - case DB_TEST: - constructor_ = new DBConstructor(options_.comparator); - break; - } - } - - ~Harness() { - delete constructor_; - } - - void Add(const std::string& key, const std::string& value) { - constructor_->Add(key, value); - } - - void Test(Random* rnd) { - std::vector keys; - KVMap data; - constructor_->Finish(options_, &keys, &data); - - TestForwardScan(keys, data); - TestBackwardScan(keys, data); - TestRandomAccess(rnd, keys, data); - } - - void TestForwardScan(const std::vector& keys, - const KVMap& data) { - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - iter->SeekToFirst(); - for (KVMap::const_iterator model_iter = data.begin(); - model_iter != data.end(); - ++model_iter) { - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - iter->Next(); - } - ASSERT_TRUE(!iter->Valid()); - delete iter; - } - - void TestBackwardScan(const std::vector& keys, - const KVMap& data) { - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - iter->SeekToLast(); - for (KVMap::const_reverse_iterator model_iter = data.rbegin(); - model_iter != data.rend(); - ++model_iter) { - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - iter->Prev(); - } - ASSERT_TRUE(!iter->Valid()); - delete iter; - } - - void TestRandomAccess(Random* rnd, - const std::vector& keys, - const KVMap& data) { - static const bool kVerbose = false; - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - KVMap::const_iterator model_iter = data.begin(); - if (kVerbose) fprintf(stderr, "---\n"); - for (int i = 0; i < 200; i++) { - const int toss = rnd->Uniform(5); - switch (toss) { - case 0: { - if (iter->Valid()) { - if (kVerbose) fprintf(stderr, "Next\n"); - iter->Next(); - ++model_iter; - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - } - break; - } - - case 1: { - if (kVerbose) fprintf(stderr, "SeekToFirst\n"); - iter->SeekToFirst(); - model_iter = data.begin(); - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - - case 2: { - std::string key = PickRandomKey(rnd, keys); - model_iter = data.lower_bound(key); - if (kVerbose) fprintf(stderr, "Seek '%s'\n", - EscapeString(key).c_str()); - iter->Seek(Slice(key)); - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - - case 3: { - if (iter->Valid()) { - if (kVerbose) fprintf(stderr, "Prev\n"); - iter->Prev(); - if (model_iter == data.begin()) { - model_iter = data.end(); // Wrap around to invalid value - } else { - --model_iter; - } - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - } - break; - } - - case 4: { - if (kVerbose) fprintf(stderr, "SeekToLast\n"); - iter->SeekToLast(); - if (keys.empty()) { - model_iter = data.end(); - } else { - std::string last = data.rbegin()->first; - model_iter = data.lower_bound(last); - } - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - } - } - delete iter; - } - - std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { - if (it == data.end()) { - return "END"; - } else { - return "'" + it->first + "->" + it->second + "'"; - } - } - - std::string ToString(const KVMap& data, - const KVMap::const_reverse_iterator& it) { - if (it == data.rend()) { - return "END"; - } else { - return "'" + it->first + "->" + it->second + "'"; - } - } - - std::string ToString(const Iterator* it) { - if (!it->Valid()) { - return "END"; - } else { - return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; - } - } - - std::string PickRandomKey(Random* rnd, const std::vector& keys) { - if (keys.empty()) { - return "foo"; - } else { - const int index = rnd->Uniform(keys.size()); - std::string result = keys[index]; - switch (rnd->Uniform(3)) { - case 0: - // Return an existing key - break; - case 1: { - // Attempt to return something smaller than an existing key - if (result.size() > 0 && result[result.size()-1] > '\0') { - result[result.size()-1]--; - } - break; - } - case 2: { - // Return something larger than an existing key - Increment(options_.comparator, &result); - break; - } - } - return result; - } - } - - // Returns NULL if not running against a DB - DB* db() const { return constructor_->db(); } - - private: - Options options_; - Constructor* constructor_; -}; - -// Test the empty key -TEST(Harness, SimpleEmptyKey) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 1); - Add("", "v"); - Test(&rnd); - } -} - -TEST(Harness, SimpleSingle) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 2); - Add("abc", "v"); - Test(&rnd); - } -} - -TEST(Harness, SimpleMulti) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 3); - Add("abc", "v"); - Add("abcd", "v"); - Add("ac", "v2"); - Test(&rnd); - } -} - -TEST(Harness, SimpleSpecialKey) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 4); - Add("\xff\xff", "v3"); - Test(&rnd); - } -} - -TEST(Harness, Randomized) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 5); - for (int num_entries = 0; num_entries < 2000; - num_entries += (num_entries < 50 ? 1 : 200)) { - if ((num_entries % 10) == 0) { - fprintf(stderr, "case %d of %d: num_entries = %d\n", - (i + 1), int(kNumTestArgs), num_entries); - } - for (int e = 0; e < num_entries; e++) { - std::string v; - Add(test::RandomKey(&rnd, rnd.Skewed(4)), - test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); - } - Test(&rnd); - } - } -} - -TEST(Harness, RandomizedLongDB) { - Random rnd(test::RandomSeed()); - TestArgs args = { DB_TEST, false, 16 }; - Init(args); - int num_entries = 100000; - for (int e = 0; e < num_entries; e++) { - std::string v; - Add(test::RandomKey(&rnd, rnd.Skewed(4)), - test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); - } - Test(&rnd); - - // We must have created enough data to force merging - std::string l0_files, l1_files; - ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); - ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); - ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0); - -} - -class MemTableTest { }; - -TEST(MemTableTest, Simple) { - InternalKeyComparator cmp(BytewiseComparator()); - MemTable memtable(cmp); - WriteBatch batch; - WriteBatchInternal::SetSequence(&batch, 100); - batch.Put(std::string("k1"), std::string("v1")); - batch.Put(std::string("k2"), std::string("v2")); - batch.Put(std::string("k3"), std::string("v3")); - batch.Put(std::string("largekey"), std::string("vlarge")); - ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); - - Iterator* iter = memtable.NewIterator(); - iter->SeekToFirst(); - while (iter->Valid()) { - fprintf(stderr, "key: '%s' -> '%s'\n", - iter->key().ToString().c_str(), - iter->value().ToString().c_str()); - iter->Next(); - } - - delete iter; -} - -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; -} - -class TableTest { }; - -TEST(TableTest, ApproximateOffsetOfPlain) { - TableConstructor c(BytewiseComparator()); - c.Add("k01", "hello"); - c.Add("k02", "hello2"); - c.Add("k03", std::string(10000, 'x')); - c.Add("k04", std::string(200000, 'x')); - c.Add("k05", std::string(300000, 'x')); - c.Add("k06", "hello3"); - c.Add("k07", std::string(100000, 'x')); - std::vector keys; - KVMap kvmap; - Options options; - options.block_size = 1024; - options.compression = kNoCompression; - c.Finish(options, &keys, &kvmap); - - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); - -} - -static bool SnappyCompressionSupported() { - std::string out; - Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Snappy_Compress(in.data(), in.size(), &out); -} - -TEST(TableTest, ApproximateOffsetOfCompressed) { - if (!SnappyCompressionSupported()) { - fprintf(stderr, "skipping compression tests\n"); - return; - } - - Random rnd(301); - TableConstructor c(BytewiseComparator()); - std::string tmp; - c.Add("k01", "hello"); - c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); - c.Add("k03", "hello3"); - c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); - std::vector keys; - KVMap kvmap; - Options options; - options.block_size = 1024; - options.compression = kSnappyCompression; - c.Finish(options, &keys, &kvmap); - - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/table/two_level_iterator.cc b/leveldb/table/two_level_iterator.cc deleted file mode 100644 index 24a1241..0000000 --- a/leveldb/table/two_level_iterator.cc +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/two_level_iterator.h" - -#include "leveldb/table.h" -#include "table/block.h" -#include "table/format.h" -#include "table/iterator_wrapper.h" - -namespace leveldb { - -namespace { - -typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); - -class TwoLevelIterator: public Iterator { - public: - TwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options); - - virtual ~TwoLevelIterator(); - - virtual void Seek(const Slice& target); - virtual void SeekToFirst(); - virtual void SeekToLast(); - virtual void Next(); - virtual void Prev(); - - virtual bool Valid() const { - return data_iter_.Valid(); - } - virtual Slice key() const { - assert(Valid()); - return data_iter_.key(); - } - virtual Slice value() const { - assert(Valid()); - return data_iter_.value(); - } - virtual Status status() const { - // It'd be nice if status() returned a const Status& instead of a Status - if (!index_iter_.status().ok()) { - return index_iter_.status(); - } else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { - return data_iter_.status(); - } else { - return status_; - } - } - - private: - void SaveError(const Status& s) { - if (status_.ok() && !s.ok()) status_ = s; - } - void SkipEmptyDataBlocksForward(); - void SkipEmptyDataBlocksBackward(); - void SetDataIterator(Iterator* data_iter); - void InitDataBlock(); - - BlockFunction block_function_; - void* arg_; - const ReadOptions options_; - Status status_; - IteratorWrapper index_iter_; - IteratorWrapper data_iter_; // May be NULL - // If data_iter_ is non-NULL, then "data_block_handle_" holds the - // "index_value" passed to block_function_ to create the data_iter_. - std::string data_block_handle_; -}; - -TwoLevelIterator::TwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options) - : block_function_(block_function), - arg_(arg), - options_(options), - index_iter_(index_iter), - data_iter_(NULL) { -} - -TwoLevelIterator::~TwoLevelIterator() { -} - -void TwoLevelIterator::Seek(const Slice& target) { - index_iter_.Seek(target); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.Seek(target); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::SeekToFirst() { - index_iter_.SeekToFirst(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::SeekToLast() { - index_iter_.SeekToLast(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); - SkipEmptyDataBlocksBackward(); -} - -void TwoLevelIterator::Next() { - assert(Valid()); - data_iter_.Next(); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::Prev() { - assert(Valid()); - data_iter_.Prev(); - SkipEmptyDataBlocksBackward(); -} - - -void TwoLevelIterator::SkipEmptyDataBlocksForward() { - while (data_iter_.iter() == NULL || !data_iter_.Valid()) { - // Move to next block - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - return; - } - index_iter_.Next(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); - } -} - -void TwoLevelIterator::SkipEmptyDataBlocksBackward() { - while (data_iter_.iter() == NULL || !data_iter_.Valid()) { - // Move to next block - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - return; - } - index_iter_.Prev(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); - } -} - -void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { - if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); - data_iter_.Set(data_iter); -} - -void TwoLevelIterator::InitDataBlock() { - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - } else { - Slice handle = index_iter_.value(); - if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { - // data_iter_ is already constructed with this iterator, so - // no need to change anything - } else { - Iterator* iter = (*block_function_)(arg_, options_, handle); - data_block_handle_.assign(handle.data(), handle.size()); - SetDataIterator(iter); - } - } -} - -} - -Iterator* NewTwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options) { - return new TwoLevelIterator(index_iter, block_function, arg, options); -} - -} diff --git a/leveldb/table/two_level_iterator.h b/leveldb/table/two_level_iterator.h deleted file mode 100644 index 5909e2b..0000000 --- a/leveldb/table/two_level_iterator.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ -#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ - -#include "leveldb/iterator.h" - -namespace leveldb { - -struct ReadOptions; - -// Return a new two level iterator. A two-level iterator contains an -// index iterator whose values point to a sequence of blocks where -// each block is itself a sequence of key,value pairs. The returned -// two-level iterator yields the concatenation of all key/value pairs -// in the sequence of blocks. Takes ownership of "index_iter" and -// will delete it when no longer needed. -// -// Uses a supplied function to convert an index_iter value into -// an iterator over the contents of the corresponding block. -extern Iterator* NewTwoLevelIterator( - Iterator* index_iter, - Iterator* (*block_function)( - void* arg, - const ReadOptions& options, - const Slice& index_value), - void* arg, - const ReadOptions& options); - -} - -#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ diff --git a/leveldb/util/arena.cc b/leveldb/util/arena.cc deleted file mode 100644 index 40ab99d..0000000 --- a/leveldb/util/arena.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/arena.h" -#include - -namespace leveldb { - -static const int kBlockSize = 4096; - -Arena::Arena() { - blocks_memory_ = 0; - alloc_ptr_ = NULL; // First allocation will allocate a block - alloc_bytes_remaining_ = 0; -} - -Arena::~Arena() { - for (size_t i = 0; i < blocks_.size(); i++) { - delete[] blocks_[i]; - } -} - -char* Arena::AllocateFallback(size_t bytes) { - if (bytes > kBlockSize / 4) { - // Object is more than a quarter of our block size. Allocate it separately - // to avoid wasting too much space in leftover bytes. - char* result = AllocateNewBlock(bytes); - return result; - } - - // We waste the remaining space in the current block. - alloc_ptr_ = AllocateNewBlock(kBlockSize); - alloc_bytes_remaining_ = kBlockSize; - - char* result = alloc_ptr_; - alloc_ptr_ += bytes; - alloc_bytes_remaining_ -= bytes; - return result; -} - -char* Arena::AllocateAligned(size_t bytes) { - const int align = sizeof(void*); // We'll align to pointer size - assert((align & (align-1)) == 0); // Pointer size should be a power of 2 - size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); - size_t slop = (current_mod == 0 ? 0 : align - current_mod); - size_t needed = bytes + slop; - char* result; - if (needed <= alloc_bytes_remaining_) { - result = alloc_ptr_ + slop; - alloc_ptr_ += needed; - alloc_bytes_remaining_ -= needed; - } else { - // AllocateFallback always returned aligned memory - result = AllocateFallback(bytes); - } - assert((reinterpret_cast(result) & (align-1)) == 0); - return result; -} - -char* Arena::AllocateNewBlock(size_t block_bytes) { - char* result = new char[block_bytes]; - blocks_memory_ += block_bytes; - blocks_.push_back(result); - return result; -} - -} diff --git a/leveldb/util/arena.h b/leveldb/util/arena.h deleted file mode 100644 index fcb5d5b..0000000 --- a/leveldb/util/arena.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ -#define STORAGE_LEVELDB_UTIL_ARENA_H_ - -#include -#include -#include -#include - -namespace leveldb { - -class Arena { - public: - Arena(); - ~Arena(); - - // Return a pointer to a newly allocated memory block of "bytes" bytes. - char* Allocate(size_t bytes); - - // Allocate memory with the normal alignment guarantees provided by malloc - char* AllocateAligned(size_t bytes); - - // Returns an estimate of the total memory usage of data allocated - // by the arena (including space allocated but not yet used for user - // allocations). - size_t MemoryUsage() const { - return blocks_memory_ + blocks_.capacity() * sizeof(char*); - } - - private: - char* AllocateFallback(size_t bytes); - char* AllocateNewBlock(size_t block_bytes); - - // Allocation state - char* alloc_ptr_; - size_t alloc_bytes_remaining_; - - // Array of new[] allocated memory blocks - std::vector blocks_; - - // Bytes of memory in blocks allocated so far - size_t blocks_memory_; - - // No copying allowed - Arena(const Arena&); - void operator=(const Arena&); -}; - -inline char* Arena::Allocate(size_t bytes) { - // The semantics of what to return are a bit messy if we allow - // 0-byte allocations, so we disallow them here (we don't need - // them for our internal use). - assert(bytes > 0); - if (bytes <= alloc_bytes_remaining_) { - char* result = alloc_ptr_; - alloc_ptr_ += bytes; - alloc_bytes_remaining_ -= bytes; - return result; - } - return AllocateFallback(bytes); -} - -} - -#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/leveldb/util/arena_test.cc b/leveldb/util/arena_test.cc deleted file mode 100644 index c33b552..0000000 --- a/leveldb/util/arena_test.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/arena.h" - -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { - -class ArenaTest { }; - -TEST(ArenaTest, Empty) { - Arena arena; -} - -TEST(ArenaTest, Simple) { - std::vector > allocated; - Arena arena; - const int N = 100000; - size_t bytes = 0; - Random rnd(301); - for (int i = 0; i < N; i++) { - size_t s; - if (i % (N / 10) == 0) { - s = i; - } else { - s = rnd.OneIn(4000) ? rnd.Uniform(6000) : - (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); - } - if (s == 0) { - // Our arena disallows size 0 allocations. - s = 1; - } - char* r; - if (rnd.OneIn(10)) { - r = arena.AllocateAligned(s); - } else { - r = arena.Allocate(s); - } - - for (int b = 0; b < s; b++) { - // Fill the "i"th allocation with a known bit pattern - r[b] = i % 256; - } - bytes += s; - allocated.push_back(std::make_pair(s, r)); - ASSERT_GE(arena.MemoryUsage(), bytes); - if (i > N/10) { - ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); - } - } - for (int i = 0; i < allocated.size(); i++) { - size_t num_bytes = allocated[i].first; - const char* p = allocated[i].second; - for (int b = 0; b < num_bytes; b++) { - // Check the "i"th allocation for the known bit pattern - ASSERT_EQ(int(p[b]) & 0xff, i % 256); - } - } -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/cache.cc b/leveldb/util/cache.cc deleted file mode 100644 index d8a4426..0000000 --- a/leveldb/util/cache.cc +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) -#include -#elif defined(LEVELDB_PLATFORM_CHROMIUM) -#include "base/hash_tables.h" -#else -#include // TODO(sanjay): Switch to unordered_set when possible. -#endif - -#include - -#include "leveldb/cache.h" -#include "port/port.h" -#include "util/hash.h" -#include "util/mutexlock.h" - -namespace leveldb { - -Cache::~Cache() { -} - -namespace { - -// LRU cache implementation - -// An entry is a variable length heap-allocated structure. Entries -// are kept in a circular doubly linked list ordered by access time. -struct LRUHandle { - void* value; - void (*deleter)(const Slice&, void* value); - LRUHandle* next; - LRUHandle* prev; - size_t charge; // TODO(opt): Only allow uint32_t? - size_t key_length; - size_t refs; // TODO(opt): Pack with "key_length"? - char key_data[1]; // Beginning of key - - Slice key() const { - // For cheaper lookups, we allow a temporary Handle object - // to store a pointer to a key in "value". - if (next == this) { - return *(reinterpret_cast(value)); - } else { - return Slice(key_data, key_length); - } - } -}; - -// Pick a platform specific hash_set instantiation -#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) - // Microsoft's hash_set deviates from the standard. See - // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx - // for details. Basically the 2 param () operator is a less than and - // the 1 param () operator is a hash function. - struct HandleHashCompare : public stdext::hash_compare { - size_t operator() (LRUHandle* h) const { - Slice k = h->key(); - return Hash(k.data(), k.size(), 0); - } - bool operator() (LRUHandle* a, LRUHandle* b) const { - return a->key().compare(b->key()) < 0; - } - }; - typedef base::hash_set HandleTable; -#else - struct HandleHash { - inline size_t operator()(LRUHandle* h) const { - Slice k = h->key(); - return Hash(k.data(), k.size(), 0); - } - }; - - struct HandleEq { - inline bool operator()(LRUHandle* a, LRUHandle* b) const { - return a->key() == b->key(); - } - }; -# if defined(LEVELDB_PLATFORM_CHROMIUM) - typedef base::hash_set HandleTable; -# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) - typedef std::unordered_set HandleTable; -# else - typedef __gnu_cxx::hash_set HandleTable; -# endif -#endif - -class LRUCache : public Cache { - public: - explicit LRUCache(size_t capacity); - virtual ~LRUCache(); - - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)); - virtual Handle* Lookup(const Slice& key); - virtual void Release(Handle* handle); - virtual void* Value(Handle* handle); - virtual void Erase(const Slice& key); - virtual uint64_t NewId(); - - private: - void LRU_Remove(LRUHandle* e); - void LRU_Append(LRUHandle* e); - void Unref(LRUHandle* e); - - // Constructor parameters - const size_t capacity_; - - // mutex_ protects the following state. - port::Mutex mutex_; - size_t usage_; - uint64_t last_id_; - - // Dummy head of LRU list. - // lru.prev is newest entry, lru.next is oldest entry. - LRUHandle lru_; - - HandleTable table_; -}; - -LRUCache::LRUCache(size_t capacity) - : capacity_(capacity), - usage_(0), - last_id_(0) { - // Make empty circular linked list - lru_.next = &lru_; - lru_.prev = &lru_; -} - -LRUCache::~LRUCache() { - table_.clear(); - for (LRUHandle* e = lru_.next; e != &lru_; ) { - LRUHandle* next = e->next; - assert(e->refs == 1); // Error if caller has an unreleased handle - Unref(e); - e = next; - } -} - -void LRUCache::Unref(LRUHandle* e) { - assert(e->refs > 0); - e->refs--; - if (e->refs <= 0) { - usage_ -= e->charge; - (*e->deleter)(e->key(), e->value); - free(e); - } -} - -void LRUCache::LRU_Remove(LRUHandle* e) { - e->next->prev = e->prev; - e->prev->next = e->next; -} - -void LRUCache::LRU_Append(LRUHandle* e) { - // Make "e" newest entry by inserting just before lru_ - e->next = &lru_; - e->prev = lru_.prev; - e->prev->next = e; - e->next->prev = e; -} - -Cache::Handle* LRUCache::Lookup(const Slice& key) { - MutexLock l(&mutex_); - - LRUHandle dummy; - dummy.next = &dummy; - dummy.value = const_cast(&key); - HandleTable::iterator iter = table_.find(&dummy); - if (iter == table_.end()) { - return NULL; - } else { - LRUHandle* e = const_cast(*iter); - e->refs++; - LRU_Remove(e); - LRU_Append(e); - return reinterpret_cast(e); - } -} - -void* LRUCache::Value(Handle* handle) { - return reinterpret_cast(handle)->value; -} - -void LRUCache::Release(Handle* handle) { - MutexLock l(&mutex_); - Unref(reinterpret_cast(handle)); -} - -Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) { - MutexLock l(&mutex_); - - LRUHandle* e = reinterpret_cast( - malloc(sizeof(LRUHandle)-1 + key.size())); - e->value = value; - e->deleter = deleter; - e->charge = charge; - e->key_length = key.size(); - e->refs = 2; // One from LRUCache, one for the returned handle - memcpy(e->key_data, key.data(), key.size()); - LRU_Append(e); - usage_ += charge; - - std::pair p = table_.insert(e); - if (!p.second) { - // Kill existing entry - LRUHandle* old = const_cast(*(p.first)); - LRU_Remove(old); - table_.erase(p.first); - table_.insert(e); - Unref(old); - } - - while (usage_ > capacity_ && lru_.next != &lru_) { - LRUHandle* old = lru_.next; - LRU_Remove(old); - table_.erase(old); - Unref(old); - } - - return reinterpret_cast(e); -} - -void LRUCache::Erase(const Slice& key) { - MutexLock l(&mutex_); - - LRUHandle dummy; - dummy.next = &dummy; - dummy.value = const_cast(&key); - HandleTable::iterator iter = table_.find(&dummy); - if (iter != table_.end()) { - LRUHandle* e = const_cast(*iter); - LRU_Remove(e); - table_.erase(iter); - Unref(e); - } -} - -uint64_t LRUCache::NewId() { - MutexLock l(&mutex_); - return ++(last_id_); -} - -} // end anonymous namespace - -Cache* NewLRUCache(size_t capacity) { - return new LRUCache(capacity); -} - -} diff --git a/leveldb/util/cache_test.cc b/leveldb/util/cache_test.cc deleted file mode 100644 index dbab988..0000000 --- a/leveldb/util/cache_test.cc +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/cache.h" - -#include -#include "util/coding.h" -#include "util/testharness.h" - -namespace leveldb { - -// Conversions between numeric keys/values and the types expected by Cache. -static std::string EncodeKey(int k) { - std::string result; - PutFixed32(&result, k); - return result; -} -static int DecodeKey(const Slice& k) { - assert(k.size() == 4); - return DecodeFixed32(k.data()); -} -static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } -static int DecodeValue(void* v) { return reinterpret_cast(v); } - -class CacheTest { - public: - static CacheTest* current_; - - static void Deleter(const Slice& key, void* v) { - current_->deleted_keys_.push_back(DecodeKey(key)); - current_->deleted_values_.push_back(DecodeValue(v)); - } - - static const int kCacheSize = 100; - std::vector deleted_keys_; - std::vector deleted_values_; - Cache* cache_; - - CacheTest() : cache_(NewLRUCache(kCacheSize)) { - current_ = this; - } - - ~CacheTest() { - delete cache_; - } - - int Lookup(int key) { - Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); - const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); - if (handle != NULL) { - cache_->Release(handle); - } - return r; - } - - void Insert(int key, int value, int charge = 1) { - cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, - &CacheTest::Deleter)); - } - - void Erase(int key) { - cache_->Erase(EncodeKey(key)); - } -}; -CacheTest* CacheTest::current_; - -TEST(CacheTest, HitAndMiss) { - ASSERT_EQ(-1, Lookup(100)); - - Insert(100, 101); - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(-1, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - Insert(200, 201); - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - Insert(100, 102); - ASSERT_EQ(102, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); -} - -TEST(CacheTest, Erase) { - Erase(200); - ASSERT_EQ(0, deleted_keys_.size()); - - Insert(100, 101); - Insert(200, 201); - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); - - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(1, deleted_keys_.size()); -} - -TEST(CacheTest, EntriesArePinned) { - Insert(100, 101); - Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); - ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); - - Insert(100, 102); - Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); - ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); - ASSERT_EQ(0, deleted_keys_.size()); - - cache_->Release(h1); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); - - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(1, deleted_keys_.size()); - - cache_->Release(h2); - ASSERT_EQ(2, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[1]); - ASSERT_EQ(102, deleted_values_[1]); -} - -TEST(CacheTest, EvictionPolicy) { - Insert(100, 101); - Insert(200, 201); - - // Frequently used entry must be kept around - for (int i = 0; i < kCacheSize; i++) { - Insert(1000+i, 2000+i); - ASSERT_EQ(2000+i, Lookup(1000+i)); - ASSERT_EQ(101, Lookup(100)); - } - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(2, deleted_keys_.size()); - ASSERT_EQ(200, deleted_keys_[0]); - ASSERT_EQ(201, deleted_values_[0]); -} - -TEST(CacheTest, HeavyEntry) { - Insert(100, 101); - Insert(200, 201, kCacheSize); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); -} - -TEST(CacheTest, NewId) { - uint64_t a = cache_->NewId(); - uint64_t b = cache_->NewId(); - ASSERT_NE(a, b); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/coding.cc b/leveldb/util/coding.cc deleted file mode 100644 index 14f21f7..0000000 --- a/leveldb/util/coding.cc +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/coding.h" - -namespace leveldb { - -void EncodeFixed32(char* buf, uint32_t value) { -#if __BYTE_ORDER == __LITTLE_ENDIAN - memcpy(buf, &value, sizeof(value)); -#else - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; -#endif -} - -void EncodeFixed64(char* buf, uint64_t value) { -#if __BYTE_ORDER == __LITTLE_ENDIAN - memcpy(buf, &value, sizeof(value)); -#else - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; - buf[4] = (value >> 32) & 0xff; - buf[5] = (value >> 40) & 0xff; - buf[6] = (value >> 48) & 0xff; - buf[7] = (value >> 56) & 0xff; -#endif -} - -void PutFixed32(std::string* dst, uint32_t value) { - char buf[sizeof(value)]; - EncodeFixed32(buf, value); - dst->append(buf, sizeof(buf)); -} - -void PutFixed64(std::string* dst, uint64_t value) { - char buf[sizeof(value)]; - EncodeFixed64(buf, value); - dst->append(buf, sizeof(buf)); -} - -char* EncodeVarint32(char* dst, uint32_t v) { - // Operate on characters as unsigneds - unsigned char* ptr = reinterpret_cast(dst); - static const int B = 128; - if (v < (1<<7)) { - *(ptr++) = v; - } else if (v < (1<<14)) { - *(ptr++) = v | B; - *(ptr++) = v>>7; - } else if (v < (1<<21)) { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = v>>14; - } else if (v < (1<<28)) { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = (v>>14) | B; - *(ptr++) = v>>21; - } else { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = (v>>14) | B; - *(ptr++) = (v>>21) | B; - *(ptr++) = v>>28; - } - return reinterpret_cast(ptr); -} - -void PutVarint32(std::string* dst, uint32_t v) { - char buf[5]; - char* ptr = EncodeVarint32(buf, v); - dst->append(buf, ptr - buf); -} - -char* EncodeVarint64(char* dst, uint64_t v) { - static const int B = 128; - unsigned char* ptr = reinterpret_cast(dst); - while (v >= B) { - *(ptr++) = (v & (B-1)) | B; - v >>= 7; - } - *(ptr++) = static_cast(v); - return reinterpret_cast(ptr); -} - -void PutVarint64(std::string* dst, uint64_t v) { - char buf[10]; - char* ptr = EncodeVarint64(buf, v); - dst->append(buf, ptr - buf); -} - -void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { - PutVarint32(dst, value.size()); - dst->append(value.data(), value.size()); -} - -int VarintLength(uint64_t v) { - int len = 1; - while (v >= 128) { - v >>= 7; - len++; - } - return len; -} - -const char* GetVarint32PtrFallback(const char* p, - const char* limit, - uint32_t* value) { - uint32_t result = 0; - for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { - uint32_t byte = *(reinterpret_cast(p)); - p++; - if (byte & 128) { - // More bytes are present - result |= ((byte & 127) << shift); - } else { - result |= (byte << shift); - *value = result; - return reinterpret_cast(p); - } - } - return NULL; -} - -bool GetVarint32(Slice* input, uint32_t* value) { - const char* p = input->data(); - const char* limit = p + input->size(); - const char* q = GetVarint32Ptr(p, limit, value); - if (q == NULL) { - return false; - } else { - *input = Slice(q, limit - q); - return true; - } -} - -const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { - uint64_t result = 0; - for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { - uint64_t byte = *(reinterpret_cast(p)); - p++; - if (byte & 128) { - // More bytes are present - result |= ((byte & 127) << shift); - } else { - result |= (byte << shift); - *value = result; - return reinterpret_cast(p); - } - } - return NULL; -} - -bool GetVarint64(Slice* input, uint64_t* value) { - const char* p = input->data(); - const char* limit = p + input->size(); - const char* q = GetVarint64Ptr(p, limit, value); - if (q == NULL) { - return false; - } else { - *input = Slice(q, limit - q); - return true; - } -} - -const char* GetLengthPrefixedSlice(const char* p, const char* limit, - Slice* result) { - uint32_t len; - p = GetVarint32Ptr(p, limit, &len); - if (p == NULL) return NULL; - if (p + len > limit) return NULL; - *result = Slice(p, len); - return p + len; -} - -bool GetLengthPrefixedSlice(Slice* input, Slice* result) { - uint32_t len; - if (GetVarint32(input, &len) && - input->size() >= len) { - *result = Slice(input->data(), len); - input->remove_prefix(len); - return true; - } else { - return false; - } -} - -} diff --git a/leveldb/util/coding.h b/leveldb/util/coding.h deleted file mode 100644 index 8755968..0000000 --- a/leveldb/util/coding.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Endian-neutral encoding: -// * Fixed-length numbers are encoded with least-significant byte first -// * In addition we support variable length "varint" encoding -// * Strings are encoded prefixed by their length in varint format - -#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ -#define STORAGE_LEVELDB_UTIL_CODING_H_ - -#include -#include -#include -#include "leveldb/slice.h" -#include "port/port.h" - -namespace leveldb { - -// Standard Put... routines append to a string -extern void PutFixed32(std::string* dst, uint32_t value); -extern void PutFixed64(std::string* dst, uint64_t value); -extern void PutVarint32(std::string* dst, uint32_t value); -extern void PutVarint64(std::string* dst, uint64_t value); -extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); - -// Standard Get... routines parse a value from the beginning of a Slice -// and advance the slice past the parsed value. -extern bool GetVarint32(Slice* input, uint32_t* value); -extern bool GetVarint64(Slice* input, uint64_t* value); -extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); - -// Pointer-based variants of GetVarint... These either store a value -// in *v and return a pointer just past the parsed value, or return -// NULL on error. These routines only look at bytes in the range -// [p..limit-1] -extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); -extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); - -// Returns the length of the varint32 or varint64 encoding of "v" -extern int VarintLength(uint64_t v); - -// Lower-level versions of Put... that write directly into a character buffer -// REQUIRES: dst has enough space for the value being written -extern void EncodeFixed32(char* dst, uint32_t value); -extern void EncodeFixed64(char* dst, uint64_t value); - -// Lower-level versions of Put... that write directly into a character buffer -// and return a pointer just past the last byte written. -// REQUIRES: dst has enough space for the value being written -extern char* EncodeVarint32(char* dst, uint32_t value); -extern char* EncodeVarint64(char* dst, uint64_t value); - -// Lower-level versions of Get... that read directly from a character buffer -// without any bounds checking. - -inline uint32_t DecodeFixed32(const char* ptr) { - if (port::kLittleEndian) { - // Load the raw bytes - uint32_t result; - memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load - return result; - } else { - return ((static_cast(ptr[0])) - | (static_cast(ptr[1]) << 8) - | (static_cast(ptr[2]) << 16) - | (static_cast(ptr[3]) << 24)); - } -} - -inline uint64_t DecodeFixed64(const char* ptr) { - if (port::kLittleEndian) { - // Load the raw bytes - uint64_t result; - memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load - return result; - } else { - uint64_t lo = DecodeFixed32(ptr); - uint64_t hi = DecodeFixed32(ptr + 4); - return (hi << 32) | lo; - } -} - -// Internal routine for use by fallback path of GetVarint32Ptr -extern const char* GetVarint32PtrFallback(const char* p, - const char* limit, - uint32_t* value); -inline const char* GetVarint32Ptr(const char* p, - const char* limit, - uint32_t* value) { - if (p < limit) { - uint32_t result = *(reinterpret_cast(p)); - if ((result & 128) == 0) { - *value = result; - return p + 1; - } - } - return GetVarint32PtrFallback(p, limit, value); -} - -} - -#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/leveldb/util/coding_test.cc b/leveldb/util/coding_test.cc deleted file mode 100644 index a8dba04..0000000 --- a/leveldb/util/coding_test.cc +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/coding.h" - -#include "util/testharness.h" - -namespace leveldb { - -class Coding { }; - -TEST(Coding, Fixed32) { - std::string s; - for (uint32_t v = 0; v < 100000; v++) { - PutFixed32(&s, v); - } - - const char* p = s.data(); - for (uint32_t v = 0; v < 100000; v++) { - uint32_t actual = DecodeFixed32(p); - ASSERT_EQ(v, actual); - p += sizeof(uint32_t); - } -} - -TEST(Coding, Fixed64) { - std::string s; - for (int power = 0; power <= 63; power++) { - uint64_t v = static_cast(1) << power; - PutFixed64(&s, v - 1); - PutFixed64(&s, v + 0); - PutFixed64(&s, v + 1); - } - - const char* p = s.data(); - for (int power = 0; power <= 63; power++) { - uint64_t v = static_cast(1) << power; - uint64_t actual; - actual = DecodeFixed64(p); - ASSERT_EQ(v-1, actual); - p += sizeof(uint64_t); - - actual = DecodeFixed64(p); - ASSERT_EQ(v+0, actual); - p += sizeof(uint64_t); - - actual = DecodeFixed64(p); - ASSERT_EQ(v+1, actual); - p += sizeof(uint64_t); - } -} - -TEST(Coding, Varint32) { - std::string s; - for (uint32_t i = 0; i < (32 * 32); i++) { - uint32_t v = (i / 32) << (i % 32); - PutVarint32(&s, v); - } - - const char* p = s.data(); - const char* limit = p + s.size(); - for (uint32_t i = 0; i < (32 * 32); i++) { - uint32_t expected = (i / 32) << (i % 32); - uint32_t actual; - const char* start = p; - p = GetVarint32Ptr(p, limit, &actual); - ASSERT_TRUE(p != NULL); - ASSERT_EQ(expected, actual); - ASSERT_EQ(VarintLength(actual), p - start); - } - ASSERT_EQ(p, s.data() + s.size()); -} - -TEST(Coding, Varint64) { - // Construct the list of values to check - std::vector values; - // Some special values - values.push_back(0); - values.push_back(100); - values.push_back(~static_cast(0)); - values.push_back(~static_cast(0) - 1); - for (uint32_t k = 0; k < 64; k++) { - // Test values near powers of two - const uint64_t power = 1ull << k; - values.push_back(power); - values.push_back(power-1); - values.push_back(power+1); - }; - - std::string s; - for (int i = 0; i < values.size(); i++) { - PutVarint64(&s, values[i]); - } - - const char* p = s.data(); - const char* limit = p + s.size(); - for (int i = 0; i < values.size(); i++) { - ASSERT_TRUE(p < limit); - uint64_t actual; - const char* start = p; - p = GetVarint64Ptr(p, limit, &actual); - ASSERT_TRUE(p != NULL); - ASSERT_EQ(values[i], actual); - ASSERT_EQ(VarintLength(actual), p - start); - } - ASSERT_EQ(p, limit); - -} - -TEST(Coding, Varint32Overflow) { - uint32_t result; - std::string input("\x81\x82\x83\x84\x85\x11"); - ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) - == NULL); -} - -TEST(Coding, Varint32Truncation) { - uint32_t large_value = (1u << 31) + 100; - std::string s; - PutVarint32(&s, large_value); - uint32_t result; - for (int len = 0; len < s.size() - 1; len++) { - ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); - } - ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); - ASSERT_EQ(large_value, result); -} - -TEST(Coding, Varint64Overflow) { - uint64_t result; - std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); - ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) - == NULL); -} - -TEST(Coding, Varint64Truncation) { - uint64_t large_value = (1ull << 63) + 100ull; - std::string s; - PutVarint64(&s, large_value); - uint64_t result; - for (int len = 0; len < s.size() - 1; len++) { - ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); - } - ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); - ASSERT_EQ(large_value, result); -} - -TEST(Coding, Strings) { - std::string s; - PutLengthPrefixedSlice(&s, Slice("")); - PutLengthPrefixedSlice(&s, Slice("foo")); - PutLengthPrefixedSlice(&s, Slice("bar")); - PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); - - Slice input(s); - Slice v; - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("foo", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("bar", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ(std::string(200, 'x'), v.ToString()); - ASSERT_EQ("", input.ToString()); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/comparator.cc b/leveldb/util/comparator.cc deleted file mode 100644 index cc2b263..0000000 --- a/leveldb/util/comparator.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "leveldb/comparator.h" -#include "leveldb/slice.h" -#include "util/logging.h" - -namespace leveldb { - -Comparator::~Comparator() { } - -namespace { -class BytewiseComparatorImpl : public Comparator { - public: - BytewiseComparatorImpl() { } - - virtual const char* Name() const { - return "leveldb.BytewiseComparator"; - } - - virtual int Compare(const Slice& a, const Slice& b) const { - return a.compare(b); - } - - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const { - // Find length of common prefix - size_t min_length = std::min(start->size(), limit.size()); - size_t diff_index = 0; - while ((diff_index < min_length) && - ((*start)[diff_index] == limit[diff_index])) { - diff_index++; - } - - if (diff_index >= min_length) { - // Do not shorten if one string is a prefix of the other - } else { - uint8_t diff_byte = static_cast((*start)[diff_index]); - if (diff_byte < static_cast(0xff) && - diff_byte + 1 < static_cast(limit[diff_index])) { - (*start)[diff_index]++; - start->resize(diff_index + 1); - assert(Compare(*start, limit) < 0); - } - } - } - - virtual void FindShortSuccessor(std::string* key) const { - // Find first character that can be incremented - size_t n = key->size(); - for (size_t i = 0; i < n; i++) { - const uint8_t byte = (*key)[i]; - if (byte != static_cast(0xff)) { - (*key)[i] = byte + 1; - key->resize(i+1); - return; - } - } - // *key is a run of 0xffs. Leave it alone. - } -}; -} -static const BytewiseComparatorImpl bytewise; - -const Comparator* BytewiseComparator() { - return &bytewise; -} - -} diff --git a/leveldb/util/crc32c.cc b/leveldb/util/crc32c.cc deleted file mode 100644 index 28c2401..0000000 --- a/leveldb/util/crc32c.cc +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A portable implementation of crc32c, optimized to handle -// four bytes at a time. - -#include "util/crc32c.h" - -#include -#include "util/coding.h" - -namespace leveldb { -namespace crc32c { - -static const uint32_t table0_[256] = { - 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, - 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, - 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, - 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, - 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, - 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, - 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, - 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, - 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, - 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, - 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, - 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, - 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, - 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, - 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, - 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, - 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, - 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, - 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, - 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, - 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, - 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, - 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, - 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, - 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, - 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, - 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, - 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, - 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, - 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, - 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, - 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, - 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, - 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, - 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, - 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, - 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, - 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, - 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, - 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, - 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, - 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, - 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, - 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, - 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, - 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, - 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, - 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, - 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, - 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, - 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, - 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, - 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, - 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, - 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, - 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, - 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, - 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, - 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, - 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, - 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, - 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, - 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, - 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 -}; -static const uint32_t table1_[256] = { - 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, - 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, - 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, - 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, - 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, - 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, - 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, - 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, - 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, - 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, - 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, - 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, - 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, - 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, - 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, - 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, - 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, - 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, - 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, - 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, - 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, - 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, - 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, - 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, - 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, - 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, - 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, - 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, - 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, - 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, - 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, - 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, - 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, - 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, - 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, - 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, - 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, - 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, - 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, - 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, - 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, - 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, - 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, - 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, - 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, - 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, - 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, - 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, - 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, - 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, - 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, - 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, - 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, - 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, - 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, - 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, - 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, - 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, - 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, - 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, - 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, - 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, - 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, - 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 -}; -static const uint32_t table2_[256] = { - 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, - 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, - 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, - 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, - 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, - 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, - 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, - 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, - 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, - 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, - 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, - 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, - 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, - 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, - 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, - 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, - 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, - 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, - 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, - 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, - 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, - 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, - 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, - 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, - 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, - 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, - 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, - 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, - 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, - 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, - 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, - 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, - 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, - 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, - 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, - 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, - 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, - 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, - 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, - 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, - 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, - 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, - 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, - 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, - 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, - 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, - 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, - 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, - 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, - 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, - 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, - 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, - 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, - 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, - 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, - 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, - 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, - 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, - 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, - 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, - 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, - 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, - 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, - 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 -}; -static const uint32_t table3_[256] = { - 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, - 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, - 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, - 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, - 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, - 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, - 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, - 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, - 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, - 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, - 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, - 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, - 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, - 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, - 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, - 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, - 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, - 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, - 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, - 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, - 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, - 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, - 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, - 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, - 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, - 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, - 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, - 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, - 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, - 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, - 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, - 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, - 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, - 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, - 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, - 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, - 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, - 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, - 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, - 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, - 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, - 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, - 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, - 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, - 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, - 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, - 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, - 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, - 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, - 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, - 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, - 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, - 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, - 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, - 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, - 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, - 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, - 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, - 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, - 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, - 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, - 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, - 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, - 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 -}; - -// Used to fetch a naturally-aligned 32-bit word in little endian byte-order -static inline uint32_t LE_LOAD32(const uint8_t *p) { - return DecodeFixed32(reinterpret_cast(p)); -} - -uint32_t Extend(uint32_t crc, const char* buf, size_t size) { - const uint8_t *p = reinterpret_cast(buf); - const uint8_t *e = p + size; - uint32_t l = crc ^ 0xffffffffu; - -#define STEP1 do { \ - int c = (l & 0xff) ^ *p++; \ - l = table0_[c] ^ (l >> 8); \ -} while (0) -#define STEP4 do { \ - uint32_t c = l ^ LE_LOAD32(p); \ - p += 4; \ - l = table3_[c & 0xff] ^ \ - table2_[(c >> 8) & 0xff] ^ \ - table1_[(c >> 16) & 0xff] ^ \ - table0_[c >> 24]; \ -} while (0) - - // Point x at first 4-byte aligned byte in string. This might be - // just past the end of the string. - const uintptr_t pval = reinterpret_cast(p); - const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); - if (x <= e) { - // Process bytes until finished or p is 4-byte aligned - while (p != x) { - STEP1; - } - } - // Process bytes 16 at a time - while ((e-p) >= 16) { - STEP4; STEP4; STEP4; STEP4; - } - // Process bytes 4 at a time - while ((e-p) >= 4) { - STEP4; - } - // Process the last few bytes - while (p != e) { - STEP1; - } -#undef STEP4 -#undef STEP1 - return l ^ 0xffffffffu; -} - -} -} diff --git a/leveldb/util/crc32c.h b/leveldb/util/crc32c.h deleted file mode 100644 index 938d8ff..0000000 --- a/leveldb/util/crc32c.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ -#define STORAGE_LEVELDB_UTIL_CRC32C_H_ - -#include -#include - -namespace leveldb { -namespace crc32c { - -// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the -// crc32c of some string A. Extend() is often used to maintain the -// crc32c of a stream of data. -extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); - -// Return the crc32c of data[0,n-1] -inline uint32_t Value(const char* data, size_t n) { - return Extend(0, data, n); -} - -static const uint32_t kMaskDelta = 0xa282ead8ul; - -// Return a masked representation of crc. -// -// Motivation: it is problematic to compute the CRC of a string that -// contains embedded CRCs. Therefore we recommend that CRCs stored -// somewhere (e.g., in files) should be masked before being stored. -inline uint32_t Mask(uint32_t crc) { - // Rotate right by 15 bits and add a constant. - return ((crc >> 15) | (crc << 17)) + kMaskDelta; -} - -// Return the crc whose masked representation is masked_crc. -inline uint32_t Unmask(uint32_t masked_crc) { - uint32_t rot = masked_crc - kMaskDelta; - return ((rot >> 17) | (rot << 15)); -} - -} -} - -#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ diff --git a/leveldb/util/crc32c_test.cc b/leveldb/util/crc32c_test.cc deleted file mode 100644 index ba9e804..0000000 --- a/leveldb/util/crc32c_test.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/crc32c.h" -#include "util/testharness.h" - -namespace leveldb { -namespace crc32c { - -class CRC { }; - -TEST(CRC, StandardResults) { - // From rfc3720 section B.4. - char buf[32]; - - memset(buf, 0, sizeof(buf)); - ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); - - memset(buf, 0xff, sizeof(buf)); - ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); - - for (int i = 0; i < 32; i++) { - buf[i] = i; - } - ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); - - for (int i = 0; i < 32; i++) { - buf[i] = 31 - i; - } - ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); - - unsigned char data[48] = { - 0x01, 0xc0, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x04, 0x00, - 0x00, 0x00, 0x00, 0x14, - 0x00, 0x00, 0x00, 0x18, - 0x28, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - }; - ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); -} - -TEST(CRC, Values) { - ASSERT_NE(Value("a", 1), Value("foo", 3)); -} - -TEST(CRC, Extend) { - ASSERT_EQ(Value("hello world", 11), - Extend(Value("hello ", 6), "world", 5)); -} - -TEST(CRC, Mask) { - uint32_t crc = Value("foo", 3); - ASSERT_NE(crc, Mask(crc)); - ASSERT_NE(crc, Mask(Mask(crc))); - ASSERT_EQ(crc, Unmask(Mask(crc))); - ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); -} - -} -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/env.cc b/leveldb/util/env.cc deleted file mode 100644 index e5297e7..0000000 --- a/leveldb/util/env.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/env.h" - -namespace leveldb { - -Env::~Env() { -} - -SequentialFile::~SequentialFile() { -} - -RandomAccessFile::~RandomAccessFile() { -} - -WritableFile::~WritableFile() { -} - -FileLock::~FileLock() { -} - -void Log(Env* env, WritableFile* info_log, const char* format, ...) { - va_list ap; - va_start(ap, format); - env->Logv(info_log, format, ap); - va_end(ap); -} - -Status WriteStringToFile(Env* env, const Slice& data, - const std::string& fname) { - WritableFile* file; - Status s = env->NewWritableFile(fname, &file); - if (!s.ok()) { - return s; - } - s = file->Append(data); - if (s.ok()) { - s = file->Close(); - } - delete file; // Will auto-close if we did not close above - if (!s.ok()) { - env->DeleteFile(fname); - } - return s; -} - -Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { - data->clear(); - SequentialFile* file; - Status s = env->NewSequentialFile(fname, &file); - if (!s.ok()) { - return s; - } - static const int kBufferSize = 8192; - char* space = new char[kBufferSize]; - while (true) { - Slice fragment; - s = file->Read(kBufferSize, &fragment, space); - if (!s.ok()) { - break; - } - data->append(fragment.data(), fragment.size()); - if (fragment.empty()) { - break; - } - } - delete[] space; - delete file; - return s; -} - -EnvWrapper::~EnvWrapper() { -} - -} diff --git a/leveldb/util/env_chromium.cc b/leveldb/util/env_chromium.cc deleted file mode 100644 index 7edc7a9..0000000 --- a/leveldb/util/env_chromium.cc +++ /dev/null @@ -1,603 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include "base/at_exit.h" -#include "base/file_path.h" -#include "base/file_util.h" -#include "base/lazy_instance.h" -#include "base/memory/ref_counted.h" -#include "base/message_loop.h" -#include "base/platform_file.h" -#include "base/process_util.h" -#include "base/synchronization/lock.h" -#include "base/sys_info.h" -#include "base/task.h" -#include "base/threading/platform_thread.h" -#include "base/threading/thread.h" -#include "base/utf_string_conversions.h" -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "port/port.h" -#include "util/logging.h" - -#if defined(OS_WIN) -#include -#include "base/win/win_util.h" -#endif - -#if defined(OS_MACOSX) || defined(OS_WIN) -// The following are glibc-specific -extern "C" { -size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { - return fread(ptr, size, n, file); -} - -size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) { - return fwrite(ptr, size, n, file); -} - -int fflush_unlocked(FILE *file) { - return fflush(file); -} - -int fdatasync(int fildes) { -#if defined(OS_WIN) - return _commit(fildes); -#else - return fsync(fildes); -#endif -} -} -#endif - -namespace leveldb { - -namespace { - -class Thread; - -static const ::FilePath::CharType kLevelDBTestDirectoryPrefix[] - = FILE_PATH_LITERAL("leveldb-test-"); - -::FilePath CreateFilePath(const std::string& file_path) { -#if defined(OS_WIN) - return FilePath(UTF8ToUTF16(file_path)); -#else - return FilePath(file_path); -#endif -} - -std::string FilePathToString(const ::FilePath& file_path) { -#if defined(OS_WIN) - return UTF16ToUTF8(file_path.value()); -#else - return file_path.value(); -#endif -} - -// TODO(jorlow): This should be moved into Chromium's base. -const char* PlatformFileErrorString(const ::base::PlatformFileError& error) { - switch (error) { - case ::base::PLATFORM_FILE_ERROR_FAILED: - return "Opening file failed."; - case ::base::PLATFORM_FILE_ERROR_IN_USE: - return "File currently in use."; - case ::base::PLATFORM_FILE_ERROR_EXISTS: - return "File already exists."; - case ::base::PLATFORM_FILE_ERROR_NOT_FOUND: - return "File not found."; - case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED: - return "Access denied."; - case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED: - return "Too many files open."; - case ::base::PLATFORM_FILE_ERROR_NO_MEMORY: - return "Out of memory."; - case ::base::PLATFORM_FILE_ERROR_NO_SPACE: - return "No space left on drive."; - case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY: - return "Not a directory."; - case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION: - return "Invalid operation."; - case ::base::PLATFORM_FILE_ERROR_SECURITY: - return "Security error."; - case ::base::PLATFORM_FILE_ERROR_ABORT: - return "File operation aborted."; - case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE: - return "The supplied path was not a file."; - case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY: - return "The file was not empty."; - } - NOTIMPLEMENTED(); - return "Unknown error."; -} - -class ChromiumSequentialFile: public SequentialFile { - private: - std::string filename_; - FILE* file_; - - public: - ChromiumSequentialFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - virtual ~ChromiumSequentialFile() { fclose(file_); } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - Status s; - size_t r = fread_unlocked(scratch, 1, n, file_); - *result = Slice(scratch, r); - if (r < n) { - if (feof(file_)) { - // We leave status as ok if we hit the end of the file - } else { - // A partial read with an error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - } - return s; - } -}; - -class ChromiumRandomAccessFile: public RandomAccessFile { - private: - std::string filename_; - ::base::PlatformFile file_; - - public: - ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file) - : filename_(fname), file_(file) { } - virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; - int r = ::base::ReadPlatformFile(file_, offset, scratch, n); - *result = Slice(scratch, (r < 0) ? 0 : r); - if (r < 0) { - // An error: return a non-ok status - s = Status::IOError(filename_, "Could not preform read"); - } - return s; - } -}; - -class ChromiumWritableFile : public WritableFile { - private: - std::string filename_; - FILE* file_; - - public: - ChromiumWritableFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - - ~ChromiumWritableFile() { - if (file_ != NULL) { - // Ignoring any potential errors - fclose(file_); - } - } - - virtual Status Append(const Slice& data) { - size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); - Status result; - if (r != data.size()) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } - - virtual Status Close() { - Status result; - if (fclose(file_) != 0) { - result = Status::IOError(filename_, strerror(errno)); - } - file_ = NULL; - return result; - } - - virtual Status Flush() { - Status result; - if (fflush_unlocked(file_) != 0) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } - - virtual Status Sync() { - Status result; - if ((fflush_unlocked(file_) != 0) || - (fdatasync(fileno(file_)) != 0)) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } -}; - -class ChromiumFileLock : public FileLock { - public: - ::base::PlatformFile file_; -}; - -class ChromiumEnv : public Env { - public: - ChromiumEnv(); - virtual ~ChromiumEnv() { - fprintf(stderr, "Destroying Env::Default()\n"); - exit(1); - } - - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) { - FILE* f = fopen(fname.c_str(), "rb"); - if (f == NULL) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } else { - *result = new ChromiumSequentialFile(fname, f); - return Status::OK(); - } - } - - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) { - int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN; - bool created; - ::base::PlatformFileError error_code; - ::base::PlatformFile file = ::base::CreatePlatformFile( - CreateFilePath(fname), flags, &created, &error_code); - if (error_code != ::base::PLATFORM_FILE_OK) { - *result = NULL; - return Status::IOError(fname, PlatformFileErrorString(error_code)); - } - *result = new ChromiumRandomAccessFile(fname, file); - return Status::OK(); - } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - *result = NULL; - FILE* f = fopen(fname.c_str(), "wb"); - if (f == NULL) { - return Status::IOError(fname, strerror(errno)); - } else { - *result = new ChromiumWritableFile(fname, f); - return Status::OK(); - } - } - - virtual bool FileExists(const std::string& fname) { - return ::file_util::PathExists(CreateFilePath(fname)); - } - - virtual Status GetChildren(const std::string& dir, - std::vector* result) { - result->clear(); - ::file_util::FileEnumerator iter( - CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES); - ::FilePath current = iter.Next(); - while (!current.empty()) { - result->push_back(FilePathToString(current.BaseName())); - current = iter.Next(); - } - // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so - // we'll always return OK. Maybe manually check for error - // conditions like the file not existing? - return Status::OK(); - } - - virtual Status DeleteFile(const std::string& fname) { - Status result; - // TODO(jorlow): Should we assert this is a file? - if (!::file_util::Delete(CreateFilePath(fname), false)) { - result = Status::IOError(fname, "Could not delete file."); - } - return result; - }; - - virtual Status CreateDir(const std::string& name) { - Status result; - if (!::file_util::CreateDirectory(CreateFilePath(name))) { - result = Status::IOError(name, "Could not create directory."); - } - return result; - }; - - virtual Status DeleteDir(const std::string& name) { - Status result; - // TODO(jorlow): Should we assert this is a directory? - if (!::file_util::Delete(CreateFilePath(name), false)) { - result = Status::IOError(name, "Could not delete directory."); - } - return result; - }; - - virtual Status GetFileSize(const std::string& fname, uint64_t* size) { - Status s; - int64_t signed_size; - if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { - *size = 0; - s = Status::IOError(fname, "Could not determine file size."); - } else { - *size = static_cast(signed_size); - } - return s; - } - - virtual Status RenameFile(const std::string& src, const std::string& dst) { - Status result; - if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) { - result = Status::IOError(src, "Could not rename file."); - } - return result; - } - - virtual Status LockFile(const std::string& fname, FileLock** lock) { - *lock = NULL; - Status result; - int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS | - ::base::PLATFORM_FILE_READ | - ::base::PLATFORM_FILE_WRITE | - ::base::PLATFORM_FILE_EXCLUSIVE_READ | - ::base::PLATFORM_FILE_EXCLUSIVE_WRITE; - bool created; - ::base::PlatformFileError error_code; - ::base::PlatformFile file = ::base::CreatePlatformFile( - CreateFilePath(fname), flags, &created, &error_code); - if (error_code != ::base::PLATFORM_FILE_OK) { - result = Status::IOError(fname, PlatformFileErrorString(error_code)); - } else { - ChromiumFileLock* my_lock = new ChromiumFileLock; - my_lock->file_ = file; - *lock = my_lock; - } - return result; - } - - virtual Status UnlockFile(FileLock* lock) { - ChromiumFileLock* my_lock = reinterpret_cast(lock); - Status result; - if (!::base::ClosePlatformFile(my_lock->file_)) { - result = Status::IOError("Could not close lock file."); - } - delete my_lock; - return result; - } - - virtual void Schedule(void (*function)(void*), void* arg); - - virtual void StartThread(void (*function)(void* arg), void* arg); - - virtual std::string UserIdentifier() { -#if defined(OS_WIN) - std::wstring user_sid; - bool ret = ::base::win::GetUserSidString(&user_sid); - DCHECK(ret); - return UTF16ToUTF8(user_sid); -#else - char buf[100]; - snprintf(buf, sizeof(buf), "%d", int(geteuid())); - return buf; -#endif - } - - virtual Status GetTestDirectory(std::string* path) { - mu_.Acquire(); - if (test_directory_.empty()) { - if (!::file_util::CreateNewTempDirectory(kLevelDBTestDirectoryPrefix, - &test_directory_)) { - mu_.Release(); - return Status::IOError("Could not create temp directory."); - } - } - *path = FilePathToString(test_directory_); - mu_.Release(); - return Status::OK(); - } - - virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { - // TODO(jorlow): We may want to just use Chromium's built in logging. - - uint64_t thread_id = 0; - // Coppied from base/logging.cc. -#if defined(OS_WIN) - thread_id = GetCurrentThreadId(); -#elif defined(OS_MACOSX) - thread_id = mach_thread_self(); -#elif defined(OS_LINUX) - thread_id = syscall(__NR_gettid); -#elif defined(OS_FREEBSD) || defined(OS_NACL) - // TODO(BSD): find a better thread ID - pthread_t tid = pthread_self(); - memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); -#endif - - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - ::base::Time::Exploded t; - ::base::Time::Now().LocalExplode(&t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.year, - t.month, - t.day_of_month, - t.hour, - t.minute, - t.second, - static_cast(t.millisecond) * 1000, - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - info_log->Append(Slice(base, p - base)); - info_log->Flush(); - if (base != buffer) { - delete[] base; - } - break; - } - } - - virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) { - ::base::Time::Exploded t; - ::base::Time::Now().LocalExplode(&t); - return snprintf(buffer, size, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d", - t.year, - t.month, - t.day_of_month, - t.hour, - t.minute, - t.second, - static_cast(t.millisecond) * 1000); - } - - virtual uint64_t NowMicros() { - return ::base::TimeTicks::HighResNow().ToInternalValue(); - } - - virtual void SleepForMicroseconds(int micros) { - // Round up to the next millisecond. - ::base::PlatformThread::Sleep((micros + 999) / 1000); - } - - private: - // BGThread() is the body of the background thread - void BGThread(); - static void BGThreadWrapper(void* arg) { - reinterpret_cast(arg)->BGThread(); - } - - FilePath test_directory_; - - size_t page_size_; - ::base::Lock mu_; - ::base::ConditionVariable bgsignal_; - bool started_bgthread_; - - // Entry per Schedule() call - struct BGItem { void* arg; void (*function)(void*); }; - typedef std::deque BGQueue; - BGQueue queue_; -}; - -ChromiumEnv::ChromiumEnv() - : page_size_(::base::SysInfo::VMAllocationGranularity()), - bgsignal_(&mu_), - started_bgthread_(false) { -#if defined(OS_MACOSX) - ::base::EnableTerminationOnHeapCorruption(); - ::base::EnableTerminationOnOutOfMemory(); -#endif // OS_MACOSX -} - -class Thread : public ::base::PlatformThread::Delegate { - public: - Thread(void (*function)(void* arg), void* arg) - : function_(function), arg_(arg) { - ::base::PlatformThreadHandle handle; - bool success = ::base::PlatformThread::Create(0, this, &handle); - DCHECK(success); - } - virtual ~Thread() {} - virtual void ThreadMain() { - (*function_)(arg_); - delete this; - } - - private: - void (*function_)(void* arg); - void* arg_; -}; - -void ChromiumEnv::Schedule(void (*function)(void*), void* arg) { - mu_.Acquire(); - - // Start background thread if necessary - if (!started_bgthread_) { - started_bgthread_ = true; - StartThread(&ChromiumEnv::BGThreadWrapper, this); - } - - // If the queue is currently empty, the background thread may currently be - // waiting. - if (queue_.empty()) { - bgsignal_.Signal(); - } - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - - mu_.Release(); -} - -void ChromiumEnv::BGThread() { - while (true) { - // Wait until there is an item that is ready to run - mu_.Acquire(); - while (queue_.empty()) { - bgsignal_.Wait(); - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - - mu_.Release(); - (*function)(arg); - } -} - -void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { - new Thread(function, arg); // Will self-delete. -} - -::base::LazyInstance > - default_env(::base::LINKER_INITIALIZED); - -} - -Env* Env::Default() { - return default_env.Pointer(); -} - -} diff --git a/leveldb/util/env_posix.cc b/leveldb/util/env_posix.cc deleted file mode 100644 index 5cddb0c..0000000 --- a/leveldb/util/env_posix.cc +++ /dev/null @@ -1,599 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(LEVELDB_PLATFORM_ANDROID) -#include -#endif -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "port/port.h" -#include "util/logging.h" - -namespace leveldb { - -namespace { - -class PosixSequentialFile: public SequentialFile { - private: - std::string filename_; - FILE* file_; - - public: - PosixSequentialFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - virtual ~PosixSequentialFile() { fclose(file_); } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - Status s; - size_t r = fread_unlocked(scratch, 1, n, file_); - *result = Slice(scratch, r); - if (r < n) { - if (feof(file_)) { - // We leave status as ok if we hit the end of the file - } else { - // A partial read with an error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - } - return s; - } -}; - -class PosixRandomAccessFile: public RandomAccessFile { - private: - std::string filename_; - int fd_; - - public: - PosixRandomAccessFile(const std::string& fname, int fd) - : filename_(fname), fd_(fd) { } - virtual ~PosixRandomAccessFile() { close(fd_); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; - ssize_t r = pread(fd_, scratch, n, static_cast(offset)); - *result = Slice(scratch, (r < 0) ? 0 : r); - if (r < 0) { - // An error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - return s; - } -}; - -// We preallocate up to an extra megabyte and use memcpy to append new -// data to the file. This is safe since we either properly close the -// file before reading from it, or for log files, the reading code -// knows enough to skip zero suffixes. -class PosixMmapFile : public WritableFile { - private: - std::string filename_; - int fd_; - size_t page_size_; - size_t map_size_; // How much extra memory to map at a time - char* base_; // The mapped region - char* limit_; // Limit of the mapped region - char* dst_; // Where to write next (in range [base_,limit_]) - char* last_sync_; // Where have we synced up to - uint64_t file_offset_; // Offset of base_ in file - - // Have we done an munmap of unsynced data? - bool pending_sync_; - - // Roundup x to a multiple of y - static size_t Roundup(size_t x, size_t y) { - return ((x + y - 1) / y) * y; - } - - size_t TruncateToPageBoundary(size_t s) { - s -= (s & (page_size_ - 1)); - assert((s % page_size_) == 0); - return s; - } - - void UnmapCurrentRegion() { - if (base_ != NULL) { - if (last_sync_ < limit_) { - // Defer syncing this data until next Sync() call, if any - pending_sync_ = true; - } - munmap(base_, limit_ - base_); - file_offset_ += limit_ - base_; - base_ = NULL; - limit_ = NULL; - last_sync_ = NULL; - dst_ = NULL; - - // Increase the amount we map the next time, but capped at 1MB - if (map_size_ < (1<<20)) { - map_size_ *= 2; - } - } - } - - bool MapNewRegion() { - assert(base_ == NULL); - if (ftruncate(fd_, file_offset_ + map_size_) < 0) { - return false; - } - void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, - fd_, file_offset_); - if (ptr == MAP_FAILED) { - return false; - } - base_ = reinterpret_cast(ptr); - limit_ = base_ + map_size_; - dst_ = base_; - last_sync_ = base_; - return true; - } - - public: - PosixMmapFile(const std::string& fname, int fd, size_t page_size) - : filename_(fname), - fd_(fd), - page_size_(page_size), - map_size_(Roundup(65536, page_size)), - base_(NULL), - limit_(NULL), - dst_(NULL), - last_sync_(NULL), - file_offset_(0), - pending_sync_(false) { - assert((page_size & (page_size - 1)) == 0); - } - - - ~PosixMmapFile() { - if (fd_ >= 0) { - PosixMmapFile::Close(); - } - } - - virtual Status Append(const Slice& data) { - const char* src = data.data(); - size_t left = data.size(); - while (left > 0) { - assert(base_ <= dst_); - assert(dst_ <= limit_); - size_t avail = limit_ - dst_; - if (avail == 0) { - UnmapCurrentRegion(); - MapNewRegion(); - } - - size_t n = (left <= avail) ? left : avail; - memcpy(dst_, src, n); - dst_ += n; - src += n; - left -= n; - } - return Status::OK(); - } - - virtual Status Close() { - Status s; - size_t unused = limit_ - dst_; - UnmapCurrentRegion(); - if (unused > 0) { - // Trim the extra space at the end of the file - if (ftruncate(fd_, file_offset_ - unused) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - if (close(fd_) < 0) { - if (s.ok()) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - fd_ = -1; - base_ = NULL; - limit_ = NULL; - return s; - } - - virtual Status Flush() { - return Status::OK(); - } - - virtual Status Sync() { - Status s; - - if (pending_sync_) { - // Some unmapped data was not synced - pending_sync_ = false; - if (fdatasync(fd_) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - if (dst_ > last_sync_) { - // Find the beginnings of the pages that contain the first and last - // bytes to be synced. - size_t p1 = TruncateToPageBoundary(last_sync_ - base_); - size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); - last_sync_ = dst_; - if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - return s; - } -}; - -static int LockOrUnlock(int fd, bool lock) { - errno = 0; - struct flock f; - memset(&f, 0, sizeof(f)); - f.l_type = (lock ? F_WRLCK : F_UNLCK); - f.l_whence = SEEK_SET; - f.l_start = 0; - f.l_len = 0; // Lock/unlock entire file - return fcntl(fd, F_SETLK, &f); -} - -class PosixFileLock : public FileLock { - public: - int fd_; -}; - -class PosixEnv : public Env { - public: - PosixEnv(); - virtual ~PosixEnv() { - fprintf(stderr, "Destroying Env::Default()\n"); - exit(1); - } - - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) { - FILE* f = fopen(fname.c_str(), "r"); - if (f == NULL) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } else { - *result = new PosixSequentialFile(fname, f); - return Status::OK(); - } - } - - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) { - int fd = open(fname.c_str(), O_RDONLY); - if (fd < 0) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } - *result = new PosixRandomAccessFile(fname, fd); - return Status::OK(); - } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - Status s; - const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd < 0) { - *result = NULL; - s = Status::IOError(fname, strerror(errno)); - } else { - *result = new PosixMmapFile(fname, fd, page_size_); - } - return s; - } - - virtual bool FileExists(const std::string& fname) { - return access(fname.c_str(), F_OK) == 0; - } - - virtual Status GetChildren(const std::string& dir, - std::vector* result) { - result->clear(); - DIR* d = opendir(dir.c_str()); - if (d == NULL) { - return Status::IOError(dir, strerror(errno)); - } - struct dirent* entry; - while ((entry = readdir(d)) != NULL) { - result->push_back(entry->d_name); - } - closedir(d); - return Status::OK(); - } - - virtual Status DeleteFile(const std::string& fname) { - Status result; - if (unlink(fname.c_str()) != 0) { - result = Status::IOError(fname, strerror(errno)); - } - return result; - }; - - virtual Status CreateDir(const std::string& name) { - Status result; - if (mkdir(name.c_str(), 0755) != 0) { - result = Status::IOError(name, strerror(errno)); - } - return result; - }; - - virtual Status DeleteDir(const std::string& name) { - Status result; - if (rmdir(name.c_str()) != 0) { - result = Status::IOError(name, strerror(errno)); - } - return result; - }; - - virtual Status GetFileSize(const std::string& fname, uint64_t* size) { - Status s; - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - *size = 0; - s = Status::IOError(fname, strerror(errno)); - } else { - *size = sbuf.st_size; - } - return s; - } - - virtual Status RenameFile(const std::string& src, const std::string& target) { - Status result; - if (rename(src.c_str(), target.c_str()) != 0) { - result = Status::IOError(src, strerror(errno)); - } - return result; - } - - virtual Status LockFile(const std::string& fname, FileLock** lock) { - *lock = NULL; - Status result; - int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); - if (fd < 0) { - result = Status::IOError(fname, strerror(errno)); - } else if (LockOrUnlock(fd, true) == -1) { - result = Status::IOError("lock " + fname, strerror(errno)); - close(fd); - } else { - PosixFileLock* my_lock = new PosixFileLock; - my_lock->fd_ = fd; - *lock = my_lock; - } - return result; - } - - virtual Status UnlockFile(FileLock* lock) { - PosixFileLock* my_lock = reinterpret_cast(lock); - Status result; - if (LockOrUnlock(my_lock->fd_, false) == -1) { - result = Status::IOError(strerror(errno)); - } - close(my_lock->fd_); - delete my_lock; - return result; - } - - virtual void Schedule(void (*function)(void*), void* arg); - - virtual void StartThread(void (*function)(void* arg), void* arg); - - virtual Status GetTestDirectory(std::string* result) { - const char* env = getenv("TEST_TMPDIR"); - if (env && env[0] != '\0') { - *result = env; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); - *result = buf; - } - // Directory may already exist - CreateDir(*result); - return Status::OK(); - } - - virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { - pthread_t tid = pthread_self(); - uint64_t thread_id = 0; - memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); - - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - struct timeval now_tv; - gettimeofday(&now_tv, NULL); - const time_t seconds = now_tv.tv_sec; - struct tm t; - localtime_r(&seconds, &t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.tm_year + 1900, - t.tm_mon + 1, - t.tm_mday, - t.tm_hour, - t.tm_min, - t.tm_sec, - static_cast(now_tv.tv_usec), - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - info_log->Append(Slice(base, p - base)); - info_log->Flush(); - if (base != buffer) { - delete[] base; - } - break; - } - } - - virtual uint64_t NowMicros() { - struct timeval tv; - gettimeofday(&tv, NULL); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; - } - - virtual void SleepForMicroseconds(int micros) { - usleep(micros); - } - - private: - void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - exit(1); - } - } - - // BGThread() is the body of the background thread - void BGThread(); - static void* BGThreadWrapper(void* arg) { - reinterpret_cast(arg)->BGThread(); - return NULL; - } - - size_t page_size_; - pthread_mutex_t mu_; - pthread_cond_t bgsignal_; - pthread_t bgthread_; - bool started_bgthread_; - - // Entry per Schedule() call - struct BGItem { void* arg; void (*function)(void*); }; - typedef std::deque BGQueue; - BGQueue queue_; -}; - -PosixEnv::PosixEnv() : page_size_(getpagesize()), - started_bgthread_(false) { - PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); - PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); -} - -void PosixEnv::Schedule(void (*function)(void*), void* arg) { - PthreadCall("lock", pthread_mutex_lock(&mu_)); - - // Start background thread if necessary - if (!started_bgthread_) { - started_bgthread_ = true; - PthreadCall( - "create thread", - pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); - } - - // If the queue is currently empty, the background thread may currently be - // waiting. - if (queue_.empty()) { - PthreadCall("signal", pthread_cond_signal(&bgsignal_)); - } - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); -} - -void PosixEnv::BGThread() { - while (true) { - // Wait until there is an item that is ready to run - PthreadCall("lock", pthread_mutex_lock(&mu_)); - while (queue_.empty()) { - PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); - (*function)(arg); - } -} - -namespace { -struct StartThreadState { - void (*user_function)(void*); - void* arg; -}; -} -static void* StartThreadWrapper(void* arg) { - StartThreadState* state = reinterpret_cast(arg); - state->user_function(state->arg); - delete state; - return NULL; -} - -void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { - pthread_t t; - StartThreadState* state = new StartThreadState; - state->user_function = function; - state->arg = arg; - PthreadCall("start thread", - pthread_create(&t, NULL, &StartThreadWrapper, state)); -} - -} - -static pthread_once_t once = PTHREAD_ONCE_INIT; -static Env* default_env; -static void InitDefaultEnv() { default_env = new PosixEnv; } - -Env* Env::Default() { - pthread_once(&once, InitDefaultEnv); - return default_env; -} - -} diff --git a/leveldb/util/env_test.cc b/leveldb/util/env_test.cc deleted file mode 100644 index 3c253be..0000000 --- a/leveldb/util/env_test.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/env.h" - -#include "port/port.h" -#include "util/testharness.h" - -namespace leveldb { - -static const int kDelayMicros = 100000; - -class EnvPosixTest { - private: - port::Mutex mu_; - std::string events_; - - public: - Env* env_; - EnvPosixTest() : env_(Env::Default()) { } -}; - -static void SetBool(void* ptr) { - *(reinterpret_cast(ptr)) = true; -} - -TEST(EnvPosixTest, RunImmediately) { - bool called = false; - env_->Schedule(&SetBool, &called); - Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_TRUE(called); -} - -TEST(EnvPosixTest, RunMany) { - int last_id = 0; - - struct CB { - int* last_id_ptr; // Pointer to shared slot - int id; // Order# for the execution of this callback - - CB(int* p, int i) : last_id_ptr(p), id(i) { } - - static void Run(void* v) { - CB* cb = reinterpret_cast(v); - ASSERT_EQ(cb->id-1, *cb->last_id_ptr); - *cb->last_id_ptr = cb->id; - } - }; - - // Schedule in different order than start time - CB cb1(&last_id, 1); - CB cb2(&last_id, 2); - CB cb3(&last_id, 3); - CB cb4(&last_id, 4); - env_->Schedule(&CB::Run, &cb1); - env_->Schedule(&CB::Run, &cb2); - env_->Schedule(&CB::Run, &cb3); - env_->Schedule(&CB::Run, &cb4); - - Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_EQ(4, last_id); -} - -struct State { - port::Mutex mu; - int val; - int num_running; -}; - -static void ThreadBody(void* arg) { - State* s = reinterpret_cast(arg); - s->mu.Lock(); - s->val += 1; - s->num_running -= 1; - s->mu.Unlock(); -} - -TEST(EnvPosixTest, StartThread) { - State state; - state.val = 0; - state.num_running = 3; - for (int i = 0; i < 3; i++) { - env_->StartThread(&ThreadBody, &state); - } - while (true) { - state.mu.Lock(); - int num = state.num_running; - state.mu.Unlock(); - if (num == 0) { - break; - } - Env::Default()->SleepForMicroseconds(kDelayMicros); - } - ASSERT_EQ(state.val, 3); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/hash.cc b/leveldb/util/hash.cc deleted file mode 100644 index d19afd1..0000000 --- a/leveldb/util/hash.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "util/coding.h" -#include "util/hash.h" - -namespace leveldb { - -uint32_t Hash(const char* data, size_t n, uint32_t seed) { - // Similar to murmur hash - const uint32_t m = 0xc6a4a793; - const uint32_t r = 24; - const char* limit = data + n; - uint32_t h = seed ^ (n * m); - - // Pick up four bytes at a time - while (data + 4 <= limit) { - uint32_t w = DecodeFixed32(data); - data += 4; - h += w; - h *= m; - h ^= (h >> 16); - } - - // Pick up remaining bytes - switch (limit - data) { - case 3: - h += data[2] << 16; - // fall through - case 2: - h += data[1] << 8; - // fall through - case 1: - h += data[0]; - h *= m; - h ^= (h >> r); - break; - } - return h; -} - - -} diff --git a/leveldb/util/hash.h b/leveldb/util/hash.h deleted file mode 100644 index 8889d56..0000000 --- a/leveldb/util/hash.h +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Simple hash function used for internal data structures - -#ifndef STORAGE_LEVELDB_UTIL_HASH_H_ -#define STORAGE_LEVELDB_UTIL_HASH_H_ - -#include -#include - -namespace leveldb { - -extern uint32_t Hash(const char* data, size_t n, uint32_t seed); - -} - -#endif // STORAGE_LEVELDB_UTIL_HASH_H_ diff --git a/leveldb/util/histogram.cc b/leveldb/util/histogram.cc deleted file mode 100644 index c5178ef..0000000 --- a/leveldb/util/histogram.cc +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include "port/port.h" -#include "util/histogram.h" - -namespace leveldb { - -const double Histogram::kBucketLimit[kNumBuckets] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, - 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, - 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, - 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, - 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, - 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, - 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, - 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, - 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, - 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, - 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, - 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, - 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, - 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, - 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000, - 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0, - 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0, - 1e200, -}; - -void Histogram::Clear() { - min_ = kBucketLimit[kNumBuckets-1]; - max_ = 0; - num_ = 0; - sum_ = 0; - sum_squares_ = 0; - for (int i = 0; i < kNumBuckets; i++) { - buckets_[i] = 0; - } -} - -void Histogram::Add(double value) { - // Linear search is fast enough for our usage in db_bench - int b = 0; - while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { - b++; - } - buckets_[b] += 1.0; - if (min_ > value) min_ = value; - if (max_ < value) max_ = value; - num_++; - sum_ += value; - sum_squares_ += (value * value); -} - -double Histogram::Median() const { - return Percentile(50.0); -} - -double Histogram::Percentile(double p) const { - double threshold = num_ * (p / 100.0); - double sum = 0; - for (int b = 0; b < kNumBuckets; b++) { - sum += buckets_[b]; - if (sum >= threshold) { - // Scale linearly within this bucket - double left_point = (b == 0) ? 0 : kBucketLimit[b-1]; - double right_point = kBucketLimit[b]; - double left_sum = sum - buckets_[b]; - double right_sum = sum; - double pos = (threshold - left_sum) / (right_sum - left_sum); - double r = left_point + (right_point - left_point) * pos; - if (r < min_) r = min_; - if (r > max_) r = max_; - return r; - } - } - return max_; -} - -double Histogram::Average() const { - if (num_ == 0.0) return 0; - return sum_ / num_; -} - -double Histogram::StandardDeviation() const { - if (num_ == 0.0) return 0; - double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); - return sqrt(variance); -} - -std::string Histogram::ToString() const { - std::string r; - char buf[200]; - snprintf(buf, sizeof(buf), - "Count: %.0f Average: %.4f StdDev: %.2f\n", - num_, Average(), StandardDeviation()); - r.append(buf); - snprintf(buf, sizeof(buf), - "Min: %.4f Median: %.4f Max: %.4f\n", - (num_ == 0.0 ? 0.0 : min_), Median(), max_); - r.append(buf); - r.append("------------------------------------------------------\n"); - const double mult = 100.0 / num_; - double sum = 0; - for (int b = 0; b < kNumBuckets; b++) { - if (buckets_[b] <= 0.0) continue; - sum += buckets_[b]; - snprintf(buf, sizeof(buf), - "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ", - ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left - kBucketLimit[b], // right - buckets_[b], // count - mult * buckets_[b], // percentage - mult * sum); // cumulative percentage - r.append(buf); - - // Add hash marks based on percentage; 20 marks for 100%. - int marks = static_cast(20*(buckets_[b] / num_) + 0.5); - r.append(marks, '#'); - r.push_back('\n'); - } - return r; -} - -} diff --git a/leveldb/util/histogram.h b/leveldb/util/histogram.h deleted file mode 100644 index f72f122..0000000 --- a/leveldb/util/histogram.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ -#define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ - -#include - -namespace leveldb { - -class Histogram { - public: - Histogram() { } - ~Histogram() { } - - void Clear(); - void Add(double value); - - std::string ToString() const; - - private: - double min_; - double max_; - double num_; - double sum_; - double sum_squares_; - - enum { kNumBuckets = 154 }; - static const double kBucketLimit[kNumBuckets]; - double buckets_[kNumBuckets]; - - double Median() const; - double Percentile(double p) const; - double Average() const; - double StandardDeviation() const; -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ diff --git a/leveldb/util/logging.cc b/leveldb/util/logging.cc deleted file mode 100644 index 760d335..0000000 --- a/leveldb/util/logging.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/logging.h" - -#include -#include -#include -#include -#include "leveldb/env.h" -#include "leveldb/slice.h" - -namespace leveldb { - -void AppendNumberTo(std::string* str, uint64_t num) { - char buf[30]; - snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); - str->append(buf); -} - -void AppendEscapedStringTo(std::string* str, const Slice& value) { - for (size_t i = 0; i < value.size(); i++) { - char c = value[i]; - if (c >= ' ' && c <= '~') { - str->push_back(c); - } else { - char buf[10]; - snprintf(buf, sizeof(buf), "\\x%02x", - static_cast(c) & 0xff); - str->append(buf); - } - } -} - -std::string NumberToString(uint64_t num) { - std::string r; - AppendNumberTo(&r, num); - return r; -} - -std::string EscapeString(const Slice& value) { - std::string r; - AppendEscapedStringTo(&r, value); - return r; -} - -bool ConsumeChar(Slice* in, char c) { - if (!in->empty() && (*in)[0] == c) { - in->remove_prefix(1); - return true; - } else { - return false; - } -} - -bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { - uint64_t v = 0; - int digits = 0; - while (!in->empty()) { - char c = (*in)[0]; - if (c >= '0' && c <= '9') { - ++digits; - const int delta = (c - '0'); - static const uint64_t kMaxUint64 = ~static_cast(0); - if (v > kMaxUint64/10 || - (v == kMaxUint64/10 && delta > kMaxUint64%10)) { - // Overflow - return false; - } - v = (v * 10) + delta; - in->remove_prefix(1); - } else { - break; - } - } - *val = v; - return (digits > 0); -} - -} diff --git a/leveldb/util/logging.h b/leveldb/util/logging.h deleted file mode 100644 index 1cd0a4b..0000000 --- a/leveldb/util/logging.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Must not be included from any .h files to avoid polluting the namespace -// with macros. - -#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ -#define STORAGE_LEVELDB_UTIL_LOGGING_H_ - -#include -#include -#include -#include "port/port.h" - -namespace leveldb { - -class Slice; -class WritableFile; - -// Append a human-readable printout of "num" to *str -extern void AppendNumberTo(std::string* str, uint64_t num); - -// Append a human-readable printout of "value" to *str. -// Escapes any non-printable characters found in "value". -extern void AppendEscapedStringTo(std::string* str, const Slice& value); - -// Return a human-readable printout of "num" -extern std::string NumberToString(uint64_t num); - -// Return a human-readable version of "value". -// Escapes any non-printable characters found in "value". -extern std::string EscapeString(const Slice& value); - -// If *in starts with "c", advances *in past the first character and -// returns true. Otherwise, returns false. -extern bool ConsumeChar(Slice* in, char c); - -// Parse a human-readable number from "*in" into *value. On success, -// advances "*in" past the consumed number and sets "*val" to the -// numeric value. Otherwise, returns false and leaves *in in an -// unspecified state. -extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); - -} - -#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/leveldb/util/mutexlock.h b/leveldb/util/mutexlock.h deleted file mode 100644 index 05fe279..0000000 --- a/leveldb/util/mutexlock.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ -#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ - -#include "port/port.h" - -namespace leveldb { - -// Helper class that locks a mutex on construction and unlocks the mutex when -// the destructor of the MutexLock object is invoked. -// -// Typical usage: -// -// void MyClass::MyMethod() { -// MutexLock l(&mu_); // mu_ is an instance variable -// ... some complex code, possibly with multiple return paths ... -// } - -class MutexLock { - public: - explicit MutexLock(port::Mutex *mu) : mu_(mu) { - this->mu_->Lock(); - } - ~MutexLock() { this->mu_->Unlock(); } - - private: - port::Mutex *const mu_; - // No copying allowed - MutexLock(const MutexLock&); - void operator=(const MutexLock&); -}; - -} - - -#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ diff --git a/leveldb/util/options.cc b/leveldb/util/options.cc deleted file mode 100644 index 0ea5c98..0000000 --- a/leveldb/util/options.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/options.h" - -#include "leveldb/comparator.h" -#include "leveldb/env.h" - -namespace leveldb { - -Options::Options() - : comparator(BytewiseComparator()), - create_if_missing(false), - error_if_exists(false), - paranoid_checks(false), - env(Env::Default()), - info_log(NULL), - write_buffer_size(4<<20), - max_open_files(1000), - block_cache(NULL), - block_size(4096), - block_restart_interval(16), - compression(kSnappyCompression) { -} - - -} diff --git a/leveldb/util/random.h b/leveldb/util/random.h deleted file mode 100644 index d886b4e..0000000 --- a/leveldb/util/random.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ -#define STORAGE_LEVELDB_UTIL_RANDOM_H_ - -#include - -namespace leveldb { - -// A very simple random number generator. Not especially good at -// generating truly random bits, but good enough for our needs in this -// package. -class Random { - private: - uint32_t seed_; - public: - explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } - uint32_t Next() { - static const uint32_t M = 2147483647L; // 2^31-1 - static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 - // We are computing - // seed_ = (seed_ * A) % M, where M = 2^31-1 - // - // seed_ must not be zero or M, or else all subsequent computed values - // will be zero or M respectively. For all other values, seed_ will end - // up cycling through every number in [1,M-1] - uint64_t product = seed_ * A; - - // Compute (product % M) using the fact that ((x << 31) % M) == x. - seed_ = static_cast((product >> 31) + (product & M)); - // The first reduction may overflow by 1 bit, so we may need to - // repeat. mod == M is not possible; using > allows the faster - // sign-bit-based test. - if (seed_ > M) { - seed_ -= M; - } - return seed_; - } - // Returns a uniformly distributed value in the range [0..n-1] - // REQUIRES: n > 0 - uint32_t Uniform(int n) { return Next() % n; } - - // Randomly returns true ~"1/n" of the time, and false otherwise. - // REQUIRES: n > 0 - bool OneIn(int n) { return (Next() % n) == 0; } - - // Skewed: pick "base" uniformly from range [0,max_log] and then - // return "base" random bits. The effect is to pick a number in the - // range [0,2^max_log-1] with exponential bias towards smaller numbers. - uint32_t Skewed(int max_log) { - return Uniform(1 << Uniform(max_log + 1)); - } -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/leveldb/util/status.cc b/leveldb/util/status.cc deleted file mode 100644 index d9b7195..0000000 --- a/leveldb/util/status.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "port/port.h" -#include "leveldb/status.h" - -namespace leveldb { - -Status::Status(Code code, const Slice& msg, const Slice& msg2) { - assert(code != kOk); - state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); - if (!msg2.empty()) { - state_->second.append(": "); - state_->second.append(msg2.data(), msg2.size()); - } -} - -std::string Status::ToString() const { - if (state_ == NULL) { - return "OK"; - } else { - char tmp[30]; - const char* type; - switch (state_->first) { - case kOk: - type = "OK"; - break; - case kNotFound: - type = "NotFound"; - break; - case kCorruption: - type = "Corruption: "; - break; - case kNotSupported: - type = "Not implemented: "; - break; - case kInvalidArgument: - type = "Invalid argument: "; - break; - case kIOError: - type = "IO error: "; - break; - default: - snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", - static_cast(state_->first)); - type = tmp; - break; - } - std::string result(type); - if (!state_->second.empty()) { - result.append(state_->second); - } - return result; - } -} - -} diff --git a/leveldb/util/testharness.cc b/leveldb/util/testharness.cc deleted file mode 100644 index b686ac3..0000000 --- a/leveldb/util/testharness.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/testharness.h" - -#include -#include - -namespace leveldb { -namespace test { - -namespace { -struct Test { - const char* base; - const char* name; - void (*func)(); -}; -std::vector* tests; -} - -bool RegisterTest(const char* base, const char* name, void (*func)()) { - if (tests == NULL) { - tests = new std::vector; - } - Test t; - t.base = base; - t.name = name; - t.func = func; - tests->push_back(t); - return true; -} - -int RunAllTests() { - int num = 0; - if (tests != NULL) { - for (int i = 0; i < tests->size(); i++) { - const Test& t = (*tests)[i]; - fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); - (*t.func)(); - ++num; - } - } - fprintf(stderr, "==== PASSED %d tests\n", num); - return 0; -} - -std::string TmpDir() { - std::string dir; - Status s = Env::Default()->GetTestDirectory(&dir); - ASSERT_TRUE(s.ok()) << s.ToString(); - return dir; -} - -int RandomSeed() { - const char* env = getenv("TEST_RANDOM_SEED"); - int result = (env != NULL ? atoi(env) : 301); - if (result <= 0) { - result = 301; - } - return result; -} - -} -} diff --git a/leveldb/util/testharness.h b/leveldb/util/testharness.h deleted file mode 100644 index 13ab914..0000000 --- a/leveldb/util/testharness.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ -#define STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ - -#include -#include -#include -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "util/random.h" - -namespace leveldb { -namespace test { - -// Run all tests registered by the TEST() macro. -// Returns 0 if all tests pass. -// Dies or returns a non-zero value if some test fails. -extern int RunAllTests(); - -// Return the directory to use for temporary storage. -extern std::string TmpDir(); - -// Return a randomization seed for this run. Typically returns the -// same number on repeated invocations of this binary, but automated -// runs may be able to vary the seed. -extern int RandomSeed(); - -// An instance of Tester is allocated to hold temporary state during -// the execution of an assertion. -class Tester { - private: - bool ok_; - const char* fname_; - int line_; - std::stringstream ss_; - - public: - Tester(const char* f, int l) - : ok_(true), fname_(f), line_(l) { - } - - ~Tester() { - if (!ok_) { - fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); - exit(1); - } - } - - Tester& Is(bool b, const char* msg) { - if (!b) { - ss_ << " Assertion failure " << msg; - ok_ = false; - } - return *this; - } - - Tester& IsOk(const Status& s) { - if (!s.ok()) { - ss_ << " " << s.ToString(); - ok_ = false; - } - return *this; - } - -#define BINARY_OP(name,op) \ - template \ - Tester& name(const X& x, const Y& y) { \ - if (! (x op y)) { \ - ss_ << " failed: " << x << (" " #op " ") << y; \ - ok_ = false; \ - } \ - return *this; \ - } - - BINARY_OP(IsEq, ==) - BINARY_OP(IsNe, !=) - BINARY_OP(IsGe, >=) - BINARY_OP(IsGt, >) - BINARY_OP(IsLe, <=) - BINARY_OP(IsLt, <) -#undef BINARY_OP - - // Attach the specified value to the error message if an error has occurred - template - Tester& operator<<(const V& value) { - if (!ok_) { - ss_ << " " << value; - } - return *this; - } -}; - -#define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c) -#define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s)) -#define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) -#define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) -#define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) -#define ASSERT_GT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) -#define ASSERT_LE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) -#define ASSERT_LT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) - -#define TCONCAT(a,b) TCONCAT1(a,b) -#define TCONCAT1(a,b) a##b - -#define TEST(base,name) \ -class TCONCAT(_Test_,name) : public base { \ - public: \ - void _Run(); \ - static void _RunIt() { \ - TCONCAT(_Test_,name) t; \ - t._Run(); \ - } \ -}; \ -bool TCONCAT(_Test_ignored_,name) = \ - ::leveldb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ -void TCONCAT(_Test_,name)::_Run() - -// Register the specified test. Typically not used directly, but -// invoked via the macro expansion of TEST. -extern bool RegisterTest(const char* base, const char* name, void (*func)()); - - -} -} - -#endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ diff --git a/leveldb/util/testutil.cc b/leveldb/util/testutil.cc deleted file mode 100644 index 8d6cf3c..0000000 --- a/leveldb/util/testutil.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/testutil.h" - -#include "util/random.h" - -namespace leveldb { -namespace test { - -Slice RandomString(Random* rnd, int len, std::string* dst) { - dst->resize(len); - for (int i = 0; i < len; i++) { - (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' - } - return Slice(*dst); -} - -std::string RandomKey(Random* rnd, int len) { - // Make sure to generate a wide variety of characters so we - // test the boundary conditions for short-key optimizations. - static const char kTestChars[] = { - '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' - }; - std::string result; - for (int i = 0; i < len; i++) { - result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; - } - return result; -} - - -extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst) { - int raw = static_cast(len * compressed_fraction); - if (raw < 1) raw = 1; - std::string raw_data; - RandomString(rnd, raw, &raw_data); - - // Duplicate the random data until we have filled "len" bytes - dst->clear(); - while (dst->size() < len) { - dst->append(raw_data); - } - dst->resize(len); - return Slice(*dst); -} - -} -} diff --git a/leveldb/util/testutil.h b/leveldb/util/testutil.h deleted file mode 100644 index a150c1a..0000000 --- a/leveldb/util/testutil.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_ -#define STORAGE_LEVELDB_UTIL_TESTUTIL_H_ - -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "util/random.h" - -namespace leveldb { -namespace test { - -// Store in *dst a random string of length "len" and return a Slice that -// references the generated data. -extern Slice RandomString(Random* rnd, int len, std::string* dst); - -// Return a random key with the specified length that may contain interesting -// characters (e.g. \x00, \xff, etc.). -extern std::string RandomKey(Random* rnd, int len); - -// Store in *dst a string of length "len" that will compress to -// "N*compressed_fraction" bytes and return a Slice that references -// the generated data. -extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst); - -// A wrapper that allows injection of errors. -class ErrorEnv : public EnvWrapper { - public: - bool writable_file_error_; - int num_writable_file_errors_; - - ErrorEnv() : EnvWrapper(Env::Default()), - writable_file_error_(false), - num_writable_file_errors_(0) { } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - if (writable_file_error_) { - ++num_writable_file_errors_; - *result = NULL; - return Status::IOError(fname, "fake error"); - } - return target()->NewWritableFile(fname, result); - } -}; - -} -} - -#endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_ diff --git a/port/README b/port/README new file mode 100644 index 0000000..422563e --- /dev/null +++ b/port/README @@ -0,0 +1,10 @@ +This directory contains interfaces and implementations that isolate the +rest of the package from platform details. + +Code in the rest of the package includes "port.h" from this directory. +"port.h" in turn includes a platform specific "port_.h" file +that provides the platform specific implementation. + +See port_posix.h for an example of what must be provided in a platform +specific header file. + diff --git a/port/port.h b/port/port.h new file mode 100644 index 0000000..816826b --- /dev/null +++ b/port/port.h @@ -0,0 +1,21 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_PORT_H_ +#define STORAGE_LEVELDB_PORT_PORT_H_ + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(LEVELDB_PLATFORM_POSIX) +# include "port/port_posix.h" +#elif defined(LEVELDB_PLATFORM_CHROMIUM) +# include "port/port_chromium.h" +#elif defined(LEVELDB_PLATFORM_ANDROID) +# include "port/port_android.h" +#endif + +#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_android.cc b/port/port_android.cc new file mode 100644 index 0000000..240e9ca --- /dev/null +++ b/port/port_android.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_android.h" + +#include + +extern "C" { +size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { + return fread(a, b, c, d); +} + +size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { + return fwrite(a, b, c, d); +} + +int fflush_unlocked(FILE *f) { + return fflush(f); +} + +int fdatasync(int fd) { + return fsync(fd); +} +} + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { + PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); +} + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal(){ + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/port/port_android.h b/port/port_android.h new file mode 100644 index 0000000..8680951 --- /dev/null +++ b/port/port_android.h @@ -0,0 +1,158 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ +#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ + +#include +#include +#include +#include +#include +#include +#include + +// Collapse the plethora of ARM flavors available to an easier to manage set +// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto +#if defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7A__) +#define ARMV6_OR_7 1 +#endif + +extern "C" { + size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); + size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); + int fflush_unlocked(FILE *f); + int fdatasync (int fd); +} + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { + //TODO(gabor): How can I implement this? + } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + Mutex* mu_; + pthread_cond_t cv_; +}; + +#ifndef ARMV6_OR_7 +// On ARM chipsets = V6 +#ifdef ARMV6_OR_7 + __asm__ __volatile__("dmb" : : : "memory"); +#else + pLinuxKernelMemoryBarrier(); +#endif + } + + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + void* r = rep_; + MemoryBarrier(); + return r; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + rep_ = v; + } + inline void* NoBarrier_Load() const { + void* r = rep_; + return r; + } + inline void NoBarrier_Store(void* v) { + rep_ = v; + } +}; + +// TODO(gabor): Implement compress +inline bool Snappy_Compress( + const char* input, + size_t input_length, + std::string* output) { + return false; +} + +// TODO(gabor): Implement uncompress +inline bool Snappy_Uncompress( + const char* input_data, + size_t input_length, + std::string* output) { + return false; +} + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + SHA1_CTX sha1_ctx; + SHA1Init(&sha1_ctx); + SHA1Update(&sha1_ctx, (const u_char*)data, len); + SHA1Final((u_char*)hash_array, &sha1_ctx); +} + +inline uint64_t ThreadIdentifier() { + pthread_t tid = pthread_self(); + uint64_t r = 0; + memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); + return r; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/port/port_chromium.cc b/port/port_chromium.cc new file mode 100644 index 0000000..2ab49b9 --- /dev/null +++ b/port/port_chromium.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_chromium.h" + +#include "util/logging.h" + +#if defined(USE_SNAPPY) +# include "third_party/snappy/src/snappy.h" +#endif + +namespace leveldb { +namespace port { + +Mutex::Mutex() { +} + +Mutex::~Mutex() { +} + +void Mutex::Lock() { + mu_.Acquire(); +} + +void Mutex::Unlock() { + mu_.Release(); +} + +void Mutex::AssertHeld() { + mu_.AssertAcquired(); +} + +CondVar::CondVar(Mutex* mu) + : cv_(&mu->mu_) { +} + +CondVar::~CondVar() { } + +void CondVar::Wait() { + cv_.Wait(); +} + +void CondVar::Signal(){ + cv_.Signal(); +} + +void CondVar::SignalAll() { + cv_.Broadcast(); +} + +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { +#if defined(USE_SNAPPY) + output->resize(snappy::MaxCompressedLength(input_length)); + size_t outlen; + snappy::RawCompress(input, input_length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#else + return false; +#endif +} + +bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { +#if defined(USE_SNAPPY) + size_t ulength; + if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { + return false; + } + output->resize(ulength); + return snappy::RawUncompress(input_data, input_length, &(*output)[0]); +#else + return false; +#endif +} + +} +} diff --git a/port/port_chromium.h b/port/port_chromium.h new file mode 100644 index 0000000..e349f8f --- /dev/null +++ b/port/port_chromium.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ +#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ + +#include +#include +#include +#include "base/atomicops.h" +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/sha1.h" +#include "base/synchronization/condition_variable.h" +#include "base/synchronization/lock.h" + +// Linux's ThreadIdentifier() needs this. +#if defined(OS_LINUX) +# include +#endif + +#if defined(OS_WIN) +#define snprintf _snprintf +#define va_copy(a, b) do { (a) = (b); } while (0) +#endif + +namespace leveldb { +namespace port { + +// Chromium only supports little endian. +static const bool kLittleEndian = true; + +class Mutex { + public: + Mutex(); + ~Mutex(); + void Lock(); + void Unlock(); + void AssertHeld(); + + private: + base::Lock mu_; + + friend class CondVar; + DISALLOW_COPY_AND_ASSIGN(Mutex); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + + private: + base::ConditionVariable cv_; + + DISALLOW_COPY_AND_ASSIGN(CondVar); +}; + +class AtomicPointer { + private: + typedef base::subtle::AtomicWord Rep; + Rep rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} + inline void* Acquire_Load() const { + return reinterpret_cast(::base::subtle::Acquire_Load(&rep_)); + } + inline void Release_Store(void* v) { + ::base::subtle::Release_Store(&rep_, reinterpret_cast(v)); + } + inline void* NoBarrier_Load() const { + return reinterpret_cast(::base::subtle::NoBarrier_Load(&rep_)); + } + inline void NoBarrier_Store(void* v) { + ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast(v)); + } +}; + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + return ::base::SHA1HashBytes(reinterpret_cast(data), + len, + reinterpret_cast(hash_array)); +} + +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); +bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output); + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ diff --git a/port/port_example.h b/port/port_example.h new file mode 100644 index 0000000..cf72617 --- /dev/null +++ b/port/port_example.h @@ -0,0 +1,120 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ +#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ + +namespace leveldb { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// A type that holds a pointer that can be read or written atomically +// (i.e., without word-tearing.) +class AtomicPointer { + private: + intptr_t rep_; + public: + // Initialize to arbitrary value + AtomicPointer(); + + // Initialize to hold v + explicit AtomicPointer(void* v) : rep_(v) { } + + // Read and return the stored pointer with the guarantee that no + // later memory access (read or write) by this thread can be + // reordered ahead of this read. + void* Acquire_Load() const; + + // Set v as the stored pointer with the guarantee that no earlier + // memory access (read or write) by this thread can be reordered + // after this store. + void Release_Store(void* v); + + // Read the stored pointer with no ordering guarantees. + void* NoBarrier_Load() const; + + // Set va as the stored pointer with no ordering guarantees. + void NoBarrier_Store(void* v); +}; + +// ------------------ Checksumming ------------------- + +// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]" +extern void SHA1_Hash(const char* data, size_t len, char* hash_array); + +// ------------------ Compression ------------------- + +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); + +// Attempt to snappy uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output); + +// ------------------ Miscellaneous ------------------- + +// If heap profiling is not supported, returns false. +// Else repeatedly calls (*func)(arg, data, n) and then returns true. +// The concatenation of all "data[0,n-1]" fragments is the heap profile. +extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/port/port_posix.cc b/port/port_posix.cc new file mode 100644 index 0000000..e75da8b --- /dev/null +++ b/port/port_posix.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_posix.h" + +#include +#include +#include +#include "util/logging.h" + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } + +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/port/port_posix.h b/port/port_posix.h new file mode 100644 index 0000000..7adbc01 --- /dev/null +++ b/port/port_posix.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ + +#include +#include +#include +#include +#include +#include +#include "port/sha1_portable.h" + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +// Storage for a lock-free pointer +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + SHA1_Hash_Portable(data, len, hash_array); +} + +// TODO(gabor): Implement actual compress +inline bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { + return false; +} + +// TODO(gabor): Implement actual uncompress +inline bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { + return false; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc new file mode 100644 index 0000000..8fa7277 --- /dev/null +++ b/port/sha1_portable.cc @@ -0,0 +1,298 @@ +// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This module provides a slow but portable implementation of +// the SHA1 hash function. +// +// It is adapted from free code written by Paul E. Jones +// . See http://www.packetizer.com/security/sha1/ +// +// The license for the original code is: +/* + Copyright (C) 1998, 2009 + Paul E. Jones + + Freeware Public License (FPL) + + This software is licensed as "freeware." Permission to distribute + this software in source and binary forms, including incorporation + into other products, is hereby granted without a fee. THIS SOFTWARE + IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD + LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER + DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA + OR DATA BEING RENDERED INACCURATE. +*/ + +#include "port/sha1_portable.h" +#include +#include +#include + +namespace leveldb { +namespace port { + +/* + * Description: + * This class implements the Secure Hashing Standard as defined + * in FIPS PUB 180-1 published April 17, 1995. + */ + +/* + * This structure will hold context information for the hashing + * operation + */ +typedef struct SHA1Context { + unsigned Message_Digest[5]; /* Message Digest (output) */ + + unsigned Length_Low; /* Message length in bits */ + unsigned Length_High; /* Message length in bits */ + + unsigned char Message_Block[64]; /* 512-bit message blocks */ + int Message_Block_Index; /* Index into message block array */ + + bool Computed; /* Is the digest computed? */ + bool Corrupted; /* Is the message digest corruped? */ +} SHA1Context; + +/* + * Portability Issues: + * SHA-1 is defined in terms of 32-bit "words". This code was + * written with the expectation that the processor has at least + * a 32-bit machine word size. If the machine word size is larger, + * the code should still function properly. One caveat to that + * is that the input functions taking characters and character + * arrays assume that only 8 bits of information are stored in each + * character. + */ + +/* + * Define the circular shift macro + */ +#define SHA1CircularShift(bits,word) \ + ((((word) << (bits)) & 0xFFFFFFFF) | \ + ((word) >> (32-(bits)))) + +/* Function prototypes */ +static void SHA1ProcessMessageBlock(SHA1Context *); +static void SHA1PadMessage(SHA1Context *); + +// Initialize the SHA1Context in preparation for computing a new +// message digest. +static void SHA1Reset(SHA1Context* context) { + context->Length_Low = 0; + context->Length_High = 0; + context->Message_Block_Index = 0; + + context->Message_Digest[0] = 0x67452301; + context->Message_Digest[1] = 0xEFCDAB89; + context->Message_Digest[2] = 0x98BADCFE; + context->Message_Digest[3] = 0x10325476; + context->Message_Digest[4] = 0xC3D2E1F0; + + context->Computed = false; + context->Corrupted = false; +} + +// This function will return the 160-bit message digest into the +// Message_Digest array within the SHA1Context provided +static bool SHA1Result(SHA1Context *context) { + if (context->Corrupted) { + return false; + } + + if (!context->Computed) { + SHA1PadMessage(context); + context->Computed = true; + } + return true; +} + +// This function accepts an array of bytes as the next portion of +// the message. +static void SHA1Input(SHA1Context *context, + const unsigned char *message_array, + unsigned length) { + if (!length) return; + + if (context->Computed || context->Corrupted) { + context->Corrupted = true; + return; + } + + while(length-- && !context->Corrupted) { + context->Message_Block[context->Message_Block_Index++] = + (*message_array & 0xFF); + + context->Length_Low += 8; + /* Force it to 32 bits */ + context->Length_Low &= 0xFFFFFFFF; + if (context->Length_Low == 0) { + context->Length_High++; + /* Force it to 32 bits */ + context->Length_High &= 0xFFFFFFFF; + if (context->Length_High == 0) + { + /* Message is too long */ + context->Corrupted = true; + } + } + + if (context->Message_Block_Index == 64) + { + SHA1ProcessMessageBlock(context); + } + + message_array++; + } +} + +// This function will process the next 512 bits of the message stored +// in the Message_Block array. +static void SHA1ProcessMessageBlock(SHA1Context *context) { + const unsigned K[] = // Constants defined in SHA-1 + { + 0x5A827999, + 0x6ED9EBA1, + 0x8F1BBCDC, + 0xCA62C1D6 + }; + int t; // Loop counter + unsigned temp; // Temporary word value + unsigned W[80]; // Word sequence + unsigned A, B, C, D, E; // Word buffers + + // Initialize the first 16 words in the array W + for(t = 0; t < 16; t++) { + W[t] = ((unsigned) context->Message_Block[t * 4]) << 24; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]); + } + + for(t = 16; t < 80; t++) { + W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]); + } + + A = context->Message_Digest[0]; + B = context->Message_Digest[1]; + C = context->Message_Digest[2]; + D = context->Message_Digest[3]; + E = context->Message_Digest[4]; + + for(t = 0; t < 20; t++) { + temp = SHA1CircularShift(5,A) + + ((B & C) | ((~B) & D)) + E + W[t] + K[0]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 20; t < 40; t++) { + temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 40; t < 60; t++) { + temp = SHA1CircularShift(5,A) + + ((B & C) | (B & D) | (C & D)) + E + W[t] + K[2]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 60; t < 80; t++) { + temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF; + context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF; + context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF; + context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF; + context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF; + + context->Message_Block_Index = 0; +} + +// According to the standard, the message must be padded to an even +// 512 bits. The first padding bit must be a '1'. The last 64 bits +// represent the length of the original message. All bits in between +// should be 0. This function will pad the message according to those +// rules by filling the Message_Block array accordingly. It will also +// call SHA1ProcessMessageBlock() appropriately. When it returns, it +// can be assumed that the message digest has been computed. +static void SHA1PadMessage(SHA1Context *context) { + // Check to see if the current message block is too small to hold + // the initial padding bits and length. If so, we will pad the + // block, process it, and then continue padding into a second block. + if (context->Message_Block_Index > 55) { + context->Message_Block[context->Message_Block_Index++] = 0x80; + while(context->Message_Block_Index < 64) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + + SHA1ProcessMessageBlock(context); + + while(context->Message_Block_Index < 56) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + } else { + context->Message_Block[context->Message_Block_Index++] = 0x80; + while(context->Message_Block_Index < 56) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + } + + // Store the message length as the last 8 octets + context->Message_Block[56] = (context->Length_High >> 24) & 0xFF; + context->Message_Block[57] = (context->Length_High >> 16) & 0xFF; + context->Message_Block[58] = (context->Length_High >> 8) & 0xFF; + context->Message_Block[59] = (context->Length_High) & 0xFF; + context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF; + context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF; + context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF; + context->Message_Block[63] = (context->Length_Low) & 0xFF; + + SHA1ProcessMessageBlock(context); +} + + +void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) { + SHA1Context context; + SHA1Reset(&context); + SHA1Input(&context, reinterpret_cast(data), len); + bool ok = SHA1Result(&context); + if (!ok) { + fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n"); + exit(1); + } + for (int i = 0; i < 5; i++) { + uint32_t value = context.Message_Digest[i]; + hash_array[i*4 + 0] = (value >> 24) & 0xff; + hash_array[i*4 + 1] = (value >> 16) & 0xff; + hash_array[i*4 + 2] = (value >> 8) & 0xff; + hash_array[i*4 + 3] = value & 0xff; + } +} + +} +} diff --git a/port/sha1_portable.h b/port/sha1_portable.h new file mode 100644 index 0000000..31db305 --- /dev/null +++ b/port/sha1_portable.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ +#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ + +#include + +namespace leveldb { +namespace port { + +// Compute the SHA1 hash value of "data[0..len-1]" and store it in +// "hash_array[0..19]". hash_array must have 20 bytes of space available. +// +// This function is portable but may not be as fast as a version +// optimized for your platform. It is provided as a default method +// that can be used when porting leveldb to a new platform if no +// better SHA1 hash implementation is available. +void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array); + +} +} + +#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ diff --git a/port/sha1_test.cc b/port/sha1_test.cc new file mode 100644 index 0000000..b182e67 --- /dev/null +++ b/port/sha1_test.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port.h" +#include "util/testharness.h" + +namespace leveldb { +namespace port { + +class SHA1 { }; + +static std::string TestSHA1(const char* data, size_t len) { + char hash_val[20]; + SHA1_Hash(data, len, hash_val); + char buf[41]; + for (int i = 0; i < 20; i++) { + snprintf(buf + i * 2, 41 - i * 2, + "%02x", + static_cast(static_cast( + hash_val[i]))); + } + return std::string(buf, 40); +} + +TEST(SHA1, Simple) { + ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0)); + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5)); + std::string x(10000, 'x'); + ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75", + TestSHA1(x.data(), x.size())); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/port/win/stdint.h b/port/win/stdint.h new file mode 100644 index 0000000..39edd0d --- /dev/null +++ b/port/win/stdint.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// MSVC didn't ship with this file until the 2010 version. + +#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ +#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ + +#if !defined(_MSC_VER) +#error This file should only be included when compiling with MSVC. +#endif + +// Define C99 equivalent types. +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long long int64_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/table/block.cc b/table/block.cc new file mode 100644 index 0000000..0525d2d --- /dev/null +++ b/table/block.cc @@ -0,0 +1,261 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block.h" + +#include +#include +#include "leveldb/comparator.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +inline uint32_t Block::NumRestarts() const { + assert(size_ >= 2*sizeof(uint32_t)); + return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); +} + +Block::Block(const char* data, size_t size) + : data_(data), + size_(size) { + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + } +} + +Block::~Block() { + delete[] data_; +} + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns NULL. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +static inline const char* DecodeEntry(const char* p, const char* limit, + uint32_t* shared, + uint32_t* non_shared, + uint32_t* value_length) { + if (limit - p < 3) return NULL; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; + if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; + } + + if (limit - p < (*non_shared + *value_length)) return NULL; + return p; +} + +class Block::Iter : public Iterator { + private: + const Comparator* const comparator_; + const char* const data_; // underlying block contents + uint32_t const restarts_; // Offset of restart array (list of fixed32) + uint32_t const num_restarts_; // Number of uint32_t entries in restart array + + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + uint32_t restart_index_; // Index of restart block in which current_ falls + std::string key_; + Slice value_; + Status status_; + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + return (value_.data() + value_.size()) - data_; + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + public: + Iter(const Comparator* comparator, + const char* data, + uint32_t restarts, + uint32_t num_restarts) + : comparator_(comparator), + data_(data), + restarts_(restarts), + num_restarts_(num_restarts), + current_(restarts_), + restart_index_(num_restarts_) { + assert(num_restarts_ > 0); + } + + virtual bool Valid() const { return current_ < restarts_; } + virtual Status status() const { return status_; } + virtual Slice key() const { + assert(Valid()); + return key_; + } + virtual Slice value() const { + assert(Valid()); + return value_; + } + + virtual void Next() { + assert(Valid()); + ParseNextKey(); + } + + virtual void Prev() { + assert(Valid()); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + do { + // Loop until end of current entry hits the start of original entry + } while (ParseNextKey() && NextEntryOffset() < original); + } + + virtual void Seek(const Slice& target) { + // Binary search in restart array to find the first restart point + // with a key >= target + uint32_t left = 0; + uint32_t right = num_restarts_ - 1; + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared, value_length; + const char* key_ptr = DecodeEntry(data_ + region_offset, + data_ + restarts_, + &shared, &non_shared, &value_length); + if (key_ptr == NULL || (shared != 0)) { + CorruptionError(); + return; + } + Slice mid_key(key_ptr, non_shared); + if (Compare(mid_key, target) < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } + } + + // Linear search (within restart block) for first key >= target + SeekToRestartPoint(left); + while (true) { + if (!ParseNextKey()) { + return; + } + if (Compare(key_, target) >= 0) { + return; + } + } + } + + virtual void SeekToFirst() { + SeekToRestartPoint(0); + ParseNextKey(); + } + + virtual void SeekToLast() { + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } + } + + private: + void CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.clear(); + value_.clear(); + } + + bool ParseNextKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (p == NULL || key_.size() < shared) { + CorruptionError(); + return false; + } else { + key_.resize(shared); + key_.append(p, non_shared); + value_ = Slice(p + non_shared, value_length); + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + return true; + } + } +}; + +Iterator* Block::NewIterator(const Comparator* cmp) { + if (size_ < 2*sizeof(uint32_t)) { + return NewErrorIterator(Status::Corruption("bad block contents")); + } + const uint32_t num_restarts = NumRestarts(); + if (num_restarts == 0) { + return NewEmptyIterator(); + } else { + return new Iter(cmp, data_, restart_offset_, num_restarts); + } +} + +} diff --git a/table/block.h b/table/block.h new file mode 100644 index 0000000..cdf0598 --- /dev/null +++ b/table/block.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ +#define STORAGE_LEVELDB_TABLE_BLOCK_H_ + +#include +#include +#include "leveldb/iterator.h" + +namespace leveldb { + +class Comparator; + +class Block { + public: + // Initialize the block with the specified contents. + // Takes ownership of data[] and will delete[] it when done. + Block(const char* data, size_t size); + + ~Block(); + + size_t size() const { return size_; } + Iterator* NewIterator(const Comparator* comparator); + + private: + uint32_t NumRestarts() const; + + const char* data_; + size_t size_; + uint32_t restart_offset_; // Offset in data_ of restart array + + // No copying allowed + Block(const Block&); + void operator=(const Block&); + + class Iter; +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ diff --git a/table/block_builder.cc b/table/block_builder.cc new file mode 100644 index 0000000..ae18b36 --- /dev/null +++ b/table/block_builder.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_builder.h" + +#include +#include +#include "leveldb/comparator.h" +#include "leveldb/table_builder.h" +#include "util/coding.h" + +namespace leveldb { + +BlockBuilder::BlockBuilder(const Options* options) + : options_(options), + restarts_(), + counter_(0), + finished_(false) { + assert(options->block_restart_interval >= 1); + restarts_.push_back(0); // First restart point is at offset 0 +} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + counter_ = 0; + finished_ = false; + last_key_.clear(); +} + +size_t BlockBuilder::CurrentSizeEstimate() const { + return (buffer_.size() + // Raw data buffer + restarts_.size() * sizeof(uint32_t) + // Restart array + sizeof(uint32_t)); // Restart array length +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (int i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + PutFixed32(&buffer_, restarts_.size()); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value) { + Slice last_key_piece(last_key_); + assert(!finished_); + assert(counter_ <= options_->block_restart_interval); + assert(buffer_.empty() // No values yet? + || options_->comparator->Compare(key, last_key_piece) > 0); + size_t shared = 0; + if (counter_ < options_->block_restart_interval) { + // See how much sharing to do with previous string + const size_t min_length = std::min(last_key_piece.size(), key.size()); + while ((shared < min_length) && (last_key_[shared] == key[shared])) { + shared++; + } + } else { + // Restart compression + restarts_.push_back(buffer_.size()); + counter_ = 0; + } + const size_t non_shared = key.size() - shared; + + // Add "" to buffer_ + PutVarint32(&buffer_, shared); + PutVarint32(&buffer_, non_shared); + PutVarint32(&buffer_, value.size()); + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + buffer_.append(value.data(), value.size()); + + // Update state + last_key_.resize(shared); + last_key_.append(key.data() + shared, non_shared); + assert(Slice(last_key_) == key); + counter_++; +} + +} diff --git a/table/block_builder.h b/table/block_builder.h new file mode 100644 index 0000000..bf92a0f --- /dev/null +++ b/table/block_builder.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ +#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ + +#include + +#include +#include "leveldb/slice.h" + +namespace leveldb { + +struct Options; + +class BlockBuilder { + public: + explicit BlockBuilder(const Options* options); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been callled since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + size_t CurrentSizeEstimate() const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { + return buffer_.empty(); + } + + private: + const Options* options_; + std::string buffer_; // Destination buffer + std::vector restarts_; // Restart points + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + + // No copying allowed + BlockBuilder(const BlockBuilder&); + void operator=(const BlockBuilder&); +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ diff --git a/table/format.cc b/table/format.cc new file mode 100644 index 0000000..8c6b0f3 --- /dev/null +++ b/table/format.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include "leveldb/env.h" +#include "port/port.h" +#include "table/block.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, offset_); + PutVarint64(dst, size_); +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && + GetVarint64(input, &size_)) { + return Status::OK(); + } else { + return Status::Corruption("bad block handle"); + } +} + +void Footer::EncodeTo(std::string* dst) const { +#ifndef NDEBUG + const size_t original_size = dst->size(); +#endif + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding + PutFixed32(dst, static_cast(kTableMagicNumber)); + PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); + assert(dst->size() == original_size + kEncodedLength); +} + +Status Footer::DecodeFrom(Slice* input) { + const char* magic_ptr = input->data() + kEncodedLength - 8; + const uint32_t magic_lo = DecodeFixed32(magic_ptr); + const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); + const uint64_t magic = ((static_cast(magic_hi) << 32) | + (static_cast(magic_lo))); + if (magic != kTableMagicNumber) { + return Status::InvalidArgument("not an sstable (bad magic number)"); + } + + Status result = metaindex_handle_.DecodeFrom(input); + if (result.ok()) { + result = index_handle_.DecodeFrom(input); + } + if (result.ok()) { + // We skip over any leftover data (just padding for now) in "input" + const char* end = magic_ptr + 8; + *input = Slice(end, input->data() + input->size() - end); + } + return result; +} + +Status ReadBlock(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** block) { + *block = NULL; + + // Read the block contents as well as the type/crc footer. + // See table_builder.cc for the code that built this structure. + size_t n = handle.size(); + char* buf = new char[n + kBlockTrailerSize]; + Slice contents; + Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); + if (!s.ok()) { + delete[] buf; + return s; + } + if (contents.size() != n + kBlockTrailerSize) { + delete[] buf; + return Status::Corruption("truncated block read"); + } + + // Check the crc of the type and the block contents + const char* data = contents.data(); // Pointer to where Read put the data + if (options.verify_checksums) { + const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); + const uint32_t actual = crc32c::Value(data, n + 1); + if (actual != crc) { + delete[] buf; + s = Status::Corruption("block checksum mismatch"); + return s; + } + } + + switch (data[n]) { + case kNoCompression: + if (data != buf) { + // File implementation gave us pointer to some other data. + // Copy into buf[]. + memcpy(buf, data, n + kBlockTrailerSize); + } + + // Ok + break; + case kSnappyCompression: { + std::string decompressed; + if (!port::Snappy_Uncompress(data, n, &decompressed)) { + delete[] buf; + s = Status::Corruption("corrupted compressed block contents"); + return s; + } + delete[] buf; // Done with uncompressed data + buf = new char[decompressed.size()]; + memcpy(buf, decompressed.data(), decompressed.size()); + n = decompressed.size(); + break; + } + default: + delete[] buf; + return Status::Corruption("bad block type"); + } + + *block = new Block(buf, n); // Block takes ownership of buf[] + return Status::OK(); +} + +} diff --git a/table/format.h b/table/format.h new file mode 100644 index 0000000..a6ab964 --- /dev/null +++ b/table/format.h @@ -0,0 +1,103 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ +#define STORAGE_LEVELDB_TABLE_FORMAT_H_ + +#include +#include +#include "leveldb/slice.h" +#include "leveldb/status.h" +#include "leveldb/table_builder.h" + +namespace leveldb { + +class Block; +class RandomAccessFile; +struct ReadOptions; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + BlockHandle(); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t offset) { offset_ = offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t size) { size_ = size; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Maximum encoding length of a BlockHandle + enum { kMaxEncodedLength = 10 + 10 }; + + private: + uint64_t offset_; + uint64_t size_; +}; + +// Footer encapsulates the fixed information stored at the tail +// end of every table file. +class Footer { + public: + Footer() { } + + // The block handle for the metaindex block of the table + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } + + // The block handle for the index block of the table + const BlockHandle& index_handle() const { + return index_handle_; + } + void set_index_handle(const BlockHandle& h) { + index_handle_ = h; + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Encoded length of a Footer. Note that the serialization of a + // Footer will always occupy exactly this many bytes. It consists + // of two block handles and a magic number. + enum { + kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 + }; + + private: + BlockHandle metaindex_handle_; + BlockHandle index_handle_; +}; + +// kTableMagicNumber was picked by running +// echo http://code.google.com/p/leveldb/ | sha1sum +// and taking the leading 64 bits. +static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; + +// 1-byte type + 32-bit crc +static const size_t kBlockTrailerSize = 5; + +// Read the block identified by "handle" from "file". On success, +// store a pointer to the heap-allocated result in *block and return +// OK. On failure store NULL in *block and return non-OK. +extern Status ReadBlock(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** block); + +// Implementation details follow. Clients should ignore, + +inline BlockHandle::BlockHandle() + : offset_(~static_cast(0)), + size_(~static_cast(0)) { +} + +} + +#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ diff --git a/table/iterator.cc b/table/iterator.cc new file mode 100644 index 0000000..4ddd55f --- /dev/null +++ b/table/iterator.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/iterator.h" +#include "util/logging.h" + +namespace leveldb { + +Iterator::Iterator() { + cleanup_.function = NULL; + cleanup_.next = NULL; +} + +Iterator::~Iterator() { + if (cleanup_.function != NULL) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != NULL; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != NULL); + Cleanup* c; + if (cleanup_.function == NULL) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +} diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h new file mode 100644 index 0000000..158d3a7 --- /dev/null +++ b/table/iterator_wrapper.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ +#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ + +namespace leveldb { + +// A internal wrapper class with an interface similar to Iterator that +// caches the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +class IteratorWrapper { + private: + Iterator* iter_; + bool valid_; + Slice key_; + public: + IteratorWrapper(): iter_(NULL), valid_(false) { } + explicit IteratorWrapper(Iterator* iter): iter_(NULL) { + Set(iter); + } + ~IteratorWrapper() { delete iter_; } + Iterator* iter() const { return iter_; } + + // Takes ownership of "iter" and will delete it when destroyed, or + // when Set() is invoked again. + void Set(Iterator* iter) { + delete iter_; + iter_ = iter; + if (iter_ == NULL) { + valid_ = false; + } else { + Update(); + } + } + + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + Slice value() const { assert(Valid()); return iter_->value(); } + // Methods below require iter() != NULL + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + key_ = iter_->key(); + } + } +}; + +} + + +#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ diff --git a/table/merger.cc b/table/merger.cc new file mode 100644 index 0000000..6ce06bb --- /dev/null +++ b/table/merger.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merger.h" + +#include "leveldb/comparator.h" +#include "leveldb/iterator.h" +#include "table/iterator_wrapper.h" + +namespace leveldb { + +namespace { +class MergingIterator : public Iterator { + public: + MergingIterator(const Comparator* comparator, Iterator** children, int n) + : comparator_(comparator), + children_(new IteratorWrapper[n]), + n_(n), + current_(NULL), + direction_(kForward) { + for (int i = 0; i < n; i++) { + children_[i].Set(children[i]); + } + } + + virtual ~MergingIterator() { + delete[] children_; + } + + virtual bool Valid() const { + return (current_ != NULL); + } + + virtual void SeekToFirst() { + for (int i = 0; i < n_; i++) { + children_[i].SeekToFirst(); + } + FindSmallest(); + direction_ = kForward; + } + + virtual void SeekToLast() { + for (int i = 0; i < n_; i++) { + children_[i].SeekToLast(); + } + FindLargest(); + direction_ = kReverse; + } + + virtual void Seek(const Slice& target) { + for (int i = 0; i < n_; i++) { + children_[i].Seek(target); + } + FindSmallest(); + direction_ = kForward; + } + + virtual void Next() { + assert(Valid()); + + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all of the non-current_ children since current_ is + // the smallest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kForward) { + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child != current_) { + child->Seek(key()); + if (child->Valid() && + comparator_->Compare(key(), child->key()) == 0) { + child->Next(); + } + } + } + direction_ = kForward; + } + + current_->Next(); + FindSmallest(); + } + + virtual void Prev() { + assert(Valid()); + + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all of the non-current_ children since current_ is + // the largest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kReverse) { + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child != current_) { + child->Seek(key()); + if (child->Valid()) { + // Child is at first entry >= key(). Step back one to be < key() + child->Prev(); + } else { + // Child has no entries >= key(). Position at last entry. + child->SeekToLast(); + } + } + } + direction_ = kReverse; + } + + current_->Prev(); + FindLargest(); + } + + virtual Slice key() const { + assert(Valid()); + return current_->key(); + } + + virtual Slice value() const { + assert(Valid()); + return current_->value(); + } + + virtual Status status() const { + Status status; + for (int i = 0; i < n_; i++) { + status = children_[i].status(); + if (!status.ok()) { + break; + } + } + return status; + } + + private: + void FindSmallest(); + void FindLargest(); + + // We might want to use a heap in case there are lots of children. + // For now we use a simple array since we expect a very small number + // of children in leveldb. + const Comparator* comparator_; + IteratorWrapper* children_; + int n_; + IteratorWrapper* current_; + + // Which direction is the iterator moving? + enum Direction { + kForward, + kReverse + }; + Direction direction_; +}; + +void MergingIterator::FindSmallest() { + IteratorWrapper* smallest = NULL; + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child->Valid()) { + if (smallest == NULL) { + smallest = child; + } else if (comparator_->Compare(child->key(), smallest->key()) < 0) { + smallest = child; + } + } + } + current_ = smallest; +} + +void MergingIterator::FindLargest() { + IteratorWrapper* largest = NULL; + for (int i = n_-1; i >= 0; i--) { + IteratorWrapper* child = &children_[i]; + if (child->Valid()) { + if (largest == NULL) { + largest = child; + } else if (comparator_->Compare(child->key(), largest->key()) > 0) { + largest = child; + } + } + } + current_ = largest; +} +} + +Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { + assert(n >= 0); + if (n == 0) { + return NewEmptyIterator(); + } else if (n == 1) { + return list[0]; + } else { + return new MergingIterator(cmp, list, n); + } +} + +} diff --git a/table/merger.h b/table/merger.h new file mode 100644 index 0000000..71d9dc5 --- /dev/null +++ b/table/merger.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ +#define STORAGE_LEVELDB_TABLE_MERGER_H_ + +namespace leveldb { + +class Comparator; +class Iterator; + +// Return an iterator that provided the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern Iterator* NewMergingIterator( + const Comparator* comparator, Iterator** children, int n); + +} + +#endif // STORAGE_LEVELDB_TABLE_MERGER_H_ diff --git a/table/table.cc b/table/table.cc new file mode 100644 index 0000000..9820753 --- /dev/null +++ b/table/table.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table.h" + +#include "leveldb/cache.h" +#include "leveldb/env.h" +#include "table/block.h" +#include "table/format.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" + +namespace leveldb { + +struct Table::Rep { + ~Rep() { + delete index_block; + } + + Options options; + Status status; + RandomAccessFile* file; + uint64_t cache_id; + + BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer + Block* index_block; +}; + +Status Table::Open(const Options& options, + RandomAccessFile* file, + uint64_t size, + Table** table) { + *table = NULL; + if (size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + char footer_space[Footer::kEncodedLength]; + Slice footer_input; + Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, + &footer_input, footer_space); + if (!s.ok()) return s; + + Footer footer; + s = footer.DecodeFrom(&footer_input); + if (!s.ok()) return s; + + // Read the index block + Block* index_block = NULL; + if (s.ok()) { + s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); + } + + if (s.ok()) { + // We've successfully read the footer and the index block: we're + // ready to serve requests. + Rep* rep = new Table::Rep; + rep->options = options; + rep->file = file; + rep->metaindex_handle = footer.metaindex_handle(); + rep->index_block = index_block; + rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); + *table = new Table(rep); + } else { + if (index_block) delete index_block; + } + + return s; +} + +Table::~Table() { + delete rep_; +} + +static void DeleteBlock(void* arg, void* ignored) { + delete reinterpret_cast(arg); +} + +static void DeleteCachedBlock(const Slice& key, void* value) { + Block* block = reinterpret_cast(value); + delete block; +} + +static void ReleaseBlock(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle); +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +Iterator* Table::BlockReader(void* arg, + const ReadOptions& options, + const Slice& index_value) { + Table* table = reinterpret_cast(arg); + Cache* block_cache = table->rep_->options.block_cache; + Block* block = NULL; + Cache::Handle* cache_handle = NULL; + + BlockHandle handle; + Slice input = index_value; + Status s = handle.DecodeFrom(&input); + // We intentionally allow extra stuff in index_value so that we + // can add more features in the future. + + if (s.ok()) { + if (block_cache != NULL) { + char cache_key_buffer[16]; + EncodeFixed64(cache_key_buffer, table->rep_->cache_id); + EncodeFixed64(cache_key_buffer+8, handle.offset()); + Slice key(cache_key_buffer, sizeof(cache_key_buffer)); + cache_handle = block_cache->Lookup(key); + if (cache_handle != NULL) { + block = reinterpret_cast(block_cache->Value(cache_handle)); + } else { + s = ReadBlock(table->rep_->file, options, handle, &block); + if (s.ok() && options.fill_cache) { + cache_handle = block_cache->Insert( + key, block, block->size(), &DeleteCachedBlock); + } + } + } else { + s = ReadBlock(table->rep_->file, options, handle, &block); + } + } + + Iterator* iter; + if (block != NULL) { + iter = block->NewIterator(table->rep_->options.comparator); + if (cache_handle == NULL) { + iter->RegisterCleanup(&DeleteBlock, block, NULL); + } else { + iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); + } + } else { + iter = NewErrorIterator(s); + } + return iter; +} + +Iterator* Table::NewIterator(const ReadOptions& options) const { + return NewTwoLevelIterator( + rep_->index_block->NewIterator(rep_->options.comparator), + &Table::BlockReader, const_cast(this), options); +} + +uint64_t Table::ApproximateOffsetOf(const Slice& key) const { + Iterator* index_iter = + rep_->index_block->NewIterator(rep_->options.comparator); + index_iter->Seek(key); + uint64_t result; + if (index_iter->Valid()) { + BlockHandle handle; + Slice input = index_iter->value(); + Status s = handle.DecodeFrom(&input); + if (s.ok()) { + result = handle.offset(); + } else { + // Strange: we can't decode the block handle in the index block. + // We'll just return the offset of the metaindex block, which is + // close to the whole file size for this case. + result = rep_->metaindex_handle.offset(); + } + } else { + // key is past the last key in the file. Approximate the offset + // by returning the offset of the metaindex block (which is + // right near the end of the file). + result = rep_->metaindex_handle.offset(); + } + delete index_iter; + return result; +} + +} diff --git a/table/table_builder.cc b/table/table_builder.cc new file mode 100644 index 0000000..7ec7ad2 --- /dev/null +++ b/table/table_builder.cc @@ -0,0 +1,227 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table_builder.h" + +#include +#include +#include "leveldb/comparator.h" +#include "leveldb/env.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/logging.h" + +namespace leveldb { + +struct TableBuilder::Rep { + Options options; + Options index_block_options; + WritableFile* file; + uint64_t offset; + Status status; + BlockBuilder data_block; + BlockBuilder index_block; + std::string last_key; + int64_t num_entries; + bool closed; // Either Finish() or Abandon() has been called. + + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + // + // Invariant: r->pending_index_entry is true only if data_block is empty. + bool pending_index_entry; + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + + Rep(const Options& opt, WritableFile* f) + : options(opt), + index_block_options(opt), + file(f), + offset(0), + data_block(&options), + index_block(&index_block_options), + num_entries(0), + closed(false), + pending_index_entry(false) { + index_block_options.block_restart_interval = 1; + } +}; + +TableBuilder::TableBuilder(const Options& options, WritableFile* file) + : rep_(new Rep(options, file)) { +} + +TableBuilder::~TableBuilder() { + assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_; +} + +Status TableBuilder::ChangeOptions(const Options& options) { + // Note: if more fields are added to Options, update + // this function to catch changes that should not be allowed to + // change in the middle of building a Table. + if (options.comparator != rep_->options.comparator) { + return Status::InvalidArgument("changing comparator while building table"); + } + + // Note that any live BlockBuilders point to rep_->options and therefore + // will automatically pick up the updated options. + rep_->options = options; + rep_->index_block_options = options; + rep_->index_block_options.block_restart_interval = 1; + return Status::OK(); +} + +void TableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->num_entries > 0) { + assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); + } + + if (r->pending_index_entry) { + assert(r->data_block.empty()); + r->options.comparator->FindShortestSeparator(&r->last_key, key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + r->pending_index_entry = false; + } + + r->last_key.assign(key.data(), key.size()); + r->num_entries++; + r->data_block.Add(key, value); + + const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); + if (estimated_block_size >= r->options.block_size) { + Flush(); + } +} + +void TableBuilder::Flush() { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->data_block.empty()) return; + assert(!r->pending_index_entry); + WriteBlock(&r->data_block, &r->pending_handle); + if (ok()) { + r->pending_index_entry = true; + r->status = r->file->Flush(); + } +} + +void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + Slice raw = block->Finish(); + + Slice block_contents; + CompressionType type = r->options.compression; + // TODO(postrelease): Support more compression options: zlib? + switch (type) { + case kNoCompression: + block_contents = raw; + break; + + case kSnappyCompression: { + std::string* compressed = &r->compressed_output; + if (port::Snappy_Compress(raw.data(), raw.size(), compressed) && + compressed->size() < raw.size() - (raw.size() / 8u)) { + block_contents = *compressed; + } else { + // Snappy not supported, or compressed less than 12.5%, so just + // store uncompressed form + block_contents = raw; + type = kNoCompression; + } + break; + } + } + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type + EncodeFixed32(trailer+1, crc32c::Mask(crc)); + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + } + } + r->compressed_output.clear(); + block->Reset(); +} + +Status TableBuilder::status() const { + return rep_->status; +} + +Status TableBuilder::Finish() { + Rep* r = rep_; + Flush(); + assert(!r->closed); + r->closed = true; + BlockHandle metaindex_block_handle; + BlockHandle index_block_handle; + if (ok()) { + BlockBuilder meta_index_block(&r->options); + // TODO(postrelease): Add stats and other meta blocks + WriteBlock(&meta_index_block, &metaindex_block_handle); + } + if (ok()) { + if (r->pending_index_entry) { + r->options.comparator->FindShortSuccessor(&r->last_key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + r->pending_index_entry = false; + } + WriteBlock(&r->index_block, &index_block_handle); + } + if (ok()) { + Footer footer; + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } + } + return r->status; +} + +void TableBuilder::Abandon() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; +} + +uint64_t TableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t TableBuilder::FileSize() const { + return rep_->offset; +} + +} diff --git a/table/table_test.cc b/table/table_test.cc new file mode 100644 index 0000000..4b3e85e --- /dev/null +++ b/table/table_test.cc @@ -0,0 +1,841 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table.h" + +#include +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "leveldb/table_builder.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +// Return reverse of "key". +// Used to test non-lexicographic comparators. +static std::string Reverse(const Slice& key) { + std::string str(key.ToString()); + std::string rev(str.rbegin(), str.rend()); + return rev; +} + +namespace { +class ReverseKeyComparator : public Comparator { + public: + virtual const char* Name() const { + return "leveldb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + std::string s = Reverse(*start); + std::string l = Reverse(limit); + BytewiseComparator()->FindShortestSeparator(&s, l); + *start = Reverse(s); + } + + virtual void FindShortSuccessor(std::string* key) const { + std::string s = Reverse(*key); + BytewiseComparator()->FindShortSuccessor(&s); + *key = Reverse(s); + } +}; +} +static ReverseKeyComparator reverse_key_comparator; + +static void Increment(const Comparator* cmp, std::string* key) { + if (cmp == BytewiseComparator()) { + key->push_back('\0'); + } else { + assert(cmp == &reverse_key_comparator); + std::string rev = Reverse(*key); + rev.push_back('\0'); + *key = Reverse(rev); + } +} + +// An STL comparator that uses a Comparator +namespace { +struct STLLessThan { + const Comparator* cmp; + + STLLessThan() : cmp(BytewiseComparator()) { } + STLLessThan(const Comparator* c) : cmp(c) { } + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } +}; +} + +class StringSink: public WritableFile { + public: + ~StringSink() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class StringSource: public RandomAccessFile { + public: + StringSource(const Slice& contents) + : contents_(contents.data(), contents.size()) { + } + + virtual ~StringSource() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + return Status::OK(); + } + + private: + std::string contents_; +}; + +typedef std::map KVMap; + +// Helper class for tests to unify the interface between +// BlockBuilder/TableBuilder and Block/Table. +class Constructor { + public: + explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } + virtual ~Constructor() { } + + void Add(const std::string& key, const Slice& value) { + data_[key] = value.ToString(); + } + + // Finish constructing the data structure with all the keys that have + // been added so far. Returns the keys in sorted order in "*keys" + // and stores the key/value pairs in "*kvmap" + void Finish(const Options& options, + std::vector* keys, + KVMap* kvmap) { + *kvmap = data_; + keys->clear(); + for (KVMap::const_iterator it = data_.begin(); + it != data_.end(); + ++it) { + keys->push_back(it->first); + } + data_.clear(); + Status s = FinishImpl(options, *kvmap); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + // Construct the data structure from the data in "data" + virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; + + virtual size_t NumBytes() const = 0; + + virtual Iterator* NewIterator() const = 0; + + virtual const KVMap& data() { return data_; } + + virtual DB* db() const { return NULL; } // Overridden in DBConstructor + + private: + KVMap data_; +}; + +class BlockConstructor: public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp), + block_size_(-1), + block_(NULL) { } + ~BlockConstructor() { + delete block_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete block_; + block_ = NULL; + BlockBuilder builder(&options); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + } + // Open the block + Slice block_data = builder.Finish(); + block_size_ = block_data.size(); + char* block_data_copy = new char[block_size_]; + memcpy(block_data_copy, block_data.data(), block_size_); + block_ = new Block(block_data_copy, block_size_); + return Status::OK(); + } + virtual size_t NumBytes() const { return block_size_; } + + virtual Iterator* NewIterator() const { + return block_->NewIterator(comparator_); + } + + private: + const Comparator* comparator_; + int block_size_; + Block* block_; + + BlockConstructor(); +}; + +class TableConstructor: public Constructor { + public: + TableConstructor(const Comparator* cmp) + : Constructor(cmp), + source_(NULL), table_(NULL) { + } + ~TableConstructor() { + Reset(); + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + Reset(); + StringSink sink; + TableBuilder builder(options, &sink); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + ASSERT_TRUE(builder.status().ok()); + } + Status s = builder.Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + + ASSERT_EQ(sink.contents().size(), builder.FileSize()); + + // Open the table + source_ = new StringSource(sink.contents()); + Options table_options; + table_options.comparator = options.comparator; + return Table::Open(table_options, source_, sink.contents().size(), &table_); + } + virtual size_t NumBytes() const { return source_->Size(); } + + virtual Iterator* NewIterator() const { + return table_->NewIterator(ReadOptions()); + } + + uint64_t ApproximateOffsetOf(const Slice& key) const { + return table_->ApproximateOffsetOf(key); + } + + private: + void Reset() { + delete table_; + delete source_; + table_ = NULL; + source_ = NULL; + } + + StringSource* source_; + Table* table_; + + TableConstructor(); +}; + +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator: public Iterator { + public: + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + + virtual Slice key() const { + assert(Valid()); + ParsedInternalKey key; + if (!ParseInternalKey(iter_->key(), &key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return key.user_key; + } + + virtual Slice value() const { return iter_->value(); } + virtual Status status() const { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + Iterator* iter_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class MemTableConstructor: public Constructor { + public: + explicit MemTableConstructor(const Comparator* cmp) + : Constructor(cmp), + internal_comparator_(cmp) { + memtable_ = new MemTable(internal_comparator_); + } + ~MemTableConstructor() { + delete memtable_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete memtable_; + memtable_ = new MemTable(internal_comparator_); + int seq = 1; + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + memtable_->Add(seq, kTypeValue, it->first, it->second); + seq++; + } + return Status::OK(); + } + virtual size_t NumBytes() const { + return memtable_->ApproximateMemoryUsage(); + } + + virtual Iterator* NewIterator() const { + return new KeyConvertingIterator(memtable_->NewIterator()); + } + + private: + InternalKeyComparator internal_comparator_; + MemTable* memtable_; +}; + +class DBConstructor: public Constructor { + public: + explicit DBConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp) { + db_ = NULL; + NewDB(); + } + ~DBConstructor() { + delete db_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete db_; + db_ = NULL; + NewDB(); + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + WriteBatch batch; + batch.Put(it->first, it->second); + ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); + } + return Status::OK(); + } + virtual size_t NumBytes() const { + Range r("", "\xff\xff"); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + virtual Iterator* NewIterator() const { + return db_->NewIterator(ReadOptions()); + } + + virtual DB* db() const { return db_; } + + private: + void NewDB() { + std::string name = test::TmpDir() + "/table_testdb"; + + Options options; + options.comparator = comparator_; + Status status = DestroyDB(name, options); + ASSERT_TRUE(status.ok()) << status.ToString(); + + options.create_if_missing = true; + options.error_if_exists = true; + options.write_buffer_size = 10000; // Something small to force merging + status = DB::Open(options, name, &db_); + ASSERT_TRUE(status.ok()) << status.ToString(); + } + + const Comparator* comparator_; + DB* db_; +}; + +enum TestType { + TABLE_TEST, + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST, +}; + +struct TestArgs { + TestType type; + bool reverse_compare; + int restart_interval; +}; + +static const TestArgs kTestArgList[] = { + { TABLE_TEST, false, 16 }, + { TABLE_TEST, false, 1 }, + { TABLE_TEST, false, 1024 }, + { TABLE_TEST, true, 16 }, + { TABLE_TEST, true, 1 }, + { TABLE_TEST, true, 1024 }, + + { BLOCK_TEST, false, 16 }, + { BLOCK_TEST, false, 1 }, + { BLOCK_TEST, false, 1024 }, + { BLOCK_TEST, true, 16 }, + { BLOCK_TEST, true, 1 }, + { BLOCK_TEST, true, 1024 }, + + // Restart interval does not matter for memtables + { MEMTABLE_TEST, false, 16 }, + { MEMTABLE_TEST, true, 16 }, + + // Do not bother with restart interval variations for DB + { DB_TEST, false, 16 }, + { DB_TEST, true, 16 }, +}; +static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); + +class Harness { + public: + Harness() : constructor_(NULL) { } + + void Init(const TestArgs& args) { + delete constructor_; + constructor_ = NULL; + options_ = Options(); + + options_.block_restart_interval = args.restart_interval; + // Use shorter block size for tests to exercise block boundary + // conditions more. + options_.block_size = 256; + if (args.reverse_compare) { + options_.comparator = &reverse_key_comparator; + } + switch (args.type) { + case TABLE_TEST: + constructor_ = new TableConstructor(options_.comparator); + break; + case BLOCK_TEST: + constructor_ = new BlockConstructor(options_.comparator); + break; + case MEMTABLE_TEST: + constructor_ = new MemTableConstructor(options_.comparator); + break; + case DB_TEST: + constructor_ = new DBConstructor(options_.comparator); + break; + } + } + + ~Harness() { + delete constructor_; + } + + void Add(const std::string& key, const std::string& value) { + constructor_->Add(key, value); + } + + void Test(Random* rnd) { + std::vector keys; + KVMap data; + constructor_->Finish(options_, &keys, &data); + + TestForwardScan(keys, data); + TestBackwardScan(keys, data); + TestRandomAccess(rnd, keys, data); + } + + void TestForwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + for (KVMap::const_iterator model_iter = data.begin(); + model_iter != data.end(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestBackwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + for (KVMap::const_reverse_iterator model_iter = data.rbegin(); + model_iter != data.rend(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Prev(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestRandomAccess(Random* rnd, + const std::vector& keys, + const KVMap& data) { + static const bool kVerbose = false; + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + KVMap::const_iterator model_iter = data.begin(); + if (kVerbose) fprintf(stderr, "---\n"); + for (int i = 0; i < 200; i++) { + const int toss = rnd->Uniform(5); + switch (toss) { + case 0: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Next\n"); + iter->Next(); + ++model_iter; + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 1: { + if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + iter->SeekToFirst(); + model_iter = data.begin(); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 2: { + std::string key = PickRandomKey(rnd, keys); + model_iter = data.lower_bound(key); + if (kVerbose) fprintf(stderr, "Seek '%s'\n", + EscapeString(key).c_str()); + iter->Seek(Slice(key)); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 3: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Prev\n"); + iter->Prev(); + if (model_iter == data.begin()) { + model_iter = data.end(); // Wrap around to invalid value + } else { + --model_iter; + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 4: { + if (kVerbose) fprintf(stderr, "SeekToLast\n"); + iter->SeekToLast(); + if (keys.empty()) { + model_iter = data.end(); + } else { + std::string last = data.rbegin()->first; + model_iter = data.lower_bound(last); + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + } + } + delete iter; + } + + std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { + if (it == data.end()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const KVMap& data, + const KVMap::const_reverse_iterator& it) { + if (it == data.rend()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const Iterator* it) { + if (!it->Valid()) { + return "END"; + } else { + return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; + } + } + + std::string PickRandomKey(Random* rnd, const std::vector& keys) { + if (keys.empty()) { + return "foo"; + } else { + const int index = rnd->Uniform(keys.size()); + std::string result = keys[index]; + switch (rnd->Uniform(3)) { + case 0: + // Return an existing key + break; + case 1: { + // Attempt to return something smaller than an existing key + if (result.size() > 0 && result[result.size()-1] > '\0') { + result[result.size()-1]--; + } + break; + } + case 2: { + // Return something larger than an existing key + Increment(options_.comparator, &result); + break; + } + } + return result; + } + } + + // Returns NULL if not running against a DB + DB* db() const { return constructor_->db(); } + + private: + Options options_; + Constructor* constructor_; +}; + +// Test the empty key +TEST(Harness, SimpleEmptyKey) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSingle) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleMulti) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSpecialKey) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + +TEST(Harness, Randomized) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + if ((num_entries % 10) == 0) { + fprintf(stderr, "case %d of %d: num_entries = %d\n", + (i + 1), int(kNumTestArgs), num_entries); + } + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + } + } +} + +TEST(Harness, RandomizedLongDB) { + Random rnd(test::RandomSeed()); + TestArgs args = { DB_TEST, false, 16 }; + Init(args); + int num_entries = 100000; + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + + // We must have created enough data to force merging + std::string l0_files, l1_files; + ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); + ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); + ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0); + +} + +class MemTableTest { }; + +TEST(MemTableTest, Simple) { + InternalKeyComparator cmp(BytewiseComparator()); + MemTable memtable(cmp); + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + batch.Put(std::string("k1"), std::string("v1")); + batch.Put(std::string("k2"), std::string("v2")); + batch.Put(std::string("k3"), std::string("v3")); + batch.Put(std::string("largekey"), std::string("vlarge")); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); + + Iterator* iter = memtable.NewIterator(); + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", + iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } + + delete iter; +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +class TableTest { }; + +TEST(TableTest, ApproximateOffsetOfPlain) { + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kNoCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); + +} + +static bool SnappyCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(in.data(), in.size(), &out); +} + +TEST(TableTest, ApproximateOffsetOfCompressed) { + if (!SnappyCompressionSupported()) { + fprintf(stderr, "skipping compression tests\n"); + return; + } + + Random rnd(301); + TableConstructor c(BytewiseComparator()); + std::string tmp; + c.Add("k01", "hello"); + c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + c.Add("k03", "hello3"); + c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kSnappyCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc new file mode 100644 index 0000000..24a1241 --- /dev/null +++ b/table/two_level_iterator.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" + +#include "leveldb/table.h" +#include "table/block.h" +#include "table/format.h" +#include "table/iterator_wrapper.h" + +namespace leveldb { + +namespace { + +typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); + +class TwoLevelIterator: public Iterator { + public: + TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options); + + virtual ~TwoLevelIterator(); + + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + virtual void Next(); + virtual void Prev(); + + virtual bool Valid() const { + return data_iter_.Valid(); + } + virtual Slice key() const { + assert(Valid()); + return data_iter_.key(); + } + virtual Slice value() const { + assert(Valid()); + return data_iter_.value(); + } + virtual Status status() const { + // It'd be nice if status() returned a const Status& instead of a Status + if (!index_iter_.status().ok()) { + return index_iter_.status(); + } else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { + return data_iter_.status(); + } else { + return status_; + } + } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetDataIterator(Iterator* data_iter); + void InitDataBlock(); + + BlockFunction block_function_; + void* arg_; + const ReadOptions options_; + Status status_; + IteratorWrapper index_iter_; + IteratorWrapper data_iter_; // May be NULL + // If data_iter_ is non-NULL, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the data_iter_. + std::string data_block_handle_; +}; + +TwoLevelIterator::TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options) + : block_function_(block_function), + arg_(arg), + options_(options), + index_iter_(index_iter), + data_iter_(NULL) { +} + +TwoLevelIterator::~TwoLevelIterator() { +} + +void TwoLevelIterator::Seek(const Slice& target) { + index_iter_.Seek(target); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.Seek(target); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToFirst() { + index_iter_.SeekToFirst(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToLast() { + index_iter_.SeekToLast(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIterator::Next() { + assert(Valid()); + data_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::Prev() { + assert(Valid()); + data_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + + +void TwoLevelIterator::SkipEmptyDataBlocksForward() { + while (data_iter_.iter() == NULL || !data_iter_.Valid()) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + return; + } + index_iter_.Next(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); + } +} + +void TwoLevelIterator::SkipEmptyDataBlocksBackward() { + while (data_iter_.iter() == NULL || !data_iter_.Valid()) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + return; + } + index_iter_.Prev(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); + } +} + +void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { + if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); + data_iter_.Set(data_iter); +} + +void TwoLevelIterator::InitDataBlock() { + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + } else { + Slice handle = index_iter_.value(); + if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { + // data_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + Iterator* iter = (*block_function_)(arg_, options_, handle); + data_block_handle_.assign(handle.data(), handle.size()); + SetDataIterator(iter); + } + } +} + +} + +Iterator* NewTwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options) { + return new TwoLevelIterator(index_iter, block_function, arg, options); +} + +} diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h new file mode 100644 index 0000000..5909e2b --- /dev/null +++ b/table/two_level_iterator.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ +#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ + +#include "leveldb/iterator.h" + +namespace leveldb { + +struct ReadOptions; + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +extern Iterator* NewTwoLevelIterator( + Iterator* index_iter, + Iterator* (*block_function)( + void* arg, + const ReadOptions& options, + const Slice& index_value), + void* arg, + const ReadOptions& options); + +} + +#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ diff --git a/util/arena.cc b/util/arena.cc new file mode 100644 index 0000000..4bf6e36 --- /dev/null +++ b/util/arena.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include + +namespace leveldb { + +static const int kBlockSize = 4096; + +Arena::Arena() { + blocks_memory_ = 0; + alloc_ptr_ = NULL; // First allocation will allocate a block + alloc_bytes_remaining_ = 0; +} + +Arena::~Arena() { + for (int i = 0; i < blocks_.size(); i++) { + delete[] blocks_[i]; + } +} + +char* Arena::AllocateFallback(size_t bytes) { + if (bytes > kBlockSize / 4) { + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + char* result = AllocateNewBlock(bytes); + return result; + } + + // We waste the remaining space in the current block. + alloc_ptr_ = AllocateNewBlock(kBlockSize); + alloc_bytes_remaining_ = kBlockSize; + + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; +} + +char* Arena::AllocateAligned(size_t bytes) { + const int align = sizeof(void*); // We'll align to pointer size + assert((align & (align-1)) == 0); // Pointer size should be a power of 2 + size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); + size_t slop = (current_mod == 0 ? 0 : align - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = alloc_ptr_ + slop; + alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returned aligned memory + result = AllocateFallback(bytes); + } + assert((reinterpret_cast(result) & (align-1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + char* result = new char[block_bytes]; + blocks_memory_ += block_bytes; + blocks_.push_back(result); + return result; +} + +} diff --git a/util/arena.h b/util/arena.h new file mode 100644 index 0000000..fcb5d5b --- /dev/null +++ b/util/arena.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ +#define STORAGE_LEVELDB_UTIL_ARENA_H_ + +#include +#include +#include +#include + +namespace leveldb { + +class Arena { + public: + Arena(); + ~Arena(); + + // Return a pointer to a newly allocated memory block of "bytes" bytes. + char* Allocate(size_t bytes); + + // Allocate memory with the normal alignment guarantees provided by malloc + char* AllocateAligned(size_t bytes); + + // Returns an estimate of the total memory usage of data allocated + // by the arena (including space allocated but not yet used for user + // allocations). + size_t MemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*); + } + + private: + char* AllocateFallback(size_t bytes); + char* AllocateNewBlock(size_t block_bytes); + + // Allocation state + char* alloc_ptr_; + size_t alloc_bytes_remaining_; + + // Array of new[] allocated memory blocks + std::vector blocks_; + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_; + + // No copying allowed + Arena(const Arena&); + void operator=(const Arena&); +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; + } + return AllocateFallback(bytes); +} + +} + +#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/util/arena_test.cc b/util/arena_test.cc new file mode 100644 index 0000000..c33b552 --- /dev/null +++ b/util/arena_test.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" + +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { + +class ArenaTest { }; + +TEST(ArenaTest, Empty) { + Arena arena; +} + +TEST(ArenaTest, Simple) { + std::vector > allocated; + Arena arena; + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) ? rnd.Uniform(6000) : + (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena.AllocateAligned(s); + } else { + r = arena.Allocate(s); + } + + for (int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena.MemoryUsage(), bytes); + if (i > N/10) { + ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); + } + } + for (int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, i % 256); + } + } +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/cache.cc b/util/cache.cc new file mode 100644 index 0000000..d8a4426 --- /dev/null +++ b/util/cache.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) +#include +#elif defined(LEVELDB_PLATFORM_CHROMIUM) +#include "base/hash_tables.h" +#else +#include // TODO(sanjay): Switch to unordered_set when possible. +#endif + +#include + +#include "leveldb/cache.h" +#include "port/port.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace leveldb { + +Cache::~Cache() { +} + +namespace { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + size_t refs; // TODO(opt): Pack with "key_length"? + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// Pick a platform specific hash_set instantiation +#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) + // Microsoft's hash_set deviates from the standard. See + // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx + // for details. Basically the 2 param () operator is a less than and + // the 1 param () operator is a hash function. + struct HandleHashCompare : public stdext::hash_compare { + size_t operator() (LRUHandle* h) const { + Slice k = h->key(); + return Hash(k.data(), k.size(), 0); + } + bool operator() (LRUHandle* a, LRUHandle* b) const { + return a->key().compare(b->key()) < 0; + } + }; + typedef base::hash_set HandleTable; +#else + struct HandleHash { + inline size_t operator()(LRUHandle* h) const { + Slice k = h->key(); + return Hash(k.data(), k.size(), 0); + } + }; + + struct HandleEq { + inline bool operator()(LRUHandle* a, LRUHandle* b) const { + return a->key() == b->key(); + } + }; +# if defined(LEVELDB_PLATFORM_CHROMIUM) + typedef base::hash_set HandleTable; +# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) + typedef std::unordered_set HandleTable; +# else + typedef __gnu_cxx::hash_set HandleTable; +# endif +#endif + +class LRUCache : public Cache { + public: + explicit LRUCache(size_t capacity); + virtual ~LRUCache(); + + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)); + virtual Handle* Lookup(const Slice& key); + virtual void Release(Handle* handle); + virtual void* Value(Handle* handle); + virtual void Erase(const Slice& key); + virtual uint64_t NewId(); + + private: + void LRU_Remove(LRUHandle* e); + void LRU_Append(LRUHandle* e); + void Unref(LRUHandle* e); + + // Constructor parameters + const size_t capacity_; + + // mutex_ protects the following state. + port::Mutex mutex_; + size_t usage_; + uint64_t last_id_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; +}; + +LRUCache::LRUCache(size_t capacity) + : capacity_(capacity), + usage_(0), + last_id_(0) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +LRUCache::~LRUCache() { + table_.clear(); + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + assert(e->refs == 1); // Error if caller has an unreleased handle + Unref(e); + e = next; + } +} + +void LRUCache::Unref(LRUHandle* e) { + assert(e->refs > 0); + e->refs--; + if (e->refs <= 0) { + usage_ -= e->charge; + (*e->deleter)(e->key(), e->value); + free(e); + } +} + +void LRUCache::LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; +} + +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; +} + +Cache::Handle* LRUCache::Lookup(const Slice& key) { + MutexLock l(&mutex_); + + LRUHandle dummy; + dummy.next = &dummy; + dummy.value = const_cast(&key); + HandleTable::iterator iter = table_.find(&dummy); + if (iter == table_.end()) { + return NULL; + } else { + LRUHandle* e = const_cast(*iter); + e->refs++; + LRU_Remove(e); + LRU_Append(e); + return reinterpret_cast(e); + } +} + +void* LRUCache::Value(Handle* handle) { + return reinterpret_cast(handle)->value; +} + +void LRUCache::Release(Handle* handle) { + MutexLock l(&mutex_); + Unref(reinterpret_cast(handle)); +} + +Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + MutexLock l(&mutex_); + + LRUHandle* e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); + LRU_Append(e); + usage_ += charge; + + std::pair p = table_.insert(e); + if (!p.second) { + // Kill existing entry + LRUHandle* old = const_cast(*(p.first)); + LRU_Remove(old); + table_.erase(p.first); + table_.insert(e); + Unref(old); + } + + while (usage_ > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + LRU_Remove(old); + table_.erase(old); + Unref(old); + } + + return reinterpret_cast(e); +} + +void LRUCache::Erase(const Slice& key) { + MutexLock l(&mutex_); + + LRUHandle dummy; + dummy.next = &dummy; + dummy.value = const_cast(&key); + HandleTable::iterator iter = table_.find(&dummy); + if (iter != table_.end()) { + LRUHandle* e = const_cast(*iter); + LRU_Remove(e); + table_.erase(iter); + Unref(e); + } +} + +uint64_t LRUCache::NewId() { + MutexLock l(&mutex_); + return ++(last_id_); +} + +} // end anonymous namespace + +Cache* NewLRUCache(size_t capacity) { + return new LRUCache(capacity); +} + +} diff --git a/util/cache_test.cc b/util/cache_test.cc new file mode 100644 index 0000000..dbab988 --- /dev/null +++ b/util/cache_test.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/cache.h" + +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace leveldb { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest { + public: + static CacheTest* current_; + + static void Deleter(const Slice& key, void* v) { + current_->deleted_keys_.push_back(DecodeKey(key)); + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kCacheSize = 100; + std::vector deleted_keys_; + std::vector deleted_values_; + Cache* cache_; + + CacheTest() : cache_(NewLRUCache(kCacheSize)) { + current_ = this; + } + + ~CacheTest() { + delete cache_; + } + + int Lookup(int key) { + Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); + const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); + if (handle != NULL) { + cache_->Release(handle); + } + return r; + } + + void Insert(int key, int value, int charge = 1) { + cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(int key) { + cache_->Erase(EncodeKey(key)); + } +}; +CacheTest* CacheTest::current_; + +TEST(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); +} + +TEST(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + + // Frequently used entry must be kept around + for (int i = 0; i < kCacheSize; i++) { + Insert(1000+i, 2000+i); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(200, deleted_keys_[0]); + ASSERT_EQ(201, deleted_values_[0]); +} + +TEST(CacheTest, HeavyEntry) { + Insert(100, 101); + Insert(200, 201, kCacheSize); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/coding.cc b/util/coding.cc new file mode 100644 index 0000000..680e2ad --- /dev/null +++ b/util/coding.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +namespace leveldb { + +void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return reinterpret_cast(ptr); +} + +void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B-1)) | B; + v >>= 7; + } + *(ptr++) = v; + return reinterpret_cast(ptr); +} + +void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return NULL; +} + +bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return NULL; +} + +bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetLengthPrefixedSlice(const char* p, const char* limit, + Slice* result) { + uint32_t len; + p = GetVarint32Ptr(p, limit, &len); + if (p == NULL) return NULL; + if (p + len > limit) return NULL; + *result = Slice(p, len); + return p + len; +} + +bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len; + if (GetVarint32(input, &len) && + input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +} diff --git a/util/coding.h b/util/coding.h new file mode 100644 index 0000000..8755968 --- /dev/null +++ b/util/coding.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ +#define STORAGE_LEVELDB_UTIL_CODING_H_ + +#include +#include +#include +#include "leveldb/slice.h" +#include "port/port.h" + +namespace leveldb { + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// NULL on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(ptr[0])) + | (static_cast(ptr[1]) << 8) + | (static_cast(ptr[2]) << 16) + | (static_cast(ptr[3]) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +} + +#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/util/coding_test.cc b/util/coding_test.cc new file mode 100644 index 0000000..a8dba04 --- /dev/null +++ b/util/coding_test.cc @@ -0,0 +1,173 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "util/testharness.h" + +namespace leveldb { + +class Coding { }; + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + uint64_t actual; + actual = DecodeFixed64(p); + ASSERT_EQ(v-1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+1, actual); + p += sizeof(uint64_t); + } +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != NULL); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast(0)); + values.push_back(~static_cast(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power-1); + values.push_back(power+1); + }; + + std::string s; + for (int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != NULL); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); + +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) + == NULL); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); + } + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) + == NULL); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); + } + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/comparator.cc b/util/comparator.cc new file mode 100644 index 0000000..e2b27e3 --- /dev/null +++ b/util/comparator.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "leveldb/comparator.h" +#include "leveldb/slice.h" +#include "util/logging.h" + +namespace leveldb { + +Comparator::~Comparator() { } + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast((*start)[diff_index]); + if (diff_byte < static_cast(0xff) && + diff_byte + 1 < static_cast(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (int i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} +static const BytewiseComparatorImpl bytewise; + +const Comparator* BytewiseComparator() { + return &bytewise; +} + +} diff --git a/util/crc32c.cc b/util/crc32c.cc new file mode 100644 index 0000000..28c2401 --- /dev/null +++ b/util/crc32c.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. + +#include "util/crc32c.h" + +#include +#include "util/coding.h" + +namespace leveldb { +namespace crc32c { + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast(p)); +} + +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint32_t l = crc ^ 0xffffffffu; + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) +#define STEP4 do { \ + uint32_t c = l ^ LE_LOAD32(p); \ + p += 4; \ + l = table3_[c & 0xff] ^ \ + table2_[(c >> 8) & 0xff] ^ \ + table1_[(c >> 16) & 0xff] ^ \ + table0_[c >> 24]; \ +} while (0) + + // Point x at first 4-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); + if (x <= e) { + // Process bytes until finished or p is 4-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + STEP4; STEP4; STEP4; STEP4; + } + // Process bytes 4 at a time + while ((e-p) >= 4) { + STEP4; + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP4 +#undef STEP1 + return l ^ 0xffffffffu; +} + +} +} diff --git a/util/crc32c.h b/util/crc32c.h new file mode 100644 index 0000000..938d8ff --- /dev/null +++ b/util/crc32c.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ +#define STORAGE_LEVELDB_UTIL_CRC32C_H_ + +#include +#include + +namespace leveldb { +namespace crc32c { + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { + return Extend(0, data, n); +} + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} +} + +#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc new file mode 100644 index 0000000..ba9e804 --- /dev/null +++ b/util/crc32c_test.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/crc32c.h" +#include "util/testharness.h" + +namespace leveldb { +namespace crc32c { + +class CRC { }; + +TEST(CRC, StandardResults) { + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = i; + } + ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = 31 - i; + } + ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); +} + +TEST(CRC, Values) { + ASSERT_NE(Value("a", 1), Value("foo", 3)); +} + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), + Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/env.cc b/util/env.cc new file mode 100644 index 0000000..e5297e7 --- /dev/null +++ b/util/env.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/env.h" + +namespace leveldb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +FileLock::~FileLock() { +} + +void Log(Env* env, WritableFile* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + env->Logv(info_log, format, ap); + va_end(ap); +} + +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + WritableFile* file; + Status s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok()) { + s = file->Close(); + } + delete file; // Will auto-close if we did not close above + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + data->clear(); + SequentialFile* file; + Status s = env->NewSequentialFile(fname, &file); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + delete file; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +} diff --git a/util/env_chromium.cc b/util/env_chromium.cc new file mode 100644 index 0000000..7edc7a9 --- /dev/null +++ b/util/env_chromium.cc @@ -0,0 +1,603 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "base/at_exit.h" +#include "base/file_path.h" +#include "base/file_util.h" +#include "base/lazy_instance.h" +#include "base/memory/ref_counted.h" +#include "base/message_loop.h" +#include "base/platform_file.h" +#include "base/process_util.h" +#include "base/synchronization/lock.h" +#include "base/sys_info.h" +#include "base/task.h" +#include "base/threading/platform_thread.h" +#include "base/threading/thread.h" +#include "base/utf_string_conversions.h" +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +#if defined(OS_WIN) +#include +#include "base/win/win_util.h" +#endif + +#if defined(OS_MACOSX) || defined(OS_WIN) +// The following are glibc-specific +extern "C" { +size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { + return fread(ptr, size, n, file); +} + +size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) { + return fwrite(ptr, size, n, file); +} + +int fflush_unlocked(FILE *file) { + return fflush(file); +} + +int fdatasync(int fildes) { +#if defined(OS_WIN) + return _commit(fildes); +#else + return fsync(fildes); +#endif +} +} +#endif + +namespace leveldb { + +namespace { + +class Thread; + +static const ::FilePath::CharType kLevelDBTestDirectoryPrefix[] + = FILE_PATH_LITERAL("leveldb-test-"); + +::FilePath CreateFilePath(const std::string& file_path) { +#if defined(OS_WIN) + return FilePath(UTF8ToUTF16(file_path)); +#else + return FilePath(file_path); +#endif +} + +std::string FilePathToString(const ::FilePath& file_path) { +#if defined(OS_WIN) + return UTF16ToUTF8(file_path.value()); +#else + return file_path.value(); +#endif +} + +// TODO(jorlow): This should be moved into Chromium's base. +const char* PlatformFileErrorString(const ::base::PlatformFileError& error) { + switch (error) { + case ::base::PLATFORM_FILE_ERROR_FAILED: + return "Opening file failed."; + case ::base::PLATFORM_FILE_ERROR_IN_USE: + return "File currently in use."; + case ::base::PLATFORM_FILE_ERROR_EXISTS: + return "File already exists."; + case ::base::PLATFORM_FILE_ERROR_NOT_FOUND: + return "File not found."; + case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED: + return "Access denied."; + case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED: + return "Too many files open."; + case ::base::PLATFORM_FILE_ERROR_NO_MEMORY: + return "Out of memory."; + case ::base::PLATFORM_FILE_ERROR_NO_SPACE: + return "No space left on drive."; + case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY: + return "Not a directory."; + case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION: + return "Invalid operation."; + case ::base::PLATFORM_FILE_ERROR_SECURITY: + return "Security error."; + case ::base::PLATFORM_FILE_ERROR_ABORT: + return "File operation aborted."; + case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE: + return "The supplied path was not a file."; + case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY: + return "The file was not empty."; + } + NOTIMPLEMENTED(); + return "Unknown error."; +} + +class ChromiumSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + ChromiumSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~ChromiumSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + } + return s; + } +}; + +class ChromiumRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + ::base::PlatformFile file_; + + public: + ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file) + : filename_(fname), file_(file) { } + virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + int r = ::base::ReadPlatformFile(file_, offset, scratch, n); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = Status::IOError(filename_, "Could not preform read"); + } + return s; + } +}; + +class ChromiumWritableFile : public WritableFile { + private: + std::string filename_; + FILE* file_; + + public: + ChromiumWritableFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + + ~ChromiumWritableFile() { + if (file_ != NULL) { + // Ignoring any potential errors + fclose(file_); + } + } + + virtual Status Append(const Slice& data) { + size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); + Status result; + if (r != data.size()) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } + + virtual Status Close() { + Status result; + if (fclose(file_) != 0) { + result = Status::IOError(filename_, strerror(errno)); + } + file_ = NULL; + return result; + } + + virtual Status Flush() { + Status result; + if (fflush_unlocked(file_) != 0) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } + + virtual Status Sync() { + Status result; + if ((fflush_unlocked(file_) != 0) || + (fdatasync(fileno(file_)) != 0)) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } +}; + +class ChromiumFileLock : public FileLock { + public: + ::base::PlatformFile file_; +}; + +class ChromiumEnv : public Env { + public: + ChromiumEnv(); + virtual ~ChromiumEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "rb"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new ChromiumSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN; + bool created; + ::base::PlatformFileError error_code; + ::base::PlatformFile file = ::base::CreatePlatformFile( + CreateFilePath(fname), flags, &created, &error_code); + if (error_code != ::base::PLATFORM_FILE_OK) { + *result = NULL; + return Status::IOError(fname, PlatformFileErrorString(error_code)); + } + *result = new ChromiumRandomAccessFile(fname, file); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + *result = NULL; + FILE* f = fopen(fname.c_str(), "wb"); + if (f == NULL) { + return Status::IOError(fname, strerror(errno)); + } else { + *result = new ChromiumWritableFile(fname, f); + return Status::OK(); + } + } + + virtual bool FileExists(const std::string& fname) { + return ::file_util::PathExists(CreateFilePath(fname)); + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + ::file_util::FileEnumerator iter( + CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES); + ::FilePath current = iter.Next(); + while (!current.empty()) { + result->push_back(FilePathToString(current.BaseName())); + current = iter.Next(); + } + // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so + // we'll always return OK. Maybe manually check for error + // conditions like the file not existing? + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + // TODO(jorlow): Should we assert this is a file? + if (!::file_util::Delete(CreateFilePath(fname), false)) { + result = Status::IOError(fname, "Could not delete file."); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (!::file_util::CreateDirectory(CreateFilePath(name))) { + result = Status::IOError(name, "Could not create directory."); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + // TODO(jorlow): Should we assert this is a directory? + if (!::file_util::Delete(CreateFilePath(name), false)) { + result = Status::IOError(name, "Could not delete directory."); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + int64_t signed_size; + if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { + *size = 0; + s = Status::IOError(fname, "Could not determine file size."); + } else { + *size = static_cast(signed_size); + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& dst) { + Status result; + if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) { + result = Status::IOError(src, "Could not rename file."); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS | + ::base::PLATFORM_FILE_READ | + ::base::PLATFORM_FILE_WRITE | + ::base::PLATFORM_FILE_EXCLUSIVE_READ | + ::base::PLATFORM_FILE_EXCLUSIVE_WRITE; + bool created; + ::base::PlatformFileError error_code; + ::base::PlatformFile file = ::base::CreatePlatformFile( + CreateFilePath(fname), flags, &created, &error_code); + if (error_code != ::base::PLATFORM_FILE_OK) { + result = Status::IOError(fname, PlatformFileErrorString(error_code)); + } else { + ChromiumFileLock* my_lock = new ChromiumFileLock; + my_lock->file_ = file; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + ChromiumFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (!::base::ClosePlatformFile(my_lock->file_)) { + result = Status::IOError("Could not close lock file."); + } + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual std::string UserIdentifier() { +#if defined(OS_WIN) + std::wstring user_sid; + bool ret = ::base::win::GetUserSidString(&user_sid); + DCHECK(ret); + return UTF16ToUTF8(user_sid); +#else + char buf[100]; + snprintf(buf, sizeof(buf), "%d", int(geteuid())); + return buf; +#endif + } + + virtual Status GetTestDirectory(std::string* path) { + mu_.Acquire(); + if (test_directory_.empty()) { + if (!::file_util::CreateNewTempDirectory(kLevelDBTestDirectoryPrefix, + &test_directory_)) { + mu_.Release(); + return Status::IOError("Could not create temp directory."); + } + } + *path = FilePathToString(test_directory_); + mu_.Release(); + return Status::OK(); + } + + virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + // TODO(jorlow): We may want to just use Chromium's built in logging. + + uint64_t thread_id = 0; + // Coppied from base/logging.cc. +#if defined(OS_WIN) + thread_id = GetCurrentThreadId(); +#elif defined(OS_MACOSX) + thread_id = mach_thread_self(); +#elif defined(OS_LINUX) + thread_id = syscall(__NR_gettid); +#elif defined(OS_FREEBSD) || defined(OS_NACL) + // TODO(BSD): find a better thread ID + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); +#endif + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + ::base::Time::Exploded t; + ::base::Time::Now().LocalExplode(&t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.year, + t.month, + t.day_of_month, + t.hour, + t.minute, + t.second, + static_cast(t.millisecond) * 1000, + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + info_log->Append(Slice(base, p - base)); + info_log->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) { + ::base::Time::Exploded t; + ::base::Time::Now().LocalExplode(&t); + return snprintf(buffer, size, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d", + t.year, + t.month, + t.day_of_month, + t.hour, + t.minute, + t.second, + static_cast(t.millisecond) * 1000); + } + + virtual uint64_t NowMicros() { + return ::base::TimeTicks::HighResNow().ToInternalValue(); + } + + virtual void SleepForMicroseconds(int micros) { + // Round up to the next millisecond. + ::base::PlatformThread::Sleep((micros + 999) / 1000); + } + + private: + // BGThread() is the body of the background thread + void BGThread(); + static void BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + } + + FilePath test_directory_; + + size_t page_size_; + ::base::Lock mu_; + ::base::ConditionVariable bgsignal_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + BGQueue queue_; +}; + +ChromiumEnv::ChromiumEnv() + : page_size_(::base::SysInfo::VMAllocationGranularity()), + bgsignal_(&mu_), + started_bgthread_(false) { +#if defined(OS_MACOSX) + ::base::EnableTerminationOnHeapCorruption(); + ::base::EnableTerminationOnOutOfMemory(); +#endif // OS_MACOSX +} + +class Thread : public ::base::PlatformThread::Delegate { + public: + Thread(void (*function)(void* arg), void* arg) + : function_(function), arg_(arg) { + ::base::PlatformThreadHandle handle; + bool success = ::base::PlatformThread::Create(0, this, &handle); + DCHECK(success); + } + virtual ~Thread() {} + virtual void ThreadMain() { + (*function_)(arg_); + delete this; + } + + private: + void (*function_)(void* arg); + void* arg_; +}; + +void ChromiumEnv::Schedule(void (*function)(void*), void* arg) { + mu_.Acquire(); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + StartThread(&ChromiumEnv::BGThreadWrapper, this); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + bgsignal_.Signal(); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + mu_.Release(); +} + +void ChromiumEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + mu_.Acquire(); + while (queue_.empty()) { + bgsignal_.Wait(); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + mu_.Release(); + (*function)(arg); + } +} + +void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { + new Thread(function, arg); // Will self-delete. +} + +::base::LazyInstance > + default_env(::base::LINKER_INITIALIZED); + +} + +Env* Env::Default() { + return default_env.Pointer(); +} + +} diff --git a/util/env_posix.cc b/util/env_posix.cc new file mode 100644 index 0000000..5cddb0c --- /dev/null +++ b/util/env_posix.cc @@ -0,0 +1,599 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(LEVELDB_PLATFORM_ANDROID) +#include +#endif +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +namespace leveldb { + +namespace { + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + } + return s; + } +}; + +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd) + : filename_(fname), fd_(fd) { } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + return s; + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + + // Have we done an munmap of unsynced data? + bool pending_sync_; + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + void UnmapCurrentRegion() { + if (base_ != NULL) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + munmap(base_, limit_ - base_); + file_offset_ += limit_ - base_; + base_ = NULL; + limit_ = NULL; + last_sync_ = NULL; + dst_ = NULL; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + } + + bool MapNewRegion() { + assert(base_ == NULL); + if (ftruncate(fd_, file_offset_ + map_size_) < 0) { + return false; + } + void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return false; + } + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return true; + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(NULL), + limit_(NULL), + dst_(NULL), + last_sync_(NULL), + file_offset_(0), + pending_sync_(false) { + assert((page_size & (page_size - 1)) == 0); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + UnmapCurrentRegion(); + MapNewRegion(); + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + UnmapCurrentRegion(); + if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + fd_ = -1; + base_ = NULL; + limit_ = NULL; + return s; + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + return s; + } +}; + +static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + return fcntl(fd, F_SETLK, &f); +} + +class PosixFileLock : public FileLock { + public: + int fd_; +}; + +class PosixEnv : public Env { + public: + PosixEnv(); + virtual ~PosixEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "r"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + int fd = open(fname.c_str(), O_RDONLY); + if (fd < 0) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } + *result = new PosixRandomAccessFile(fname, fd); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + *result = NULL; + s = Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixMmapFile(fname, fd, page_size_); + } + return s; + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == NULL) { + return Status::IOError(dir, strerror(errno)); + } + struct dirent* entry; + while ((entry = readdir(d)) != NULL) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = Status::IOError(fname, strerror(errno)); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = Status::IOError(name, strerror(errno)); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = Status::IOError(name, strerror(errno)); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = Status::IOError(fname, strerror(errno)); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = Status::IOError(src, strerror(errno)); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = Status::IOError(fname, strerror(errno)); + } else if (LockOrUnlock(fd, true) == -1) { + result = Status::IOError("lock " + fname, strerror(errno)); + close(fd); + } else { + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = Status::IOError(strerror(errno)); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + pthread_t tid = pthread_self(); + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, NULL); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + info_log->Append(Slice(base, p - base)); + info_log->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + + private: + void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } + } + + // BGThread() is the body of the background thread + void BGThread(); + static void* BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + return NULL; + } + + size_t page_size_; + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + pthread_t bgthread_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + BGQueue queue_; +}; + +PosixEnv::PosixEnv() : page_size_(getpagesize()), + started_bgthread_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + PthreadCall( + "create thread", + pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + while (queue_.empty()) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return NULL; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, NULL, &StartThreadWrapper, state)); +} + +} + +static pthread_once_t once = PTHREAD_ONCE_INIT; +static Env* default_env; +static void InitDefaultEnv() { default_env = new PosixEnv; } + +Env* Env::Default() { + pthread_once(&once, InitDefaultEnv); + return default_env; +} + +} diff --git a/util/env_test.cc b/util/env_test.cc new file mode 100644 index 0000000..3c253be --- /dev/null +++ b/util/env_test.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/env.h" + +#include "port/port.h" +#include "util/testharness.h" + +namespace leveldb { + +static const int kDelayMicros = 100000; + +class EnvPosixTest { + private: + port::Mutex mu_; + std::string events_; + + public: + Env* env_; + EnvPosixTest() : env_(Env::Default()) { } +}; + +static void SetBool(void* ptr) { + *(reinterpret_cast(ptr)) = true; +} + +TEST(EnvPosixTest, RunImmediately) { + bool called = false; + env_->Schedule(&SetBool, &called); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(called); +} + +TEST(EnvPosixTest, RunMany) { + int last_id = 0; + + struct CB { + int* last_id_ptr; // Pointer to shared slot + int id; // Order# for the execution of this callback + + CB(int* p, int i) : last_id_ptr(p), id(i) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + ASSERT_EQ(cb->id-1, *cb->last_id_ptr); + *cb->last_id_ptr = cb->id; + } + }; + + // Schedule in different order than start time + CB cb1(&last_id, 1); + CB cb2(&last_id, 2); + CB cb3(&last_id, 3); + CB cb4(&last_id, 4); + env_->Schedule(&CB::Run, &cb1); + env_->Schedule(&CB::Run, &cb2); + env_->Schedule(&CB::Run, &cb3); + env_->Schedule(&CB::Run, &cb4); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(4, last_id); +} + +struct State { + port::Mutex mu; + int val; + int num_running; +}; + +static void ThreadBody(void* arg) { + State* s = reinterpret_cast(arg); + s->mu.Lock(); + s->val += 1; + s->num_running -= 1; + s->mu.Unlock(); +} + +TEST(EnvPosixTest, StartThread) { + State state; + state.val = 0; + state.num_running = 3; + for (int i = 0; i < 3; i++) { + env_->StartThread(&ThreadBody, &state); + } + while (true) { + state.mu.Lock(); + int num = state.num_running; + state.mu.Unlock(); + if (num == 0) { + break; + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(state.val, 3); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/hash.cc b/util/hash.cc new file mode 100644 index 0000000..d19afd1 --- /dev/null +++ b/util/hash.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "util/coding.h" +#include "util/hash.h" + +namespace leveldb { + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // Similar to murmur hash + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = seed ^ (n * m); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + case 3: + h += data[2] << 16; + // fall through + case 2: + h += data[1] << 8; + // fall through + case 1: + h += data[0]; + h *= m; + h ^= (h >> r); + break; + } + return h; +} + + +} diff --git a/util/hash.h b/util/hash.h new file mode 100644 index 0000000..8889d56 --- /dev/null +++ b/util/hash.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Simple hash function used for internal data structures + +#ifndef STORAGE_LEVELDB_UTIL_HASH_H_ +#define STORAGE_LEVELDB_UTIL_HASH_H_ + +#include +#include + +namespace leveldb { + +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +} + +#endif // STORAGE_LEVELDB_UTIL_HASH_H_ diff --git a/util/histogram.cc b/util/histogram.cc new file mode 100644 index 0000000..c5178ef --- /dev/null +++ b/util/histogram.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "port/port.h" +#include "util/histogram.h" + +namespace leveldb { + +const double Histogram::kBucketLimit[kNumBuckets] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, + 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, + 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, + 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, + 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, + 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, + 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, + 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, + 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, + 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, + 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, + 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, + 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, + 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, + 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000, + 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0, + 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0, + 1e200, +}; + +void Histogram::Clear() { + min_ = kBucketLimit[kNumBuckets-1]; + max_ = 0; + num_ = 0; + sum_ = 0; + sum_squares_ = 0; + for (int i = 0; i < kNumBuckets; i++) { + buckets_[i] = 0; + } +} + +void Histogram::Add(double value) { + // Linear search is fast enough for our usage in db_bench + int b = 0; + while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { + b++; + } + buckets_[b] += 1.0; + if (min_ > value) min_ = value; + if (max_ < value) max_ = value; + num_++; + sum_ += value; + sum_squares_ += (value * value); +} + +double Histogram::Median() const { + return Percentile(50.0); +} + +double Histogram::Percentile(double p) const { + double threshold = num_ * (p / 100.0); + double sum = 0; + for (int b = 0; b < kNumBuckets; b++) { + sum += buckets_[b]; + if (sum >= threshold) { + // Scale linearly within this bucket + double left_point = (b == 0) ? 0 : kBucketLimit[b-1]; + double right_point = kBucketLimit[b]; + double left_sum = sum - buckets_[b]; + double right_sum = sum; + double pos = (threshold - left_sum) / (right_sum - left_sum); + double r = left_point + (right_point - left_point) * pos; + if (r < min_) r = min_; + if (r > max_) r = max_; + return r; + } + } + return max_; +} + +double Histogram::Average() const { + if (num_ == 0.0) return 0; + return sum_ / num_; +} + +double Histogram::StandardDeviation() const { + if (num_ == 0.0) return 0; + double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); + return sqrt(variance); +} + +std::string Histogram::ToString() const { + std::string r; + char buf[200]; + snprintf(buf, sizeof(buf), + "Count: %.0f Average: %.4f StdDev: %.2f\n", + num_, Average(), StandardDeviation()); + r.append(buf); + snprintf(buf, sizeof(buf), + "Min: %.4f Median: %.4f Max: %.4f\n", + (num_ == 0.0 ? 0.0 : min_), Median(), max_); + r.append(buf); + r.append("------------------------------------------------------\n"); + const double mult = 100.0 / num_; + double sum = 0; + for (int b = 0; b < kNumBuckets; b++) { + if (buckets_[b] <= 0.0) continue; + sum += buckets_[b]; + snprintf(buf, sizeof(buf), + "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ", + ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left + kBucketLimit[b], // right + buckets_[b], // count + mult * buckets_[b], // percentage + mult * sum); // cumulative percentage + r.append(buf); + + // Add hash marks based on percentage; 20 marks for 100%. + int marks = static_cast(20*(buckets_[b] / num_) + 0.5); + r.append(marks, '#'); + r.push_back('\n'); + } + return r; +} + +} diff --git a/util/histogram.h b/util/histogram.h new file mode 100644 index 0000000..f72f122 --- /dev/null +++ b/util/histogram.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ +#define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ + +#include + +namespace leveldb { + +class Histogram { + public: + Histogram() { } + ~Histogram() { } + + void Clear(); + void Add(double value); + + std::string ToString() const; + + private: + double min_; + double max_; + double num_; + double sum_; + double sum_squares_; + + enum { kNumBuckets = 154 }; + static const double kBucketLimit[kNumBuckets]; + double buckets_[kNumBuckets]; + + double Median() const; + double Percentile(double p) const; + double Average() const; + double StandardDeviation() const; +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ diff --git a/util/logging.cc b/util/logging.cc new file mode 100644 index 0000000..5c9bd4a --- /dev/null +++ b/util/logging.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#include +#include +#include +#include +#include "leveldb/env.h" +#include "leveldb/slice.h" + +namespace leveldb { + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (int i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} diff --git a/util/logging.h b/util/logging.h new file mode 100644 index 0000000..1cd0a4b --- /dev/null +++ b/util/logging.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ +#define STORAGE_LEVELDB_UTIL_LOGGING_H_ + +#include +#include +#include +#include "port/port.h" + +namespace leveldb { + +class Slice; +class WritableFile; + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} + +#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/util/mutexlock.h b/util/mutexlock.h new file mode 100644 index 0000000..05fe279 --- /dev/null +++ b/util/mutexlock.h @@ -0,0 +1,39 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ +#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ + +#include "port/port.h" + +namespace leveldb { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { + this->mu_->Lock(); + } + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; + // No copying allowed + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +} + + +#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ diff --git a/util/options.cc b/util/options.cc new file mode 100644 index 0000000..29272fe --- /dev/null +++ b/util/options.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/options.h" + +#include "leveldb/comparator.h" +#include "leveldb/env.h" + +namespace leveldb { + +Options::Options() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(NULL), + write_buffer_size(4<<20), + max_open_files(1000), + large_value_threshold(65536), + block_cache(NULL), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression) { +} + + +} diff --git a/util/random.h b/util/random.h new file mode 100644 index 0000000..2d458e8 --- /dev/null +++ b/util/random.h @@ -0,0 +1,59 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ +#define STORAGE_LEVELDB_UTIL_RANDOM_H_ + +#include + +namespace leveldb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = (product >> 31) + (product & M); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/util/status.cc b/util/status.cc new file mode 100644 index 0000000..d9b7195 --- /dev/null +++ b/util/status.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "port/port.h" +#include "leveldb/status.h" + +namespace leveldb { + +Status::Status(Code code, const Slice& msg, const Slice& msg2) { + assert(code != kOk); + state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); + if (!msg2.empty()) { + state_->second.append(": "); + state_->second.append(msg2.data(), msg2.size()); + } +} + +std::string Status::ToString() const { + if (state_ == NULL) { + return "OK"; + } else { + char tmp[30]; + const char* type; + switch (state_->first) { + case kOk: + type = "OK"; + break; + case kNotFound: + type = "NotFound"; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(state_->first)); + type = tmp; + break; + } + std::string result(type); + if (!state_->second.empty()) { + result.append(state_->second); + } + return result; + } +} + +} diff --git a/util/testharness.cc b/util/testharness.cc new file mode 100644 index 0000000..b686ac3 --- /dev/null +++ b/util/testharness.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testharness.h" + +#include +#include + +namespace leveldb { +namespace test { + +namespace { +struct Test { + const char* base; + const char* name; + void (*func)(); +}; +std::vector* tests; +} + +bool RegisterTest(const char* base, const char* name, void (*func)()) { + if (tests == NULL) { + tests = new std::vector; + } + Test t; + t.base = base; + t.name = name; + t.func = func; + tests->push_back(t); + return true; +} + +int RunAllTests() { + int num = 0; + if (tests != NULL) { + for (int i = 0; i < tests->size(); i++) { + const Test& t = (*tests)[i]; + fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); + (*t.func)(); + ++num; + } + } + fprintf(stderr, "==== PASSED %d tests\n", num); + return 0; +} + +std::string TmpDir() { + std::string dir; + Status s = Env::Default()->GetTestDirectory(&dir); + ASSERT_TRUE(s.ok()) << s.ToString(); + return dir; +} + +int RandomSeed() { + const char* env = getenv("TEST_RANDOM_SEED"); + int result = (env != NULL ? atoi(env) : 301); + if (result <= 0) { + result = 301; + } + return result; +} + +} +} diff --git a/util/testharness.h b/util/testharness.h new file mode 100644 index 0000000..13ab914 --- /dev/null +++ b/util/testharness.h @@ -0,0 +1,129 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ +#define STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ + +#include +#include +#include +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "util/random.h" + +namespace leveldb { +namespace test { + +// Run all tests registered by the TEST() macro. +// Returns 0 if all tests pass. +// Dies or returns a non-zero value if some test fails. +extern int RunAllTests(); + +// Return the directory to use for temporary storage. +extern std::string TmpDir(); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +extern int RandomSeed(); + +// An instance of Tester is allocated to hold temporary state during +// the execution of an assertion. +class Tester { + private: + bool ok_; + const char* fname_; + int line_; + std::stringstream ss_; + + public: + Tester(const char* f, int l) + : ok_(true), fname_(f), line_(l) { + } + + ~Tester() { + if (!ok_) { + fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + exit(1); + } + } + + Tester& Is(bool b, const char* msg) { + if (!b) { + ss_ << " Assertion failure " << msg; + ok_ = false; + } + return *this; + } + + Tester& IsOk(const Status& s) { + if (!s.ok()) { + ss_ << " " << s.ToString(); + ok_ = false; + } + return *this; + } + +#define BINARY_OP(name,op) \ + template \ + Tester& name(const X& x, const Y& y) { \ + if (! (x op y)) { \ + ss_ << " failed: " << x << (" " #op " ") << y; \ + ok_ = false; \ + } \ + return *this; \ + } + + BINARY_OP(IsEq, ==) + BINARY_OP(IsNe, !=) + BINARY_OP(IsGe, >=) + BINARY_OP(IsGt, >) + BINARY_OP(IsLe, <=) + BINARY_OP(IsLt, <) +#undef BINARY_OP + + // Attach the specified value to the error message if an error has occurred + template + Tester& operator<<(const V& value) { + if (!ok_) { + ss_ << " " << value; + } + return *this; + } +}; + +#define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c) +#define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) +#define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) +#define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) +#define ASSERT_GT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) +#define ASSERT_LE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) +#define ASSERT_LT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) + +#define TCONCAT(a,b) TCONCAT1(a,b) +#define TCONCAT1(a,b) a##b + +#define TEST(base,name) \ +class TCONCAT(_Test_,name) : public base { \ + public: \ + void _Run(); \ + static void _RunIt() { \ + TCONCAT(_Test_,name) t; \ + t._Run(); \ + } \ +}; \ +bool TCONCAT(_Test_ignored_,name) = \ + ::leveldb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ +void TCONCAT(_Test_,name)::_Run() + +// Register the specified test. Typically not used directly, but +// invoked via the macro expansion of TEST. +extern bool RegisterTest(const char* base, const char* name, void (*func)()); + + +} +} + +#endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ diff --git a/util/testutil.cc b/util/testutil.cc new file mode 100644 index 0000000..8d6cf3c --- /dev/null +++ b/util/testutil.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testutil.h" + +#include "util/random.h" + +namespace leveldb { +namespace test { + +Slice RandomString(Random* rnd, int len, std::string* dst) { + dst->resize(len); + for (int i = 0; i < len; i++) { + (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' + } + return Slice(*dst); +} + +std::string RandomKey(Random* rnd, int len) { + // Make sure to generate a wide variety of characters so we + // test the boundary conditions for short-key optimizations. + static const char kTestChars[] = { + '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' + }; + std::string result; + for (int i = 0; i < len; i++) { + result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; + } + return result; +} + + +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data; + RandomString(rnd, raw, &raw_data); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + +} +} diff --git a/util/testutil.h b/util/testutil.h new file mode 100644 index 0000000..a150c1a --- /dev/null +++ b/util/testutil.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_ +#define STORAGE_LEVELDB_UTIL_TESTUTIL_H_ + +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "util/random.h" + +namespace leveldb { +namespace test { + +// Store in *dst a random string of length "len" and return a Slice that +// references the generated data. +extern Slice RandomString(Random* rnd, int len, std::string* dst); + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +extern std::string RandomKey(Random* rnd, int len); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + ErrorEnv() : EnvWrapper(Env::Default()), + writable_file_error_(false), + num_writable_file_errors_(0) { } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + if (writable_file_error_) { + ++num_writable_file_errors_; + *result = NULL; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result); + } +}; + +} +} + +#endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_