From 9013f13b1512f6ab8c04518e8f036e58be271eba Mon Sep 17 00:00:00 2001 From: Sanjay Ghemawat Date: Thu, 15 Mar 2012 09:14:00 -0700 Subject: [PATCH] use mmap on 64-bit machines to speed-up reads; small build fixes --- Makefile | 36 ++++++++++++++++++------------------ table/block.cc | 9 ++++++--- table/block.h | 6 ++++-- table/format.cc | 19 +++++++++++++------ table/format.h | 5 ++++- table/table.cc | 11 +++++++---- table/table_test.cc | 2 +- util/env_posix.cc | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 8 files changed, 99 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index d337905..c284d4c 100644 --- a/Makefile +++ b/Makefile @@ -124,65 +124,65 @@ db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + $(CXX) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LDFLAGS db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) $(LDFLAGS) -lkyotocabinet doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) $(MEMENVLIBRARY) : helpers/memenv/memenv.o rm -f $@ $(AR) -rs $@ helpers/memenv/memenv.o memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) - $(CXX) $(LDFLAGS) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ + $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS) ifeq ($(PLATFORM), IOS) # For iOS, create universal object files to be used on both the simulator and diff --git a/table/block.cc b/table/block.cc index 40aa318..06eb6f8 100644 --- a/table/block.cc +++ b/table/block.cc @@ -19,9 +19,10 @@ inline uint32_t Block::NumRestarts() const { return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } -Block::Block(const char* data, size_t size) +Block::Block(const char* data, size_t size, bool take_ownership) : data_(data), - size_(size) { + size_(size), + owned_(take_ownership) { if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { @@ -35,7 +36,9 @@ Block::Block(const char* data, size_t size) } Block::~Block() { - delete[] data_; + if (owned_) { + delete[] data_; + } } // Helper routine: decode the next block entry starting at "p", diff --git a/table/block.h b/table/block.h index 9eb6f02..76088a4 100644 --- a/table/block.h +++ b/table/block.h @@ -16,8 +16,9 @@ class Comparator; class Block { public: // Initialize the block with the specified contents. - // Takes ownership of data[] and will delete[] it when done. - Block(const char* data, size_t size); + // Takes ownership of data[] and will delete[] it when done iff + // "take_ownership is true. + Block(const char* data, size_t size, bool take_ownership); ~Block(); @@ -30,6 +31,7 @@ class Block { const char* data_; size_t size_; uint32_t restart_offset_; // Offset in data_ of restart array + bool owned_; // Block owns data_[] // No copying allowed Block(const Block&); diff --git a/table/format.cc b/table/format.cc index 23728d8..25b85a2 100644 --- a/table/format.cc +++ b/table/format.cc @@ -66,8 +66,10 @@ Status Footer::DecodeFrom(Slice* input) { Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - Block** block) { + Block** block, + bool* may_cache) { *block = NULL; + *may_cache = false; // Read the block contents as well as the type/crc footer. // See table_builder.cc for the code that built this structure. @@ -100,8 +102,14 @@ Status ReadBlock(RandomAccessFile* file, case kNoCompression: if (data != buf) { // File implementation gave us pointer to some other data. - // Copy into buf[]. - memcpy(buf, data, n + kBlockTrailerSize); + // Use it directly under the assumption that it will be live + // while the file is open. + delete[] buf; + *block = new Block(data, n, false /* do not take ownership */); + *may_cache = false; // Do not double-cache + } else { + *block = new Block(buf, n, true /* take ownership */); + *may_cache = true; } // Ok @@ -119,8 +127,8 @@ Status ReadBlock(RandomAccessFile* file, return Status::Corruption("corrupted compressed block contents"); } delete[] buf; - buf = ubuf; - n = ulength; + *block = new Block(ubuf, ulength, true /* take ownership */); + *may_cache = true; break; } default: @@ -128,7 +136,6 @@ Status ReadBlock(RandomAccessFile* file, return Status::Corruption("bad block type"); } - *block = new Block(buf, n); // Block takes ownership of buf[] return Status::OK(); } diff --git a/table/format.h b/table/format.h index 2a3e1ac..66a15da 100644 --- a/table/format.h +++ b/table/format.h @@ -86,10 +86,13 @@ static const size_t kBlockTrailerSize = 5; // Read the block identified by "handle" from "file". On success, // store a pointer to the heap-allocated result in *block and return // OK. On failure store NULL in *block and return non-OK. +// On success, stores true in *may_cache if the result may be +// cached, false if it must not be cached. extern Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - Block** block); + Block** block, + bool* may_cache); // Implementation details follow. Clients should ignore, diff --git a/table/table.cc b/table/table.cc index 5f9238e..07dcffd 100644 --- a/table/table.cc +++ b/table/table.cc @@ -49,7 +49,9 @@ Status Table::Open(const Options& options, // Read the index block Block* index_block = NULL; if (s.ok()) { - s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); + bool may_cache; // Ignored result + s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block, + &may_cache); } if (s.ok()) { @@ -105,6 +107,7 @@ Iterator* Table::BlockReader(void* arg, // can add more features in the future. if (s.ok()) { + bool may_cache; if (block_cache != NULL) { char cache_key_buffer[16]; EncodeFixed64(cache_key_buffer, table->rep_->cache_id); @@ -114,14 +117,14 @@ Iterator* Table::BlockReader(void* arg, if (cache_handle != NULL) { block = reinterpret_cast(block_cache->Value(cache_handle)); } else { - s = ReadBlock(table->rep_->file, options, handle, &block); - if (s.ok() && options.fill_cache) { + s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache); + if (s.ok() && may_cache && options.fill_cache) { cache_handle = block_cache->Insert( key, block, block->size(), &DeleteCachedBlock); } } } else { - s = ReadBlock(table->rep_->file, options, handle, &block); + s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache); } } diff --git a/table/table_test.cc b/table/table_test.cc index cd85b4b..0c8e676 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -205,7 +205,7 @@ class BlockConstructor: public Constructor { block_size_ = block_data.size(); char* block_data_copy = new char[block_size_]; memcpy(block_data_copy, block_data.data(), block_size_); - block_ = new Block(block_data_copy, block_size_); + block_ = new Block(block_data_copy, block_size_, true /* take ownership */); return Status::OK(); } virtual size_t NumBytes() const { return block_size_; } diff --git a/util/env_posix.cc b/util/env_posix.cc index cc73348..cb1f6fc 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -66,6 +66,7 @@ class PosixSequentialFile: public SequentialFile { } }; +// pread() based random-access class PosixRandomAccessFile: public RandomAccessFile { private: std::string filename_; @@ -89,6 +90,32 @@ class PosixRandomAccessFile: public RandomAccessFile { } }; +// mmap() based random-access +class PosixMmapReadableFile: public RandomAccessFile { + private: + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + // base[0,length-1] contains the mmapped contents of the file. + PosixMmapReadableFile(const std::string& fname, void* base, size_t length) + : filename_(fname), mmapped_region_(base), length_(length) { } + virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + if (offset + n > length_) { + *result = Slice(); + s = IOError(filename_, EINVAL); + } else { + *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + } + return s; + } +}; + // We preallocate up to an extra megabyte and use memcpy to append new // data to the file. This is safe since we either properly close the // file before reading from it, or for log files, the reading code @@ -297,13 +324,28 @@ class PosixEnv : public Env { virtual Status NewRandomAccessFile(const std::string& fname, RandomAccessFile** result) { + *result = NULL; + Status s; int fd = open(fname.c_str(), O_RDONLY); if (fd < 0) { - *result = NULL; - return IOError(fname, errno); + s = IOError(fname, errno); + } else if (sizeof(void*) >= 8) { + // Use mmap when virtual address-space is plentiful. + uint64_t size; + s = GetFileSize(fname, &size); + if (s.ok()) { + void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + *result = new PosixMmapReadableFile(fname, base, size); + } else { + s = IOError(fname, errno); + } + } + close(fd); + } else { + *result = new PosixRandomAccessFile(fname, fd); } - *result = new PosixRandomAccessFile(fname, fd); - return Status::OK(); + return s; } virtual Status NewWritableFile(const std::string& fname,