diff --git a/db/builder.cc b/db/builder.cc index f3d0fe2..d5585c3 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -74,7 +74,9 @@ Status BuildTable(const std::string& dbname, if (s.ok()) { // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number); + Iterator* it = table_cache->NewIterator(ReadOptions(), + meta->number, + meta->file_size); s = it->status(); delete it; } diff --git a/db/db_bench.cc b/db/db_bench.cc index 7026ca1..c7a662d 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -354,7 +354,7 @@ class Benchmark { private: void Crc32c(int size, const char* label) { // Checksum about 500MB of data total - string data(size, 'x'); + std::string data(size, 'x'); int64_t bytes = 0; uint32_t crc = 0; while (bytes < 500 * 1048576) { @@ -371,7 +371,7 @@ class Benchmark { void SHA1(int size, const char* label) { // SHA1 about 100MB of data total - string data(size, 'x'); + std::string data(size, 'x'); int64_t bytes = 0; char sha1[20]; while (bytes < 100 * 1048576) { diff --git a/db/db_impl.cc b/db/db_impl.cc index 12c02b3..f14167a 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -642,7 +642,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, if (s.ok() && current_entries > 0) { // Verify that the table is usable - Iterator* iter = table_cache_->NewIterator(ReadOptions(),output_number); + Iterator* iter = table_cache_->NewIterator(ReadOptions(), + output_number, + current_bytes); s = iter->status(); delete iter; if (s.ok()) { diff --git a/db/db_iter.cc b/db/db_iter.cc index 6726b51..beb4d74 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -340,8 +340,11 @@ void DBIter::ReadIndirectValue(Slice ref) const { std::string fname = LargeValueFileName(*dbname_, large_ref); RandomAccessFile* file; Status s = env_->NewRandomAccessFile(fname, &file); + uint64_t file_size = 0; + if (s.ok()) { + s = env_->GetFileSize(fname, &file_size); + } if (s.ok()) { - uint64_t file_size = file->Size(); uint64_t value_size = large_ref.ValueSize(); large_->value.resize(value_size); Slice result; diff --git a/db/log_reader.cc b/db/log_reader.cc index 39a6d2b..407700d 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -105,7 +105,7 @@ void Reader::ReportDrop(size_t bytes, const char* reason) { unsigned int Reader::ReadPhysicalRecord(Slice* result) { while (true) { - if (buffer_.size() <= kHeaderSize) { + if (buffer_.size() < kHeaderSize) { if (!eof_) { // Last read was a full read, so this is a trailer to skip buffer_.clear(); @@ -124,12 +124,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { } else if (buffer_.size() == 0) { // End of file return kEof; - } else if (buffer_.size() < kHeaderSize) { + } else { ReportDrop(buffer_.size(), "truncated record at end of file"); buffer_.clear(); return kEof; - } else { - // We have a trailing zero-length record. Fall through and check it. } } diff --git a/db/log_writer.cc b/db/log_writer.cc index 465eca2..fc33e6e 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -35,18 +35,19 @@ Status Writer::AddRecord(const Slice& slice) { do { const int leftover = kBlockSize - block_offset_; assert(leftover >= 0); - if (leftover <= kHeaderSize) { + if (leftover < kHeaderSize) { // Switch to a new block if (leftover > 0) { - // Fill the trailer - dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover)); + // Fill the trailer (literal below relies on kHeaderSize being 7) + assert(kHeaderSize == 7); + dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); } block_offset_ = 0; } - // Invariant: we never leave <= kHeaderSize bytes in a block. + // Invariant: we never leave < kHeaderSize bytes in a block. const int avail = kBlockSize - block_offset_ - kHeaderSize; - assert(avail > 0); + assert(avail >= 0); const size_t fragment_length = (left < avail) ? left : avail; diff --git a/db/repair.cc b/db/repair.cc index 0727914..745b31a 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -261,7 +261,7 @@ class Repairer { Status status = env_->GetFileSize(fname, &t->meta.file_size); if (status.ok()) { Iterator* iter = table_cache_->NewIterator( - ReadOptions(), t->meta.number); + ReadOptions(), t->meta.number, t->meta.file_size); bool empty = true; ParsedInternalKey parsed; t->max_sequence = 0; diff --git a/db/table_cache.cc b/db/table_cache.cc index 604298d..6f750d6 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -44,6 +44,7 @@ TableCache::~TableCache() { Iterator* TableCache::NewIterator(const ReadOptions& options, uint64_t file_number, + uint64_t file_size, Table** tableptr) { if (tableptr != NULL) { *tableptr = NULL; @@ -59,7 +60,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Table* table = NULL; Status s = env_->NewRandomAccessFile(fname, &file); if (s.ok()) { - s = Table::Open(*options_, file, &table); + s = Table::Open(*options_, file, file_size, &table); } if (!s.ok()) { diff --git a/db/table_cache.h b/db/table_cache.h index 6c357df..5564dfc 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -23,15 +23,16 @@ class TableCache { TableCache(const std::string& dbname, const Options* options, int entries); ~TableCache(); - // Get an iterator for the specified file number and return it. If - // "tableptr" is non-NULL, also sets "*tableptr" to point to the - // Table object underlying the returned iterator, or NULL if no - // Table object underlies the returned iterator. The returned - // "*tableptr" object is owned by the cache and should not be - // deleted, and is valid for as long as the returned iterator is - // live. + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "tableptr" is + // non-NULL, also sets "*tableptr" to point to the Table object + // underlying the returned iterator, or NULL if no Table object underlies + // the returned iterator. The returned "*tableptr" object is owned by + // the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. Iterator* NewIterator(const ReadOptions& options, uint64_t file_number, + uint64_t file_size, Table** tableptr = NULL); // Evict any entry for the specified file number diff --git a/db/version_set.cc b/db/version_set.cc index caf0b2d..b826e5b 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -75,8 +75,8 @@ Version::~Version() { // An internal iterator. For a given version/level pair, yields // information about the files in the level. For a given entry, key() // is the largest key that occurs in the file, and value() is an -// 8-byte value containing the file number of the file, encoding using -// EncodeFixed64. +// 16-byte value containing the file number and file size, both +// encoded using EncodeFixed64. class Version::LevelFileNumIterator : public Iterator { public: LevelFileNumIterator(const Version* version, @@ -129,6 +129,7 @@ class Version::LevelFileNumIterator : public Iterator { Slice value() const { assert(Valid()); EncodeFixed64(value_buf_, (*flist_)[index_]->number); + EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); return Slice(value_buf_, sizeof(value_buf_)); } virtual Status status() const { return Status::OK(); } @@ -137,18 +138,21 @@ class Version::LevelFileNumIterator : public Iterator { const std::vector* const flist_; int index_; - mutable char value_buf_[8]; // Used for encoding the file number for value() + // Backing store for value(). Holds the file number and size. + mutable char value_buf_[16]; }; static Iterator* GetFileIterator(void* arg, const ReadOptions& options, const Slice& file_value) { TableCache* cache = reinterpret_cast(arg); - if (file_value.size() != 8) { + if (file_value.size() != 16) { return NewErrorIterator( Status::Corruption("FileReader invoked with unexpected value")); } else { - return cache->NewIterator(options, DecodeFixed64(file_value.data())); + return cache->NewIterator(options, + DecodeFixed64(file_value.data()), + DecodeFixed64(file_value.data() + 8)); } } @@ -164,7 +168,8 @@ void Version::AddIterators(const ReadOptions& options, // Merge all level zero files together since they may overlap for (int i = 0; i < files_[0].size(); i++) { iters->push_back( - vset_->table_cache_->NewIterator(options, files_[0][i]->number)); + vset_->table_cache_->NewIterator( + options, files_[0][i]->number, files_[0][i]->file_size)); } // For levels > 0, we can use a concatenating iterator that sequentially @@ -650,7 +655,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { // approximate offset of "ikey" within the table. Table* tableptr; Iterator* iter = table_cache_->NewIterator( - ReadOptions(), files[i]->number, &tableptr); + ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); if (tableptr != NULL) { result += tableptr->ApproximateOffsetOf(ikey.Encode()); } @@ -855,7 +860,8 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { if (c->level() + which == 0) { const std::vector& files = c->inputs_[which]; for (int i = 0; i < files.size(); i++) { - list[num++] = table_cache_->NewIterator(options, files[i]->number); + list[num++] = table_cache_->NewIterator( + options, files[i]->number, files[i]->file_size); } } else { // Create concatenating iterator for the files from this level diff --git a/doc/log_format.txt b/doc/log_format.txt index 9a801d4..3a0414b 100644 --- a/doc/log_format.txt +++ b/doc/log_format.txt @@ -9,12 +9,15 @@ Each block consists of a sequence of records: type: uint8 // One of FULL, FIRST, MIDDLE, LAST data: uint8[length] -A record never starts within the last seven bytes of a block. Any -leftover bytes here form the trailer, which must consist entirely of -zero bytes and must be skipped by readers. In particular, even if -there are exactly seven bytes left in the block, and a zero-length -user record is added (which will fit in these seven bytes), the writer -must skip these trailer bytes and add the record to the next block. +A record never starts within the last six bytes of a block (since it +won't fit). Any leftover bytes here form the trailer, which must +consist entirely of zero bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new +non-zero length record is added, the writer must emit a FIRST record +(which contains zero bytes of user data) to fill up the trailing seven +bytes of the block and then emit all of the user data in subsequent +blocks. More types may be added in the future. Some Readers may skip record types they do not understand, others may report that some data was diff --git a/include/env.h b/include/env.h index a728f29..14ddf29 100644 --- a/include/env.h +++ b/include/env.h @@ -168,9 +168,6 @@ class RandomAccessFile { RandomAccessFile() { } virtual ~RandomAccessFile(); - // Return the length of this file in bytes. - virtual uint64_t Size() const = 0; - // Read up to "n" bytes from the file starting at "offset". // "scratch[0..n-1]" may be written by this routine. Sets "*result" // to the data that was read (including if fewer than "n" bytes were diff --git a/include/table.h b/include/table.h index 96b2196..c2a4cf9 100644 --- a/include/table.h +++ b/include/table.h @@ -20,8 +20,9 @@ struct ReadOptions; // immutable and persistent. class Table { public: - // Attempt to open the table that is stored in "file", and read the - // metadata entries necessary to allow retrieving data from the table. + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. // // If successful, returns ok and sets "*table" to the newly opened // table. The client should delete "*table" when no longer needed. @@ -33,6 +34,7 @@ class Table { // *file must remain live while this Table is in use. static Status Open(const Options& options, RandomAccessFile* file, + uint64_t file_size, Table** table); ~Table(); diff --git a/table/table.cc b/table/table.cc index dffc217..bd0fbb5 100644 --- a/table/table.cc +++ b/table/table.cc @@ -29,9 +29,9 @@ struct Table::Rep { Status Table::Open(const Options& options, RandomAccessFile* file, + uint64_t size, Table** table) { *table = NULL; - const uint64_t size = file->Size(); if (size < Footer::kEncodedLength) { return Status::InvalidArgument("file is too short to be an sstable"); } diff --git a/table/table_test.cc b/table/table_test.cc index eabb257..d67c58b 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -110,7 +110,7 @@ class StringSource: public RandomAccessFile { virtual ~StringSource() { } - virtual uint64_t Size() const { return contents_.size(); } + uint64_t Size() const { return contents_.size(); } virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { @@ -246,7 +246,7 @@ class TableConstructor: public Constructor { source_ = new StringSource(sink.contents()); Options table_options; table_options.comparator = options.comparator; - return Table::Open(table_options, source_, &table_); + return Table::Open(table_options, source_, sink.contents().size(), &table_); } virtual size_t NumBytes() const { return source_->Size(); } diff --git a/util/env_chromium.cc b/util/env_chromium.cc index d79406f..834ec2d 100644 --- a/util/env_chromium.cc +++ b/util/env_chromium.cc @@ -144,17 +144,13 @@ class ChromiumSequentialFile: public SequentialFile { class ChromiumRandomAccessFile: public RandomAccessFile { private: std::string filename_; - uint64_t size_; ::base::PlatformFile file_; public: - ChromiumRandomAccessFile(const std::string& fname, uint64_t size, - ::base::PlatformFile file) - : filename_(fname), size_(size), file_(file) { } + ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file) + : filename_(fname), file_(file) { } virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } - virtual uint64_t Size() const { return size_; } - virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { Status s; @@ -256,13 +252,7 @@ class ChromiumEnv : public Env { *result = NULL; return Status::IOError(fname, PlatformFileErrorString(error_code)); } - ::base::PlatformFileInfo info; - if (!::base::GetPlatformFileInfo(file, &info)) { - *result = NULL; - ::base::ClosePlatformFile(file); - return Status::IOError(fname, PlatformFileErrorString(error_code)); - } - *result = new ChromiumRandomAccessFile(fname, info.size, file); + *result = new ChromiumRandomAccessFile(fname, file); return Status::OK(); } diff --git a/util/env_posix.cc b/util/env_posix.cc index f5174d3..5c58449 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -57,16 +57,13 @@ class PosixSequentialFile: public SequentialFile { class PosixRandomAccessFile: public RandomAccessFile { private: std::string filename_; - uint64_t size_; int fd_; public: - PosixRandomAccessFile(const std::string& fname, uint64_t size, int fd) - : filename_(fname), size_(size), fd_(fd) { } + PosixRandomAccessFile(const std::string& fname, int fd) + : filename_(fname), fd_(fd) { } virtual ~PosixRandomAccessFile() { close(fd_); } - virtual uint64_t Size() const { return size_; } - virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { Status s; @@ -286,14 +283,7 @@ class PosixEnv : public Env { *result = NULL; return Status::IOError(fname, strerror(errno)); } - struct stat sbuf; - if (fstat(fd, &sbuf) != 0) { - *result = NULL; - Status s = Status::IOError(fname, strerror(errno)); - close(fd); - return s; - } - *result = new PosixRandomAccessFile(fname, sbuf.st_size, fd); + *result = new PosixRandomAccessFile(fname, fd); return Status::OK(); }