From e2da744e1241fec301351ca34730877e6cff722a Mon Sep 17 00:00:00 2001
From: "jorlow@chromium.org"
 <jorlow@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>
Date: Mon, 28 Mar 2011 20:43:44 +0000
Subject: [PATCH] Upstream changes.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@16 62dab493-f737-651d-591e-8d6aee1b9529
---
 db/builder.cc        |  4 +++-
 db/db_bench.cc       |  4 ++--
 db/db_impl.cc        |  4 +++-
 db/db_iter.cc        |  5 ++++-
 db/log_reader.cc     |  6 ++----
 db/log_writer.cc     | 11 ++++++-----
 db/repair.cc         |  2 +-
 db/table_cache.cc    |  3 ++-
 db/table_cache.h     | 15 ++++++++-------
 db/version_set.cc    | 22 ++++++++++++++--------
 doc/log_format.txt   | 15 +++++++++------
 include/env.h        |  3 ---
 include/table.h      |  6 ++++--
 table/table.cc       |  2 +-
 table/table_test.cc  |  4 ++--
 util/env_chromium.cc | 16 +++-------------
 util/env_posix.cc    | 16 +++-------------
 17 files changed, 67 insertions(+), 71 deletions(-)

diff --git a/db/builder.cc b/db/builder.cc
index f3d0fe2..d5585c3 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -74,7 +74,9 @@ Status BuildTable(const std::string& dbname,
 
     if (s.ok()) {
       // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number);
+      Iterator* it = table_cache->NewIterator(ReadOptions(),
+                                              meta->number,
+                                              meta->file_size);
       s = it->status();
       delete it;
     }
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 7026ca1..c7a662d 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -354,7 +354,7 @@ class Benchmark {
  private:
   void Crc32c(int size, const char* label) {
     // Checksum about 500MB of data total
-    string data(size, 'x');
+    std::string data(size, 'x');
     int64_t bytes = 0;
     uint32_t crc = 0;
     while (bytes < 500 * 1048576) {
@@ -371,7 +371,7 @@ class Benchmark {
 
   void SHA1(int size, const char* label) {
     // SHA1 about 100MB of data total
-    string data(size, 'x');
+    std::string data(size, 'x');
     int64_t bytes = 0;
     char sha1[20];
     while (bytes < 100 * 1048576) {
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 12c02b3..f14167a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -642,7 +642,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
 
   if (s.ok() && current_entries > 0) {
     // Verify that the table is usable
-    Iterator* iter = table_cache_->NewIterator(ReadOptions(),output_number);
+    Iterator* iter = table_cache_->NewIterator(ReadOptions(),
+                                               output_number,
+                                               current_bytes);
     s = iter->status();
     delete iter;
     if (s.ok()) {
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 6726b51..beb4d74 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -340,8 +340,11 @@ void DBIter::ReadIndirectValue(Slice ref) const {
   std::string fname = LargeValueFileName(*dbname_, large_ref);
   RandomAccessFile* file;
   Status s = env_->NewRandomAccessFile(fname, &file);
+  uint64_t file_size = 0;
+  if (s.ok()) {
+    s = env_->GetFileSize(fname, &file_size);
+  }
   if (s.ok()) {
-    uint64_t file_size = file->Size();
     uint64_t value_size = large_ref.ValueSize();
     large_->value.resize(value_size);
     Slice result;
diff --git a/db/log_reader.cc b/db/log_reader.cc
index 39a6d2b..407700d 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -105,7 +105,7 @@ void Reader::ReportDrop(size_t bytes, const char* reason) {
 
 unsigned int Reader::ReadPhysicalRecord(Slice* result) {
   while (true) {
-    if (buffer_.size() <= kHeaderSize) {
+    if (buffer_.size() < kHeaderSize) {
       if (!eof_) {
         // Last read was a full read, so this is a trailer to skip
         buffer_.clear();
@@ -124,12 +124,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
       } else if (buffer_.size() == 0) {
         // End of file
         return kEof;
-      } else if (buffer_.size() < kHeaderSize) {
+      } else {
         ReportDrop(buffer_.size(), "truncated record at end of file");
         buffer_.clear();
         return kEof;
-      } else {
-        // We have a trailing zero-length record.  Fall through and check it.
       }
     }
 
diff --git a/db/log_writer.cc b/db/log_writer.cc
index 465eca2..fc33e6e 100644
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -35,18 +35,19 @@ Status Writer::AddRecord(const Slice& slice) {
   do {
     const int leftover = kBlockSize - block_offset_;
     assert(leftover >= 0);
-    if (leftover <= kHeaderSize) {
+    if (leftover < kHeaderSize) {
       // Switch to a new block
       if (leftover > 0) {
-        // Fill the trailer
-        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover));
+        // Fill the trailer (literal below relies on kHeaderSize being 7)
+        assert(kHeaderSize == 7);
+        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
       }
       block_offset_ = 0;
     }
 
-    // Invariant: we never leave <= kHeaderSize bytes in a block.
+    // Invariant: we never leave < kHeaderSize bytes in a block.
     const int avail = kBlockSize - block_offset_ - kHeaderSize;
-    assert(avail > 0);
+    assert(avail >= 0);
 
     const size_t fragment_length = (left < avail) ? left : avail;
 
diff --git a/db/repair.cc b/db/repair.cc
index 0727914..745b31a 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -261,7 +261,7 @@ class Repairer {
     Status status = env_->GetFileSize(fname, &t->meta.file_size);
     if (status.ok()) {
       Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), t->meta.number);
+          ReadOptions(), t->meta.number, t->meta.file_size);
       bool empty = true;
       ParsedInternalKey parsed;
       t->max_sequence = 0;
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 604298d..6f750d6 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -44,6 +44,7 @@ TableCache::~TableCache() {
 
 Iterator* TableCache::NewIterator(const ReadOptions& options,
                                   uint64_t file_number,
+                                  uint64_t file_size,
                                   Table** tableptr) {
   if (tableptr != NULL) {
     *tableptr = NULL;
@@ -59,7 +60,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
     Table* table = NULL;
     Status s = env_->NewRandomAccessFile(fname, &file);
     if (s.ok()) {
-      s = Table::Open(*options_, file, &table);
+      s = Table::Open(*options_, file, file_size, &table);
     }
 
     if (!s.ok()) {
diff --git a/db/table_cache.h b/db/table_cache.h
index 6c357df..5564dfc 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -23,15 +23,16 @@ class TableCache {
   TableCache(const std::string& dbname, const Options* options, int entries);
   ~TableCache();
 
-  // Get an iterator for the specified file number and return it.  If
-  // "tableptr" is non-NULL, also sets "*tableptr" to point to the
-  // Table object underlying the returned iterator, or NULL if no
-  // Table object underlies the returned iterator.  The returned
-  // "*tableptr" object is owned by the cache and should not be
-  // deleted, and is valid for as long as the returned iterator is
-  // live.
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "tableptr" is
+  // non-NULL, also sets "*tableptr" to point to the Table object
+  // underlying the returned iterator, or NULL if no Table object underlies
+  // the returned iterator.  The returned "*tableptr" object is owned by
+  // the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
   Iterator* NewIterator(const ReadOptions& options,
                         uint64_t file_number,
+                        uint64_t file_size,
                         Table** tableptr = NULL);
 
   // Evict any entry for the specified file number
diff --git a/db/version_set.cc b/db/version_set.cc
index caf0b2d..b826e5b 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -75,8 +75,8 @@ Version::~Version() {
 // An internal iterator.  For a given version/level pair, yields
 // information about the files in the level.  For a given entry, key()
 // is the largest key that occurs in the file, and value() is an
-// 8-byte value containing the file number of the file, encoding using
-// EncodeFixed64.
+// 16-byte value containing the file number and file size, both
+// encoded using EncodeFixed64.
 class Version::LevelFileNumIterator : public Iterator {
  public:
   LevelFileNumIterator(const Version* version,
@@ -129,6 +129,7 @@ class Version::LevelFileNumIterator : public Iterator {
   Slice value() const {
     assert(Valid());
     EncodeFixed64(value_buf_, (*flist_)[index_]->number);
+    EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
     return Slice(value_buf_, sizeof(value_buf_));
   }
   virtual Status status() const { return Status::OK(); }
@@ -137,18 +138,21 @@ class Version::LevelFileNumIterator : public Iterator {
   const std::vector<FileMetaData*>* const flist_;
   int index_;
 
-  mutable char value_buf_[8];  // Used for encoding the file number for value()
+  // Backing store for value().  Holds the file number and size.
+  mutable char value_buf_[16];
 };
 
 static Iterator* GetFileIterator(void* arg,
                                  const ReadOptions& options,
                                  const Slice& file_value) {
   TableCache* cache = reinterpret_cast<TableCache*>(arg);
-  if (file_value.size() != 8) {
+  if (file_value.size() != 16) {
     return NewErrorIterator(
         Status::Corruption("FileReader invoked with unexpected value"));
   } else {
-    return cache->NewIterator(options, DecodeFixed64(file_value.data()));
+    return cache->NewIterator(options,
+                              DecodeFixed64(file_value.data()),
+                              DecodeFixed64(file_value.data() + 8));
   }
 }
 
@@ -164,7 +168,8 @@ void Version::AddIterators(const ReadOptions& options,
   // Merge all level zero files together since they may overlap
   for (int i = 0; i < files_[0].size(); i++) {
     iters->push_back(
-        vset_->table_cache_->NewIterator(options, files_[0][i]->number));
+        vset_->table_cache_->NewIterator(
+            options, files_[0][i]->number, files_[0][i]->file_size));
   }
 
   // For levels > 0, we can use a concatenating iterator that sequentially
@@ -650,7 +655,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
         // approximate offset of "ikey" within the table.
         Table* tableptr;
         Iterator* iter = table_cache_->NewIterator(
-            ReadOptions(), files[i]->number, &tableptr);
+            ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
         if (tableptr != NULL) {
           result += tableptr->ApproximateOffsetOf(ikey.Encode());
         }
@@ -855,7 +860,8 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
       if (c->level() + which == 0) {
         const std::vector<FileMetaData*>& files = c->inputs_[which];
         for (int i = 0; i < files.size(); i++) {
-          list[num++] = table_cache_->NewIterator(options, files[i]->number);
+          list[num++] = table_cache_->NewIterator(
+              options, files[i]->number, files[i]->file_size);
         }
       } else {
         // Create concatenating iterator for the files from this level
diff --git a/doc/log_format.txt b/doc/log_format.txt
index 9a801d4..3a0414b 100644
--- a/doc/log_format.txt
+++ b/doc/log_format.txt
@@ -9,12 +9,15 @@ Each block consists of a sequence of records:
 	type: uint8		// One of FULL, FIRST, MIDDLE, LAST
 	data: uint8[length]
 
-A record never starts within the last seven bytes of a block.  Any
-leftover bytes here form the trailer, which must consist entirely of
-zero bytes and must be skipped by readers.  In particular, even if
-there are exactly seven bytes left in the block, and a zero-length
-user record is added (which will fit in these seven bytes), the writer
-must skip these trailer bytes and add the record to the next block.
+A record never starts within the last six bytes of a block (since it
+won't fit).  Any leftover bytes here form the trailer, which must
+consist entirely of zero bytes and must be skipped by readers.  
+
+Aside: if exactly seven bytes are left in the current block, and a new
+non-zero length record is added, the writer must emit a FIRST record
+(which contains zero bytes of user data) to fill up the trailing seven
+bytes of the block and then emit all of the user data in subsequent
+blocks.
 
 More types may be added in the future.  Some Readers may skip record
 types they do not understand, others may report that some data was
diff --git a/include/env.h b/include/env.h
index a728f29..14ddf29 100644
--- a/include/env.h
+++ b/include/env.h
@@ -168,9 +168,6 @@ class RandomAccessFile {
   RandomAccessFile() { }
   virtual ~RandomAccessFile();
 
-  // Return the length of this file in bytes.
-  virtual uint64_t Size() const = 0;
-
   // Read up to "n" bytes from the file starting at "offset".
   // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
   // to the data that was read (including if fewer than "n" bytes were
diff --git a/include/table.h b/include/table.h
index 96b2196..c2a4cf9 100644
--- a/include/table.h
+++ b/include/table.h
@@ -20,8 +20,9 @@ struct ReadOptions;
 // immutable and persistent.
 class Table {
  public:
-  // Attempt to open the table that is stored in "file", and read the
-  // metadata entries necessary to allow retrieving data from the table.
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
   //
   // If successful, returns ok and sets "*table" to the newly opened
   // table.  The client should delete "*table" when no longer needed.
@@ -33,6 +34,7 @@ class Table {
   // *file must remain live while this Table is in use.
   static Status Open(const Options& options,
                      RandomAccessFile* file,
+                     uint64_t file_size,
                      Table** table);
 
   ~Table();
diff --git a/table/table.cc b/table/table.cc
index dffc217..bd0fbb5 100644
--- a/table/table.cc
+++ b/table/table.cc
@@ -29,9 +29,9 @@ struct Table::Rep {
 
 Status Table::Open(const Options& options,
                    RandomAccessFile* file,
+                   uint64_t size,
                    Table** table) {
   *table = NULL;
-  const uint64_t size = file->Size();
   if (size < Footer::kEncodedLength) {
     return Status::InvalidArgument("file is too short to be an sstable");
   }
diff --git a/table/table_test.cc b/table/table_test.cc
index eabb257..d67c58b 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -110,7 +110,7 @@ class StringSource: public RandomAccessFile {
 
   virtual ~StringSource() { }
 
-  virtual uint64_t Size() const { return contents_.size(); }
+  uint64_t Size() const { return contents_.size(); }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                        char* scratch) const {
@@ -246,7 +246,7 @@ class TableConstructor: public Constructor {
     source_ = new StringSource(sink.contents());
     Options table_options;
     table_options.comparator = options.comparator;
-    return Table::Open(table_options, source_, &table_);
+    return Table::Open(table_options, source_, sink.contents().size(), &table_);
   }
   virtual size_t NumBytes() const { return source_->Size(); }
 
diff --git a/util/env_chromium.cc b/util/env_chromium.cc
index d79406f..834ec2d 100644
--- a/util/env_chromium.cc
+++ b/util/env_chromium.cc
@@ -144,17 +144,13 @@ class ChromiumSequentialFile: public SequentialFile {
 class ChromiumRandomAccessFile: public RandomAccessFile {
  private:
   std::string filename_;
-  uint64_t size_;
   ::base::PlatformFile file_;
 
  public:
-  ChromiumRandomAccessFile(const std::string& fname, uint64_t size,
-                           ::base::PlatformFile file)
-      : filename_(fname), size_(size), file_(file) { }
+  ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file)
+      : filename_(fname), file_(file) { }
   virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); }
 
-  virtual uint64_t Size() const { return size_; }
-
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {
     Status s;
@@ -256,13 +252,7 @@ class ChromiumEnv : public Env {
       *result = NULL;
       return Status::IOError(fname, PlatformFileErrorString(error_code));
     }
-    ::base::PlatformFileInfo info;
-    if (!::base::GetPlatformFileInfo(file, &info)) {
-      *result = NULL;
-      ::base::ClosePlatformFile(file);
-      return Status::IOError(fname, PlatformFileErrorString(error_code));
-    }
-    *result = new ChromiumRandomAccessFile(fname, info.size, file);
+    *result = new ChromiumRandomAccessFile(fname, file);
     return Status::OK();
   }
 
diff --git a/util/env_posix.cc b/util/env_posix.cc
index f5174d3..5c58449 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -57,16 +57,13 @@ class PosixSequentialFile: public SequentialFile {
 class PosixRandomAccessFile: public RandomAccessFile {
  private:
   std::string filename_;
-  uint64_t size_;
   int fd_;
 
  public:
-  PosixRandomAccessFile(const std::string& fname, uint64_t size, int fd)
-      : filename_(fname), size_(size), fd_(fd) { }
+  PosixRandomAccessFile(const std::string& fname, int fd)
+      : filename_(fname), fd_(fd) { }
   virtual ~PosixRandomAccessFile() { close(fd_); }
 
-  virtual uint64_t Size() const { return size_; }
-
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {
     Status s;
@@ -286,14 +283,7 @@ class PosixEnv : public Env {
       *result = NULL;
       return Status::IOError(fname, strerror(errno));
     }
-    struct stat sbuf;
-    if (fstat(fd, &sbuf) != 0) {
-      *result = NULL;
-      Status s = Status::IOError(fname, strerror(errno));
-      close(fd);
-      return s;
-    }
-    *result = new PosixRandomAccessFile(fname, sbuf.st_size, fd);
+    *result = new PosixRandomAccessFile(fname, fd);
     return Status::OK();
   }