作者: 谢瑞阳 10225101483 徐翔宇 10225101535
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

412 lines
11 KiB

  1. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  4. #include "db/db_iter.h"
  5. #include "db/filename.h"
  6. #include "db/dbformat.h"
  7. #include "include/env.h"
  8. #include "include/iterator.h"
  9. #include "port/port.h"
  10. #include "util/logging.h"
  11. #include "util/mutexlock.h"
  12. namespace leveldb {
  13. #if 0
  14. static void DumpInternalIter(Iterator* iter) {
  15. for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
  16. ParsedInternalKey k;
  17. if (!ParseInternalKey(iter->key(), &k)) {
  18. fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
  19. } else {
  20. fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
  21. }
  22. }
  23. }
  24. #endif
  25. namespace {
  26. // Memtables and sstables that make the DB representation contain
  27. // (userkey,seq,type) => uservalue entries. DBIter
  28. // combines multiple entries for the same userkey found in the DB
  29. // representation into a single entry while accounting for sequence
  30. // numbers, deletion markers, overwrites, etc.
  31. class DBIter: public Iterator {
  32. public:
  33. DBIter(const std::string* dbname, Env* env,
  34. const Comparator* cmp, Iterator* iter, SequenceNumber s)
  35. : dbname_(dbname),
  36. env_(env),
  37. user_comparator_(cmp),
  38. iter_(iter),
  39. sequence_(s),
  40. large_(NULL),
  41. valid_(false) {
  42. }
  43. virtual ~DBIter() {
  44. delete iter_;
  45. delete large_;
  46. }
  47. virtual bool Valid() const { return valid_; }
  48. virtual Slice key() const {
  49. assert(valid_);
  50. return key_;
  51. }
  52. virtual Slice value() const {
  53. assert(valid_);
  54. if (large_ == NULL) {
  55. return value_;
  56. } else {
  57. MutexLock l(&large_->mutex);
  58. if (!large_->produced) {
  59. ReadIndirectValue();
  60. }
  61. return large_->value;
  62. }
  63. }
  64. virtual void Next() {
  65. assert(valid_);
  66. // iter_ is already positioned past DBIter::key()
  67. FindNextUserEntry();
  68. }
  69. virtual void Prev() {
  70. assert(valid_);
  71. bool ignored;
  72. ScanUntilBeforeCurrentKey(&ignored);
  73. FindPrevUserEntry();
  74. }
  75. virtual void Seek(const Slice& target) {
  76. ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek);
  77. std::string tmp;
  78. AppendInternalKey(&tmp, ikey);
  79. iter_->Seek(tmp);
  80. FindNextUserEntry();
  81. }
  82. virtual void SeekToFirst() {
  83. iter_->SeekToFirst();
  84. FindNextUserEntry();
  85. }
  86. virtual void SeekToLast();
  87. virtual Status status() const {
  88. if (status_.ok()) {
  89. if (large_ != NULL && !large_->status.ok()) return large_->status;
  90. return iter_->status();
  91. } else {
  92. return status_;
  93. }
  94. }
  95. private:
  96. void FindNextUserEntry();
  97. void FindPrevUserEntry();
  98. void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); }
  99. void SaveValue(const Slice& v) {
  100. if (value_.capacity() > v.size() + 1048576) {
  101. std::string empty;
  102. swap(empty, value_);
  103. }
  104. value_.assign(v.data(), v.size());
  105. }
  106. bool ParseKey(ParsedInternalKey* key);
  107. void SkipPast(const Slice& k);
  108. void ScanUntilBeforeCurrentKey(bool* found_live);
  109. void ReadIndirectValue() const;
  110. struct Large {
  111. port::Mutex mutex;
  112. std::string value;
  113. bool produced;
  114. Status status;
  115. };
  116. const std::string* const dbname_;
  117. Env* const env_;
  118. const Comparator* const user_comparator_;
  119. // iter_ is positioned just past current entry for DBIter if valid_
  120. Iterator* const iter_;
  121. SequenceNumber const sequence_;
  122. Status status_;
  123. std::string key_; // Always a user key
  124. std::string value_;
  125. Large* large_; // Non-NULL if value is an indirect reference
  126. bool valid_;
  127. // No copying allowed
  128. DBIter(const DBIter&);
  129. void operator=(const DBIter&);
  130. };
  131. inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
  132. if (!ParseInternalKey(iter_->key(), ikey)) {
  133. status_ = Status::Corruption("corrupted internal key in DBIter");
  134. return false;
  135. } else {
  136. return true;
  137. }
  138. }
  139. void DBIter::FindNextUserEntry() {
  140. if (large_ != NULL) {
  141. if (status_.ok() && !large_->status.ok()) {
  142. status_ = large_->status;
  143. }
  144. delete large_;
  145. large_ = NULL;
  146. }
  147. while (iter_->Valid()) {
  148. ParsedInternalKey ikey;
  149. if (!ParseKey(&ikey)) {
  150. // Skip past corrupted entry
  151. iter_->Next();
  152. continue;
  153. }
  154. if (ikey.sequence > sequence_) {
  155. // Ignore entries newer than the snapshot
  156. iter_->Next();
  157. continue;
  158. }
  159. switch (ikey.type) {
  160. case kTypeDeletion:
  161. SaveKey(ikey.user_key); // Make local copy for use by SkipPast()
  162. iter_->Next();
  163. SkipPast(key_);
  164. // Do not return deleted entries. Instead keep looping.
  165. break;
  166. case kTypeValue:
  167. SaveKey(ikey.user_key);
  168. SaveValue(iter_->value());
  169. iter_->Next();
  170. SkipPast(key_);
  171. // Yield the value we just found.
  172. valid_ = true;
  173. return;
  174. case kTypeLargeValueRef:
  175. SaveKey(ikey.user_key);
  176. // Save the large value ref as value_, and read it lazily on a call
  177. // to value()
  178. SaveValue(iter_->value());
  179. large_ = new Large;
  180. large_->produced = false;
  181. iter_->Next();
  182. SkipPast(key_);
  183. // Yield the value we just found.
  184. valid_ = true;
  185. return;
  186. }
  187. }
  188. valid_ = false;
  189. key_.clear();
  190. value_.clear();
  191. assert(large_ == NULL);
  192. }
  193. void DBIter::SkipPast(const Slice& k) {
  194. while (iter_->Valid()) {
  195. ParsedInternalKey ikey;
  196. // Note that if we cannot parse an internal key, we keep looping
  197. // so that if we have a run like the following:
  198. // <x,100,v> => value100
  199. // <corrupted entry for user key x>
  200. // <x,50,v> => value50
  201. // we will skip over the corrupted entry as well as value50.
  202. if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) {
  203. break;
  204. }
  205. iter_->Next();
  206. }
  207. }
  208. void DBIter::SeekToLast() {
  209. // Position iter_ at the last uncorrupted user key and then
  210. // let FindPrevUserEntry() do the heavy lifting to find
  211. // a user key that is live.
  212. iter_->SeekToLast();
  213. ParsedInternalKey current;
  214. while (iter_->Valid() && !ParseKey(&current)) {
  215. iter_->Prev();
  216. }
  217. if (iter_->Valid()) {
  218. SaveKey(current.user_key);
  219. }
  220. FindPrevUserEntry();
  221. }
  222. // Let X be the user key at which iter_ is currently positioned.
  223. // Adjust DBIter to point at the last entry with a key <= X that
  224. // has a live value.
  225. void DBIter::FindPrevUserEntry() {
  226. // Consider the following example:
  227. //
  228. // A@540
  229. // A@400
  230. //
  231. // B@300
  232. // B@200
  233. // B@100 <- iter_
  234. //
  235. // C@301
  236. // C@201
  237. //
  238. // The comments marked "(first iteration)" below relate what happens
  239. // for the preceding example in the first iteration of the while loop
  240. // below. There may be more than one iteration either if there are
  241. // no live values for B, or if there is a corruption.
  242. while (iter_->Valid()) {
  243. std::string saved = key_;
  244. bool found_live;
  245. ScanUntilBeforeCurrentKey(&found_live);
  246. // (first iteration) iter_ at A@400
  247. if (found_live) {
  248. // Step forward into range of entries with user key >= saved
  249. if (!iter_->Valid()) {
  250. iter_->SeekToFirst();
  251. } else {
  252. iter_->Next();
  253. }
  254. // (first iteration) iter_ at B@300
  255. FindNextUserEntry(); // Sets key_ to the key of the next value it found
  256. if (valid_ && user_comparator_->Compare(key_, saved) == 0) {
  257. // (first iteration) iter_ at C@301
  258. return;
  259. }
  260. // FindNextUserEntry() could not find any entries under the
  261. // user key "saved". This is probably a corruption since
  262. // ScanUntilBefore(saved) found a live value. So we skip
  263. // backwards to an earlier key and ignore the corrupted
  264. // entries for "saved".
  265. //
  266. // (first iteration) iter_ at C@301 and saved == "B"
  267. key_ = saved;
  268. bool ignored;
  269. ScanUntilBeforeCurrentKey(&ignored);
  270. // (first iteration) iter_ at A@400
  271. }
  272. }
  273. valid_ = false;
  274. key_.clear();
  275. value_.clear();
  276. }
  277. void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) {
  278. *found_live = false;
  279. if (!iter_->Valid()) {
  280. iter_->SeekToLast();
  281. }
  282. while (iter_->Valid()) {
  283. ParsedInternalKey current;
  284. if (!ParseKey(&current)) {
  285. iter_->Prev();
  286. continue;
  287. }
  288. if (current.sequence > sequence_) {
  289. // Ignore entries that are serialized after this read
  290. iter_->Prev();
  291. continue;
  292. }
  293. const int cmp = user_comparator_->Compare(current.user_key, key_);
  294. if (cmp < 0) {
  295. SaveKey(current.user_key);
  296. return;
  297. } else if (cmp == 0) {
  298. switch (current.type) {
  299. case kTypeDeletion:
  300. *found_live = false;
  301. break;
  302. case kTypeValue:
  303. case kTypeLargeValueRef:
  304. *found_live = true;
  305. break;
  306. }
  307. } else { // cmp > 0
  308. *found_live = false;
  309. }
  310. iter_->Prev();
  311. }
  312. }
  313. void DBIter::ReadIndirectValue() const {
  314. assert(!large_->produced);
  315. large_->produced = true;
  316. LargeValueRef large_ref;
  317. if (value_.size() != LargeValueRef::ByteSize()) {
  318. large_->status = Status::Corruption("malformed large value reference");
  319. return;
  320. }
  321. memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize());
  322. std::string fname = LargeValueFileName(*dbname_, large_ref);
  323. RandomAccessFile* file;
  324. Status s = env_->NewRandomAccessFile(fname, &file);
  325. if (s.ok()) {
  326. uint64_t file_size = file->Size();
  327. uint64_t value_size = large_ref.ValueSize();
  328. large_->value.resize(value_size);
  329. Slice result;
  330. s = file->Read(0, file_size, &result,
  331. const_cast<char*>(large_->value.data()));
  332. if (s.ok()) {
  333. if (result.size() == file_size) {
  334. switch (large_ref.compression_type()) {
  335. case kNoCompression: {
  336. if (result.data() != large_->value.data()) {
  337. large_->value.assign(result.data(), result.size());
  338. }
  339. break;
  340. }
  341. case kLightweightCompression: {
  342. std::string uncompressed;
  343. if (port::Lightweight_Uncompress(result.data(), result.size(),
  344. &uncompressed) &&
  345. uncompressed.size() == large_ref.ValueSize()) {
  346. swap(uncompressed, large_->value);
  347. } else {
  348. s = Status::Corruption(
  349. "Unable to read entire compressed large value file");
  350. }
  351. }
  352. }
  353. } else {
  354. s = Status::Corruption("Unable to read entire large value file");
  355. }
  356. }
  357. delete file; // Ignore errors on closing
  358. }
  359. if (!s.ok()) {
  360. large_->value.clear();
  361. large_->status = s;
  362. }
  363. }
  364. } // anonymous namespace
  365. Iterator* NewDBIterator(
  366. const std::string* dbname,
  367. Env* env,
  368. const Comparator* user_key_comparator,
  369. Iterator* internal_iter,
  370. const SequenceNumber& sequence) {
  371. return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
  372. }
  373. }