4 Commits

Author SHA1 Message Date
  ArcueidType f71888a2fe Add Report.pdf 8 months ago
  wesley 05beef2928 update report 8 months ago
  wesley 1790adbd74 fix tests and report 8 months ago
  wesley c185695359 test 8 months ago
8 changed files with 64 additions and 286 deletions
Split View
  1. +0
    -5
      CMakeLists.txt
  2. +14
    -4
      README.md
  3. BIN
      Report.pdf
  4. BIN
      assets/latency.png
  5. BIN
      assets/randvalu_2.png
  6. BIN
      assets/randvalue.png
  7. +50
    -1
      test/test_bench.cc
  8. +0
    -276
      test/test_bench_gc.cc

+ 0
- 5
CMakeLists.txt View File

@ -551,8 +551,3 @@ add_executable(test_bench
"${PROJECT_SOURCE_DIR}/test/test_bench.cc"
)
target_link_libraries(test_bench PRIVATE leveldb gtest)
add_executable(test_bench_gc
"${PROJECT_SOURCE_DIR}/test/test_bench_gc.cc"
)
target_link_libraries(test_bench_gc PRIVATE leveldb gtest)

+ 14
- 4
README.md View File

@ -322,7 +322,7 @@ class VTableManager {
由于 `VTable Manager` 是在 `leveldb` 运行过程中在内存中维护的数据结构,因此一旦程序停止运行,即数据库关闭, `VTable Manager` 存储的元数据就会丢失,而作为一个数据库,其可以在关闭后重启,重启时可以通过遍历所有 `VTable` 并且用其中的key恢复上次关闭时的元数据信息,但是这样恢复的效率在数据量较大时会产生大量的读磁盘,导致启动速度极慢。为了解决这个问题,这里采用了类似 `Log` 的机制,`VTable Manager` 提供了 `SaveVTableMeta` 方法用于将当前元数据状态写入磁盘, `LoadVTableMeta` 方法用于从磁盘中读取并恢复元数据状态
在时机上,遵从 `leveldb` 的机制,即在 `version set` `LogAndApply` 时调用 `SaveVTableMeta` 保存当前数据库 `VTable Meta` 状态,在数据库启动时的恢复过程中调用 `LoadVTableMeta` 读取关机时数据库中的 `VTable Meta` 状态
在时机上,遵从 `leveldb` 的机制,即在 `version set` `LogAndApply` 时调用 `SaveVTableMeta` 保存当前数据库 `VTable Meta` 状态,在数据库启动时的恢复过程(`Recover`方法)中调用 `LoadVTableMeta` 读取关机时数据库中的 `VTable Meta` 状态
- **VTable Builder**
@ -636,7 +636,11 @@ GC是KV分离中的一个非常重要的问题,虽然KV分离的功能显著
![insert](./assets/Insert_static_vsize.png)
类似地,使用`test_bench.cc`可以测得`Put()`的延迟,以及其他操作的吞吐量和延迟。这里不一一作图展示。
从图中可见,当`value_size`较小时,由于不发生KV分离,性能差距并不明显;而当`value_size`大到足够发生KV分离时,实现了KV分离的数据的写入数据性能明显优于原本的LevelDB。而由于后台GC会阻塞读写进程,对整体性能产生影响,触发了GC的数据库性能略差于不触发GC的数据库。
在本测试中测出的各操作延迟如下表(单位:微秒/次操作,micros/op),根据平均延迟,红色字体最优,黄色格子次之:
![lats](./assets/latency.png)
##### Part. C 面对不固定的value_size时数据库的性能
@ -650,9 +654,15 @@ GC是KV分离中的一个非常重要的问题,虽然KV分离的功能显著
在这次性能测试中,我们设定的`value_size`的随机范围是`500~2048`,预设的KV分离阈值是1000,正好可以模拟实际应用情况。同时,我们还是设定了对于全部数据都使用KV分离和完全不使用KV分离的两种数据库作为对照组。
![randvsizet](./assets/randvalue.png)
由于分离阈值对`value_size`较小的数据的读取很友好,因此此次测试还使用了`GetDataSmallSize`函数,用于对比读取较小数据时各版本的性能。
图中红色柱是全部数据KV分离,蓝色柱是部分数据KV分离(测试目标),黄色柱是完全不进行KV分离。
![randvalue](./assets/randvalu_2.png)
从图中可以看出,写入速度最快的是部分进行KV分离的数据库,因为对于不分离的小数据,没有了将其`value`编写为`VTable`并写入磁盘的操作,写入速度比完全KV分离的更快。
Latency等数据不便于在图中展示,这里省略。
而在读取全部数据时,部分KV分离的数据库性能介于不KV分离和完全KV分离之间;但在读取小数据时,我们实现的数据库由于没有分离小数据,其读取性能远高于完全KV分离的数据库
##### Part. D 不同数据量的写放大开销

BIN
Report.pdf View File


BIN
assets/latency.png View File

Before After
Width: 1643  |  Height: 654  |  Size: 414 KiB

BIN
assets/randvalu_2.png View File

Before After
Width: 1348  |  Height: 817  |  Size: 50 KiB

BIN
assets/randvalue.png View File

Before After
Width: 1348  |  Height: 818  |  Size: 63 KiB

+ 50
- 1
test/test_bench.cc View File

@ -11,7 +11,7 @@ using namespace leveldb;
// Number of key/values to operate in database
constexpr int num_ = 100000;
// Size of each value
constexpr int value_size_ = 500;
constexpr int value_size_ = 1000;
// Number of read operations
constexpr int reads_ = 100000;
// Number of findkeysbyfield operations
@ -19,6 +19,7 @@ constexpr int search_ = 10;
int64_t bytes_ = 0;
std::set<int> key_set;
Status OpenDB(std::string dbName, DB **db) {
Options options;
@ -56,6 +57,10 @@ void InsertData(DB *db, std::vector &lats) {
latency = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - last_time).count();
last_time = end_time;
lats.emplace_back(latency);
// if (value_size < 1000) {
// key_set.insert(key_);
// }
}
bytes_ += bytes;
}
@ -83,6 +88,31 @@ void GetData(DB *db, std::vector &lats) {
bytes_ += bytes;
}
void GetDataSmallSize (DB *db, std::vector<int64_t> &lats) {
ReadOptions readOptions;
bytes_ = 0;
int64_t bytes = 0;
srand(0);
int64_t latency = 0;
auto end_time = std::chrono::steady_clock::now();
auto last_time = end_time;
for (auto key_ : key_set) {
// int key_ = rand() % num_+1;
// if (key_set.find(key_) != key_set.end()){
// continue;
// }
std::string key = std::to_string(key_);
Fields ret;
db->Get(readOptions, key, &ret);
bytes += ret.size();
end_time = std::chrono::steady_clock::now();
latency = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - last_time).count();
last_time = end_time;
lats.emplace_back(latency);
}
bytes_ += bytes;
}
// DB::Iterator()->Seek() PointQuery
void PointQuery(DB *db, std::vector<int64_t> &lats) {
ReadOptions options;
@ -183,6 +213,7 @@ TEST(TestBench, Throughput) {
std::vector<int64_t> lats;
// Put()
auto start_time = std::chrono::steady_clock::now();
InsertData(db, lats);
@ -190,6 +221,8 @@ TEST(TestBench, Throughput) {
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
// std::cout << "Throughput of Put(): " << std::fixed << num_ * 1e6 / duration << " ops/s" << std::endl;
std::cout << "Throughput of Put(): " << std::fixed << std::setprecision(2) << std::endl << (bytes_ / 1048576.0) / (duration * 1e-6) << " MB/s" << std::endl << std::endl;
std::cout << "set size:" << key_set.size() << std::endl;
// Get()
start_time = std::chrono::steady_clock::now();
GetData(db, lats);
@ -197,6 +230,14 @@ TEST(TestBench, Throughput) {
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
// std::cout << "Throughput of Get(): " << std::fixed << reads_ * 1e6 / duration << " ops/s" << std::endl;
std::cout << "Throughput of Get(): " << std::fixed << std::setprecision(2) << std::endl << (bytes_ / 1048576.0) / (duration * 1e-6) << " MB/s" << std::endl << std::endl;
// start_time = std::chrono::steady_clock::now();
// GetDataSmallSize(db, lats);
// end_time = std::chrono::steady_clock::now();
// duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
// // std::cout << "Throughput of Get(): " << std::fixed << reads_ * 1e6 / duration << " ops/s" << std::endl;
// std::cout << "Throughput of Get(): " << std::fixed << std::setprecision(2) << std::endl << (bytes_ / 1048576.0) / (duration * 1e-6) << " MB/s" << std::endl << std::endl;
// Iterator()
start_time = std::chrono::steady_clock::now();
ReadOrdered(db, lats);
@ -223,6 +264,7 @@ TEST(TestBench, Latency) {
std::vector<int64_t> put_lats;
std::vector<int64_t> get_lats;
std::vector<int64_t> get_lats_2;
std::vector<int64_t> iter_lats;
std::vector<int64_t> search_lats;
auto calc_lat = [](const std::vector<int64_t>& latencies) {
@ -252,6 +294,13 @@ TEST(TestBench, Latency) {
double get_p99 = std::get<2>(get_latency);
std::cout << "Get Latency (avg, P75, P99): " << std::fixed << std::endl << std::setprecision(2) << get_avg * 1e-3 << " micros/op, " << get_p75 * 1e-3 << " micros/op, " << get_p99 * 1e-3 << " micros/op" << std::endl << std::endl;
// GetDataSmallSize(db, get_lats_2);
// std::tuple<double, double, double> get_latency_2 = calc_lat(get_lats_2);
// double get_avg_s = std::get<0>(get_latency_2);
// double get_p75_s = std::get<1>(get_latency_2);
// double get_p99_s = std::get<2>(get_latency_2);
// std::cout << "Small Get Latency (avg, P75, P99): " << std::fixed << std::endl << std::setprecision(2) << get_avg_s * 1e-3 << " micros/op, " << get_p75_s * 1e-3 << " micros/op, " << get_p99_s * 1e-3 << " micros/op" << std::endl << std::endl;
ReadOrdered(db, iter_lats);
std::tuple<double, double, double> iter_latency = calc_lat(iter_lats);
double iter_avg = std::get<0>(iter_latency);

+ 0
- 276
test/test_bench_gc.cc View File

@ -1,276 +0,0 @@
#include <iostream>
#include <gtest/gtest.h>
#include <chrono>
#include <vector>
#include <random>
#include "leveldb/env.h"
#include "leveldb/db.h"
using namespace leveldb;
// Number of key/values to operate in database
constexpr int num_ = 100000;
// Size of each value
constexpr int value_size_ = 500;
// Number of read operations
constexpr int reads_ = 100000;
// Number of findkeysbyfield operations
constexpr int search_ = 20;
int64_t bytes_ = 0;
Status OpenDB(std::string dbName, DB **db) {
Options options;
options.create_if_missing = true;
return DB::Open(options, dbName, db);
}
// DB::Put()
void InsertData(DB *db, std::vector<int64_t> &lats) {
WriteOptions writeOptions;
bytes_ = 0;
int64_t bytes = 0;
srand(0);
// std::mt19937 value_seed(100);
// std::uniform_int_distribution<int> value_range(500, 2048);
int64_t latency = 0;
auto end_time = std::chrono::steady_clock::now();
auto last_time = end_time;
for (int i = 0; i < num_; i++) {
int key_ = rand() % num_+1;
int value_ = std::rand() % (num_ + 1);
// int value_size = value_range(value_seed);
std::string value(value_size_, 'a');
std::string key = std::to_string(key_);
FieldArray field_array = {
{"1", value},
};
auto fields = Fields(field_array);
db->Put(writeOptions, key, fields);
bytes += fields.size();
end_time = std::chrono::steady_clock::now();
latency = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - last_time).count();
last_time = end_time;
lats.emplace_back(latency);
}
bytes_ += bytes;
}
// DB::Get() PointQuery Random
void GetData(DB *db, std::vector<int64_t> &lats) {
ReadOptions readOptions;
bytes_ = 0;
int64_t bytes = 0;
srand(0);
int64_t latency = 0;
auto end_time = std::chrono::steady_clock::now();
auto last_time = end_time;
for (int i = 0; i < reads_; i++) {
int key_ = rand() % num_+1;
std::string key = std::to_string(key_);
Fields ret;
db->Get(readOptions, key, &ret);
bytes += ret.size();
end_time = std::chrono::steady_clock::now();
latency = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - last_time).count();
last_time = end_time;
lats.emplace_back(latency);
}
bytes_ += bytes;
}
// DB::Iterator()->Seek() PointQuery
void PointQuery(DB *db, std::vector<int64_t> &lats) {
ReadOptions options;
srand(0);
bytes_ = 0;
int64_t bytes = 0;
Iterator* iter = db->NewIterator(options);
int key_ = 0;
int64_t latency = 0;
auto end_time = std::chrono::steady_clock::now();
auto last_time = end_time;
for (int i = 0; i < reads_; i++) {
key_ = (key_ + rand()) % num_+1;
std::string key = std::to_string(key_);
iter->Seek(key);
bytes += iter->fields().size();
end_time = std::chrono::steady_clock::now();
latency = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - last_time).count();
last_time = end_time;
lats.emplace_back(latency);
}
bytes_+=bytes;
delete iter;
}
// DB::Iterator()->SeekToFirst() RangeQuery
void ReadOrdered(DB *db, std::vector<int64_t> &lats) {
Iterator* iter = db->NewIterator(ReadOptions());
int i = 0;
bytes_ = 0;
int64_t bytes = 0;
int64_t latency = 0;
auto end_time = std::chrono::steady_clock::now();
auto last_time = end_time;
for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
++i;
bytes+=iter->fields().size();
end_time = std::chrono::steady_clock::now();
latency = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - last_time).count();
last_time = end_time;
lats.emplace_back(latency);
}
bytes_+=bytes;
delete iter;
}
// DB::FindKeysByField()
void SearchField(DB *db, std::vector<int64_t> &lats) {
int64_t latency = 0;
auto end_time = std::chrono::steady_clock::now();
auto last_time = end_time;
srand(0);
for (int i = 0; i < search_; i++) {
// Iterator *iter = db->NewIterator(ReadOptions());
int value_ = std::rand() % (num_ + 1);
Field field_to_search = {"1", std::to_string(value_)};
const std::vector<std::string> key_ret = db->FindKeysByField(field_to_search);
end_time = std::chrono::steady_clock::now();
latency = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - last_time).count();
last_time = end_time;
lats.emplace_back(latency);
}
}
// Insert many k/vs in order to start background GC
void InsertMany(DB *db) {
std::vector<int64_t> lats;
for (int i = 0; i < 2; i++) {
InsertData(db, lats);
GetData(db, lats);
db->CompactRange(nullptr, nullptr);
std::cout << "put and get " << i << " of Many" << std::endl;
}
}
double CalculatePercentile(const std::vector<int64_t>& latencies, double percentile) {
if (latencies.empty()) return 0.0;
std::vector<int64_t> sorted_latencies = latencies;
std::sort(sorted_latencies.begin(), sorted_latencies.end());
size_t index = static_cast<size_t>(percentile * sorted_latencies.size());
if (index >= sorted_latencies.size()) index = sorted_latencies.size() - 1;
return sorted_latencies[index];
}
TEST(TestBench, WithGC) {
DB *db;
if(OpenDB("testdb", &db).ok() == false) {
std::cerr << "open db failed" << std::endl;
abort();
}
std::vector<int64_t> put_lats;
std::vector<int64_t> get_lats;
std::vector<int64_t> iter_lats;
std::vector<int64_t> search_lats;
auto calc_lat = [](const std::vector<int64_t>& latencies) {
double avg = 0.0;
for (auto latency : latencies) {
avg += latency;
}
avg /= latencies.size();
double p75 = CalculatePercentile(latencies, 0.75);
double p99 = CalculatePercentile(latencies, 0.99);
return std::make_tuple(avg, p75, p99);
};
InsertMany(db);
// Put()
auto start_time = std::chrono::steady_clock::now();
InsertData(db, put_lats);
auto end_time = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
// std::cout << "Throughput of Put(): " << std::fixed << num_ * 1e6 / duration << " ops/s" << std::endl;
std::cout << std::endl << "Write size: " << bytes_ << " bytes" << std::endl << std::endl;
std::cout << "Throughput of Put(): " << std::fixed << std::setprecision(2) << (bytes_ / 1048576.0) / (duration * 1e-6) << std::endl << std::endl; // << " MB/s" << std::endl << std::endl;
std::tuple<double, double, double> put_latency = calc_lat(put_lats);
double put_avg = std::get<0>(put_latency);
double put_p75 = std::get<1>(put_latency);
double put_p99 = std::get<2>(put_latency);
std::cout << "Put Latency (avg, P75, P99): " << std::endl << std::setprecision(2) << put_avg * 1e-3 // << " micros/op, "
<< std::endl << put_p75 * 1e-3 // << " micros/op, "
<< std::endl << put_p99 * 1e-3 // << " micros/op"
<< std::endl << std::endl;
// Get()
start_time = std::chrono::steady_clock::now();
GetData(db, get_lats);
end_time = std::chrono::steady_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
// std::cout << "Throughput of Get(): " << std::fixed << reads_ * 1e6 / duration << " ops/s" << std::endl;
std::cout << "Throughput of Get(): " << std::fixed << std::setprecision(2) << (bytes_ / 1048576.0) / (duration * 1e-6) << std::endl << std::endl; // << " MB/s" << std::endl << std::endl;
std::tuple<double, double, double> get_latency = calc_lat(get_lats);
double get_avg = std::get<0>(get_latency);
double get_p75 = std::get<1>(get_latency);
double get_p99 = std::get<2>(get_latency);
std::cout << "Put Latency (avg, P75, P99): " << std::endl << std::setprecision(2) << get_avg * 1e-3 // << " micros/op, "
<< std::endl << get_p75 * 1e-3 // << " micros/op, "
<< std::endl << get_p99 * 1e-3 // << " micros/op"
<< std::endl << std::endl;
// Iterator()
start_time = std::chrono::steady_clock::now();
ReadOrdered(db, iter_lats);
end_time = std::chrono::steady_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
// std::cout << "Throughput of Iterator(): " << std::fixed << reads_ * 1e6 / duration << " ops/s" << std::endl;
std::cout << "Throughput of Iterator(): " << std::fixed << std::setprecision(2) << (bytes_ / 1048576.0) / (duration * 1e-6) << std::endl << std::endl; // << " MB/s" << std::endl << std::endl;
std::tuple<double, double, double> iter_latency = calc_lat(iter_lats);
double iter_avg = std::get<0>(iter_latency);
double iter_p75 = std::get<1>(iter_latency);
double iter_p99 = std::get<2>(iter_latency);
std::cout << "Put Latency (avg, P75, P99): " << std::endl << std::setprecision(2) << iter_avg * 1e-3 // << " micros/op, "
<< std::endl << iter_p75 * 1e-3 // << " micros/op, "
<< std::endl << iter_p99 * 1e-3 // << " micros/op"
<< std::endl << std::endl;
// FindKeysByField()
start_time = std::chrono::steady_clock::now();
SearchField(db, search_lats);
end_time = std::chrono::steady_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
std::cout << "Throughput of FindKeysbyField(): " << std::setprecision(2) << search_ * 1e6 / duration << std::endl << std::endl; // << " ops/s" << std::endl << std::endl;
std::tuple<double, double, double> search_latency = calc_lat(search_lats);
double search_avg = std::get<0>(search_latency);
double search_p75 = std::get<1>(search_latency);
double search_p99 = std::get<2>(search_latency);
std::cout << "Put Latency (avg, P75, P99): " << std::endl << std::setprecision(2) << search_avg * 1e-3 // << " micros/op, "
<< std::endl << search_p75 * 1e-3 // << " micros/op, "
<< std::endl << search_p99 * 1e-3 // << " micros/op"
<< std::endl << std::endl;
delete db;
}
int main(int argc, char **argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

Loading…
Cancel
Save