Browse Source

Clean up util/coding.{h,cc}.

1) Inline EncodeFixed{32,64}(). They emit single machine instructions on 64-bit processors.
2) Remove size narrowing compiler warnings from DecodeFixed{32,64}().
3) Add comments explaining the current state of optimizations in compilers we care about.
4) Change C-style includes, like <stdint.h>, to C++ style, like <cstdint>.
5) memcpy -> std::memcpy.

The optimization comments are based on https://godbolt.org/z/RdIqS1. The missed optimization opportunities in clang have been reported as https://bugs.llvm.org/show_bug.cgi?id=41761

The change does not have significant impact on benchmarks. Results below.

LevelDB:    version 1.22
Date:       Mon May  6 10:42:18 2019
CPU:        72 * Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz
CPUCache:   25344 KB
Keys:       16 bytes each
Values:     100 bytes each (50 bytes after compression)
Entries:    1000000
RawSize:    110.6 MB (estimated)
FileSize:   62.9 MB (estimated)

With change
------------------------------------------------
fillseq      :       2.327 micros/op;   47.5 MB/s
fillsync     :    4185.526 micros/op;    0.0 MB/s (1000 ops)
fillrandom   :       3.662 micros/op;   30.2 MB/s
overwrite    :       4.261 micros/op;   26.0 MB/s
readrandom   :       4.239 micros/op; (1000000 of 1000000 found)
readrandom   :       3.649 micros/op; (1000000 of 1000000 found)
readseq      :       0.174 micros/op;  636.7 MB/s
readreverse  :       0.271 micros/op;  408.7 MB/s
compact      :  570495.000 micros/op;
readrandom   :       2.735 micros/op; (1000000 of 1000000 found)
readseq      :       0.118 micros/op;  937.3 MB/s
readreverse  :       0.190 micros/op;  583.7 MB/s
fill100K     :     860.164 micros/op;  110.9 MB/s (1000 ops)
crc32c       :       1.131 micros/op; 3455.2 MB/s (4K per op)
snappycomp   :       3.034 micros/op; 1287.5 MB/s (output: 55.1%)
snappyuncomp :       0.544 micros/op; 7176.0 MB/s

Baseline
------------------------------------------------
fillseq      :       2.365 micros/op;   46.8 MB/s
fillsync     :    4240.165 micros/op;    0.0 MB/s (1000 ops)
fillrandom   :       3.244 micros/op;   34.1 MB/s
overwrite    :       4.153 micros/op;   26.6 MB/s
readrandom   :       4.698 micros/op; (1000000 of 1000000 found)
readrandom   :       4.065 micros/op; (1000000 of 1000000 found)
readseq      :       0.192 micros/op;  576.3 MB/s
readreverse  :       0.286 micros/op;  386.7 MB/s
compact      :  635979.000 micros/op;
readrandom   :       3.264 micros/op; (1000000 of 1000000 found)
readseq      :       0.169 micros/op;  652.8 MB/s
readreverse  :       0.213 micros/op;  519.5 MB/s
fill100K     :    1055.367 micros/op;   90.4 MB/s (1000 ops)
crc32c       :       1.353 micros/op; 2887.3 MB/s (4K per op)
snappycomp   :       3.036 micros/op; 1286.7 MB/s (output: 55.1%)
snappyuncomp :       0.540 micros/op; 7238.6 MB/s
PiperOrigin-RevId: 246856811
ld
Victor Costan 5 years ago
committed by Victor Costan
parent
commit
a7528a5d2b
2 changed files with 81 additions and 47 deletions
  1. +0
    -26
      util/coding.cc
  2. +81
    -21
      util/coding.h

+ 0
- 26
util/coding.cc View File

@ -6,32 +6,6 @@
namespace leveldb {
void EncodeFixed32(char* dst, uint32_t value) {
if (port::kLittleEndian) {
memcpy(dst, &value, sizeof(value));
} else {
dst[0] = value & 0xff;
dst[1] = (value >> 8) & 0xff;
dst[2] = (value >> 16) & 0xff;
dst[3] = (value >> 24) & 0xff;
}
}
void EncodeFixed64(char* dst, uint64_t value) {
if (port::kLittleEndian) {
memcpy(dst, &value, sizeof(value));
} else {
dst[0] = value & 0xff;
dst[1] = (value >> 8) & 0xff;
dst[2] = (value >> 16) & 0xff;
dst[3] = (value >> 24) & 0xff;
dst[4] = (value >> 32) & 0xff;
dst[5] = (value >> 40) & 0xff;
dst[6] = (value >> 48) & 0xff;
dst[7] = (value >> 56) & 0xff;
}
}
void PutFixed32(std::string* dst, uint32_t value) {
char buf[sizeof(value)];
EncodeFixed32(buf, value);

+ 81
- 21
util/coding.h View File

@ -10,9 +10,8 @@
#ifndef STORAGE_LEVELDB_UTIL_CODING_H_
#define STORAGE_LEVELDB_UTIL_CODING_H_
#include <stdint.h>
#include <string.h>
#include <cstdint>
#include <cstring>
#include <string>
#include "leveldb/slice.h"
@ -44,44 +43,105 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* v);
int VarintLength(uint64_t v);
// Lower-level versions of Put... that write directly into a character buffer
// REQUIRES: dst has enough space for the value being written
void EncodeFixed32(char* dst, uint32_t value);
void EncodeFixed64(char* dst, uint64_t value);
// Lower-level versions of Put... that write directly into a character buffer
// and return a pointer just past the last byte written.
// REQUIRES: dst has enough space for the value being written
char* EncodeVarint32(char* dst, uint32_t value);
char* EncodeVarint64(char* dst, uint64_t value);
// TODO(costan): Remove port::kLittleEndian and the fast paths based on
// std::memcpy when clang learns to optimize the generic code, as
// described in https://bugs.llvm.org/show_bug.cgi?id=41761
//
// The platform-independent code in DecodeFixed{32,64}() gets optimized to mov
// on x86 and ldr on ARM64, by both clang and gcc. However, only gcc optimizes
// the platform-independent code in EncodeFixed{32,64}() to mov / str.
// Lower-level versions of Put... that write directly into a character buffer
// REQUIRES: dst has enough space for the value being written
inline void EncodeFixed32(char* dst, uint32_t value) {
uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
if (port::kLittleEndian) {
// Fast path for little-endian CPUs. All major compilers optimize this to a
// single mov (x86_64) / str (ARM) instruction.
std::memcpy(buffer, &value, sizeof(uint32_t));
return;
}
// Platform-independent code.
// Currently, only gcc optimizes this to a single mov / str instruction.
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
buffer[2] = static_cast<uint8_t>(value >> 16);
buffer[3] = static_cast<uint8_t>(value >> 24);
}
inline void EncodeFixed64(char* dst, uint64_t value) {
uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
if (port::kLittleEndian) {
// Fast path for little-endian CPUs. All major compilers optimize this to a
// single mov (x86_64) / str (ARM) instruction.
std::memcpy(buffer, &value, sizeof(uint64_t));
return;
}
// Platform-independent code.
// Currently, only gcc optimizes this to a single mov / str instruction.
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
buffer[2] = static_cast<uint8_t>(value >> 16);
buffer[3] = static_cast<uint8_t>(value >> 24);
buffer[4] = static_cast<uint8_t>(value >> 32);
buffer[5] = static_cast<uint8_t>(value >> 40);
buffer[6] = static_cast<uint8_t>(value >> 48);
buffer[7] = static_cast<uint8_t>(value >> 56);
}
// Lower-level versions of Get... that read directly from a character buffer
// without any bounds checking.
inline uint32_t DecodeFixed32(const char* ptr) {
const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
if (port::kLittleEndian) {
// Load the raw bytes
// Fast path for little-endian CPUs. All major compilers optimize this to a
// single mov (x86_64) / ldr (ARM) instruction.
uint32_t result;
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
std::memcpy(&result, buffer, sizeof(uint32_t));
return result;
} else {
return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
(static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
(static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
(static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
}
// Platform-independent code.
// Clang and gcc optimize this to a single mov / ldr instruction.
return (static_cast<uint32_t>(buffer[0])) |
(static_cast<uint32_t>(buffer[1]) << 8) |
(static_cast<uint32_t>(buffer[2]) << 16) |
(static_cast<uint32_t>(buffer[3]) << 24);
}
inline uint64_t DecodeFixed64(const char* ptr) {
const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
if (port::kLittleEndian) {
// Load the raw bytes
// Fast path for little-endian CPUs. All major compilers optimize this to a
// single mov (x86_64) / ldr (ARM) instruction.
uint64_t result;
memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
std::memcpy(&result, buffer, sizeof(uint64_t));
return result;
} else {
uint64_t lo = DecodeFixed32(ptr);
uint64_t hi = DecodeFixed32(ptr + 4);
return (hi << 32) | lo;
}
// Platform-independent code.
// Clang and gcc optimize this to a single mov / ldr instruction.
return (static_cast<uint64_t>(buffer[0])) |
(static_cast<uint64_t>(buffer[1]) << 8) |
(static_cast<uint64_t>(buffer[2]) << 16) |
(static_cast<uint64_t>(buffer[3]) << 24) |
(static_cast<uint64_t>(buffer[4]) << 32) |
(static_cast<uint64_t>(buffer[5]) << 40) |
(static_cast<uint64_t>(buffer[6]) << 48) |
(static_cast<uint64_t>(buffer[7]) << 56);
}
// Internal routine for use by fallback path of GetVarint32Ptr

Loading…
Cancel
Save