diff --git a/util/coding.cc b/util/coding.cc index e2089df..55be020 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -6,32 +6,6 @@ namespace leveldb { -void EncodeFixed32(char* dst, uint32_t value) { - if (port::kLittleEndian) { - memcpy(dst, &value, sizeof(value)); - } else { - dst[0] = value & 0xff; - dst[1] = (value >> 8) & 0xff; - dst[2] = (value >> 16) & 0xff; - dst[3] = (value >> 24) & 0xff; - } -} - -void EncodeFixed64(char* dst, uint64_t value) { - if (port::kLittleEndian) { - memcpy(dst, &value, sizeof(value)); - } else { - dst[0] = value & 0xff; - dst[1] = (value >> 8) & 0xff; - dst[2] = (value >> 16) & 0xff; - dst[3] = (value >> 24) & 0xff; - dst[4] = (value >> 32) & 0xff; - dst[5] = (value >> 40) & 0xff; - dst[6] = (value >> 48) & 0xff; - dst[7] = (value >> 56) & 0xff; - } -} - void PutFixed32(std::string* dst, uint32_t value) { char buf[sizeof(value)]; EncodeFixed32(buf, value); diff --git a/util/coding.h b/util/coding.h index d9eeaa3..92a961f 100644 --- a/util/coding.h +++ b/util/coding.h @@ -10,9 +10,8 @@ #ifndef STORAGE_LEVELDB_UTIL_CODING_H_ #define STORAGE_LEVELDB_UTIL_CODING_H_ -#include -#include - +#include +#include #include #include "leveldb/slice.h" @@ -44,44 +43,105 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* v); int VarintLength(uint64_t v); // Lower-level versions of Put... that write directly into a character buffer -// REQUIRES: dst has enough space for the value being written -void EncodeFixed32(char* dst, uint32_t value); -void EncodeFixed64(char* dst, uint64_t value); - -// Lower-level versions of Put... that write directly into a character buffer // and return a pointer just past the last byte written. // REQUIRES: dst has enough space for the value being written char* EncodeVarint32(char* dst, uint32_t value); char* EncodeVarint64(char* dst, uint64_t value); +// TODO(costan): Remove port::kLittleEndian and the fast paths based on +// std::memcpy when clang learns to optimize the generic code, as +// described in https://bugs.llvm.org/show_bug.cgi?id=41761 +// +// The platform-independent code in DecodeFixed{32,64}() gets optimized to mov +// on x86 and ldr on ARM64, by both clang and gcc. However, only gcc optimizes +// the platform-independent code in EncodeFixed{32,64}() to mov / str. + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written + +inline void EncodeFixed32(char* dst, uint32_t value) { + uint8_t* const buffer = reinterpret_cast(dst); + + if (port::kLittleEndian) { + // Fast path for little-endian CPUs. All major compilers optimize this to a + // single mov (x86_64) / str (ARM) instruction. + std::memcpy(buffer, &value, sizeof(uint32_t)); + return; + } + + // Platform-independent code. + // Currently, only gcc optimizes this to a single mov / str instruction. + buffer[0] = static_cast(value); + buffer[1] = static_cast(value >> 8); + buffer[2] = static_cast(value >> 16); + buffer[3] = static_cast(value >> 24); +} + +inline void EncodeFixed64(char* dst, uint64_t value) { + uint8_t* const buffer = reinterpret_cast(dst); + + if (port::kLittleEndian) { + // Fast path for little-endian CPUs. All major compilers optimize this to a + // single mov (x86_64) / str (ARM) instruction. + std::memcpy(buffer, &value, sizeof(uint64_t)); + return; + } + + // Platform-independent code. + // Currently, only gcc optimizes this to a single mov / str instruction. + buffer[0] = static_cast(value); + buffer[1] = static_cast(value >> 8); + buffer[2] = static_cast(value >> 16); + buffer[3] = static_cast(value >> 24); + buffer[4] = static_cast(value >> 32); + buffer[5] = static_cast(value >> 40); + buffer[6] = static_cast(value >> 48); + buffer[7] = static_cast(value >> 56); +} + // Lower-level versions of Get... that read directly from a character buffer // without any bounds checking. inline uint32_t DecodeFixed32(const char* ptr) { + const uint8_t* const buffer = reinterpret_cast(ptr); + if (port::kLittleEndian) { - // Load the raw bytes + // Fast path for little-endian CPUs. All major compilers optimize this to a + // single mov (x86_64) / ldr (ARM) instruction. uint32_t result; - memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + std::memcpy(&result, buffer, sizeof(uint32_t)); return result; - } else { - return ((static_cast(static_cast(ptr[0]))) | - (static_cast(static_cast(ptr[1])) << 8) | - (static_cast(static_cast(ptr[2])) << 16) | - (static_cast(static_cast(ptr[3])) << 24)); } + + // Platform-independent code. + // Clang and gcc optimize this to a single mov / ldr instruction. + return (static_cast(buffer[0])) | + (static_cast(buffer[1]) << 8) | + (static_cast(buffer[2]) << 16) | + (static_cast(buffer[3]) << 24); } inline uint64_t DecodeFixed64(const char* ptr) { + const uint8_t* const buffer = reinterpret_cast(ptr); + if (port::kLittleEndian) { - // Load the raw bytes + // Fast path for little-endian CPUs. All major compilers optimize this to a + // single mov (x86_64) / ldr (ARM) instruction. uint64_t result; - memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + std::memcpy(&result, buffer, sizeof(uint64_t)); return result; - } else { - uint64_t lo = DecodeFixed32(ptr); - uint64_t hi = DecodeFixed32(ptr + 4); - return (hi << 32) | lo; } + + // Platform-independent code. + // Clang and gcc optimize this to a single mov / ldr instruction. + return (static_cast(buffer[0])) | + (static_cast(buffer[1]) << 8) | + (static_cast(buffer[2]) << 16) | + (static_cast(buffer[3]) << 24) | + (static_cast(buffer[4]) << 32) | + (static_cast(buffer[5]) << 40) | + (static_cast(buffer[6]) << 48) | + (static_cast(buffer[7]) << 56); } // Internal routine for use by fallback path of GetVarint32Ptr