From 4d6144f97faf9d2a6c89f41d7d2360f21f0b71e2 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Fri, 15 Jun 2012 14:19:11 +0200 Subject: [PATCH] Compact serialization for variable-length integers Variable-length integers: bytes are a MSB base-128 encoding of the number. The high bit in each byte signifies whether another digit follows. To make the encoding is one-to-one, one is subtracted from all but the last digit. Thus, the byte sequence a[] with length len, where all but the last byte has bit 128 set, encodes the number: (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1)) Properties: * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes) * Every integer has exactly one encoding * Encoding does not depend on size of original integer type --- src/serialize.h | 94 +++++++++++++++++++++++++++++++++++- src/test/serialize_tests.cpp | 45 +++++++++++++++++ 2 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 src/test/serialize_tests.cpp diff --git a/src/serialize.h b/src/serialize.h index 63df3160a..549e46cbc 100644 --- a/src/serialize.h +++ b/src/serialize.h @@ -238,9 +238,75 @@ uint64 ReadCompactSize(Stream& is) return nSizeRet; } +// Variable-length integers: bytes are a MSB base-128 encoding of the number. +// The high bit in each byte signifies whether another digit follows. To make +// the encoding is one-to-one, one is subtracted from all but the last digit. +// Thus, the byte sequence a[] with length len, where all but the last byte +// has bit 128 set, encodes the number: +// +// (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1)) +// +// Properties: +// * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes) +// * Every integer has exactly one encoding +// * Encoding does not depend on size of original integer type +// * No redundancy: every (infinite) byte sequence corresponds to a list +// of encoded integers. +// +// 0: [0x00] 256: [0x81 0x00] +// 1: [0x01] 16383: [0xFE 0x7F] +// 127: [0x7F] 16384: [0xFF 0x00] +// 128: [0x80 0x00] 16511: [0x80 0xFF 0x7F] +// 255: [0x80 0x7F] 65535: [0x82 0xFD 0x7F] +// 2^32: [0x8E 0xFE 0xFE 0xFF 0x00] + +template +inline unsigned int GetSizeOfVarInt(I n) +{ + int nRet = 0; + while(true) { + nRet++; + if (n <= 0x7F) + break; + n = (n >> 7) - 1; + } + return nRet; +} + +template +void WriteVarInt(Stream& os, I n) +{ + unsigned char tmp[(sizeof(n)*8+6)/7]; + int len=0; + while(true) { + tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00); + if (n <= 0x7F) + break; + n = (n >> 7) - 1; + len++; + } + do { + WRITEDATA(os, tmp[len]); + } while(len--); +} +template +I ReadVarInt(Stream& is) +{ + I n = 0; + while(true) { + unsigned char chData; + READDATA(is, chData); + n = (n << 7) | (chData & 0x7F); + if (chData & 0x80) + n++; + else + return n; + } +} -#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj))) +#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj))) +#define VARINT(obj) REF(WrapVarInt(REF(obj))) /** Wrapper for serializing arrays and POD. */ @@ -274,6 +340,32 @@ public: } }; +template +class CVarInt +{ +protected: + I &n; +public: + CVarInt(I& nIn) : n(nIn) { } + + unsigned int GetSerializeSize(int, int) const { + return GetSizeOfVarInt(n); + } + + template + void Serialize(Stream &s, int, int) const { + WriteVarInt(s, n); + } + + template + void Unserialize(Stream& s, int, int) { + n = ReadVarInt(s); + } +}; + +template +CVarInt WrapVarInt(I& n) { return CVarInt(n); } + // // Forward declarations // diff --git a/src/test/serialize_tests.cpp b/src/test/serialize_tests.cpp new file mode 100644 index 000000000..90ac89f8c --- /dev/null +++ b/src/test/serialize_tests.cpp @@ -0,0 +1,45 @@ +#include + +#include +#include + +#include "serialize.h" + +using namespace std; + +BOOST_AUTO_TEST_SUITE(serialize_tests) + +BOOST_AUTO_TEST_CASE(varints) +{ + // encode + + CDataStream ss(SER_DISK, 0); + CDataStream::size_type size = 0; + for (int i = 0; i < 100000; i++) { + ss << VARINT(i); + size += ::GetSerializeSize(VARINT(i), 0, 0); + BOOST_CHECK(size == ss.size()); + } + + for (uint64 i = 0; i < 100000000000ULL; i += 999999937) { + ss << VARINT(i); + size += ::GetSerializeSize(VARINT(i), 0, 0); + BOOST_CHECK(size == ss.size()); + } + + // decode + for (int i = 0; i < 100000; i++) { + int j; + ss >> VARINT(j); + BOOST_CHECK_MESSAGE(i == j, "decoded:" << j << " expected:" << i); + } + + for (uint64 i = 0; i < 100000000000ULL; i += 999999937) { + uint64 j; + ss >> VARINT(j); + BOOST_CHECK_MESSAGE(i == j, "decoded:" << j << " expected:" << i); + } + +} + +BOOST_AUTO_TEST_SUITE_END()