1. 获取原始数据大小N,将N存储到输出结果开头位置,占用1-5个字节不等:
- N < (1 << 7)时,占1个字节
out[0] = N
- N < (1 << 14)时,占2个字节
out[0] = N | 128; out[1] = N >> 7;
- N < (1 << 21)时,占3个字节
out[0] = N | 128; out[1] = (N >> 7) | 128; out[2] = N >> 14;
- N < (1 << 28)时,占4个字节
out[0] = N | 128; out[1] = (N >> 7) | 128; out[2] = (N >> 14) | 128; out[3] = N >> 21;
- other
out[0] = N | 128; out[1] = (N >> 7) | 128; out[2] = (N >> 14) | 128; out[3] = (N >> 21) | 128; out[4] = N >> 28;
2. 将数据切分为65536字节的块,循环处理各个块
(1)维护一张类型为uint16的hash表,初始化为0,hash表中保存滑动窗口中所有不同字符串的偏移值,hash表的索引为字符串前4个字节的hash值,hash表的大小如下公式计算:
uint32_t CalculateTableSize(uint32_t input_size) {
// input_size为原始数据大小
// kMaxHashTableSize = 1 << 14;
if (input_size > kMaxHashTableSize) {
return kMaxHashTableSize;
}
// kMinHashTableSize = 1 << 8
if (input_size < kMinHashTableSize) {
return kMinHashTableSize;
}
// This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
// 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
return 2u << Bits::Log2Floor(input_size - 1);
}
inline int Bits::Log2Floor(uint32_t n) {
return (n == 0) ? -1 : Bits::Log2FloorNonZero(n);
}
inline int Bits::Log2FloorNonZero(uint32_t n) {
assert(n != 0);
// (31 ^ x) is equivalent to (31 - x) for x in [0, 31]. An easy proof
// represents subtraction in base 2 and observes that there's no carry.
//
// GCC and Clang represent __builtin_clz on x86 as 31 ^ _bit_scan_reverse(x).
// Using "31 ^" here instead of "31 -" allows the optimizer to strip the
// function body down to _bit_scan_reverse(x).
// __builtin_clz(n)是获取n的二进制前端(高位)有多少个0
return 31 ^ __builtin_cl