一、简介
对于一个整数统计二进制形式中bit为1的个数
- 逐一的右移,进行判断累加
- while(num) {num &= (num-1); count++}
在redis中,又学习到了可以通过映射表以及通过移位操作进行统计。
对于redis中的redisPopcount函数,分为了三部分
- 处理地址没有对齐32bit的,直接通过映射表直接获取
- 批量处理,每次处理28字节
- 处理剩余不足28字节的数据,也是直接通过映射表获取
二、映射表直接获取
static const unsigned char bitsinbyte[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};
一个无符号的字节代码的数字范围在[0,255], 所以redis中定义了一个大小为256的静态常量数组bitsinbyte, 数组下标代码某个字节的数字,而数组元素代码某个字节值对应的bit为1的个数。
比如:
bitsinbyte[0] = 0, 表示 0 (00000000)包含0个bit为1
bitsinbyte[1] = 1, 表示 1(00000001) 包含1个bit为1
bitsinbyte[2] = 1, 表示 2(00000010) 包含1个bit为1
bitsinbyte[3] = 2, 表示 3(00000011) 包含2个bit为1
…
bitsinbyte[255] = 8, 表示 255(1111111) 包含8个bit为1
个人觉得可以直接都从映射表中获取统计,但是redis进行了更进一步的优化。
地址对齐,加速数据的读取,为后面的批处理做准备
2.1 对于字符串地址未对齐的
long long redisPopcount(void *s, long count) {
...
/* Count initial bytes not aligned to 32 bit. */
while((unsigned long)p & 3 && count) {
bits += bitsinbyte[*p++];
count--;
}
...
}
2.2 批处理后剩下的
long long redisPopcount(void *s, long count) {
...
/* Count the remaining bytes. */
p = (unsigned char*)p4;
while(count--) bits += bitsinbyte[*p++];
...
}
三、批量处理
- 批处理,减少了循环次数
- 可能会运用上simd(simple instruction mutilple data)技术, 实现并行运算,加速计算(猜测,后续继续学习)
long long redisPopcount(void *s, long count) {
...
/* Count bits 28 bytes at a time */
p4 = (uint32_t*)p;
while(count>=28) {
uint32_t aux1, aux2, aux3, aux4, aux5, aux6, aux7;
aux1 = *p4++;
aux2 = *p4++;
aux3 = *p4++;
aux4 = *p4++;
aux5 = *p4++;
aux6 = *p4++;
aux7 = *p4++;
count -= 28;
aux1 = aux1 - ((aux1 >> 1) & 0x55555555);
aux1 = (aux1 & 0x33333333) + ((aux1 >> 2) & 0x33333333);
aux2 = aux2 - ((aux2 >> 1) & 0x55555555);
aux2 = (aux2 & 0x33333333) + ((aux2 >> 2) & 0x33333333);
aux3 = aux3 - ((aux3 >> 1) & 0x55555555);
aux3 = (aux3 & 0x33333333) + ((aux3 >> 2) & 0x33333333);
aux4 = aux4 - ((aux4 >> 1) & 0x55555555);
aux4 = (aux4 & 0x33333333) + ((aux4 >> 2) & 0x33333333);
aux5 = aux5 - ((aux5 >> 1) & 0x55555555);
aux5 = (aux5 & 0x33333333) + ((aux5 >> 2) & 0x33333333);
aux6 = aux6 - ((aux6 >> 1) & 0x55555555);
aux6 = (aux6 & 0x33333333) + ((aux6 >> 2) & 0x33333333);
aux7 = aux7 - ((aux7 >> 1) & 0x55555555);
aux7 = (aux7 & 0x33333333) + ((aux7 >> 2) & 0x33333333);
bits += ((((aux1 + (aux1 >> 4)) & 0x0F0F0F0F) +
((aux2 + (aux2 >> 4)) & 0x0F0F0F0F) +
((aux3 + (aux3 >> 4)) & 0x0F0F0F0F) +
((aux4 + (aux4 >> 4)) & 0x0F0F0F0F) +
((aux5 + (aux5 >> 4)) & 0x0F0F0F0F) +
((aux6 + (aux6 >> 4)) & 0x0F0F0F0F) +
((aux7 + (aux7 >> 4)) & 0x0F0F0F0F))* 0x01010101) >> 24;
}
...
}
3.1 原理
对于一个无符号的32bit的整数aux1, 其中bx为0或1
1. 首先是每两bit为一组的进行处理
aux1 = aux1 - ((aux1 >> 1) & 0x55555555);
aux1
= b0*2
0
^0
0 + b1*2
1
^1
1 + b2*2
2
^2
2 + b3*2
3
^3
3 + … + b30*2
30
^{30}
30 + b31*2
31
^{31}
31
aux1>>1
= b1*2
0
^0
0 + b2*2
1
^1
1 + b3*2
2
^2
2 + … + b30*2
29
^{29}
29 + b31*2
30
^{30}
30 + 0*2
31
^{31}
31
0x55555555
= 0101 0101 0101 0101 0101 0101 0101 0101
= 2
0
^0
0 + 2
2
^2
2 + 2
4
^4
4 + 2
6
^6
6 + … + 2
28
^{28}
28 + 2
30
^{30}
30
(aux1>>1) & 0x555555555
= b1*
2
0
2^0
20 + b3*
2
2
2^2
22 + b5*
2
4
2^4
24 + b7*
2
6
2^6
26 + … + b27*
2
26
2^{26}
226 + b29*
2
28
2^{28}
228 + b31*
2
30
2^{30}
230
aux1 - (aux1>>1) & 0x55555555
= b0*
2
0
2^0
20 + b1*
2
0
2^0
20 + b2*
2
2
2^2
22 + b3*
2
2
2^2
22 + … + b28*
2
28
2^{28}
228 + b29*
2
28
2^{28}
228 + b30*
2
30
2^{30}
230 + b31*
2
30
2^{30}
230
= (b0+b1)*
2
0
2^0
20 + (b2+b3)*
2
2
2^2
22 + (b4+b5)*
2
4
2^4
24 + … + (b28+b29)*
2
28
2^{28}
228 + (b30+b31)*
2
30
2^{30}
230
2. 接下来就是每四bit为一组的进行统计
aux1 = (aux1 & 0x33333333) + ((aux1 >> 2) & 0x33333333);
0x33333333
= 0011 0011 0011 0011 0011 0011 0011 0011
= 2
0
^0
0 + 2
1
^1
1 + 2
4
^4
4 + 2
5
^5
5 + 2
8
^8
8 + 2
9
^9
9 + … + 2
28
^{28}
28 + 2
29
^{29}
29
aux1 & 0x33333333 =
(b0+b1)*
2
0
2^0
20 + (b4+b5)*
2
4
2^4
24 + (b8+b9)*
2
8
2^8
28 + (b12+b13)*
2
12
2^{12}
212 + (b16+b17)*
2
16
2^{16}
216 + (b20+b21)*
2
20
2^{20}
220 + (b24+b25)*
2
24
2^{24}
224 + (b28+b29)*
2
28
2^{28}
228
aux1>>2 =
(b2+b3)*
2
2
2^2
22 + (b4+b5)*
2
4
2^4
24 + … + (b28+b29)*
2
28
2^{28}
228 + (b30+b31)*
2
30
2^{30}
230
(aux1>>2)&0x33333333 =
(b2+b3)*
2
0
2^0
20 + (b6+b7)*
2
4
2^4
24 + (b10+b11)*
2
8
2^8
28 + (b14+b15)*
2
12
2^{12}
212 + (b18+b19)*
2
16
2^{16}
216 + (b22+b23)*
2
20
2^{20}
220 + (b26+b27)*
2
24
2^{24}
224 + (b30+b31)*
2
28
2^{28}
228
(aux1 & 0x33333333) + ((aux1 >> 2) & 0x33333333) =
(b0+b1)*
2
0
2^0
20 + (b4+b5)*
2
4
2^4
24 + (b8+b9)*
2
8
2^8
28 + (b12+b13)*
2
12
2^{12}
212 + (b16+b17)*
2
16
2^{16}
216 + (b20+b21)*
2
20
2^{20}
220 + (b24+b25)*
2
24
2^{24}
224 + (b28+b29)*
2
28
2^{28}
228 + (b2+b3)*
2
0
2^0
20 + (b6+b7)*
2
4
2^4
24 + (b10+b11)*
2
8
2^8
28 + (b14+b15)*
2
12
2^{12}
212 + (b18+b19)*
2
16
2^{16}
216 + (b22+b23)*
2
20
2^{20}
220 + (b26+b27)*
2
24
2^{24}
224 + (b30+b31)*
2
28
2^{28}
228
= (b0+b1+b2+b3)*
2
0
2^0
20 + (b4+b5+b6+b7)*
2
4
2^4
24 + (b8+b9+b10+b11)*
2
8
2^8
28 + (b12+b13+b14+b15)*
2
12
2^{12}
212 + (b16+b17+b18+b19)*
2
16
2^{16}
216 + (b20+b21+b22+b23)*
2
20
2^{20}
220 + (b24+b25+b26+b27)*
2
24
2^{24}
224 + (b28+b29+b30+b31)*
2
28
2^{28}
228
3. 接下来是每八bit为一组
((aux1 + (aux1 >> 4)) & 0x0F0F0F0F)
aux1>>4 =
(b4+b5+b6+b7)*
2
0
2^0
20 + (b8+b9+b10+b11)*
2
4
2^4
24 + (b12+b13+b14+b15)*
2
8
2^{8}
28 + (b16+b17+b18+b19)*
2
12
2^{12}
212 + (b20+b21+b22+b23)*
2
16
2^{16}
216 + (b24+b25+b26+b27)*
2
20
2^{20}
220 + (b28+b29+b30+b31)*
2
24
2^{24}
224
aux1+(aux1>>4) =
(b0+b1+b2+b3)*
2
0
2^0
20 + (b4+b5+b6+b7)*
2
4
2^4
24 + (b8+b9+b10+b11)*
2
8
2^8
28 + (b12+b13+b14+b15)*
2
12
2^{12}
212 + (b16+b17+b18+b19)*
2
16
2^{16}
216 + (b20+b21+b22+b23)*
2
20
2^{20}
220 + (b24+b25+b26+b27)*
2
24
2^{24}
224 + (b28+b29+b30+b31)*
2
28
2^{28}
228 + (b4+b5+b6+b7)*
2
0
2^0
20 + (b8+b9+b10+b11)*
2
4
2^4
24 + (b12+b13+b14+b15)*
2
8
2^{8}
28 + (b16+b17+b18+b19)*
2
12
2^{12}
212 + (b20+b21+b22+b23)*
2
16
2^{16}
216 + (b24+b25+b26+b27)*
2
20
2^{20}
220 + (b28+b29+b30+b31)*
2
24
2^{24}
224
= (b0+b1+b2+b3+b4+b5+b6+b7)*
2
0
2^0
20 + (b4+b5+b6+b7+b8+b9+b10+b11)*
2
4
2^4
24 + (b8+b9+b10+b11+b12+b13+b14+b15)*
2
8
2^8
28 + (b12+b13+b14+b15+b16+b17+b18+b19)*
2
12
2^{12}
212 + (b16+b17+b18+b19+b20+b21+b22+b23)*
2
16
2^{16}
216 + (b20+b21+b22+b23+b24+b25+b26+b27)*
2
20
2^{20}
220 + (b24+b25+b26+b27+b28+b29+b30+b31)*
2
24
2^{24}
224 + (b28+b29+b30+b31)*
2
28
2^{28}
228
(aux1+(aux1>>4) )&0x0F0F0F0F =
(b0+b1+b2+b3+b4+b5+b6+b7)*
2
0
2^0
20 + (b8+b9+b10+b11+b12+b13+b14+b15)*
2
8
2^8
28 + (b16+b17+b18+b19+b20+b21+b22+b23)*
2
16
2^{16}
216 + (b24+b25+b26+b27+b28+b29+b30+b31)*
2
24
2^{24}
224
4. 乘以0x01010101,将值都推向最高字节
将分散在低字节的数据都移动到最高的字节位置
0x01010101 =
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224
aux1 * 0x01010101 =
( (b0+b1+b2+b3+b4+b5+b6+b7)*
2
0
2^0
20 + (b8+b9+b10+b11+b12+b13+b14+b15)*
2
8
2^8
28 + (b16+b17+b18+b19+b20+b21+b22+b23)*
2
16
2^{16}
216 + (b24+b25+b26+b27+b28+b29+b30+b31)*
2
24
2^{24}
224 ) * (
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224)
假设:
x = b0+b1+b2+b3+b4+b5+b6+b7
y = b8+b9+b10+b11+b12+b13+b14+b15
z = b16+b17+b18+b19+b20+b21+b22+b23
p = b24+b25+b26+b27+b28+b29+b30+b31
简化为:
aux1 * 0x01010101 =
( x*
2
0
2^0
20 + y*
2
8
2^8
28 + z*
2
16
2^{16}
216 + p*
2
24
2^{24}
224 ) * (
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224)
= x*
2
0
2^0
20* (
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224) + y*
2
8
2^8
28*(
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224) + z*
2
16
2^{16}
216*(
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224) + p*
2
24
2^{24}
224*(
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224)
= x*(
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224) + y*(
2
8
2^8
28 +
2
16
2^{16}
216 +
2
24
2^{24}
224 +
2
32
2^{32}
232) + z*(
2
16
2^{16}
216 +
2
24
2^{24}
224 +
2
32
2^{32}
232 +
2
40
2^{40}
240) + p*(
2
24
2^{24}
224 +
2
32
2^{32}
232 +
2
40
2^{40}
240 +
2
48
2^{48}
248)
将其中次方数超过32的减去32,溢出反转
= x*( 2 0 2^0 20 + 2 8 2^8 28 + 2 16 2^{16} 216 + 2 24 2^{24} 224) + y*( 2 8 2^8 28 + 2 16 2^{16} 216 + 2 24 2^{24} 224 + 2 0 2^{0} 20) + z*( 2 16 2^{16} 216 + 2 24 2^{24} 224 + 2 0 2^{0} 20 + 2 8 2^{8} 28) + p*( 2 24 2^{24} 224 + 2 0 2^{0} 20 + 2 8 2^{8} 28 + 2 16 2^{16} 216)
= 2 24 2^{24} 224*(x+y+z+p) + x*( 2 0 2^0 20 + 2 8 2^8 28 + 2 16 2^{16} 216) + y*( 2 8 2^8 28 + 2 16 2^{16} 216 + 2 0 2^{0} 20) + z*( 2 16 2^{16} 216 + 2 0 2^{0} 20 + 2 8 2^{8} 28) + p*( 2 0 2^{0} 20 + 2 8 2^{8} 28 + 2 16 2^{16} 216)
其中24~31共8位,最大能表示255,所以能装x+y+z+p的值
5. 获取最高字节以获取统计值
其他6个int都是相同的处理方式,最后在右移24位,取最高字节作为统计数据
(
2
24
2^{24}
224*(x+y+z+p) + x*(
2
0
2^0
20 +
2
8
2^8
28 +
2
16
2^{16}
216) + y*(
2
8
2^8
28 +
2
16
2^{16}
216 +
2
0
2^{0}
20) + z*(
2
16
2^{16}
216 +
2
0
2^{0}
20 +
2
8
2^{8}
28) + p*(
2
0
2^{0}
20 +
2
8
2^{8}
28 +
2
16
2^{16}
216)) >> 24
= x+y+z+p
= b0+b1+b2+b3+b4+…+b31
6. 一次处理为啥是28字节
因为最终取的最高字节作为位数的统计,一个无符号的字节能存255,而一个字节最多有8个置1的bit位,所以可以处理255/8 = 31个字节,而处理是用的无符号的int进行处理,int占4字节,所以不大于31并且是4的倍数,则位31/4 * 4 = 28
四、参考
https://www.136.la/nginx/show-132078.html