//number of bits 1 binary representation table in Range[0-2^16)
unsigned short WordNumBit1[1 << 16];
//性能选项
#if 0
typedef unsigned int stype;
# define SMOVE 5
#else
typedef uint64_t stype; //计算大数组性能更好, 64 bit尤其明显
# define SMOVE 6
#endif
//计算整数n二进制中1个个数
//采用16 bit查表或支持popcnt指令的cpu硬件计算
static inline int countBit1(stype n)
{
#if POPCNT
//popcnt instruction : INTEL i7/SSE4.2, AMD Phonem/SSE4A
#if _M_AMD64 || __x86_64__
return _mm_popcnt_u64(n);
#elif (SMOVE == 5)
return _mm_popcnt_u32(n);
#else
return _mm_popcnt_u32(n) + _mm_popcnt_u32(n >> 32);
#endif
#elif TREE2 == 0
#if SMOVE == 6
uint hig = n >> 32, low = (uint)n;
return WordNumBit1[(ushort)low] + WordNumBit1[low >> 16] +
WordNumBit1[(ushort)hig] + WordNumBit1[hig >> 16];
#else
return WordNumBit1[n & 0xffff] + WordNumBit1[n >> 16];
#endif
#else
#if SMOVE == 6
n -= (n >> 1) & 0x5555555555555555ull;
n = (n & 0x3333333333333333ull) + ((n >> 2) & 0x3333333333333333ull);
n = (n + (n >> 4)) & 0x0F0F0F0F0F0F0F0Full;
n += n >> 8;
n += n >> 16;
n += n >> 32;
return (n & 0x00000000FF);
#else
n -= (n >> 1) & 0x55555555;
n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
n = (n + (n >> 4)) & 0x0F0F0F0F;
n += n >> 8;
n += n >> 16;
return (n & 0x0000003F);
#endif
#endif
}
//构造16 bit整数内二进制1个数表
void createBitTable()
{
int nbitsize = sizeof(WordNumBit1) / sizeof(WordNumBit1[0]);
int i;
WordNumBit1[0] = 0;
for (i = 1; i < nbitsize; i++)
WordNumBit1[i] = WordNumBit1[i >> 1] + (i & 1);
}
//count number of bit 0 in binary representation of array
//计算位数组内二进制0的个数, 注意bitleng 长度之后8字节内容都为bit 1
static int countBit0Array(stype bitarray[], const int bitleng )
{
int bit1s = 0;
int loops = bitleng >> SMOVE;
while (loops-- >= 0) {
bit1s += countBit1(*bitarray++);
}
return ((1 + (bitleng >> SMOVE)) << SMOVE) - bit1s;
}
//计算两数组or操作之后二进制0的个数, 注意bit长度之后8字节内容都为bit 1(至少其中一个数组)
static int countBit0ArrayOr(stype bitarray1[], stype bitarray2[], const int bitleng)
{
int bit1s = 0;
int loops = bitleng >> SMOVE;
while (loops-- >= 0) {
bit1s += countBit1(*bitarray1++ | *bitarray2++);
}
return ((1 + (bitleng >> SMOVE)) << SMOVE) - bit1s;
}
//结论, 在64 bit OS 位性能比32好很多
//支持POPCNT指令的机器硬件计算比打表快50%
//性能计算表如下