__builtin_popcountll指令

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef unsigned long long u64;
typedef unsigned int       u32;

inline u32 popcount64_1(u64 x) { return __builtin_popcountll(x); }

inline u32 popcount64_2(u64 x)
{
    u32 y;

    x = (x & 0x5555555555555555ULL) + ((x >>  1) & 0x5555555555555555ULL);
    x = (x & 0x3333333333333333ULL) + ((x >>  2) & 0x3333333333333333ULL);
    x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >>  4) & 0x0F0F0F0F0F0F0F0FULL);
    x = (x & 0x000F000F000F000FULL) + ((x >>  8) & 0x000F000F000F000FULL);
    x = (x & 0x0000001F0000001FULL) + ((x >> 16) & 0x0000001F0000001FULL);
    y = (x & 0x000000000000003F   ) + ((x >> 32) & 0x000000000000003F   );  
    return y;
}

inline u32 popcount64_3(u64 x)
{
    x = (x & 0x5555555555555555ULL) + ((x >>  1) & 0x5555555555555555ULL);
    x = (x & 0x3333333333333333ULL) + ((x >>  2) & 0x3333333333333333ULL);
    x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >>  4) & 0x0F0F0F0F0F0F0F0FULL);
    return (x * 0x0101010101010101ULL) >> 56; 
}

inline u32 popcount64_4(u64 x)
{
    x = (x & 0x5555555555555555ULL) + ((x >>  1) & 0x5555555555555555ULL);
    x = (x & 0x3333333333333333ULL) + ((x >>  2) & 0x3333333333333333ULL);
    x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >>  4) & 0x0F0F0F0F0F0F0F0FULL);
    return (((u32)(x >> 32)) * 0x01010101 >> 24) + 
           (((u32)(x      )) * 0x01010101 >> 24);
}

u64 data[1024];

u64 lrand64(void)
{
    u32 lo = lrand48();
    u32 hi = lrand48();

    return ((u64)hi << 32) | lo; 
}

volatile u32 pt; 

main()
{
    int i, j;
    u32 p, pp;
    time_t t1, t2;

    srand48(time(0));

    for (i = 0; i < 1024; i++)
        data[i] = lrand64();

    for (i = 0; i < 1024; i++)
    {
        p = popcount64_1(data[i]);

        if ((pp = popcount64_2(data[i])) != p)
        {
            printf("FAIL 2: %llx %d %d\n", data[i], p, pp);
            exit(1);
        }

        if ((pp = popcount64_3(data[i])) != p)
        {
            printf("FAIL 3: %llx %d %d\n", data[i], p, pp);
            exit(1);
        }

        if ((pp = popcount64_4(data[i])) != p)
        {
            printf("FAIL 4: %llx %d %d\n", data[i], p, pp);
            exit(1);
        }
    }

    t1 = clock();
    for (j = 0; j < 1000000; j++)
        for (i = 0; i < 1024; i++)
            pt = popcount64_1(data[i]);
    t2 = clock();

    printf("popcount64_1 = %d clocks\n", t2 - t1);

    t1 = clock();
    for (j = 0; j < 1000000; j++)
        for (i = 0; i < 1024; i++)
            pt = popcount64_2(data[i]);
    t2 = clock();

    printf("popcount64_2 = %d clocks\n", t2 - t1);

    t1 = clock();
    for (j = 0; j < 1000000; j++)
        for (i = 0; i < 1024; i++)
            pt = popcount64_3(data[i]);
    t2 = clock();
    printf("popcount64_3 = %d clocks\n", t2 - t1);

    t1 = clock();
    for (j = 0; j < 1000000; j++)
        for (i = 0; i < 1024; i++)
            pt = popcount64_4(data[i]);
    t2 = clock();

    printf("popcount64_4 = %d clocks\n", t2 - t1);

    return 0;
}


g++ test.cpp -msse4.2

popcount64_1 = 3620000 clocks
popcount64_2 = 13830000 clocks
popcount64_3 = 9030000 clocks
popcount64_4 = 10920000 clocks

参考:

https://iafita.wordpress.com/2009/01/30/popcount-problem-%E6%B1%82%E4%BA%8C%E8%BF%9B%E5%88%B6%E6%95%B0%E4%B8%AD1%E7%9A%84%E4%B8%AA%E6%95%B0-%E4%B8%8B/

http://leexh.com/blog/2014/10/25/popcount-problem/


0000000000400ad1 <_Z12popcount64_1y>:
  400ad1:       55                      push   %rbp
  400ad2:       48 89 e5                mov    %rsp,%rbp
  400ad5:       48 89 7d f8             mov    %rdi,-0x8(%rbp)
  400ad9:       f3 48 0f b8 45 f8       popcnt -0x8(%rbp),%rax
  400adf:       c9                      leaveq 
  400ae0:       c3                      retq   

__builtin_popcountll内部使用汇编popcnt命令,可以直接利用硬件寄存器计算

阅读更多
换一批

没有更多推荐了,返回首页