Intel硬件指令加速计算CRC32

最新推荐文章于 2024-08-16 08:29:22 发布

lkkey80

最新推荐文章于 2024-08-16 08:29:22 发布

阅读量1.2w

点赞数 2

分类专栏：性能优化

本文链接：https://blog.csdn.net/lkkey80/article/details/43732819

版权

性能优化专栏收录该内容

3 篇文章 0 订阅

订阅专栏

纯软件实现CRC32经常是借助于查表实现的(https://pycrc.org/一个可以生成CRC C语言计算代码的工具)，当计算CRC32过于频繁时可通硬件指令优化以减少对CPU的占用。目前Intel支持的用于计算CRC的有CRC32和PCLMULQDQ两个指令。本文仅讨论使用CRC32指令的使用。CRC32指令计算的是iSCSI CRC，也就是生成多项式为0x11EDC6F41的32位CRC。

使用CRC32指令的方式有2种：一种是直接使用（内联）汇编代码；另一种是借助编译器intrinsics。本文介绍借助编译器intrinsics计算CRC32的过程。

1 使用CRC32指令之前必须检测处理器是否支持SSE4.2

可通过 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1) 来判断。

1.1 使用汇编指令

int check_support_sse4_2() {
    int res=0;
    __asm__ __volatile__(
                        "movl $1,%%eax\n\t"
                        "cpuid\n\t"
                        "test $0x0100000,%%ecx\n\t"
                        "jz 1f\n\t"
                        "movl $1,%0\n\t"
                        "1:\n\t"
                        :"=m"(res)
                        :
                        :"eax","ebx","ecx","edx");
    return res;
}

1.2 利用gcc提供的cpuid.h

#include <cpuid.h>
#include <stdio.h>
 
void
main () {
  unsigned int eax, ebx, ecx, edx;
 
  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
 
  if (ecx & bit_SSE4_2)
    printf ("SSE4.2 is supported\n");
 
  return;
}

 2 使用compiler intrinsics(x86intrin.h) 计算CRC32c

   unsigned int _mm_crc32_u8( unsigned int crc, unsigned char data ) 
 

   unsigned int _mm_crc32_u16( unsigned int crc, unsigned short data ) 
 

   unsigned int _mm_crc32_u32( unsigned int crc, unsigned int data ) 
 

   unsinged __int64 _mm_crc32_u64( unsinged __int64 crc, unsigned __int64 data ) 
 
 
  
 
 
 
  #ifdef __x86_64__
#define ALIGN_SIZE 8
#else
#define ALIGN_SIZE 4
#endif
#define ALIGN_MASK (ALIGN_SIZE - 1)

uint32_t extend(uint32_t init_crc, const char *data, size_t n) {
    uint32_t res = init_crc ^ 0xffffffff;
    size_t i;
#ifdef __x86_64__
    uint64_t *ptr_u64;
    uint64_t tmp;
#endif
    uint32_t *ptr_u32;
    uint16_t *ptr_u16;
    uint8_t *ptr_u8;

    // aligned to machine word's boundary
    for (i = 0; (i < n) && ((intptr_t)(data + i) & ALIGN_MASK); ++i) {
        res = _mm_crc32_u8(res, data[i]);
    }

#ifdef __x86_64__
    tmp = res;
    while (n - i >= sizeof(uint64_t)) {
       ptr_u64 = (uint64_t *)&data[i];
       tmp = _mm_crc32_u64(tmp, *ptr_u64);
       i += sizeof(uint64_t); 
    }
    res = (uint32_t)tmp;
#endif
    while (n - i >= sizeof(uint32_t)) {
       ptr_u32 = (uint32_t *)&data[i];
       res = _mm_crc32_u32(res, *ptr_u32);
       i += sizeof(uint32_t); 
    }
    while (n - i >= sizeof(uint16_t)) {
       ptr_u16 = (uint16_t *)&data[i];
       res = _mm_crc32_u16(res, *ptr_u16);
       i += sizeof(uint16_t); 
    }
    while (n - i >= sizeof(uint8_t)) {
       ptr_u8 = (uint8_t *)&data[i];
       res = _mm_crc32_u8(res, *ptr_u8);
       i += sizeof(uint8_t); 
    }

    return res ^ 0xffffffff;
}
static inline uint32_t crc32c(const char *data, size_t n) {
    return extend(0, data, n);
} 
  
 
 
 
  3 再优化 
 
 
      其实还可以使用CRC32指令并行计算CRC32，具体见引用[2]。 
 
 
  
  
 
4 引用
 [1] Intel® 64 and IA-32 Architectures Software Developer’s Manual
 [2]Choosing a CRC polynomial and associated method for Fast CRC Computation on Intel® Processors
 [3] simd，http://dirlt.com/simd.html
[4]Professional Assembly Language