原代码:
unsigned char tmp;
unsigned char buff[990] = {0};
int k = 0;
memcpy(buff, uchar_recv_data + index + 4, data_length);
for(k = 0; k < sizeof(buff); k+=2)
{
tmp = buff[k];
buff[k] = buff[k+1];
buff[k+1] = tmp;
}
memcpy(ret_info.item[i].sVal, (char*)buff, strlen((char *)buff));
优化1:
short* psrc = (short*)pdata;
short* pdst = (short*)ret_info.item[ret_info.total].sVal;
//要留下\0的位置
if (data_length > sizeof(ret_info.item[ret_info.total].sVal) - 2)
data_length = sizeof(ret_info.item[ret_info.total].sVal) - 2;
for (uint16_t k = 0; k < data_length / 2; k++)
pdst[k] = ntohs(psrc[k]);
//\0结尾,防止访问时越界
pdst[data_length / 2] = 0;
优化2:
优化2:
.section .text
.align 64
shufidx: .quad 0x607040502030001,0xe0f0c0d0a0b0809,0x1617141512131011,0x1e1f1c1d1a1b1819,0x2627242522232021,0x2e2f2c2d2a2b2829,0x3637343532333031,0x3e3f3c3d3a3b3839
.global bswapw512p
.global bswapw512s
.global bswapw256s
# 要求: AVX2
# dst,src: 不要求地址32字节对齐, 但是要求大小是32字节的整数倍
# srcc:src有效数据的长度, 可以不是32字节的整数倍
#void bswapw256(uint16_t*dst(%rdi), uint16_t *src(%rsi), size_t srcc(%rdx));
.align 64
bswapw256s:
lea (%rsi,%rdx), %rcx
vmovdqa (shufidx), %ymm2
bswapw256s_lp:
vmovdqu (%rsi), %ymm1
vpshufb %ymm2, %ymm1, %ymm0
vmovdqu %ymm0, (%rdi)
add $32, %rsi
add $32, %rdi
cmp %rcx, %rsi
jb bswapw256s_lp
ret
# 要求: AVX512VL + AVX512BW
# dst,src: 不要求地址64字节对齐, 但是要求大小是64字节的整数倍
# srcc:src有效数据的长度, 可以不是64字节的整数倍
#void bswapw512p(uint16_t*dst(%rdi), uint16_t *src(%rsi), size_t srcc(%rdx));
.align 64
bswapw512p:
lea (%rsi,%rdx), %rcx
vmovdqa64 (shufidx), %zmm2
bswapw512p_lp:
vmovdqu64 (%rsi), %zmm1
vpermb %zmm1, %zmm2, %zmm0
vmovdqu64 %zmm0, (%rdi)
add $64, %rsi
add $64, %rdi
cmp %rcx, %rsi
jb bswapw512p_lp
ret
# 要求: AVX512BW
.align 64
bswapw512s:
lea (%rsi,%rdx), %rcx
vmovdqa64 (shufidx), %zmm2
bswapw512s_lp:
vmovdqu64 (%rsi), %zmm1
vpshufb %zmm2, %zmm1, %zmm0
vmovdqu64 %zmm0, (%rdi)
add $64, %rsi
add $64, %rdi
cmp %rcx, %rsi
jb bswapw512s_lp
ret
速度对比:
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <arpa/inet.h>
#include <sys/time.h>
extern "C" void bswapw512p(uint16_t * dst, uint16_t * src, size_t srcc);
extern "C" void bswapw512s(uint16_t * dst, uint16_t * src, size_t srcc);
extern "C" void bswapw256s(uint16_t * dst, uint16_t * src, size_t srcc);
#define COUNT 1000000000
int main()
{
char pdata[512];
char newdata[512];
char newdata1[512];
char newdata2[512];
char newdata3[512];
char newdata4[512];
uint16_t* psrc = (uint16_t*)pdata;
uint16_t* pdst = (uint16_t*)newdata;
uint16_t* pdst1 = (uint16_t*)newdata1;
uint16_t* pdst2 = (uint16_t*)newdata2;
uint16_t* pdst3 = (uint16_t*)newdata3;
uint16_t* pdst4 = (uint16_t*)newdata4;
uint16_t data_length = 500;
struct timeval start, end, result;
for (uint16_t k = 0; k < sizeof(pdata) / 2; k++)
{
psrc[k] = (short)rand();
pdst[k] = 0xffff;
pdst1[k] = 0xffff;
pdst2[k] = 0xffff;
pdst3[k] = 0xffff;
pdst4[k] = 0xffff;
}
if (data_length > sizeof(pdata) - 2)
data_length = sizeof(pdata) - 2;
unsigned char tmp;
unsigned char buff[990] = { 0 };
gettimeofday(&start, NULL);
for (int i = 0; i < COUNT; i++)
{
memcpy(buff, pdata, data_length);
for (size_t k = 0; k < sizeof(buff); k += 2)
{
tmp = buff[k];
buff[k] = buff[k + 1];
buff[k + 1] = tmp;
}
memcpy(newdata, (char*)buff, data_length);
}
gettimeofday(&end, NULL);
timersub(&end, &start, &result);
printf(" origin: %10.4lf\n", result.tv_sec * 1000.0 + result.tv_usec / 1000.0);
gettimeofday(&start, NULL);
for (int i = 0; i < COUNT; i++)
{
for (uint16_t k = 0; k < data_length / 2; k++)
pdst1[k] = ntohs(psrc[k]);
}
pdst1[data_length / 2] = 0;
gettimeofday(&end, NULL);
timersub(&end, &start, &result);
printf(" ntohs: %10.4lf\n",result.tv_sec * 1000.0 + result.tv_usec / 1000.0);
gettimeofday(&start, NULL);
for (int i = 0; i < COUNT; i++)
{
bswapw512p(pdst2, psrc, data_length);
}
pdst2[data_length / 2] = 0;
gettimeofday(&end, NULL);
timersub(&end, &start, &result);
printf("asm512p: %10.4lf\n", result.tv_sec * 1000.0 + result.tv_usec / 1000.0);
gettimeofday(&start, NULL);
for (int i = 0; i < COUNT; i++)
{
bswapw512s(pdst3, psrc, data_length);
}
pdst3[data_length / 2] = 0;
gettimeofday(&end, NULL);
timersub(&end, &start, &result);
printf("asm512s: %10.4lf\n", result.tv_sec * 1000.0 + result.tv_usec / 1000.0);
gettimeofday(&start, NULL);
for (int i = 0; i < COUNT; i++)
{
bswapw256s(pdst4, psrc, data_length);
}
pdst4[data_length / 2] = 0;
gettimeofday(&end, NULL);
timersub(&end, &start, &result);
printf("asm256s: %10.4lf\n", result.tv_sec * 1000.0 + result.tv_usec / 1000.0);
printf("结果对比: %d,%d,%d,%d\n", memcmp(pdst, pdst1, data_length), memcmp(pdst, newdata2, data_length), memcmp(pdst, newdata3, data_length), memcmp(pdst, newdata4, data_length));
return 0;
}
Release(1000000000):
origin: 189899.0370 tmp buff
ntohs: 57464.1800 ntohs
asm512p: 8309.1700 vpermb
asm512s: 3893.6680 vpshufb (还是vpshufb比vpermb效率更高一些呀)
asm256s: 5206.3640 vpshufb
结果对比: 0,0,0,0
从文档中看,vpshufb确实是比vpermb更高效:
vpermb:
Architecture | Latency | Throughput (CPI) |
---|---|---|
Icelake Intel Core | - | 1 |
Icelake Xeon | 3 | 1 |
Sapphire Rapids | 3 | 1 |
vpshufb:
Architecture | Latency | Throughput (CPI) |
---|---|---|
Icelake Intel Core | - | 1 |
Icelake Xeon | 1 | 1 |
Sapphire Rapids | 1 | 1 |
Skylake | 1 | 1 |