设备通讯中,有时会遇到大小端的问题,需要“交换字节序“

文章展示了对原始代码中字节交换操作的两种优化方法,一种基于ntohs函数,另一种利用AVX2和AVX512指令集。通过性能测试,AVX512的实现(vpshufb)在处理大数据量时表现出更高的效率,比ntohs和AVX2的实现更快。测试结果显示,使用AVX512指令的优化版本在处理大量数据时具有显著优势。
摘要由CSDN通过智能技术生成

原代码:

unsigned char tmp;

unsigned char buff[990] = {0};

int k = 0;

memcpy(buff, uchar_recv_data + index + 4, data_length);

for(k = 0; k < sizeof(buff); k+=2)

{

    tmp = buff[k];

    buff[k] = buff[k+1];

    buff[k+1] = tmp;

}

memcpy(ret_info.item[i].sVal, (char*)buff, strlen((char *)buff));

优化1:

short* psrc = (short*)pdata;

short* pdst = (short*)ret_info.item[ret_info.total].sVal;

//要留下\0的位置

if (data_length > sizeof(ret_info.item[ret_info.total].sVal) - 2)

    data_length = sizeof(ret_info.item[ret_info.total].sVal) - 2;

for (uint16_t k = 0; k < data_length / 2; k++)

    pdst[k] = ntohs(psrc[k]);

//\0结尾,防止访问时越界

pdst[data_length / 2] = 0;

优化2:

优化2:

.section .text

.align 64

shufidx: .quad 0x607040502030001,0xe0f0c0d0a0b0809,0x1617141512131011,0x1e1f1c1d1a1b1819,0x2627242522232021,0x2e2f2c2d2a2b2829,0x3637343532333031,0x3e3f3c3d3a3b3839

.global bswapw512p

.global bswapw512s

.global bswapw256s

# 要求: AVX2

# dst,src: 不要求地址32字节对齐, 但是要求大小是32字节的整数倍

# srcc:src有效数据的长度, 可以不是32字节的整数倍

#void bswapw256(uint16_t*dst(%rdi), uint16_t *src(%rsi), size_t srcc(%rdx));

.align 64

bswapw256s:

lea (%rsi,%rdx), %rcx

vmovdqa (shufidx), %ymm2

bswapw256s_lp:

vmovdqu (%rsi), %ymm1

vpshufb %ymm2, %ymm1, %ymm0

vmovdqu %ymm0, (%rdi)

add $32, %rsi

add $32, %rdi

cmp %rcx, %rsi

jb bswapw256s_lp

ret

# 要求: AVX512VL + AVX512BW

# dst,src: 不要求地址64字节对齐, 但是要求大小是64字节的整数倍

# srcc:src有效数据的长度, 可以不是64字节的整数倍

#void bswapw512p(uint16_t*dst(%rdi), uint16_t *src(%rsi), size_t srcc(%rdx));

.align 64

bswapw512p:

lea (%rsi,%rdx), %rcx

vmovdqa64 (shufidx), %zmm2

bswapw512p_lp:

vmovdqu64 (%rsi), %zmm1

vpermb %zmm1, %zmm2, %zmm0

vmovdqu64 %zmm0, (%rdi)

add $64, %rsi

add $64, %rdi

cmp %rcx, %rsi

jb bswapw512p_lp

ret

# 要求: AVX512BW

.align 64

bswapw512s:

lea (%rsi,%rdx), %rcx

vmovdqa64 (shufidx), %zmm2

bswapw512s_lp:

vmovdqu64 (%rsi), %zmm1

vpshufb %zmm2, %zmm1, %zmm0

vmovdqu64 %zmm0, (%rdi)

add $64, %rsi

add $64, %rdi

cmp %rcx, %rsi

jb bswapw512s_lp

ret

速度对比:

#include <stdint.h>

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <arpa/inet.h>

#include <sys/time.h>

extern "C" void bswapw512p(uint16_t * dst, uint16_t * src, size_t srcc);

extern "C" void bswapw512s(uint16_t * dst, uint16_t * src, size_t srcc);

extern "C" void bswapw256s(uint16_t * dst, uint16_t * src, size_t srcc);

#define COUNT 1000000000

int main()

{

char pdata[512];

char newdata[512];

char newdata1[512];

char newdata2[512];

char newdata3[512];

char newdata4[512];

uint16_t* psrc = (uint16_t*)pdata;

uint16_t* pdst = (uint16_t*)newdata;

uint16_t* pdst1 = (uint16_t*)newdata1;

uint16_t* pdst2 = (uint16_t*)newdata2;

uint16_t* pdst3 = (uint16_t*)newdata3;

uint16_t* pdst4 = (uint16_t*)newdata4;

uint16_t data_length = 500;

struct timeval start, end, result;

for (uint16_t k = 0; k < sizeof(pdata) / 2; k++)

{

psrc[k] = (short)rand();

pdst[k] = 0xffff;

pdst1[k] = 0xffff;

pdst2[k] = 0xffff;

pdst3[k] = 0xffff;

pdst4[k] = 0xffff;

}

if (data_length > sizeof(pdata) - 2)

data_length = sizeof(pdata) - 2;

unsigned char tmp;

unsigned char buff[990] = { 0 };

gettimeofday(&start, NULL);

for (int i = 0; i < COUNT; i++)

{

memcpy(buff, pdata, data_length);

for (size_t k = 0; k < sizeof(buff); k += 2)

{

tmp = buff[k];

buff[k] = buff[k + 1];

buff[k + 1] = tmp;

}

memcpy(newdata, (char*)buff, data_length);

}

gettimeofday(&end, NULL);

timersub(&end, &start, &result);

printf(" origin: %10.4lf\n", result.tv_sec * 1000.0 + result.tv_usec / 1000.0);

gettimeofday(&start, NULL);

for (int i = 0; i < COUNT; i++)

{

for (uint16_t k = 0; k < data_length / 2; k++)

pdst1[k] = ntohs(psrc[k]);

}

pdst1[data_length / 2] = 0;

gettimeofday(&end, NULL);

timersub(&end, &start, &result);

printf(" ntohs: %10.4lf\n",result.tv_sec * 1000.0 + result.tv_usec / 1000.0);

gettimeofday(&start, NULL);

for (int i = 0; i < COUNT; i++)

{

bswapw512p(pdst2, psrc, data_length);

}

pdst2[data_length / 2] = 0;

gettimeofday(&end, NULL);

timersub(&end, &start, &result);

printf("asm512p: %10.4lf\n", result.tv_sec * 1000.0 + result.tv_usec / 1000.0);

gettimeofday(&start, NULL);

for (int i = 0; i < COUNT; i++)

{

bswapw512s(pdst3, psrc, data_length);

}

pdst3[data_length / 2] = 0;

gettimeofday(&end, NULL);

timersub(&end, &start, &result);

printf("asm512s: %10.4lf\n", result.tv_sec * 1000.0 + result.tv_usec / 1000.0);

gettimeofday(&start, NULL);

for (int i = 0; i < COUNT; i++)

{

bswapw256s(pdst4, psrc, data_length);

}

pdst4[data_length / 2] = 0;

gettimeofday(&end, NULL);

timersub(&end, &start, &result);

printf("asm256s: %10.4lf\n", result.tv_sec * 1000.0 + result.tv_usec / 1000.0);

printf("结果对比: %d,%d,%d,%d\n", memcmp(pdst, pdst1, data_length), memcmp(pdst, newdata2, data_length), memcmp(pdst, newdata3, data_length), memcmp(pdst, newdata4, data_length));

return 0;

}

Release(1000000000):

origin: 189899.0370 tmp buff

ntohs: 57464.1800 ntohs

asm512p: 8309.1700 vpermb

asm512s: 3893.6680 vpshufb (还是vpshufb比vpermb效率更高一些呀)

asm256s: 5206.3640 vpshufb

结果对比: 0,0,0,0

从文档中看,vpshufb确实是比vpermb更高效:

vpermb:

ArchitectureLatencyThroughput (CPI)
Icelake Intel Core-1
Icelake Xeon31
Sapphire Rapids31

vpshufb:

ArchitectureLatencyThroughput (CPI)
Icelake Intel Core-1
Icelake Xeon11
Sapphire Rapids11
Skylake11
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值