项目中遇到个需求, 要把TCP接收到的数据写到日志中, 直接二进制写的话, 不好查看, 就需要把二进制数据以16进制字符串的方式写到日志中.
之前的同学是用sprintf来实现的:
char buf[LEN]; //二进制数据
char priBuf1[LEN * 2 + 1]; //保存HEX字符串
priBuf1[0] = 0;
for (int n = 0; n < sizeof(buf); n++)
{
sprintf(priBuf1, "%s%02X", priBuf1, (unsigned char)(buf)[n]);
}
//循环sprintf, 首先不说这个方法是错误的方法, 就是它的效率, 就是所有方法中最慢的, 超级慢的那种(好像GCC12之后对它有所优化)
//If a call to sprintf or snprintf causes copying to take place between objects that overlap, the behavior is undefined (e.g. sprintf(buf, "%s text", buf);)
这里有说明:
//https://en.cppreference.com/w/cpp/io/c/fprintf
//https://gcc.gnu.org/onlinedocs/
//https://www.gnu.org/software/gcc/mirrors.html
改正1:
//sprintf + strcat, 这种方法虽然不是UB, 但是算法复杂度跟"循环sprintf"是一样的
priBuf2[0] = 0;
for (int n = 0; n < sizeof(buf); n++)
{
sprintf(tmp, "%02X", (unsigned char)(buf)[n]);
strcat(priBuf2, tmp);
}
改正2:
//这些速度快了很多
for (int n = 0; n < sizeof(buf); n++)
{
sprintf(priBuf2 + n * 2, "%02X", (unsigned char)(buf)[n]);
}
或者:
char *p = priBuf2;
for (int n = 0; n < sizeof(buf); n++)
{
p+=sprintf(p, "%02X", (unsigned char)(buf)[n]);
}
//优化1: xlat换码法
RDTSC(dw_start);
formatx_xlat(priBuf2, buf, sizeof(priBuf2), sizeof(buf));
RDTSC(tm_xlat);
printf("fx_xlat:%llu\n", tm_xlat - dw_start);
//优化2: C语言查表法
RDTSC(dw_start);
formatc(priBuf2, (const unsigned char*)buf, sizeof(buf));
RDTSC(tm_c);
printf("formatc:%llu\n", tm_c - dw_start);
//优化3: asm查表法
RDTSC(dw_start);
formatx_asm(priBuf6, buf, sizeof(priBuf6), sizeof(buf));
RDTSC(tm_asm);
printf("fmt_asm:%llu\n", tm_asm - dw_start);
//优化4: SSE换码法
RDTSC(dw_start);
formatx_sse(priBuf2, buf, sizeof(priBuf2), sizeof(buf));
RDTSC(tm_sse);
printf("fmt_sse:%llu\n", tm_sse - dw_start);
//优化5: avx512换码法
RDTSC(dw_start);
formatx_avx(priBuf2, buf, sizeof(priBuf2), sizeof(buf));
RDTSC(tm_avx);
printf("fmt_avx:%llu\n", tm_avx - dw_start);
测试结果:
LEN 40960 //GCC4.8.5 Xeon(R) Platinum 8163 CPU 2.50GHz
cncprtf:12915588974
prtfcat:114271680
sprintf:9926708
sprintp:9402810
fx_xlat:806896
formatc:496936
fmt_asm:291836
fmt_sse:26240
fmt_avx:14678
贴代码:
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#define RDTSC(val) __asm("rdtsc \n shl $32, %%rdx \n or %%rdx, %%rax":"=A"(val)::"%rdx");
extern "C" void formatx_avx(char* dst, const char* src, size_t dst_size, size_t src_len);
extern "C" void formatx_sse(char* dst, const char* src, size_t dst_size, size_t src_len);
extern "C" void formatx_xlat(char* dst, const char* src, size_t dst_size, size_t src_len);
extern "C" void formatx_asm(char* dst, const char* src, size_t dst_size, size_t src_len);
//编译器优化还是很不错的, 同样的算法我用汇编写的formatx_asm, 只是稍微快了一点点
char hex_map[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' };
void formatc(char* dst, const unsigned char* src, int src_len)
{
for (int i = 0; i < src_len; i++)
{
*dst++ = hex_map[src[i] >> 4];
*dst++ = hex_map[src[i] & 0xF];
}
*dst = 0;
}
#define LEN 40
int main()
{
unsigned long long dw_start, tm_sprintf, tm_str_cat, tm_c, tm_sse, tm_xlat, tm_asm, tm_avx;
char tmp[3];
char buf[LEN];
char priBuf1[LEN * 2 + 1];
char priBuf2[LEN * 2 + 1];
//随机填入数据
srand((unsigned int)time(NULL));
for (int i = 0; i < sizeof(buf); i++)
{
buf[i] = (char)rand();
}
buf[0] = 0xab;
printf("输出结果:\n");
//循环sprintf, 首先不说这个方法是错误的方法, 就是它的效率, 就是所有方法中最慢的, 超级慢的那种(好像GCC12之后对它有所优化)
//If a call to sprintf or snprintf causes copying to take place between objects that overlap, the behavior is undefined (e.g. sprintf(buf, "%s text", buf);)
//std::printf, std::fprintf, std::sprintf, std::snprintf - cppreference.com
//GCC online documentation- GNU Project
//GCC mirror sites- GNU Project
RDTSC(dw_start);
priBuf1[0] = 0;
for (int n = 0; n < sizeof(buf); n++)
{
sprintf(priBuf1, "%s%02X", priBuf1, (unsigned char)(buf)[n]);
}
RDTSC(tm_sprintf);
printf("cncprtf:%llu\n", tm_sprintf - dw_start);
//sprintf + strcat, 这种方法虽然不是UB, 但是算法复杂度跟"循环sprintf"是一样的
RDTSC(dw_start);
priBuf2[0] = 0;
for (int n = 0; n < sizeof(buf); n++)
{
sprintf(tmp, "%02X", (unsigned char)(buf)[n]);
strcat(priBuf2, tmp);
}
RDTSC(tm_str_cat);
printf("prtfcat:%llu\n", tm_str_cat - dw_start);
//sprintf
RDTSC(dw_start);
for (int n = 0; n < sizeof(buf); n++)
{
sprintf(priBuf2 + n * 2, "%02X", (unsigned char)(buf)[n]);
}
RDTSC(tm_str_cat);
printf("sprintf:%llu\n", tm_str_cat - dw_start);
//跟上一个版本的算法复杂度是一样的, 换了个写法
char *p = priBuf2;
RDTSC(dw_start);
for (int n = 0; n < sizeof(buf); n++)
{
p+=sprintf(p, "%02X", (unsigned char)(buf)[n]);
}
RDTSC(tm_str_cat);
printf("sprintp:%llu\n", tm_str_cat - dw_start);
//xlat换码法
RDTSC(dw_start);
formatx_xlat(priBuf2, buf, sizeof(priBuf2), sizeof(buf));
RDTSC(tm_xlat);
printf("fx_xlat:%llu\n", tm_xlat - dw_start);
//C语言查表法
RDTSC(dw_start);
formatc(priBuf2, (const unsigned char*)buf, sizeof(buf));
RDTSC(tm_c);
printf("formatc:%llu\n", tm_c - dw_start);
//asm查表法
RDTSC(dw_start);
formatx_asm(priBuf6, buf, sizeof(priBuf6), sizeof(buf));
RDTSC(tm_asm);
printf("fmt_asm:%llu\n", tm_asm - dw_start);
//SSE换码法
RDTSC(dw_start);
formatx_sse(priBuf2, buf, sizeof(priBuf2), sizeof(buf));
RDTSC(tm_sse);
printf("fmt_sse:%llu\n", tm_sse - dw_start);
//avx512换码法
RDTSC(dw_start);
formatx_avx(priBuf2, buf, sizeof(priBuf2), sizeof(buf));
RDTSC(tm_avx);
printf("fmt_avx:%llu\n", tm_avx - dw_start);
printf("检查结果:%d\n", memcmp(priBuf1, priBuf2, sizeof(priBuf1)));
return 0;
}
汇编部分 (nasm):
##########################################################################################
.section .text
.intel_syntax noprefix #开头的时候声明一次就可以了,声明了之后,AT&T语法依然有效,然后就可以两种语法混写了,但是还是用INTEL语法写吧,移植方便些
#下一个版本, 搞vpermb #见最后
#这些本身就是常量,放text段也可以
map_avx: .ascii "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF"
#extern "C" void formatx_avx(char* dst/*%rdi*/, const char* src/*%rsi*/, size_t dst_size/*rdx*/, size_t src_len/*%rcx*/);
#AVX512, 要求缓冲区64字节对其哦,但是暂时这里都是用的不需要对其版本的,但是对齐了,对性能也有好处
#但是本函数本身有要求,目标缓冲区大小必须是64的整数倍,因为"vmovdqu64 [rdi], zmm2"一次是写入64字节的
.global formatx_avx
formatx_avx:
mov r8, rcx #记录源字符串长度,用于给目标字符串结尾填\0
mov rax, rdi #记录原始目标地址
add rcx, 31 #一次算32字节
shr rcx, 5 #向上取整
vmovdqu64 zmm7, zmmword ptr [map_avx] #准备数据
formatx_avx_start:
vpmovzxbw zmm3, ymmword ptr[rsi]
vmovdqu64 zmm1, zmm3
vpsrlw zmm1, zmm1, 4
vpsllw zmm3, zmm3, 12
vpsrlw zmm3, zmm3, 4
vpord zmm5, zmm1, zmm3
vpshufb zmm2, zmm7, zmm5
vmovdqu64 [rdi], zmm2
add rsi, 32 #32字节的数据的16进制字符串是
add rdi, 64 #64字节
loop formatx_avx_start
mov byte ptr [rax + r8*2], 0 #末尾填1个字节的0,结束字符串
ret
#extern "C" void formatx_sse(char* dst, const char* src, size_t dst_size, size_t src_len)
.global formatx_sse
formatx_sse:
movq xmm5, rbx #rbx是xlat指令默认寄存器
movq xmm6, rdx
dec rdx #要在最后加'\0'
shr rdx, 1 #一个字节的16进制字符串是2个字符
cmp rdx, rcx #现在再看看目标空间大小够不够
ja formatx_sse_start #够的话,就开始干活
movq rcx, rdx #不够的话, 就只转换目标空间大小的那部分数据
formatx_sse_start:
movq rdx, rcx #现在, rcx中就是需要进行转换的数量
shrq rcx, 4 #一次转换16字节,所以次数是要除以16的
andq rdx, 0x0f #最后的零头数量在rdx中,最后用换码法进行转换
test rcx, rcx #看看有没有凑整一次16字节的处理
jz formatx_sse_remained #如果需要进行处理的字节数不够16字节,直接转去用xlat进行处理
movdqu xmm7, [hex_map] #xmm7中保存这码表
pxor xmm2, xmm2 #清零xmm2,用于插入0
formatx_sse_lopst:
movdqu xmm0, xmm7 #pshufb的时候, 会破坏xmm0,所以每次从xmm7取出码表到xmm0
movdqu xmm1, [rsi] #一次读取16字节
movdqu xmm4, xmm1 #xmm4中再保存一份,用于处理高8字节
punpcklbw xmm1, xmm2 #先处理低8字节xmm1中的数据 #注2(这边下面单独详细注释)
movdqu xmm3, xmm1 #利用xmm3,整理xmm1中的字节序
psrlw xmm1, 4
psllw xmm3, 12
psrlw xmm3, 4
por xmm1, xmm3 #最终处理好的字节序保存再xmm1中
pshufb xmm0, xmm1 #执行换码
movdqu [rdi], xmm0 #将结果存入目标内存
add rdi, 16
movdqu xmm0, xmm7 #pshufb的时候, 会破坏xmm0,所以每次从xmm7取出码表到xmm0
punpckhbw xmm4, xmm2 #再处理高8字节xmm4中的数据
movdqu xmm3, xmm4 #利用xmm3,整理xmm4中的字节序
psrlw xmm4, 4
psllw xmm3, 12
psrlw xmm3, 4
por xmm4, xmm3 #最终处理好的字节序保存再xmm4中
pshufb xmm0, xmm4 #执行换码
movdqu [rdi], xmm0 #将结果存入目标内存
add rdi, 16
add rsi, 16 #一次读取16字节
loop formatx_sse_lopst
#剩余部分用换码xlat指令, 或者用formatx_asm的算法都可以
formatx_sse_remained:
test rdx, rdx
jz formatx_sse_exit_pos
mov rcx, rdx
xor rax, rax
formatx_sse_redo_remain:
lea rbx, hex_map
mov al, [rsi]
ror rax, 4
xlat
mov [rdi], al
shr rax, 60
xlat
mov [rdi + 1], al
add rdi, 2
inc rsi
loop formatx_sse_redo_remain
formatx_sse_exit_pos:
movb [rdi], 0
movq rbx, xmm5
ret
# xlat效率不高, 毕竟是上古留下来的东西了
# extern "C" void formatx_xlat(char* dst, const char* src, size_t dst_size, size_t src_len)
.global formatx_xlat
formatx_xlat:
movq xmm5, rbx
xor rax, rax
lea rbx, hex_map
formatx_xlat_loop:
mov al, [rsi]
ror eax, 4
xlat
mov [rdi], al
inc rdi
shr eax, 28
xlat
mov [rdi], al
inc rdi
inc rsi
loop formatx_xlat_loop
movb [rdi], 0
movq rbx, xmm5
ret
# 算法跟formatc是一样的,gcc4.8.5下比formatc快20%左右, gcc12.1下的时候,就没有formatc跑的快了
# extern "C" void formatx_asm(char* dst, const char* src, size_t dst_size, size_t src_len)
.global formatx_asm
formatx_asm:
xor rax, rax
xor r8, r8
xor r9, r9
formatx_asm_loop:
mov al, [rsi]
mov r8b, al
and r8b, 0xF
shr al, 4
mov dl, [hex_map + rax]
mov [rdi], dl
mov r9b, [hex_map + r8]
mov [rdi+1], r9b
add rdi, 2
inc rsi
loop formatx_asm_loop
movb [rdi], 0
ret