如图,今天在工作的时候,要迁移老代码到新的编译工具上,然后两边跑同样的案例不一致,经排查是在一个只根据参数计算结果的函数里发生了不一致,单独把变量摘出来后,复现和debug的时候一致。怀疑是05的exp标准库为了性能、或者当时指令集有限等原因导致为了性能损失了精度。
10的汇编代码
exp:
000000005AB2D010 sub rsp,88h
000000005AB2D017 movsd mmword ptr [rsp+30h],xmm0
000000005AB2D01D mov rax,qword ptr [__real_inf (5AB31090h)]
000000005AB2D024 mov rdx,qword ptr [rsp+30h]
000000005AB2D029 and rax,rdx
000000005AB2D02C mov r9,rdx
000000005AB2D02F cmp rax,qword ptr [__real_inf (5AB31090h)]
000000005AB2D036 je __x_is_inf_or_nan (5AB2D260h)
000000005AB2D03C and r9,qword ptr [__exp_mant_mask (5AB31070h)]
000000005AB2D043 cmp r9,qword ptr [__real_x_near0_threshold (5AB31080h)]
000000005AB2D04A movsd xmm3,mmword ptr [__real_64_by_log2 (5AB310E0h)]
000000005AB2D052 jl __process_result_one (5AB2D180h)
000000005AB2D058 mulsd xmm3,xmm0
000000005AB2D05C comisd xmm3,mmword ptr [__real_p65536 (5AB310C0h)]
000000005AB2D064 ja __y_is_inf (5AB2D240h)
000000005AB2D06A comisd xmm3,mmword ptr [__real_m68800 (5AB310D0h)]
000000005AB2D072 jbe __y_is_zero (5AB2D220h)
000000005AB2D078 cvtpd2dq xmm4,xmm3
000000005AB2D07C lea r10,[__two_to_jby64_head_table (5AB31360h)]
000000005AB2D083 lea r11,[__two_to_jby64_tail_table (5AB31560h)]
000000005AB2D08A cvtdq2pd xmm1,xmm4
000000005AB2D08E movsd xmm2,mmword ptr [__real_log2_by_64_head (5AB310F0h)]
000000005AB2D096 mulsd xmm2,xmm1
000000005AB2D09A movd ecx,xmm4
000000005AB2D09E mov rax,3Fh
000000005AB2D0A5 and eax,ecx
000000005AB2D0A7 subsd xmm0,xmm2
000000005AB2D0AB mulsd xmm1,mmword ptr [__real_log2_by_64_tail (5AB31100h)]
000000005AB2D0B3 movsd xmm2,xmm0
000000005AB2D0B7 sub ecx,eax
000000005AB2D0B9 sar ecx,6
000000005AB2D0BC addsd xmm2,xmm1
000000005AB2D0C0 movsd xmm1,xmm2
000000005AB2D0C4 movsd xmm0,mmword ptr [__real_1_by_2 (5AB31150h)]
000000005AB2D0CC movsd xmm3,mmword ptr [__real_1_by_24 (5AB31130h)]
000000005AB2D0D4 movsd xmm4,mmword ptr [__real_1_by_720 (5AB31110h)]
000000005AB2D0DC mulsd xmm1,xmm2
000000005AB2D0E0 mulsd xmm0,xmm2
000000005AB2D0E4 mulsd xmm3,xmm2
000000005AB2D0E8 mulsd xmm4,xmm2
000000005AB2D0EC movsd xmm5,xmm1
000000005AB2D0F0 mulsd xmm1,xmm2
000000005AB2D0F4 addsd xmm0,mmword ptr [__real_one (5AB31040h)]
000000005AB2D0FC addsd xmm3,mmword ptr [__real_1_by_6 (5AB31140h)]
000000005AB2D104 mulsd xmm5,xmm1
000000005AB2D108 addsd xmm4,mmword ptr [__real_1_by_120 (5AB31120h)]
000000005AB2D110 mulsd xmm0,xmm2
000000005AB2D114 mulsd xmm3,xmm1
000000005AB2D118 mulsd xmm4,xmm5
000000005AB2D11C xor r9d,r9d
000000005AB2D11F cmp ecx,dword ptr [__denormal_threshold (5AB31010h)]
000000005AB2D125 addsd xmm3,xmm4
000000005AB2D129 addsd xmm0,xmm3
000000005AB2D12D cmovle r9d,ecx
000000005AB2D131 add rcx,3FFh
000000005AB2D138 shl rcx,34h
000000005AB2D13C lea r8,[__two_to_jby64_table (5AB31160h)]
000000005AB2D143 mulsd xmm0,mmword ptr [r8+rax*8]
000000005AB2D149 cmp rcx,qword ptr [__real_inf (5AB31090h)]
000000005AB2D150 addsd xmm0,mmword ptr [r11+rax*8]
000000005AB2D156 addsd xmm0,mmword ptr [r10+rax*8]
000000005AB2D15C je __process_almost_inf (5AB2D190h)
000000005AB2D15E test r9d,r9d
000000005AB2D161 mov qword ptr [rsp+30h],rcx
000000005AB2D166 jne __process_denormal (5AB2D1A0h)
000000005AB2D168 mulsd xmm0,mmword ptr [rsp+30h]
__final_check:
000000005AB2D16E add rsp,88h
000000005AB2D175 ret
000000005AB2D176 nop word ptr [rax+rax]
__process_result_one:
000000005AB2D180 movsd xmm0,mmword ptr [__real_one (5AB31040h)]
000000005AB2D188 jmp __final_check (5AB2D16Eh)
000000005AB2D18A nop word ptr [rax+rax]
__process_almost_inf:
000000005AB2D190 orpd xmm0,xmmword ptr [__enable_almost_inf (5AB31020h)]
000000005AB2D198 jmp __final_check (5AB2D16Eh)
000000005AB2D19A nop word ptr [rax+rax]
__process_denormal:
000000005AB2D1A0 mov ecx,r9d
000000005AB2D1A3 xor r11d,r11d
000000005AB2D1A6 comisd xmm0,mmword ptr [__real_one (5AB31040h)]
000000005AB2D1AE cmovae r11d,ecx
000000005AB2D1B2 cmp r11d,dword ptr [__denormal_threshold (5AB31010h)]
000000005AB2D1B9 jne __process_true_denormal (5AB2D1D0h)
000000005AB2D1BB mulsd xmm0,mmword ptr [rsp+30h]
000000005AB2D1C1 jmp __final_check (5AB2D16Eh)
000000005AB2D1C3 nop word ptr [rax+rax]
__process_true_denormal:
000000005AB2D1D0 xor r8,r8
000000005AB2D1D3 cmp rdx,qword ptr [__denormal_tiny_threshold (5AB31060h)]
000000005AB2D1DA mov r9,1
000000005AB2D1E1 jg __process_denormal_tiny (5AB2D210h)
000000005AB2D1E3 add ecx,432h
000000005AB2D1E9 cmovs rcx,r8
000000005AB2D1ED shl r9,cl
000000005AB2D1F0 mov rcx,r9
000000005AB2D1F3 mov qword ptr [rsp+30h],rcx
000000005AB2D1F8 mulsd xmm0,mmword ptr [rsp+30h]
000000005AB2D1FE jmp __final_check (5AB2D16Eh)
000000005AB2D203 nop word ptr [rax+rax]
__process_denormal_tiny:
000000005AB2D210 movsd xmm0,mmword ptr [__real_smallest_denormal (5AB31050h)]
000000005AB2D218 jmp __final_check (5AB2D16Eh)
000000005AB2D21D nop dword ptr [rax]
__y_is_zero:
000000005AB2D220 movsd xmm1,mmword ptr [__real_zero (5AB31030h)]
000000005AB2D228 movd xmm0,rdx
000000005AB2D22D mov r8d,dword ptr [__flag_y_zero (5AB31004h)]
000000005AB2D234 call _exp_special (5AAB46B0h)
000000005AB2D239 jmp __finish (5AB2D2A0h)
000000005AB2D23E xchg ax,ax
__y_is_inf:
000000005AB2D240 movsd xmm1,mmword ptr [__real_inf (5AB31090h)]
000000005AB2D248 movd xmm0,rdx
000000005AB2D24D mov r8d,dword ptr [__flag_y_inf (5AB31008h)]
000000005AB2D254 call _exp_special (5AAB46B0h)
000000005AB2D259 jmp __finish (5AB2D2A0h)
000000005AB2D25B nop dword ptr [rax+rax]
__x_is_inf_or_nan:
000000005AB2D260 cmp rdx,qword ptr [__real_inf (5AB31090h)]
000000005AB2D267 je __finish (5AB2D2A0h)
000000005AB2D269 cmp rdx,qword ptr [__real_ninf (5AB310A0h)]
000000005AB2D270 je __process_zero (5AB2D290h)
000000005AB2D272 or rdx,qword ptr [__real_qnanbit (5AB310B0h)]
000000005AB2D279 movd xmm1,rdx
000000005AB2D27E mov r8d,dword ptr [__flag_x_nan (5AB31000h)]
000000005AB2D285 call _exp_special (5AAB46B0h)
000000005AB2D28A jmp __finish (5AB2D2A0h)
000000005AB2D28C nop dword ptr [rax]
__process_zero:
000000005AB2D290 movsd xmm0,mmword ptr [__real_zero (5AB31030h)]
000000005AB2D298 jmp __final_check (5AB2D16Eh)
000000005AB2D29D nop dword ptr [rax]
__finish:
000000005AB2D2A0 add rsp,88h
000000005AB2D2A7 ret
05的汇编
exp:
00000001400010BA jmp qword ptr [__imp_exp (14000A518h)]
_RTC_InitBase:
00000001400010C0 sub rsp,38h
00000001400010C4 cmp byte ptr [init (1400081A0h)],0
00000001400010CB jne _RTC_InitBase+36h (1400010F6h)
00000001400010CD mov r9d,1
00000001400010D3 xor r8d,r8d
00000001400010D6 xor edx,edx
00000001400010D8 xor ecx,ecx
00000001400010DA mov byte ptr [init (1400081A0h)],1
00000001400010E1 mov dword ptr [rsp+20h],0
00000001400010E9 call _CRT_RTC_INITW (140001658h)
00000001400010EE mov rcx,rax
00000001400010F1 call _RTC_SetErrorFuncW (140001620h)
00000001400010F6 add rsp,38h
00000001400010FA ret
很明显的差异,我估计10的精度应该更高很多
经过性能测试,05的只有10的一半性能,
100w次,05需要0.007秒~0.008,10需要0.002~0.003
由于还在上班,暂时没法上高精度计时,但是多次重复差不多都是这么个值,还是10又快又准。