我们知道任何编译器语言都不是直接编译 “目标CPU平台汇编机器字节码”(汇编),哪怕是:C/C++ 编译器也是相同的,需要预编译为ASM汇编文本源码后,提交输入ASM编译器进行静态编译,VC++ CL内集MASM编译器。
而本文的获取的不同语言执行汇编代码就是这个步骤的由编译器生成的源码。
原型表达式:C/C++
int mul(int x, int y) { return x * y; }
原型表达式:Go
package main
func mul(x int, y int) int {
return x * y
}
func main() {}
原型表达式:C#
class Program
{
static int mul(int x, int y) => x * y;
}
原型表达式:F#
module Program
let mul(x, y) = x * y
C/C++ on x86-64 clang 6.0.0
mul(int, int): # @mul(int, int)
imul edi, esi
mov eax, edi
ret
C/C++ on MSVC V19.10 WINE(VC++ 2019, ##VC++ 2022 version greater !19.29.30140.0)
x$ = 8
y$ = 16
int mul(int,int) PROC ; mul
imul ecx, edx
mov eax, ecx
ret 0
int mul(int,int) ENDP ; mul
C# .NET 6.0(dotnet core)
Program:.ctor():this:
ret
Program:mul(int,int):int:
mov eax, edi
imul eax, esi
ret
C# .NET Framework 4.0 JIT (Intel x86-32)DEBUG【会有无效CPU指令】
static int mul(int x, int y) => x * y;
## 完整函数实现(等价于上面其它语言ASM被编译为最终机器代码)
0516B040 55 push ebp
0516B041 8B EC mov ebp,esp // 部署函数堆栈
0516B043 57 push edi
0516B044 56 push esi
0516B045 53 push ebx
0516B046 83 EC 34 sub esp,34h // 扩大52字节计算堆栈
0516B049 33 C0 xor eax,eax // 置空(位运算同值异或)
## 复制计算堆栈的值(X,Y)两个变量到函数局部变量计算堆栈上及函数计算堆栈的初始化
0516B04B 89 45 F0 mov dword ptr [ebp-10h],eax
0516B04E 89 45 E4 mov dword ptr [ebp-1Ch],eax
0516B051 89 4D C4 mov dword ptr [ebp-3Ch],ecx
0516B054 89 55 C0 mov dword ptr [ebp-40h],edx
0516B057 83 3D F0 42 E8 00 00 cmp dword ptr ds:[0E842F0h],0
0516B05E 74 05 je Ppp.Windows.PppApplication+Program.mul(Int32, Int32)+025h (0516B065h)
0516B060 E8 0B 2C EB 6D call 7301DC70
## 该C#函数做工乘法运算执行汇编的指令(很清晰);
0516B065 8B 45 C0 mov eax,dword ptr [ebp-40h]
0516B068 0F AF 45 C4 imul eax,dword ptr [ebp-3Ch]
## 平衡函数堆栈并返回,注:EAX累加寄存器在X86汇编中常用于代表返回值
0516B06C 8D 65 F4 lea esp,[ebp-0Ch]
0516B06F 5B pop ebx
0516B070 5E pop esi
0516B071 5F pop edi
0516B072 5D pop ebp
0516B073 C3 ret ## 等价:RETN 0(就是上面未编译为机器汇编的源文本形式的:ret 0)
F# on dotNET Native AOT
Program:mul(int,int):int:
mov eax, edi
imul eax, esi
ret
Golang on x86 gccgo 12.2.20(打开编译器最大代码编译优化级别)
剔除其它代码至少需要执行以下的源汇编指令,这就只是做个 X, Y 简单的乘法运算而已.....
main.mul:
cmp rsp, QWORD PTR fs:112
jb .L125
.L124:
mov rax, rdi
imul rax, rsi
ret
.L125:
xor r10d, r10d
xor r11d, r11d
call __morestack
ret
jmp .L124
编译器生成完整汇编源代码:
main.struct_4runtime_0gList_cruntime_0n_bint32_5..eq:
cmp rsp, QWORD PTR fs:112
jb .L6
.L4:
mov rdx, QWORD PTR [rsi]
xor eax, eax
cmp QWORD PTR [rdi], rdx
jne .L1
mov eax, DWORD PTR [rsi+8]
cmp DWORD PTR [rdi+8], eax
sete al
.L1:
ret
.L6:
xor r10d, r10d
xor r11d, r11d
call __morestack
ret
jmp .L4
main._661_7struct_4Size_buint32_cMallocs_buint64_cFrees_buint64_5..eq:
cmp rsp, QWORD PTR fs:112
jb .L16
.L15:
xor edx, edx
.L9:
mov r9, QWORD PTR [rdi+8+rdx]
mov r8, QWORD PTR [rdi+16+rdx]
mov rax, QWORD PTR [rsi+8+rdx]
mov rcx, QWORD PTR [rsi+16+rdx]
mov r10d, DWORD PTR [rsi+rdx]
cmp DWORD PTR [rdi+rdx], r10d
je .L17
xor eax, eax
.L7:
ret
.L17:
cmp r9, rax
sete al
cmp r8, rcx
sete cl
and al, cl
je .L7
add rdx, 24
cmp rdx, 1464
jne .L9
ret
.L16:
xor r10d, r10d
xor r11d, r11d
call __morestack
ret
jmp .L15
main.struct_4Size_buint32_cMallocs_buint64_cFrees_buint64_5..eq:
cmp rsp, QWORD PTR fs:112
jb .L23
.L22:
mov edx, DWORD PTR [rsi]
xor eax, eax
cmp DWORD PTR [rdi], edx
jne .L18
mov rcx, QWORD PTR [rsi+8]
cmp QWORD PTR [rdi+8], rcx
je .L24
.L18:
ret
.L24:
mov rax, QWORD PTR [rsi+16]
cmp QWORD PTR [rdi+16], rax
sete al
ret
.L23:
xor r10d, r10d
xor r11d, r11d
call __morestack
ret
jmp .L22
main._633_7float64..eq:
cmp rsp, QWORD PTR fs:112
jb .L33
.L32:
xor eax, eax
jmp .L28
.L35:
add rax, 8
cmp rax, 264
je .L34
.L28:
movsd xmm0, QWORD PTR [rdi+rax]
ucomisd xmm0, QWORD PTR [rsi+rax]
jp .L29
je .L35
.L29:
xor eax, eax
ret
.L34:
mov eax, 1
ret
.L33:
xor r10d, r10d
xor r11d, r11d
call __morestack
ret
jmp .L32
main._632_7uintptr..eq:
cmp rsp, QWORD PTR fs:112
jb .L39
.L37:
sub rsp, 8
mov edx, 256
call runtime.memequal
add rsp, 8
ret
.L39:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L37
main._6256_7uint64..eq:
cmp rsp, QWORD PTR fs:112
jb .L43
.L41:
sub rsp, 8
mov edx, 2048
call runtime.memequal
add rsp, 8
ret
.L43:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L41
main._6122_7uintptr..eq:
cmp rsp, QWORD PTR fs:112
jb .L47
.L45:
sub rsp, 8
mov edx, 976
call runtime.memequal
add rsp, 8
ret
.L47:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L45
main._68_7uint64..eq:
cmp rsp, QWORD PTR fs:112
jb .L51
.L49:
sub rsp, 8
mov edx, 64
call runtime.memequal
add rsp, 8
ret
.L51:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L49
main._6128_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L55
.L53:
sub rsp, 8
mov edx, 128
call runtime.memequal
add rsp, 8
ret
.L55:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L53
main._64096_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L59
.L57:
sub rsp, 8
mov edx, 4096
call runtime.memequal
add rsp, 8
ret
.L59:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L57
main._668_7uint16..eq:
cmp rsp, QWORD PTR fs:112
jb .L63
.L61:
sub rsp, 8
mov edx, 136
call runtime.memequal
add rsp, 8
ret
.L63:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L61
main._665_7uint32..eq:
cmp rsp, QWORD PTR fs:112
jb .L67
.L65:
sub rsp, 8
mov edx, 260
call runtime.memequal
add rsp, 8
ret
.L67:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L65
main._64_7uintptr..eq:
cmp rsp, QWORD PTR fs:112
jb .L71
.L69:
sub rsp, 8
mov edx, 32
call runtime.memequal
add rsp, 8
ret
.L71:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L69
main._65_7uint..eq:
cmp rsp, QWORD PTR fs:112
jb .L75
.L73:
sub rsp, 8
mov edx, 40
call runtime.memequal
add rsp, 8
ret
.L75:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L73
main._6512_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L79
.L77:
sub rsp, 8
mov edx, 512
call runtime.memequal
add rsp, 8
ret
.L79:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L77
main._6249_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L83
.L81:
sub rsp, 8
mov edx, 249
call runtime.memequal
add rsp, 8
ret
.L83:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L81
main._6129_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L87
.L85:
sub rsp, 8
mov edx, 129
call runtime.memequal
add rsp, 8
ret
.L87:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L85
main._632_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L91
.L89:
sub rsp, 8
mov edx, 32
call runtime.memequal
add rsp, 8
ret
.L91:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L89
main._627_7string..eq:
cmp rsp, QWORD PTR fs:112
jb .L104
.L102:
push r12
mov r12, rdi
push rbp
mov rbp, rsi
push rbx
xor ebx, ebx
jmp .L95
.L106:
cmp rdi, rsi
je .L97
call memcmp
test eax, eax
jne .L93
.L97:
add rbx, 16
cmp rbx, 432
je .L105
.L95:
movdqu xmm0, XMMWORD PTR [r12+rbx]
mov rdi, QWORD PTR [r12+rbx]
movdqu xmm0, XMMWORD PTR [rbp+0+rbx]
mov rsi, QWORD PTR [rbp+0+rbx]
mov rdx, QWORD PTR [rbp+8+rbx]
cmp rdx, QWORD PTR [r12+8+rbx]
je .L106
.L93:
xor eax, eax
pop rbx
pop rbp
pop r12
ret
.L105:
mov eax, 1
pop rbx
pop rbp
pop r12
ret
.L104:
mov r10d, 24
xor r11d, r11d
call __morestack
ret
jmp .L102
main._61024_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L110
.L108:
sub rsp, 8
mov edx, 1024
call runtime.memequal
add rsp, 8
ret
.L110:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L108
main._62_7int32..eq:
cmp rsp, QWORD PTR fs:112
jb .L114
.L112:
sub rsp, 8
mov edx, 8
call runtime.memequal
add rsp, 8
ret
.L114:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L112
main._664_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L118
.L116:
sub rsp, 8
mov edx, 64
call runtime.memequal
add rsp, 8
ret
.L118:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L116
main._6256_7uint8..eq:
cmp rsp, QWORD PTR fs:112
jb .L122
.L120:
sub rsp, 8
mov edx, 256
call runtime.memequal
add rsp, 8
ret
.L122:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L120
main.mul:
cmp rsp, QWORD PTR fs:112
jb .L125
.L124:
mov rax, rdi
imul rax, rsi
ret
.L125:
xor r10d, r10d
xor r11d, r11d
call __morestack
ret
jmp .L124
main.main:
cmp rsp, QWORD PTR fs:112
jb .L128
ret
.L128:
xor r10d, r10d
xor r11d, r11d
call __morestack
ret
ret
__go_init_main:
cmp rsp, QWORD PTR fs:112
jb .L132
.L130:
sub rsp, 8
mov esi, OFFSET FLAT:go..typelists
mov edi, 11
call runtime.registerTypeDescriptors
call internal_1cpu..import
call runtime..import
add rsp, 8
ret
.L132:
mov r10d, 8
xor r11d, r11d
call __morestack
ret
jmp .L130
go..typelists:
.quad internal_1cpu..types
.quad runtime..types
.quad internal_1abi..types
.quad internal_1bytealg..types
.quad internal_1goarch..types
.quad internal_1goexperiment..types
.quad internal_1goos..types
.quad runtime_1internal_1atomic..types
.quad runtime_1internal_1math..types
.quad runtime_1internal_1sys..types
.quad main..types
main..types:
.zero 16
main._6256_7uint8..eq..f:
.quad main._6256_7uint8..eq
main._664_7uint8..eq..f:
.quad main._664_7uint8..eq
main._62_7int32..eq..f:
.quad main._62_7int32..eq
main._61024_7uint8..eq..f:
.quad main._61024_7uint8..eq
main._627_7string..eq..f:
.quad main._627_7string..eq
main._632_7uint8..eq..f:
.quad main._632_7uint8..eq
main._6129_7uint8..eq..f:
.quad main._6129_7uint8..eq
main._6249_7uint8..eq..f:
.quad main._6249_7uint8..eq
main._6512_7uint8..eq..f:
.quad main._6512_7uint8..eq
main._65_7uint..eq..f:
.quad main._65_7uint..eq
main._64_7uintptr..eq..f:
.quad main._64_7uintptr..eq
main._665_7uint32..eq..f:
.quad main._665_7uint32..eq
main._633_7float64..eq..f:
.quad main._633_7float64..eq
main._668_7uint16..eq..f:
.quad main._668_7uint16..eq
main._64096_7uint8..eq..f:
.quad main._64096_7uint8..eq
main._6128_7uint8..eq..f:
.quad main._6128_7uint8..eq
main._68_7uint64..eq..f:
.quad main._68_7uint64..eq
main._6122_7uintptr..eq..f:
.quad main._6122_7uintptr..eq
main.struct_4Size_buint32_cMallocs_buint64_cFrees_buint64_5..eq..f:
.quad main.struct_4Size_buint32_cMallocs_buint64_cFrees_buint64_5..eq
main._661_7struct_4Size_buint32_cMallocs_buint64_cFrees_buint64_5..eq..f:
.quad main._661_7struct_4Size_buint32_cMallocs_buint64_cFrees_buint64_5..eq
main._6256_7uint64..eq..f:
.quad main._6256_7uint64..eq
main._632_7uintptr..eq..f:
.quad main._632_7uintptr..eq
main.struct_4runtime_0gList_cruntime_0n_bint32_5..eq..f:
.quad main.struct_4runtime_0gList_cruntime_0n_bint32_5..eq
人们从上述,可以自行看出很多门道来了;没有严格的从目标平台CPU机器汇编代码执行来判断某个编程语言编译代码的执行效率是没有意义的。