上面大佬已经解释的很棒了, 我来分享一个让go变得跟c差不多快的魔(作)法(弊)手段:
benchmark_amd64.s
TEXT ·_benchmark(SB), $0-8
MOVQ buf+0(FP), DI
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
LONG $0x03e8b841; WORD $0x0000 // mov r8d, 1000
WORD $0xd231 // xor edx, edx
LBB0_1:
WORD $0x894d; BYTE $0xd9 // mov r9, r11
LONG $0xc9af0f4d // imul r9, r9
WORD $0x894d; BYTE $0xc2 // mov r10, r8
WORD $0x3145; BYTE $0xf6 // xor r14d, r14d
LBB0_2:
WORD $0x894c; BYTE $0xf6 // mov rsi, r14
LONG $0xf6af0f48 // imul rsi, rsi
WORD $0x014c; BYTE $0xce // add rsi, r9
WORD $0x894c; BYTE $0xd0 // mov rax, r10
WORD $0xc931 // xor ecx, ecx
LBB0_3:
WORD $0x8948; BYTE $0xcb // mov rbx, rcx
LONG $0xdbaf0f48 // imul rbx, rbx
WORD $0x3948; BYTE $0xde // cmp rsi, rbx
JNE LBB0_6
WORD $0x8548; BYTE $0xc0 // test rax, rax
JNE LBB0_6
LONG $0xd71c8b48 // mov rbx, qword [rdi + 8*rdx]
WORD $0x894c; BYTE $0x1b // mov qword [rbx], r11
LONG $0xd75c8b48; BYTE $0x08 // mov rbx, qword [rdi + 8*rdx + 8]
WORD $0x894c; BYTE $0x33 // mov qword [rbx], r14
LONG $0xd75c8b48; BYTE $0x10 // mov rbx, qword [rdi + 8*rdx + 16]
WORD $0x8948; BYTE $0x0b // mov qword [rbx], rcx
LONG $0x03c28348 // add rdx, 3
LBB0_6:
WORD $0xff48; BYTE $0xc1 // inc rcx
WORD $0xff48; BYTE $0xc8 // dec rax
LONG $0xe9f98148; WORD $0x0003; BYTE $0x00 // cmp rcx, 1001
JNE LBB0_3
WORD $0xff49; BYTE $0xc6 // inc r14
WORD $0xff49; BYTE $0xca // dec r10
LONG $0xe9fe8149; WORD $0x0003; BYTE $0x00 // cmp r14, 1001
JNE LBB0_2
WORD $0xff49; BYTE $0xc3 // inc r11
WORD $0xff49; BYTE $0xc8 // dec r8
LONG $0xe9fb8149; WORD $0x0003; BYTE $0x00 // cmp r11, 1001
JNE LBB0_1
RET
benchmark_amd64.go
package main
import (
"unsafe"
"fmt"
)
//go:noescapefunc _benchmark(buf unsafe.Pointer)
func benchmark() {
var d0 uint64 = 0
var d1 uint64 = 0
var d2 uint64 = 0
var d3 uint64 = 0
var d4 uint64 = 0
var d5 uint64 = 0
var d6 uint64 = 0
var d7 uint64 = 0
var d8 uint64 = 0
var d9 uint64 = 0
var d10 uint64 = 0
var d11 uint64 = 0
a := []unsafe.Pointer{
unsafe.Pointer(&d0),unsafe.Pointer(&d1),unsafe.Pointer(&d2),
unsafe.Pointer(&d3),unsafe.Pointer(&d4),unsafe.Pointer(&d5),
unsafe.Pointer(&d6),unsafe.Pointer(&d7),unsafe.Pointer(&d8),
unsafe.Pointer(&d9),unsafe.Pointer(&d10),unsafe.Pointer(&d11)}
p1 := unsafe.Pointer(&a[0])
// emit _benchmark(p1)
// output fmt.Println("-----------------------------")
fmt.Printf("a:%d \t b:%d \t c:%d\n", d0, d1, d2);
fmt.Printf("a:%d \t b:%d \t c:%d\n", d3, d4, d5);
fmt.Printf("a:%d \t b:%d \t c:%d\n", d6, d7, d8);
fmt.Printf("a:%d \t b:%d \t c:%d\n", d9, d10, d11);
}
func main(){
benchmark()
}
go build:
BOOM !
在我机器上这个"优化"后的结果是:
[root@m01 benchmark]# time ./benchmark
-----------------------------
a:0 b:500 c:500
a:200 b:375 c:425
a:375 b:200 c:425
a:500 b:0 c:500
real0m0.801s
user0m0.798s
sys0m0.006s
纯go实现的结果是:
[root@m01 go]# time ./go
[0 500 500 200 375 425 375 200 425 500 0 500]
real0m1.429s
user0m1.428s
sys0m0.005s
c语言版本clang -O3结果是:
[root@m01 _lib]# time ./original_benchmark
a:0 b:500 c:500
a:200 b:375 c:425
a:375 b:200 c:425
a:500 b:0 c:500
real0m0.791s
user0m0.790s
sys0m0.001s
感兴趣的同学可以下载看看在你的电脑上大概是什么结果.
更新:
TEXT ·_benchmark(SB), $0-8
MOVQ buf+0(FP), DI
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
LONG $0x03e8b841; WORD $0x0000 // mov r8d, 1000
WORD $0xd231 // xor edx, edx
LBB0_1:
WORD $0x8945; BYTE $0xd9 // mov r9d, r11d
LONG $0xc9af0f45 // imul r9d, r9d
WORD $0x8945; BYTE $0xc2 // mov r10d, r8d
WORD $0x3145; BYTE $0xf6 // xor r14d, r14d
LBB0_2:
WORD $0x8944; BYTE $0xf6 // mov esi, r14d
WORD $0xaf0f; BYTE $0xf6 // imul esi, esi
WORD $0x0144; BYTE $0xce // add esi, r9d
WORD $0xc931 // xor ecx, ecx
WORD $0x8944; BYTE $0xd3 // mov ebx, r10d
LBB0_3:
WORD $0xc889 // mov eax, ecx
WORD $0xaf0f; BYTE $0xc0 // imul eax, eax
WORD $0xc639 // cmp esi, eax
JNE LBB0_6
WORD $0xdb85 // test ebx, ebx
JNE LBB0_6
WORD $0xd089 // mov eax, edx
LONG $0xc7048b48 // mov rax, qword [rdi + 8*rax]
WORD $0x8944; BYTE $0x18 // mov dword [rax], r11d
WORD $0x428d; BYTE $0x01 // lea eax, [rdx + 1]
LONG $0xc7048b48 // mov rax, qword [rdi + 8*rax]
WORD $0x8944; BYTE $0x30 // mov dword [rax], r14d
WORD $0x428d; BYTE $0x02 // lea eax, [rdx + 2]
LONG $0xc7048b48 // mov rax, qword [rdi + 8*rax]
WORD $0x0889 // mov dword [rax], ecx
WORD $0xc283; BYTE $0x03 // add edx, 3
LBB0_6:
WORD $0xc1ff // inc ecx
WORD $0xcbff // dec ebx
LONG $0x03e9f981; WORD $0x0000 // cmp ecx, 1001
JNE LBB0_3
WORD $0xff41; BYTE $0xc6 // inc r14d
WORD $0xff41; BYTE $0xca // dec r10d
LONG $0xe9fe8141; WORD $0x0003; BYTE $0x00 // cmp r14d, 1001
JNE LBB0_2
WORD $0xff41; BYTE $0xc3 // inc r11d
WORD $0xff41; BYTE $0xc8 // dec r8d
LONG $0xe9fb8141; WORD $0x0003; BYTE $0x00 // cmp r11d, 1001
JNE LBB0_1
RET