# uint64_t mul384(uint64_t c[12], uint64_t a[6], uint64_t b[6])
# rdi = c
# rsi = a
# rdx = b
# -----------------------------------------------------------------
# load (xmm6 , xmm7 , xmm8 ) = (a[0], a[1], a[2], a[3], a[4], a[5])
# load (xmm9 , xmm10, xmm11) = (b[0], b[1], b[2], b[3], b[4], b[5])
# -----------------------------------------------------------------
movdqa (%rsi), %xmm6
movdqa 16(%rsi), %xmm7
movdqa 32(%rsi), %xmm8
movdqa (%rdx), %xmm9
movdqa 16(%rdx), %xmm10
movdqa 32(%rdx), %xmm11
# ---------------------------
# backup (r12, r13, r14, r15)
# ---------------------------
movq %r12, %xmm14
movq %r14, %xmm15
pinsrq $1, %r13, %xmm14
pinsrq $1, %r15, %xmm15
# -----------------------------------------------------------------
# r10 = a[1] r11 = a[3] r12 = a[5] r13 = b[1] r14 = b[3] r15 = b[5]
# -----------------------------------------------------------------
pextrq $1, %xmm6, %r10
pextrq $1, %xmm7, %r11
pextrq $1, %xmm8, %r12
pextrq $1, %xmm9, %r13
pextrq $1, %xmm10, %r14
pextrq $1, %xmm11, %r15
# ------------------
# init (r8, r9, rsi)
# ------------------
xorq %r8, %r8
xorq %r9, %r9
xorq %rsi, %rsi
# -----------
# a[0] * b[0]
# -----------
movq %xmm6, %rcx
movq %xmm9, %rax
mulq %rcx
# ----
movq %rax, %xmm0
movq %rdx, %r8
# -----------
# a[0] * b[1]
# a[1] * b[0]
# -----------
movq %xmm6, %rax
mulq %r13
addq %rax, %r8
adcq %rdx, %r9
# ----
movq %xmm9, %rax
mulq %r10
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
pinsrq $1, %r8, %xmm0
xorq %r8, %r8
# -----------
# a[0] * b[2]
# a[1] * b[1]
# a[2] * b[0]
# -----------
movq %xmm10, %rcx
movq %xmm6, %rax
mulq %rcx
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %r10, %rax
mulq %r13
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %xmm9, %rcx
movq %xmm7, %rax
mulq %rcx
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %r9, %xmm1
xorq %r9, %r9
# -----------
# a[0] * b[3]
# a[1] * b[2]
# a[2] * b[1]
# a[3] * b[0]
# -----------
movq %xmm6, %rax
mulq %r14
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %xmm10, %rax
mulq %r10
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %xmm7, %rax
mulq %r13
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %xmm9, %rax
mulq %r11
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
pinsrq $1, %rsi, %xmm1
xorq %rsi, %rsi
# -----------
# a[0] * b[4]
# a[1] * b[3]
# a[2] * b[2]
# a[3] * b[1]
# a[4] * b[0]
# -----------
movq %xmm11, %rcx
movq %xmm6, %rax
mulq %rcx
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
movq %r10, %rax
mulq %r14
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
movq %xmm7, %rcx
movq %xmm10, %rax
mulq %rcx
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
movq %r11, %rax
mulq %r13
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
movq %xmm9, %rcx
movq %xmm8, %rax
mulq %rcx
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
movq %r8, %xmm2
xorq %r8, %r8
# -----------
# a[0] * b[5]
# a[1] * b[4]
# a[2] * b[3]
# a[3] * b[2]
# a[4] * b[1]
# a[5] * b[0]
# -----------
movq %xmm6, %rax
mulq %r15
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %xmm11, %rax
mulq %r10
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %xmm7, %rax
mulq %r14
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %xmm10, %rax
mulq %r11
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %xmm8, %rax
mulq %r13
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %xmm9, %rax
mulq %r12
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
pinsrq $1, %r9, %xmm2
xorq %r9, %r9
# -----------
# a[1] * b[5]
# a[2] * b[4]
# a[3] * b[3]
# a[4] * b[2]
# a[5] * b[1]
# -----------
movq %r10, %rax
mulq %r15
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %xmm11, %rcx
movq %xmm7, %rax
mulq %rcx
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %r11, %rax
mulq %r14
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %xmm10, %rcx
movq %xmm8, %rax
mulq %rcx
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %r12, %rax
mulq %r13
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %rsi, %xmm3
xorq %rsi, %rsi
# -----------
# a[2] * b[5]
# a[3] * b[4]
# a[4] * b[3]
# a[5] * b[2]
# -----------
movq %xmm7, %rax
mulq %r15
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
movq %xmm11, %rax
mulq %r11
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
movq %xmm8, %rax
mulq %r14
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
movq %xmm10, %rax
mulq %r12
addq %rax, %r8
adcq %rdx, %r9
adcq $0, %rsi
# ----
pinsrq $1, %r8, %xmm3
xorq %r8, %r8
# -----------
# a[3] * b[5]
# a[4] * b[4]
# a[5] * b[3]
# -----------
movq %r11, %rax
mulq %r15
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %xmm11, %r11
movq %xmm8, %rax
mulq %r11
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %r12, %rax
mulq %r14
addq %rax, %r9
adcq %rdx, %rsi
adcq $0, %r8
# ----
movq %r9, %xmm4
xorq %r9, %r9
# -----------
# a[4] * b[5]
# a[5] * b[4]
# -----------
movq %xmm8, %rax
mulq %r15
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
movq %xmm11, %rax
mulq %r12
addq %rax, %rsi
adcq %rdx, %r8
adcq $0, %r9
# ----
pinsrq $1, %rsi, %xmm4
# -----------
# a[5] * b[5]
# -----------
movq %r12, %rax
mulq %r15
addq %rax, %r8
adcq %rdx, %r9
# ----
movq %r8, %xmm5
pinsrq $1, %r9, %xmm5
# --------------------------------------
# restore (r12, r13, r14, r15)
# --------------------------------------
movq %xmm14, %r12
movq %xmm15, %r14
pextrq $1, %xmm14, %r13
pextrq $1, %xmm15, %r15
# -------------------------------------------
# output (xmm0, xmm1, xmm2, xmm3, xmm4, xmm5)
# -------------------------------------------
movdqa %xmm0, (%rdi)
movdqa %xmm1, 16(%rdi)
movdqa %xmm2, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm4, 64(%rdi)
movdqa %xmm5, 80(%rdi)
# ------
# return
# ------
emms
xorq %rax, %rax
xorq %rdx, %rdx
ret
将上述完整汇编程序代码插入上篇中生成的汇编模板文件mul384.s中即得到完整的汇编源文件,配合头文件(见上篇),即可用于C程序编程中调用。