.file "mp_mod_sm2.c"
.text
.p2align 4,,15
.globl mp_mod_sm2
.type mp_mod_sm2, @function
mp_mod_sm2:
.LFB0:
.cfi_startproc
# uint64_t mp_mod_sm2(uint64_t r[4], uint64_t a[8])
# rdi = r
# rsi = c
# ---------------------------
# backup (r12, r13, r14, r15)
# ---------------------------
movq %r12, %xmm14
movq %r14, %xmm15
pinsrq $1, %r13, %xmm14
pinsrq $1, %r15, %xmm15
# ------------------------------------
# load (xmm10 ~ xmm13) = (a[0] ~ c[7])
# ------------------------------------
movdqa (%rsi), %xmm10
movdqa 16(%rsi), %xmm11
movdqa 32(%rsi), %xmm12
movdqa 48(%rsi), %xmm13
# ---------
# r15 = a15
# ---------
pextrd $3, %xmm13, %r15d
# ---------------
# r14 = a14 + a15
# ---------------
pextrd $2, %xmm13, %r14d
addq %r15, %r14
# ---------------------
# r13 = a13 + a14 + a15
# ---------------------
pextrd $1, %xmm13, %r13d
addq %r14, %r13
# ---------------------------
# r12 = a12 + a13 + a14 + a15
# ---------------------------
movd %xmm13, %r12d
addq %r13, %r12
# --------
# r8 = a08
# --------
movd %xmm12, %r8d
# --------
# r9 = a09
# --------
pextrd $1, %xmm12, %r9d
# --------
# r10 = a10
# --------
pextrd $2, %xmm12, %r10d
# --------
# r11 = a11
# --------
pextrd $3, %xmm12, %r11d
# ---------------------------
# rsi = a08 + a09 + a10 + a11
# ---------------------------
movq %r8, %rsi
addq %r9, %rsi
addq %r10, %rsi
addq %r11, %rsi
# --------------------------------------------------------------------------
# a00 + (a08 + a09 + a10 +a11) + (a12 + a13 + a14 + a15) + (a13 + a14 + a15)
# --------------------------------------------------------------------------
movd %xmm10, %eax
addq %rsi, %rax
addq %r12, %rax
addq %r13, %rax
movd %eax, %xmm0
# --------------------------------------------------------------------------------
# up + a01 + (a08 + a09 + a10 + a11) + (a12 + a13 + a14 + a15) + (a14 + a15) - a08
# --------------------------------------------------------------------------------
shr $32, %rax
pextrd $1, %xmm10, %edx
addq %rdx, %rax
addq %rsi, %rax
addq %r12, %rax
addq %r14, %rax
subq %r8, %rax
pinsrd $1, %eax, %xmm0
# -------------------------------------------
# up + a02 + (2 ^ 34) - a08 - a09 - a13 - a14
# -------------------------------------------
shr $32, %rax
pextrd $2, %xmm10, %edx
addq %rdx, %rax
movq $1, %rdx
shl $34, %rdx
addq %rdx, %rax
subq %r8, %rax
subq %r9, %rax
pextrd $1, %xmm13, %edx
subq %rdx, %rax
pextrd $2, %xmm13, %edx
subq %rdx, %rax
pinsrd $2, %eax, %xmm0
# -------------------------------------------------------------------
# up + a03 + (2 ^ 32) + (a12 + a13 + a14 + a15) + a08 + a11 + a13 - 4
# -------------------------------------------------------------------
shr $32, %rax
pextrd $3, %xmm10, %edx
addq %rdx, %rax
movq $1, %rdx
shl $32, %rdx
addq %rdx, %rax
addq %r12, %rax
addq %r8, %rax
addq %r11, %rax
pextrd $1, %xmm13, %edx
addq %rdx, %rax
subq $4, %rax
pinsrd $3, %eax, %xmm0
# --------------------------------------------------
# up + a04 + (a12 + a13 + a14 + a15) + a09 + a14 - 1
# --------------------------------------------------
shr $32, %rax
movd %xmm11, %edx
addq %rdx, %rax
addq %r12, %rax
addq %r9, %rax
pextrd $2, %xmm13, %edx
addq %rdx, %rax
decq %rax
movd %eax, %xmm1
# ----------------------------------------
# up + a05 + (a13 + a14 + a15) + a10 + a15
# ----------------------------------------
shr $32, %rax
pextrd $1, %xmm11, %edx
addq %rdx, %rax
addq %r13, %rax
addq %r10, %rax
addq %r15, %rax
pinsrd $1, %eax, %xmm1
# ----------------------------
# up + a06 + (a14 + a15) + a11
# ----------------------------
shr $32, %rax
pextrd $2, %xmm11, %edx
addq %rdx, %rax
addq %r14, %rax
addq %r11, %rax
pinsrd $2, %eax, %xmm1
# -------------------------------------------------------------------------------------------
# up + a07 + (a08 + a09 + a10 +a11) + (a12 + a13 + a14 + a15) + (a12 + a13 + a14 + a15) + a15
# -------------------------------------------------------------------------------------------
shr $32, %rax
pextrd $3, %xmm11, %edx
addq %rdx, %rax
addq %rsi, %rax
addq %r12, %rax
addq %r12, %rax
addq %r15, %rax
pinsrd $3, %eax, %xmm1
# -----
# final
# -----
movq %xmm0, %r12
movq %xmm1, %r14
pextrq $1, %xmm0, %r13
pextrq $1, %xmm1, %r15
shr $32, %rax
movq %rax, %rdx
shl $32, %rdx
movq %rdx, %rsi
subq %rax, %rsi
addq %rax, %r12
adcq %rsi, %r13
adcq $0, %r14
adcq %rdx, %r15
movq %r12, %xmm0
movq %r14, %xmm1
pinsrq $1, %r13, %xmm0
pinsrq $1, %r15, %xmm1
# ----------------------------
# restore (r12, r13, r14, r15)
# ----------------------------
movq %xmm14, %r12
movq %xmm15, %r14
pextrq $1, %xmm14, %r13
pextrq $1, %xmm15, %r15
# -------------------
# output (xmm0, xmm1)
# -------------------
movdqa %xmm0, (%rdi)
movdqa %xmm1, 16(%rdi)
# return
emms
xorq %rax, %rax
xorq %rdx, %rdx
ret
.cfi_endproc
.LFE0:
.size mp_mod_sm2, .-mp_mod_sm2
.ident "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-4)"
.section .note.GNU-stack,"",@progbits
最后的收尾代码从数学上讲有些不够严谨,要改很容易拉