本文介绍的是一种特定场景下的大数快速取模算法,对于
先将
如下所示:
于是
令
并且易知
特别有意思的是,当
我们可以得到很多黑魔法,比如对整数每部分进行累加、二进制读取。
![38e6dd3509f907da6d7725b35157e2b3.png](https://i-blog.csdnimg.cn/blog_migrate/bbb6e1ce3cff1ff8f38e55561b64bcc2.jpeg)
令函数
于是有了最终结论:
特别是当
举个栗子,在secp256k1中p=0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f,经常需要模p乘法。有2个大整数
a=0xb5003f7d80f965825706b2c4bbbf1c70b3b02cf65141c6e9d4006205526e919a
b=0xa95780689fd0168ae72b563711bd226bce465dda6d7fca7d64d4e64f26f8a081
求a*b%p。
a*b = 0x77bb07c986a24bd066edf876a667ff3f6fe9fbf3b684e1828f946199862395df8991cf4e4fa8c706ddd413e6f3b95940d2733b04c785e796535047738de79e9a
f(a*b) = 0x8991cf4e4fa8c706ddd413e6f3b95940d2733b04c785e796535047738de79e9a + 0x1000003d1 * 0x77bb07c986a24bd066edf876a667ff3f6fe9fbf3b684e1828f946199862395df = 0x77bb099300fcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec75cebc583b7bb696a9
f(f(a*b)) = 0xfcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec75cebc583b7bb696a9 + 0x1000003d1 * 0x77bb0993 = 0xfcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec76467763976c8620ac
a*b%p = 0xfcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec76467763976c8620ac
用python验算一下:
![62a6b5fc0acc05164fe4fe85aa7bbb4d.png](https://i-blog.csdnimg.cn/blog_migrate/d6383a723a37e4fbca9546d26b974f20.png)
在secp256k1中,
最后直接上代码。
// secp256k1的uint256模乘
#include <cinttypes>
inline static void mul(uint64_t x, uint64_t y, uint64_t& low, uint64_t& high) {
__asm(R"(
mulq %3
)" : "=a"(low), "=d"(high) : "a"(x), "r"(y));
}
inline static void square(uint64_t x, uint64_t& low, uint64_t& high) {
__asm(R"(
mulq %[x]
)" : "=a"(low), "=d"(high) : [x] "a"(x));
}
inline static void add(uint64_t x, uint64_t& y, uint64_t& of1) {
__asm(R"(
addq %[x], %[y]
adcq $0, %[of1]
)" : [y] "+r"(y), [of1]"+r"(of1) : [x] "r"(x));
}
inline static void add(uint64_t x, uint64_t& y, uint64_t& of1, uint64_t& of2) {
__asm(R"(
addq %[x], %[y]
adcq $0, %[of1]
adcq $0, %[of2]
)" : [y] "+r"(y), [of1]"+r"(of1), [of2]"+r"(of2) : [x] "r"(x));
}
inline static void add(uint64_t x, uint64_t& y, uint64_t& of1, uint64_t& of2, uint64_t& of3) {
__asm(R"(
addq %[x], %[y]
adcq $0, %[of1]
adcq $0, %[of2]
adcq $0, %[of3]
)" : [y] "+r"(y), [of1]"+r"(of1), [of2]"+r"(of2), [of3]"+r"(of3) : [x] "r"(x));
}
inline static void add(uint64_t x, uint64_t& y, uint64_t& of1, uint64_t& of2, uint64_t& of3, uint64_t& of4) {
__asm(R"(
addq %[x], %[y]
adcq $0, %[of1]
adcq $0, %[of2]
adcq $0, %[of3]
adcq $0, %[of4]
)" : [y] "+r"(y), [of1]"+r"(of1), [of2]"+r"(of2), [of3]"+r"(of3), [of4]"+r"(of4) : [x] "r"(x));
}
inline static void add(uint64_t x, uint64_t& y, uint64_t& of1, uint64_t& of2, uint64_t& of3, uint64_t& of4, uint64_t& of5) {
__asm(R"(
addq %[x], %[y]
adcq $0, %[of1]
adcq $0, %[of2]
adcq $0, %[of3]
adcq $0, %[of4]
adcq $0, %[of5]
)" : [y] "+r"(y), [of1]"+r"(of1), [of2]"+r"(of2), [of3]"+r"(of3), [of4]"+r"(of4), [of5]"+r"(of5) : [x] "r"(x));
}
inline static void add_u512_offset1(uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t x5, uint64_t x6, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5, uint64_t& y6, uint64_t& y7) {
__asm(R"(
addq %[x1], %[y1]
adcq %[x2], %[y2]
adcq %[x3], %[y3]
adcq %[x4], %[y4]
adcq %[x5], %[y5]
adcq %[x6], %[y6]
adcq $0, %[y7]
)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5), [y6]"+r"(y6), [y7]"+r"(y7) : [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3), [x4] "r"(x4), [x5] "r"(x5), [x6] "r"(x6));
}
inline static void add_u512_offset2(uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5, uint64_t& y6) {
__asm(R"(
addq %[x1], %[y1]
adcq %[x2], %[y2]
adcq %[x3], %[y3]
adcq %[x4], %[y4]
adcq $0, %[y5]
adcq $0, %[y6]
)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5), [y6]"+r"(y6) : [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3), [x4] "r"(x4));
}
inline static void add_u512_offset3(uint64_t x1, uint64_t x2, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5) {
__asm(R"(
addq %[x1], %[y1]
adcq %[x2], %[y2]
adcq $0, %[y3]
adcq $0, %[y4]
adcq $0, %[y5]
)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5) : [x1] "r"(x1), [x2] "r"(x2));
}
inline static void add_u320_offset1(uint64_t x1, uint64_t x2, uint64_t x3, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4) {
__asm(R"(
addq %[x1], %[y1]
adcq %[x2], %[y2]
adcq %[x3], %[y3]
adcq $0, %[y4]
)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4) : [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3));
}
inline static void add_u320_u256(uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5) {
__asm(R"(
addq %[x1], %[y1]
adcq %[x2], %[y2]
adcq %[x3], %[y3]
adcq %[x4], %[y4]
adcq $0, %[y5]
)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5) : [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3), [x4] "r"(x4));
}
inline static void add_u320_u128(uint64_t x1, uint64_t x2, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5) {
__asm(R"(
addq %[x1], %[y1]
adcq %[x2], %[y2]
adcq $0, %[y3]
adcq $0, %[y4]
adcq $0, %[y5]
)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5) : [x1] "r"(x1), [x2] "r"(x2));
}
#define ALWAYS_INLINE __attribute__((always_inline))
ALWAYS_INLINE
inline static void u320_mod_p(uint64_t& x1, uint64_t& x2, uint64_t& x3, uint64_t& x4, uint64_t x5) {
constexpr uint64_t negP = 0x1000003d1; // 2^256 - p
uint64_t t0, t1;
mul(x5, negP, t0, t1);
x5 = 0;
add_u320_u128(t0, t1, x1, x2, x3, x4, x5);
if (x5 != 0 || !(~x4 || ~x3 || ~x2 || x1 < ~negP + 1)) {
add(negP, x1, x2, x3, x4); // 加-p相当于减p
}
}
ALWAYS_INLINE
inline static void u512_mod_p(uint64_t& x1, uint64_t& x2, uint64_t& x3, uint64_t& x4, uint64_t x5, uint64_t x6, uint64_t x7, uint64_t x8) {
constexpr uint64_t negP = 0x1000003d1; // 2^256 - p
/*
x8 x7 x6 x5
-p
-------------------------------------------------------------
H3 <- x8' H2 <- x7' H1 <- x6' H0 <- x5'
-------------------------------------------------------------
H3 x8' x7' x6' x5'
H2 H1 H0
*/
uint64_t H0, H1, H2, H3;
mul(x5, negP, x5, H0);
mul(x6, negP, x6, H1);
mul(x7, negP, x7, H2);
mul(x8, negP, x8, H3);
add_u320_offset1(H0, H1, H2, x6, x7, x8, H3); // 用 [x5,x6,x7,x8,H3] 存 uint320
add_u320_u256(x5, x6, x7, x8, x1, x2, x3, x4, H3);
u320_mod_p(x1, x2, x3, x4, H3);
}
extern "C" __declspec(dllexport)
void u256_x_u256(const uint8_t a[32], const uint8_t b[32], uint8_t r[32]) {
/*
3 2 1 0
3 2 1 0
------------------------------------------------------------------------------------------------------
H30 <- L30 H20 <- L20 H10 <- L10 H00 <- L00
H31 <- L31 H21 <- L21 H11 <- L11 H01 <- L01
H32 <- L32 H22 <- L22 H12 <- L12 H02 <- L02
H33 <- L33 H23 <- L23 H13 <- L13 H03 <- L03
------------------------------------------------------------------------------------------------------
H33 L33 L32 L31 L30 L20 L10 L00
H32 L23 L22 L21 L11 L01
H23 H31 L13 L12 L02 H00
H22 H30 L03 H10
H13 H21 H20 H01
H12 H11
H03 H02
*/
uint64_t t0, t1, t2, t3, t4, t5, t6, t7;
const uint64_t* pa = (const uint64_t*)a, * pb = (const uint64_t*)b;
uint64_t* pr = (uint64_t*)r;
uint64_t L00, L01, L02, L03, L10, L11, L12, L13, L20, L21, L22, L23, L30, L31, L32, L33;
uint64_t H00, H01, H02, H03, H10, H11, H12, H13, H20, H21, H22, H23, H30, H31, H32, H33;
mul(pa[0], pb[0], L00, H00);
mul(pa[0], pb[1], L01, H01);
mul(pa[0], pb[2], L02, H02);
mul(pa[0], pb[3], L03, H03);
mul(pa[1], pb[0], L10, H10);
mul(pa[1], pb[1], L11, H11);
mul(pa[1], pb[2], L12, H12);
mul(pa[1], pb[3], L13, H13);
mul(pa[2], pb[0], L20, H20);
mul(pa[2], pb[1], L21, H21);
mul(pa[2], pb[2], L22, H22);
mul(pa[2], pb[3], L23, H23);
mul(pa[3], pb[0], L30, H30);
mul(pa[3], pb[1], L31, H31);
mul(pa[3], pb[2], L32, H32);
mul(pa[3], pb[3], L33, H33);
t0 = L00;
t1 = L10;
t2 = L20;
t3 = L30;
t4 = L31;
t5 = L32;
t6 = L33;
t7 = H33;
add_u512_offset1(L01, L11, L21, L22, L23, H32, t1, t2, t3, t4, t5, t6, t7);
add_u512_offset1(H00, L02, L12, L13, H31, H23, t1, t2, t3, t4, t5, t6, t7);
add_u512_offset2(H10, L03, H30, H22, t2, t3, t4, t5, t6, t7);
add_u512_offset2(H01, H20, H21, H13, t2, t3, t4, t5, t6, t7);
add_u512_offset3(H11, H12, t3, t4, t5, t6, t7);
add_u512_offset3(H02, H03, t3, t4, t5, t6, t7);
u512_mod_p(t0, t1, t2, t3, t4, t5, t6, t7);
pr[0] = t0;
pr[1] = t1;
pr[2] = t2;
pr[3] = t3;
}