大数加法_大数快速取模魔法

最新推荐文章于 2022-10-16 01:34:05 发布

weixin_40006965

最新推荐文章于 2022-10-16 01:34:05 发布

阅读量296

点赞数

文章标签：大数加法

本文链接：https://blog.csdn.net/weixin_40006965/article/details/112673748

版权

本文介绍的是一种特定场景下的大数快速取模算法，对于

当

非常接近2的整数次幂时，该算法十分高效。

先将

以二进制的形式表示出来，从低位开始取出每

位，得到一个数列

。其中

是一个满足

的整数，即

。

如下所示：

于是

。

令

，可以得到以下结论：

并且易知

。当且仅当

时，等号成立。

特别有意思的是，当

：

我们可以得到很多黑魔法，比如对整数每部分进行累加、二进制读取。

的情况很多小伙伴应该都知道了。

令函数

，经过有限次的迭代，最终将得到

。

于是有了最终结论：

特别是当

很小时，需要的迭代次数非常少，这个算法变得非常高效，并且只有乘法和加法。通常的模乘场景中

的二进制位数不会超过

，因此

。

举个栗子，在secp256k1中p=0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f，经常需要模p乘法。有2个大整数

a=0xb5003f7d80f965825706b2c4bbbf1c70b3b02cf65141c6e9d4006205526e919a

b=0xa95780689fd0168ae72b563711bd226bce465dda6d7fca7d64d4e64f26f8a081

求a*b%p。

a*b = 0x77bb07c986a24bd066edf876a667ff3f6fe9fbf3b684e1828f946199862395df8991cf4e4fa8c706ddd413e6f3b95940d2733b04c785e796535047738de79e9a

f(a*b) = 0x8991cf4e4fa8c706ddd413e6f3b95940d2733b04c785e796535047738de79e9a + 0x1000003d1 * 0x77bb07c986a24bd066edf876a667ff3f6fe9fbf3b684e1828f946199862395df = 0x77bb099300fcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec75cebc583b7bb696a9

f(f(a*b)) = 0xfcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec75cebc583b7bb696a9 + 0x1000003d1 * 0x77bb0993 = 0xfcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec76467763976c8620ac

a*b%p = 0xfcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec76467763976c8620ac

用python验算一下：

在secp256k1中，

迭代次数不会超过3次，并且在第3次，

只能是0或1。

最后直接上代码。

// secp256k1的uint256模乘
#include <cinttypes>

inline static void mul(uint64_t x, uint64_t y, uint64_t& low, uint64_t& high) {
	__asm(R"(
		mulq %3
	)" : "=a"(low), "=d"(high) : "a"(x), "r"(y));
}

inline static void square(uint64_t x, uint64_t& low, uint64_t& high) {
	__asm(R"(
		mulq %[x]
	)" : "=a"(low), "=d"(high) : [x] "a"(x));
}

inline static void add(uint64_t x, uint64_t& y, uint64_t& of1) {
	__asm(R"(
		addq %[x], %[y]
		adcq $0, %[of1]
	)" : [y] "+r"(y), [of1]"+r"(of1) : [x] "r"(x));
}

inline static void add(uint64_t x, uint64_t& y, uint64_t& of1, uint64_t& of2) {
	__asm(R"(
		addq %[x], %[y]
		adcq $0, %[of1]
		adcq $0, %[of2]
	)" : [y] "+r"(y), [of1]"+r"(of1), [of2]"+r"(of2) : [x] "r"(x));
}

inline static void add(uint64_t x, uint64_t& y, uint64_t& of1, uint64_t& of2, uint64_t& of3) {
	__asm(R"(
		addq %[x], %[y]
		adcq $0, %[of1]
		adcq $0, %[of2]
		adcq $0, %[of3]
	)" : [y] "+r"(y), [of1]"+r"(of1), [of2]"+r"(of2), [of3]"+r"(of3) : [x] "r"(x));
}

inline static void add(uint64_t x, uint64_t& y, uint64_t& of1, uint64_t& of2, uint64_t& of3, uint64_t& of4) {
	__asm(R"(
		addq %[x], %[y]
		adcq $0, %[of1]
		adcq $0, %[of2]
		adcq $0, %[of3]
		adcq $0, %[of4]
	)" : [y] "+r"(y), [of1]"+r"(of1), [of2]"+r"(of2), [of3]"+r"(of3), [of4]"+r"(of4) : [x] "r"(x));
}

inline static void add(uint64_t x, uint64_t& y, uint64_t& of1, uint64_t& of2, uint64_t& of3, uint64_t& of4, uint64_t& of5) {
	__asm(R"(
		addq %[x], %[y]
		adcq $0, %[of1]
		adcq $0, %[of2]
		adcq $0, %[of3]
		adcq $0, %[of4]
		adcq $0, %[of5]
	)" : [y] "+r"(y), [of1]"+r"(of1), [of2]"+r"(of2), [of3]"+r"(of3), [of4]"+r"(of4), [of5]"+r"(of5) : [x] "r"(x));
}

inline static void add_u512_offset1(uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t x5, uint64_t x6, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5, uint64_t& y6, uint64_t& y7) {
	__asm(R"(
		addq %[x1], %[y1]
		adcq %[x2], %[y2]
		adcq %[x3], %[y3]
		adcq %[x4], %[y4]
		adcq %[x5], %[y5]
		adcq %[x6], %[y6]
		adcq $0, %[y7]
	)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5), [y6]"+r"(y6), [y7]"+r"(y7) : [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3), [x4] "r"(x4), [x5] "r"(x5), [x6] "r"(x6));
}

inline static void add_u512_offset2(uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5, uint64_t& y6) {
	__asm(R"(
		addq %[x1], %[y1]
		adcq %[x2], %[y2]
		adcq %[x3], %[y3]
		adcq %[x4], %[y4]
		adcq $0, %[y5]
		adcq $0, %[y6]
	)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5), [y6]"+r"(y6) : [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3), [x4] "r"(x4));
}

inline static void add_u512_offset3(uint64_t x1, uint64_t x2, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5) {
	__asm(R"(
		addq %[x1], %[y1]
		adcq %[x2], %[y2]
		adcq $0, %[y3]
		adcq $0, %[y4]
		adcq $0, %[y5]
	)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5) : [x1] "r"(x1), [x2] "r"(x2));
}

inline static void add_u320_offset1(uint64_t x1, uint64_t x2, uint64_t x3, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4) {
	__asm(R"(
		addq %[x1], %[y1]
		adcq %[x2], %[y2]
		adcq %[x3], %[y3]
		adcq $0, %[y4]
	)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4) : [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3));
}

inline static void add_u320_u256(uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5) {
	__asm(R"(
		addq %[x1], %[y1]
		adcq %[x2], %[y2]
		adcq %[x3], %[y3]
		adcq %[x4], %[y4]
		adcq $0, %[y5]
	)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5) : [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3), [x4] "r"(x4));
}

inline static void add_u320_u128(uint64_t x1, uint64_t x2, uint64_t& y1, uint64_t& y2, uint64_t& y3, uint64_t& y4, uint64_t& y5) {
	__asm(R"(
		addq %[x1], %[y1]
		adcq %[x2], %[y2]
		adcq $0, %[y3]
		adcq $0, %[y4]
		adcq $0, %[y5]
	)" : [y1] "+r"(y1), [y2]"+r"(y2), [y3]"+r"(y3), [y4]"+r"(y4), [y5]"+r"(y5) : [x1] "r"(x1), [x2] "r"(x2));
}


#define ALWAYS_INLINE __attribute__((always_inline))


ALWAYS_INLINE
inline static void u320_mod_p(uint64_t& x1, uint64_t& x2, uint64_t& x3, uint64_t& x4, uint64_t x5) {
	constexpr uint64_t negP = 0x1000003d1; // 2^256 - p
	uint64_t t0, t1;
	mul(x5, negP, t0, t1);
	x5 = 0;
	add_u320_u128(t0, t1, x1, x2, x3, x4, x5);

	if (x5 != 0 || !(~x4 || ~x3 || ~x2 || x1 < ~negP + 1)) {
		add(negP, x1, x2, x3, x4); // 加-p相当于减p
	}
}

ALWAYS_INLINE
inline static void u512_mod_p(uint64_t& x1, uint64_t& x2, uint64_t& x3, uint64_t& x4, uint64_t x5, uint64_t x6, uint64_t x7, uint64_t x8) {
	constexpr uint64_t negP = 0x1000003d1; // 2^256 - p
/*
               x8            x7            x6            x5
                                                         -p
-------------------------------------------------------------
         H3 <- x8'     H2 <- x7'     H1 <- x6'     H0 <- x5'
-------------------------------------------------------------
 H3            x8'           x7'           x6'           x5'
               H2            H1            H0
*/
	uint64_t H0, H1, H2, H3;
	mul(x5, negP, x5, H0);
	mul(x6, negP, x6, H1);
	mul(x7, negP, x7, H2);
	mul(x8, negP, x8, H3);
	add_u320_offset1(H0, H1, H2, x6, x7, x8, H3); // 用 [x5,x6,x7,x8,H3] 存 uint320

	add_u320_u256(x5, x6, x7, x8, x1, x2, x3, x4, H3);

	u320_mod_p(x1, x2, x3, x4, H3);
}

extern "C" __declspec(dllexport)
void u256_x_u256(const uint8_t a[32], const uint8_t b[32], uint8_t r[32]) {
	/*
                                                         3             2             1             0
                                                         3             2             1             0
------------------------------------------------------------------------------------------------------
                                                 H30 <- L30    H20 <- L20    H10 <- L10    H00 <- L00
                                   H31 <- L31    H21 <- L21    H11 <- L11    H01 <- L01
                     H32 <- L32    H22 <- L22    H12 <- L12    H02 <- L02
       H33 <- L33    H23 <- L23    H13 <- L13    H03 <- L03
------------------------------------------------------------------------------------------------------
H33           L33           L32           L31           L30           L20           L10           L00
              H32           L23           L22           L21           L11           L01
              H23           H31           L13           L12           L02           H00
                            H22           H30           L03           H10
                            H13           H21           H20           H01
                                          H12           H11
                                          H03           H02
	*/

	uint64_t t0, t1, t2, t3, t4, t5, t6, t7;
	const uint64_t* pa = (const uint64_t*)a, * pb = (const uint64_t*)b;
	uint64_t* pr = (uint64_t*)r;
	uint64_t L00, L01, L02, L03, L10, L11, L12, L13, L20, L21, L22, L23, L30, L31, L32, L33;
	uint64_t H00, H01, H02, H03, H10, H11, H12, H13, H20, H21, H22, H23, H30, H31, H32, H33;

	mul(pa[0], pb[0], L00, H00);
	mul(pa[0], pb[1], L01, H01);
	mul(pa[0], pb[2], L02, H02);
	mul(pa[0], pb[3], L03, H03);
	mul(pa[1], pb[0], L10, H10);
	mul(pa[1], pb[1], L11, H11);
	mul(pa[1], pb[2], L12, H12);
	mul(pa[1], pb[3], L13, H13);
	mul(pa[2], pb[0], L20, H20);
	mul(pa[2], pb[1], L21, H21);
	mul(pa[2], pb[2], L22, H22);
	mul(pa[2], pb[3], L23, H23);
	mul(pa[3], pb[0], L30, H30);
	mul(pa[3], pb[1], L31, H31);
	mul(pa[3], pb[2], L32, H32);
	mul(pa[3], pb[3], L33, H33);

	t0 = L00;
	t1 = L10;
	t2 = L20;
	t3 = L30;
	t4 = L31;
	t5 = L32;
	t6 = L33;
	t7 = H33;

	add_u512_offset1(L01, L11, L21, L22, L23, H32, t1, t2, t3, t4, t5, t6, t7);
	add_u512_offset1(H00, L02, L12, L13, H31, H23, t1, t2, t3, t4, t5, t6, t7);
	add_u512_offset2(H10, L03, H30, H22, t2, t3, t4, t5, t6, t7);
	add_u512_offset2(H01, H20, H21, H13, t2, t3, t4, t5, t6, t7);
	add_u512_offset3(H11, H12, t3, t4, t5, t6, t7);
	add_u512_offset3(H02, H03, t3, t4, t5, t6, t7);

	u512_mod_p(t0, t1, t2, t3, t4, t5, t6, t7);

	pr[0] = t0;
	pr[1] = t1;
	pr[2] = t2;
	pr[3] = t3;
}

weixin_40006965

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
大数加法_大数快速取模魔法

本文介绍的是一种特定场景下的大数快速取模算法，对于当非常接近2的整数次幂时，该算法十分高效。先将以二进制的形式表示出来，从低位开始取出每位，得到一个数列。其中是一个满足的整数，即。如下所示：于是。令，可以得到以下结论：并且易知。当且仅当时，等号成立。特别有意思的是，当：我们可以得到很多黑魔法，比如对整数每部分进行累加、二进制读取。的情况很多...
复制链接

扫一扫