大數快速取模魔法

作者：由九條可憐發表于書法時間：2019-08-15

本文介紹的是一種特定場景下的大數快速取模演算法，對於

$x\bmod n$

當

非常接近2的整數次冪時，該演算法十分高效。

先將

以二進位制的形式表示出來，從低位開始取出每

位，得到一個數列

。其中

是一個滿足

$2^{M-1}<n\leq 2^{M}$

的整數，即

$M=\lceil log_2 n\rceil$

。

如下所示：

$\begin{matrix} ...&\underbrace{bb...bb} &\underbrace{bb...bb} &\underbrace{bb...bb}\\ &{^{M位}} &{^{M位}} &{^{M位}}\\ ...&x_2&x_1&x_0\\ \end{matrix}\\$

於是

$x=x_0+2^{M}x_1+2^{2M}x_2+...$

。

令

，可以得到以下結論：

$\begin{align} x_0+2^{M}x_1+2^{2M}x_2+...&\equiv x\ (mod\ n)\\ x_0+(2^M-n)^1x_1+(2^M-n)^2x_2+...&\equiv x\ (mod\ n)\\ x_0+k^1x_1+k^2x_2+...&\equiv x\ (mod\ n)\\ \end{align}\\$

並且易知

$x_0+k^1x_1+k^2x_2+...\leq x_0+2^{M}x_1+2^{2M}x_2+...=x$

。當且僅當

時，等號成立。

特別有意思的是，當

：

$\begin{align} x_0&\equiv x\ (mod\ 2^M)\\ x_0+x_1+x_2+...&\equiv x\ (mod\ 2^M-1)\\ x_0+2^1x_1+2^2x_2+...&\equiv x\ (mod\ 2^M-2)\\ \end{align}\\$

我們可以得到很多黑魔法，比如對整數每部分進行累加、二進位制讀取。

的情況很多小夥伴應該都知道了。

令函式

，經過有限次的迭代，最終將得到

。

於是有了最終結論：

$x\bmod n = \begin{cases} y-n & \text{, } y \geq n \\ y & \text{, } y < n \end{cases}\\$

特別是當

很小時，需要的迭代次數非常少，這個演算法變得非常高效，並且只有乘法和加法。通常的模乘場景中

的二進位制位數不會超過

，因此

。

舉個栗子，在secp256k1中p=0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f，經常需要模p乘法。有2個大整數

a=0xb5003f7d80f965825706b2c4bbbf1c70b3b02cf65141c6e9d4006205526e919a

b=0xa95780689fd0168ae72b563711bd226bce465dda6d7fca7d64d4e64f26f8a081

求a*b%p。

a*b = 0x

77bb07c986a24bd066edf876a667ff3f6fe9fbf3b684e1828f946199862395df8991cf4e4fa8c706ddd413e6f3b95940d2733b04c785e796535047738de79e9a

f（a*b） = 0x

8991cf4e4fa8c706ddd413e6f3b95940d2733b04c785e796535047738de79e9a

+ 0x

1000003d1

* 0x

77bb07c986a24bd066edf876a667ff3f6fe9fbf3b684e1828f946199862395df

= 0x

77bb099300fcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec75cebc583b7bb696a9

f（f（a*b）） = 0x

fcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec75cebc583b7bb696a9

+ 0x

1000003d1

* 0x

77bb0993

= 0x

fcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec76467763976c8620ac

a*b%p = 0xfcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec76467763976c8620ac

用python驗算一下：

在secp256k1中，

迭代次數不會超過3次，並且在第3次，

只能是0或1。

最後直接上程式碼。

// secp256k1的uint256模乘

#include

inline

static

void

mul

（

uint64_t

，

uint64_t

，

uint64_t

low

，

uint64_t

high

）

{

__asm

（

“（

mulq

）

” ： “

”（low）， “

”（high）： “

”（x）， “

”（y））；

}

inline

static

void

square

（

uint64_t

，

uint64_t

low

，

uint64_t

high

）

{

__asm

（

“（

mulq

［

］

）

” ： “

”（low）， “

”（high）：［x］ “

”（x））；

}

inline

static

void

add

（

uint64_t

，

uint64_t

，

uint64_t

of1

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

，

［

of1

］

）

” ：［y］ “

”（y），［of1］“

”（of1）：［x］ “

”（x））；

}

inline

static

void

add

（

uint64_t

，

uint64_t

，

uint64_t

of1

，

uint64_t

of2

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

，

［

of1

］

adcq

，

［

of2

］

）

” ：［y］ “

”（y），［of1］“

”（of1），［of2］“

”（of2）：［x］ “

”（x））；

}

inline

static

void

add

（

uint64_t

，

uint64_t

，

uint64_t

of1

，

uint64_t

of2

，

uint64_t

of3

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

，

［

of1

］

adcq

，

［

of2

］

adcq

，

［

of3

］

）

” ：［y］ “

”（y），［of1］“

”（of1），［of2］“

”（of2），［of3］“

”（of3）：［x］ “

”（x））；

}

inline

static

void

add

（

uint64_t

，

uint64_t

，

uint64_t

of1

，

uint64_t

of2

，

uint64_t

of3

，

uint64_t

of4

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

，

［

of1

］

adcq

，

［

of2

］

adcq

，

［

of3

］

adcq

，

［

of4

］

）

” ：［y］ “

”（y），［of1］“

”（of1），［of2］“

”（of2），［of3］“

”（of3），［of4］“

”（of4）：［x］ “

”（x））；

}

inline

static

void

add

（

uint64_t

，

uint64_t

，

uint64_t

of1

，

uint64_t

of2

，

uint64_t

of3

，

uint64_t

of4

，

uint64_t

of5

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

，

［

of1

］

adcq

，

［

of2

］

adcq

，

［

of3

］

adcq

，

［

of4

］

adcq

，

［

of5

］

）

” ：［y］ “

”（y），［of1］“

”（of1），［of2］“

”（of2），［of3］“

”（of3），［of4］“

”（of4），［of5］“

”（of5）：［x］ “

”（x））；

}

inline

static

void

add_u512_offset1

（

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

，

［

］

）

” ：［y1］ “

”（y1），［y2］“

”（y2），［y3］“

”（y3），［y4］“

”（y4），［y5］“

”（y5），［y6］“

”（y6），［y7］“

”（y7）：［x1］ “

”（x1），［x2］ “

”（x2），［x3］ “

”（x3），［x4］ “

”（x4），［x5］ “

”（x5），［x6］ “

”（x6））；

}

inline

static

void

add_u512_offset2

（

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

，

［

］

adcq

，

［

］

）

” ：［y1］ “

”（y1），［y2］“

”（y2），［y3］“

”（y3），［y4］“

”（y4），［y5］“

”（y5），［y6］“

”（y6）：［x1］ “

”（x1），［x2］ “

”（x2），［x3］ “

”（x3），［x4］ “

”（x4））；

}

inline

static

void

add_u512_offset3

（

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

［

］，

［

］

adcq

，

［

］

adcq

，

［

］

adcq

，

［

］

）

” ：［y1］ “

”（y1），［y2］“

”（y2），［y3］“

”（y3），［y4］“

”（y4），［y5］“

”（y5）：［x1］ “

”（x1），［x2］ “

”（x2））；

}

inline

static

void

add_u320_offset1

（

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

，

［

］

）

” ：［y1］ “

”（y1），［y2］“

”（y2），［y3］“

”（y3），［y4］“

”（y4）：［x1］ “

”（x1），［x2］ “

”（x2），［x3］ “

”（x3））；

}

inline

static

void

add_u320_u256

（

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

［

］，

［

］

adcq

，

［

］

）

” ：［y1］ “

”（y1），［y2］“

”（y2），［y3］“

”（y3），［y4］“

”（y4），［y5］“

”（y5）：［x1］ “

”（x1），［x2］ “

”（x2），［x3］ “

”（x3），［x4］ “

”（x4））；

}

inline

static

void

add_u320_u128

（

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

）

{

__asm

（

“（

addq

［

］，

［

］

adcq

［

］，

［

］

adcq

，

［

］

adcq

，

［

］

adcq

，

［

］

）

” ：［y1］ “

”（y1），［y2］“

”（y2），［y3］“

”（y3），［y4］“

”（y4），［y5］“

”（y5）：［x1］ “

”（x1），［x2］ “

”（x2））；

}

#define ALWAYS_INLINE __attribute__（（always_inline））

ALWAYS_INLINE

inline

static

void

u320_mod_p

（

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

）

{

constexpr

uint64_t

negP

0x1000003d1

；

// 2^256 - p

uint64_t

，

；

mul

（

，

negP

，

）；

；

add_u320_u128

（

，

）；

（

！=

！

（

negP

））

{

add

（

negP

，

）；

// 加-p相當於減p

}

ALWAYS_INLINE

inline

static

void

u512_mod_p

（

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

，

uint64_t

）

{

constexpr

uint64_t

negP

0x1000003d1

；

// 2^256 - p

x8 x7 x6 x5

-p

——————————————————————————————-

H3 <- x8‘ H2 <- x7’ H1 <- x6‘ H0 <- x5’

——————————————————————————————-

H3 x8‘ x7’ x6‘ x5’

H2 H1 H0

uint64_t

，

；

mul

（

，

negP

，

）；

mul

（

，

negP

，

）；

mul

（

，

negP

，

）；

mul

（

，

negP

，

）；

add_u320_offset1

（

，

）；

// 用［x5，x6，x7，x8，H3］存 uint320

add_u320_u256

（

，

）；

u320_mod_p

（

，

）；

}

extern

“C”

__declspec

（

dllexport

）

void

u256_x_u256

（

const

uint8_t

［

］，

const

uint8_t

［

］，

uint8_t

［

］）

{

3 2 1 0

————————————————————————————————————————————————————

H30 <- L30 H20 <- L20 H10 <- L10 H00 <- L00

H31 <- L31 H21 <- L21 H11 <- L11 H01 <- L01

H32 <- L32 H22 <- L22 H12 <- L12 H02 <- L02

H33 <- L33 H23 <- L23 H13 <- L13 H03 <- L03

————————————————————————————————————————————————————

H33 L33 L32 L31 L30 L20 L10 L00

H32 L23 L22 L21 L11 L01

H23 H31 L13 L12 L02 H00

H22 H30 L03 H10

H13 H21 H20 H01

H12 H11

H03 H02

uint64_t

，

；

const

uint64_t

（

const

uint64_t

）

，

（

const

uint64_t

）

；

uint64_t

（

uint64_t

）

；

uint64_t

L00

，

L01

，

L02

，

L03

，

L10

，

L11

，

L12

，

L13

，

L20

，

L21

，

L22

，

L23

，

L30

，

L31

，

L32

，

L33

；

uint64_t

H00

，

H01

，

H02

，

H03

，

H10

，

H11

，

H12

，

H13

，

H20

，

H21

，

H22

，

H23

，

H30

，

H31

，

H32

，

H33

；

mul

（

［

］，

［

］，

L00

，

H00

）；

mul

（

［

］，

［

］，

L01

，

H01

）；

mul

（

［

］，

［

］，

L02

，

H02

）；

mul

（

［

］，

［

］，

L03

，

H03

）；

mul

（

［

］，

［

］，

L10

，

H10

）；

mul

（

［

］，

［

］，

L11

，

H11

）；

mul

（

［

］，

［

］，

L12

，

H12

）；

mul

（

［

］，

［

］，

L13

，

H13

）；

mul

（

［

］，

［

］，

L20

，

H20

）；

mul

（

［

］，

［

］，

L21

，

H21

）；

mul

（

［

］，

［

］，

L22

，

H22

）；

mul

（

［

］，

［

］，

L23

，

H23

）；

mul

（

［

］，

［

］，

L30

，

H30

）；

mul

（

［

］，

［

］，

L31

，

H31

）；

mul

（

［

］，

［

］，

L32

，

H32

）；

mul

（

［

］，

［

］，

L33

，

H33

）；

L00

；

L10

；

L20

；

L30

；

L31

；

L32

；

L33

；

H33

；

add_u512_offset1

（

L01

，

L11

，

L21

，

L22

，

L23

，

H32

，

）；

add_u512_offset1

（

H00

，

L02

，

L12

，

L13

，

H31

，

H23

，

）；

add_u512_offset2

（

H10

，

L03

，

H30

，

H22

，

）；

add_u512_offset2

（

H01

，

H20

，

H21

，

H13

，

）；

add_u512_offset3

（

H11

，

H12

，

）；

add_u512_offset3

（

H02

，

H03

，

）；

u512_mod_p

（

，

）；

［

］

；

［

］

；

［

］

；

［

］

；

}

標簽： uint64 adcq x1 x2 y1

上一篇:八關齋戒具體內容和受持功德

下一篇：手把手教你在PCB上新增淚滴

大數快速取模魔法

猜你喜歡

30mm的半幅鏡頭用在半幅相機上和30mm的全幅鏡頭用在全幅相機上是一樣的嗎？

如何求正交矩陣？

Find X2 Pro竹青配色太吸引人，這高階質感

【許曉笛】49行程式碼就能發幣？而且EOS連例子都給你了

評估區分效度的三大方法