您當前的位置:首頁 > 書法

大數快速取模魔法

作者:由 九條可憐 發表于 書法時間:2019-08-15

本文介紹的是一種特定場景下的大數快速取模演算法,對於

x\bmod n

n

非常接近2的整數次冪時,該演算法十分高效。

先將

x

以二進位制的形式表示出來,從低位開始取出每

M

位,得到一個數列

x_0,x_1,x_2,...

。其中

M

是一個滿足

2^{M-1}<n\leq 2^{M}

的整數,即

M=\lceil log_2 n\rceil

如下所示:

\begin{matrix} ...&\underbrace{bb...bb} &\underbrace{bb...bb} &\underbrace{bb...bb}\\ &{^{M位}} &{^{M位}} &{^{M位}}\\ ...&x_2&x_1&x_0\\ \end{matrix}\\

於是

x=x_0+2^{M}x_1+2^{2M}x_2+...

k=2^M-n

,可以得到以下結論:

\begin{align} x_0+2^{M}x_1+2^{2M}x_2+...&\equiv x\ (mod\ n)\\ x_0+(2^M-n)^1x_1+(2^M-n)^2x_2+...&\equiv x\ (mod\ n)\\ x_0+k^1x_1+k^2x_2+...&\equiv x\ (mod\ n)\\ \end{align}\\

並且易知

x_0+k^1x_1+k^2x_2+...\leq x_0+2^{M}x_1+2^{2M}x_2+...=x

。當且僅當

x_1=x_2=x_3=...=0

時,等號成立。

特別有意思的是,當

k=0,1,2

\begin{align} x_0&\equiv x\ (mod\ 2^M)\\ x_0+x_1+x_2+...&\equiv x\ (mod\ 2^M-1)\\ x_0+2^1x_1+2^2x_2+...&\equiv x\ (mod\ 2^M-2)\\ \end{align}\\

我們可以得到很多黑魔法,比如對整數每部分進行累加、二進位制讀取。

大數快速取模魔法

k=0

的情況很多小夥伴應該都知道了。

令函式

f(x)=x_0+k^1x_1+k^2x_2+...

,經過有限次的迭代,最終將得到

y=f(f(f(...f(x))))< 2^M

於是有了最終結論:

x\bmod n =  \begin{cases}     y-n & \text{, } y \geq n \\    y    & \text{, } y < n \end{cases}\\

特別是當

k

很小時,需要的迭代次數非常少,這個演算法變得非常高效,並且只有乘法和加法。通常的模乘場景中

x

的二進位制位數不會超過

2M

,因此

f(x)=x_0+kx_1

舉個栗子,在secp256k1中p=0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f,經常需要模p乘法。有2個大整數

a=0xb5003f7d80f965825706b2c4bbbf1c70b3b02cf65141c6e9d4006205526e919a

b=0xa95780689fd0168ae72b563711bd226bce465dda6d7fca7d64d4e64f26f8a081

求a*b%p。

a*b = 0x

77bb07c986a24bd066edf876a667ff3f6fe9fbf3b684e1828f946199862395df8991cf4e4fa8c706ddd413e6f3b95940d2733b04c785e796535047738de79e9a

f(a*b) = 0x

8991cf4e4fa8c706ddd413e6f3b95940d2733b04c785e796535047738de79e9a

+ 0x

1000003d1

* 0x

77bb07c986a24bd066edf876a667ff3f6fe9fbf3b684e1828f946199862395df

= 0x

77bb099300fcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec75cebc583b7bb696a9

f(f(a*b)) = 0x

fcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec75cebc583b7bb696a9

+ 0x

1000003d1

* 0x

77bb0993

= 0x

fcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec76467763976c8620ac

a*b%p = 0xfcd33987fa15d6566d4ff77688764ea4f2a9a2e83aec76467763976c8620ac

用python驗算一下:

大數快速取模魔法

在secp256k1中,

f(x)

迭代次數不會超過3次,並且在第3次,

x_1

只能是0或1。

最後直接上程式碼。

// secp256k1的uint256模乘

#include

inline

static

void

mul

uint64_t

x

uint64_t

y

uint64_t

&

low

uint64_t

&

high

{

__asm

R

“(

mulq

%

3

” : “

=

a

”(low), “

=

d

”(high) : “

a

”(x), “

r

”(y));

}

inline

static

void

square

uint64_t

x

uint64_t

&

low

uint64_t

&

high

{

__asm

R

“(

mulq

%

x

” : “

=

a

”(low), “

=

d

”(high) : [x] “

a

”(x));

}

inline

static

void

add

uint64_t

x

uint64_t

&

y

uint64_t

&

of1

{

__asm

R

“(

addq

%

x

],

%

y

adcq

$

0

%

of1

” : [y] “

+

r

”(y), [of1]“

+

r

”(of1) : [x] “

r

”(x));

}

inline

static

void

add

uint64_t

x

uint64_t

&

y

uint64_t

&

of1

uint64_t

&

of2

{

__asm

R

“(

addq

%

x

],

%

y

adcq

$

0

%

of1

adcq

$

0

%

of2

” : [y] “

+

r

”(y), [of1]“

+

r

”(of1), [of2]“

+

r

”(of2) : [x] “

r

”(x));

}

inline

static

void

add

uint64_t

x

uint64_t

&

y

uint64_t

&

of1

uint64_t

&

of2

uint64_t

&

of3

{

__asm

R

“(

addq

%

x

],

%

y

adcq

$

0

%

of1

adcq

$

0

%

of2

adcq

$

0

%

of3

” : [y] “

+

r

”(y), [of1]“

+

r

”(of1), [of2]“

+

r

”(of2), [of3]“

+

r

”(of3) : [x] “

r

”(x));

}

inline

static

void

add

uint64_t

x

uint64_t

&

y

uint64_t

&

of1

uint64_t

&

of2

uint64_t

&

of3

uint64_t

&

of4

{

__asm

R

“(

addq

%

x

],

%

y

adcq

$

0

%

of1

adcq

$

0

%

of2

adcq

$

0

%

of3

adcq

$

0

%

of4

” : [y] “

+

r

”(y), [of1]“

+

r

”(of1), [of2]“

+

r

”(of2), [of3]“

+

r

”(of3), [of4]“

+

r

”(of4) : [x] “

r

”(x));

}

inline

static

void

add

uint64_t

x

uint64_t

&

y

uint64_t

&

of1

uint64_t

&

of2

uint64_t

&

of3

uint64_t

&

of4

uint64_t

&

of5

{

__asm

R

“(

addq

%

x

],

%

y

adcq

$

0

%

of1

adcq

$

0

%

of2

adcq

$

0

%

of3

adcq

$

0

%

of4

adcq

$

0

%

of5

” : [y] “

+

r

”(y), [of1]“

+

r

”(of1), [of2]“

+

r

”(of2), [of3]“

+

r

”(of3), [of4]“

+

r

”(of4), [of5]“

+

r

”(of5) : [x] “

r

”(x));

}

inline

static

void

add_u512_offset1

uint64_t

x1

uint64_t

x2

uint64_t

x3

uint64_t

x4

uint64_t

x5

uint64_t

x6

uint64_t

&

y1

uint64_t

&

y2

uint64_t

&

y3

uint64_t

&

y4

uint64_t

&

y5

uint64_t

&

y6

uint64_t

&

y7

{

__asm

R

“(

addq

%

x1

],

%

y1

adcq

%

x2

],

%

y2

adcq

%

x3

],

%

y3

adcq

%

x4

],

%

y4

adcq

%

x5

],

%

y5

adcq

%

x6

],

%

y6

adcq

$

0

%

y7

” : [y1] “

+

r

”(y1), [y2]“

+

r

”(y2), [y3]“

+

r

”(y3), [y4]“

+

r

”(y4), [y5]“

+

r

”(y5), [y6]“

+

r

”(y6), [y7]“

+

r

”(y7) : [x1] “

r

”(x1), [x2] “

r

”(x2), [x3] “

r

”(x3), [x4] “

r

”(x4), [x5] “

r

”(x5), [x6] “

r

”(x6));

}

inline

static

void

add_u512_offset2

uint64_t

x1

uint64_t

x2

uint64_t

x3

uint64_t

x4

uint64_t

&

y1

uint64_t

&

y2

uint64_t

&

y3

uint64_t

&

y4

uint64_t

&

y5

uint64_t

&

y6

{

__asm

R

“(

addq

%

x1

],

%

y1

adcq

%

x2

],

%

y2

adcq

%

x3

],

%

y3

adcq

%

x4

],

%

y4

adcq

$

0

%

y5

adcq

$

0

%

y6

” : [y1] “

+

r

”(y1), [y2]“

+

r

”(y2), [y3]“

+

r

”(y3), [y4]“

+

r

”(y4), [y5]“

+

r

”(y5), [y6]“

+

r

”(y6) : [x1] “

r

”(x1), [x2] “

r

”(x2), [x3] “

r

”(x3), [x4] “

r

”(x4));

}

inline

static

void

add_u512_offset3

uint64_t

x1

uint64_t

x2

uint64_t

&

y1

uint64_t

&

y2

uint64_t

&

y3

uint64_t

&

y4

uint64_t

&

y5

{

__asm

R

“(

addq

%

x1

],

%

y1

adcq

%

x2

],

%

y2

adcq

$

0

%

y3

adcq

$

0

%

y4

adcq

$

0

%

y5

” : [y1] “

+

r

”(y1), [y2]“

+

r

”(y2), [y3]“

+

r

”(y3), [y4]“

+

r

”(y4), [y5]“

+

r

”(y5) : [x1] “

r

”(x1), [x2] “

r

”(x2));

}

inline

static

void

add_u320_offset1

uint64_t

x1

uint64_t

x2

uint64_t

x3

uint64_t

&

y1

uint64_t

&

y2

uint64_t

&

y3

uint64_t

&

y4

{

__asm

R

“(

addq

%

x1

],

%

y1

adcq

%

x2

],

%

y2

adcq

%

x3

],

%

y3

adcq

$

0

%

y4

” : [y1] “

+

r

”(y1), [y2]“

+

r

”(y2), [y3]“

+

r

”(y3), [y4]“

+

r

”(y4) : [x1] “

r

”(x1), [x2] “

r

”(x2), [x3] “

r

”(x3));

}

inline

static

void

add_u320_u256

uint64_t

x1

uint64_t

x2

uint64_t

x3

uint64_t

x4

uint64_t

&

y1

uint64_t

&

y2

uint64_t

&

y3

uint64_t

&

y4

uint64_t

&

y5

{

__asm

R

“(

addq

%

x1

],

%

y1

adcq

%

x2

],

%

y2

adcq

%

x3

],

%

y3

adcq

%

x4

],

%

y4

adcq

$

0

%

y5

” : [y1] “

+

r

”(y1), [y2]“

+

r

”(y2), [y3]“

+

r

”(y3), [y4]“

+

r

”(y4), [y5]“

+

r

”(y5) : [x1] “

r

”(x1), [x2] “

r

”(x2), [x3] “

r

”(x3), [x4] “

r

”(x4));

}

inline

static

void

add_u320_u128

uint64_t

x1

uint64_t

x2

uint64_t

&

y1

uint64_t

&

y2

uint64_t

&

y3

uint64_t

&

y4

uint64_t

&

y5

{

__asm

R

“(

addq

%

x1

],

%

y1

adcq

%

x2

],

%

y2

adcq

$

0

%

y3

adcq

$

0

%

y4

adcq

$

0

%

y5

” : [y1] “

+

r

”(y1), [y2]“

+

r

”(y2), [y3]“

+

r

”(y3), [y4]“

+

r

”(y4), [y5]“

+

r

”(y5) : [x1] “

r

”(x1), [x2] “

r

”(x2));

}

#define ALWAYS_INLINE __attribute__((always_inline))

ALWAYS_INLINE

inline

static

void

u320_mod_p

uint64_t

&

x1

uint64_t

&

x2

uint64_t

&

x3

uint64_t

&

x4

uint64_t

x5

{

constexpr

uint64_t

negP

=

0x1000003d1

// 2^256 - p

uint64_t

t0

t1

mul

x5

negP

t0

t1

);

x5

=

0

add_u320_u128

t0

t1

x1

x2

x3

x4

x5

);

if

x5

!=

0

||

~

x4

||

~

x3

||

~

x2

||

x1

<

~

negP

+

1

))

{

add

negP

x1

x2

x3

x4

);

// 加-p相當於減p

}

}

ALWAYS_INLINE

inline

static

void

u512_mod_p

uint64_t

&

x1

uint64_t

&

x2

uint64_t

&

x3

uint64_t

&

x4

uint64_t

x5

uint64_t

x6

uint64_t

x7

uint64_t

x8

{

constexpr

uint64_t

negP

=

0x1000003d1

// 2^256 - p

/*

x8 x7 x6 x5

-p

——————————————————————————————-

H3 <- x8‘ H2 <- x7’ H1 <- x6‘ H0 <- x5’

——————————————————————————————-

H3 x8‘ x7’ x6‘ x5’

H2 H1 H0

*/

uint64_t

H0

H1

H2

H3

mul

x5

negP

x5

H0

);

mul

x6

negP

x6

H1

);

mul

x7

negP

x7

H2

);

mul

x8

negP

x8

H3

);

add_u320_offset1

H0

H1

H2

x6

x7

x8

H3

);

// 用 [x5,x6,x7,x8,H3] 存 uint320

add_u320_u256

x5

x6

x7

x8

x1

x2

x3

x4

H3

);

u320_mod_p

x1

x2

x3

x4

H3

);

}

extern

“C”

__declspec

dllexport

void

u256_x_u256

const

uint8_t

a

32

],

const

uint8_t

b

32

],

uint8_t

r

32

])

{

/*

3 2 1 0

3 2 1 0

————————————————————————————————————————————————————

H30 <- L30 H20 <- L20 H10 <- L10 H00 <- L00

H31 <- L31 H21 <- L21 H11 <- L11 H01 <- L01

H32 <- L32 H22 <- L22 H12 <- L12 H02 <- L02

H33 <- L33 H23 <- L23 H13 <- L13 H03 <- L03

————————————————————————————————————————————————————

H33 L33 L32 L31 L30 L20 L10 L00

H32 L23 L22 L21 L11 L01

H23 H31 L13 L12 L02 H00

H22 H30 L03 H10

H13 H21 H20 H01

H12 H11

H03 H02

*/

uint64_t

t0

t1

t2

t3

t4

t5

t6

t7

const

uint64_t

*

pa

=

const

uint64_t

*

a

*

pb

=

const

uint64_t

*

b

uint64_t

*

pr

=

uint64_t

*

r

uint64_t

L00

L01

L02

L03

L10

L11

L12

L13

L20

L21

L22

L23

L30

L31

L32

L33

uint64_t

H00

H01

H02

H03

H10

H11

H12

H13

H20

H21

H22

H23

H30

H31

H32

H33

mul

pa

0

],

pb

0

],

L00

H00

);

mul

pa

0

],

pb

1

],

L01

H01

);

mul

pa

0

],

pb

2

],

L02

H02

);

mul

pa

0

],

pb

3

],

L03

H03

);

mul

pa

1

],

pb

0

],

L10

H10

);

mul

pa

1

],

pb

1

],

L11

H11

);

mul

pa

1

],

pb

2

],

L12

H12

);

mul

pa

1

],

pb

3

],

L13

H13

);

mul

pa

2

],

pb

0

],

L20

H20

);

mul

pa

2

],

pb

1

],

L21

H21

);

mul

pa

2

],

pb

2

],

L22

H22

);

mul

pa

2

],

pb

3

],

L23

H23

);

mul

pa

3

],

pb

0

],

L30

H30

);

mul

pa

3

],

pb

1

],

L31

H31

);

mul

pa

3

],

pb

2

],

L32

H32

);

mul

pa

3

],

pb

3

],

L33

H33

);

t0

=

L00

t1

=

L10

t2

=

L20

t3

=

L30

t4

=

L31

t5

=

L32

t6

=

L33

t7

=

H33

add_u512_offset1

L01

L11

L21

L22

L23

H32

t1

t2

t3

t4

t5

t6

t7

);

add_u512_offset1

H00

L02

L12

L13

H31

H23

t1

t2

t3

t4

t5

t6

t7

);

add_u512_offset2

H10

L03

H30

H22

t2

t3

t4

t5

t6

t7

);

add_u512_offset2

H01

H20

H21

H13

t2

t3

t4

t5

t6

t7

);

add_u512_offset3

H11

H12

t3

t4

t5

t6

t7

);

add_u512_offset3

H02

H03

t3

t4

t5

t6

t7

);

u512_mod_p

t0

t1

t2

t3

t4

t5

t6

t7

);

pr

0

=

t0

pr

1

=

t1

pr

2

=

t2

pr

3

=

t3

}

標簽: uint64  adcq  x1  x2  y1