Nettle
nettle
Commits
fa269b6a
Commit
fa269b6a
authored
Dec 15, 2015
by
Niels Möller
Fix carry folding bug in x86_64 ecc_384_modp. Problem reported by Hanno Böck.
parent
c71d2c9d
ChangeLog
fa269b6a
20151215 Niels Möller <nisse@lysator.liu.se>
* x86_64/ecc384modp.asm: Fixed carry propagation bug. Problem
reported by Hanno Böck. Simplified the folding to always use
nonnegative carry, the old code attempted to add in a carry which
could be either positive or negative, but didn't get that case
right.
20151210 Niels Möller <nisse@lysator.liu.se>
* ecc256.c (ecc_256_modp): Fixed carry propagation bug. Problem
x86_64/ecc384modp.asm
fa269b6a
C
x86_64
/
ecc

384

modp.asm
ifelse
(
<
Copyright
(
C
)
2013
Niels
M
ö
ller
Copyright
(
C
)
2013
,
2015
Niels
M
ö
ller
This
file
is
part
of
GNU
Nettle.
@@ 33,7 +33,7 @@ ifelse(<
.file
"ecc384modp.asm"
define
(
<
RP
>
,
<%
rsi
>
)
define
(
<
D
4
>
,
<%
rax
>
)
define
(
<
D
5
>
,
<%
rax
>
)
define
(
<
T0
>
,
<%
rbx
>
)
define
(
<
T1
>
,
<%
rcx
>
)
define
(
<
T2
>
,
<%
rdx
>
)
@@ 48,8 +48,8 @@ define(<H4>, <%r13>)
define
(
<
H5
>
,
<%
r14
>
)
define
(
<
C2
>
,
<%
r15
>
)
define
(
<
C0
>
,
H5
)
C
Overlap
define
(
<
D0
>
,
RP
)
C
Overlap
define
(
<
TMP
>
,
H4
)
C
Overlap
define
(
<
TMP
>
,
RP
)
C
Overlap
PROLOGUE
(
nettle_ecc_384_modp
)
W64_ENTRY
(
2
,
0
)
@@ 61,34 +61,38 @@ PROLOGUE(nettle_ecc_384_modp)
push
%
r14
push
%
r15
C
First
get
top
2
limbs
,
which
need
folding
twice
C
First
get
top
2
limbs
,
which
need
folding
twice.
C
B
^
10
=
B
^
6
+
B
^
4
+
2
^
32
(
B

1
)
B
^
4
.
C
We
handle
the
terms
as
follow
:
C
C
H5
H4
C

H5
C

C
H0
D4
C
B
^
6
:
Folded
immediatly.
C
C
Then
shift
right
,
(
H1
,
H0
,
D4
)
<
(
H0
,
D4
)
<<
32
C
and
add
C
B
^
4
:
Delayed
,
added
in
in
the
next
folding.
C
C
H5
H4
C
H1
H0
C

C
C2
H1
H0
mov
80
(
RP
),
D4
mov
88
(
RP
),
H0
mov
D4
,
H4
mov
H0
,
H5
sub
H0
,
D4
sbb
$
0
,
H0
mov
D4
,
T2
mov
H0
,
H1
shl
$
32
,
H0
shr
$
32
,
T2
C
2
^
32
(
B

1
)
B
^
4
:
Low
half
limb
delayed
until
the
next
C
folding.
Top
1.5
limbs
subtracted
and
shifter
now
,
resulting
C
in
2.5
limbs.
The
low
limb
saved
in
D5
,
high
1.5
limbs
added
C
in.
mov
80
(
RP
),
H4
mov
88
(
RP
),
H5
C
Shift
right
32
bits
,
into
H1
,
H0
mov
H4
,
H0
mov
H5
,
H1
mov
H5
,
D5
shr
$
32
,
H1
or
T2
,
H0
shl
$
32
,
D5
shr
$
32
,
H0
or
D5
,
H0
C
H1
H0
C

H1
H0
C

C
H1
H0
D5
mov
H0
,
D5
neg
D5
sbb
H1
,
H0
sbb
$
0
,
H1
xor
C2
,
C2
add
H4
,
H0
@@ 127,118 +131,95 @@ PROLOGUE(nettle_ecc_384_modp)
adc
H3
,
T5
adc
$
0
,
C0
C
H3
H2
H1
H0
0
C

H4
H3
H2
H1
H0
C

C
H3
H2
H1
H0
D0
mov
XREG
(
D4
),
XREG
(
D4
)
mov
H0
,
D0
neg
D0
sbb
H1
,
H0
sbb
H2
,
H1
sbb
H3
,
H2
sbb
H4
,
H3
sbb
$
0
,
D4
C
Shift
right.
High
bits
are
si
gn
,
to
be
added
to
C0.
mov
D4
,
TMP
sar
$
32
,
TMP
shl
$
32
,
D4
add
TMP
,
C0
C
Shift
left
,
including
low
half
of
H4
mov
H3
,
TMP
shl
$
32
,
H4
shr
$
32
,
TMP
shl
$
32
,
H3
or
TMP
,
D4
or
TMP
,
H4
mov
H2
,
TMP
shl
$
32
,
H3
shr
$
32
,
TMP
shl
$
32
,
H2
or
TMP
,
H3
mov
H1
,
TMP
shl
$
32
,
H2
shr
$
32
,
TMP
shl
$
32
,
H1
or
TMP
,
H2
mov
H0
,
TMP
shl
$
32
,
H1
shr
$
32
,
TMP
shl
$
32
,
H0
or
TMP
,
H1
mov
D0
,
TMP
shr
$
32
,
TMP
shl
$
32
,
D0
or
TMP
,
H0
shl
$
32
,
H0
C
H4
H3
H2
H1
H0
0
C

H4
H3
H2
H1
H0
C

C
H4
H3
H2
H1
H0
TMP
add
D0
,
T0
mov
H0
,
TMP
neg
TMP
sbb
H1
,
H0
sbb
H2
,
H1
sbb
H3
,
H2
sbb
H4
,
H3
sbb
$
0
,
H4
add
TMP
,
T0
adc
H0
,
T1
adc
H1
,
T2
adc
H2
,
T3
adc
H3
,
T4
adc
D
4
,
T5
adc
H
4
,
T5
adc
$
0
,
C0
C
Remains
to
add
in
C2
and
C0
C
C0
C0
<<
32
(

2
^
32
+
1
)
C0
C
C2
C2
<<
32
(

2
^
32
+
1
)
C2
C
where
C2
is
al
ways
positive
,
while
C0
may
be

1
.
C
Set
H1
,
H0
=
(
2
^
96

2
^
32
+
1
)
C0
mov
C0
,
H0
mov
C0
,
H1
mov
C0
,
H2
sar
$
63
,
C0
C
Get
si
gn
shl
$
32
,
H1
sub
H1
,
H0
C
Gives
borrow
iff
C0
>
0
sub
H1
,
H0
sbb
$
0
,
H1
add
C0
,
H2
C
Set
H3
,
H2
=
(
2
^
96

2
^
32
+
1
)
C2
mov
C2
,
H2
mov
C2
,
H3
shl
$
32
,
H3
sub
H3
,
H2
sbb
$
0
,
H3
add
C0
,
H2
C
No
carry.
Could
use
lea
trick
xor
C0
,
C0
add
H0
,
T0
adc
H1
,
T1
adc
$
0
,
H2
adc
$
0
,
C0
C
Set
(
H1
H0
)
<
C2
<<
96

C2
<<
32
+
1
mov
C2
,
H0
mov
C2
,
H1
shl
$
32
,
H1
sub
H1
,
H0
sbb
$
0
,
H1
add
H2
,
H0
adc
C0
,
H1
adc
C2
,
C0
mov
C0
,
H2
sar
$
63
,
C0
add
H0
,
T2
adc
H1
,
T3
adc
H2
,
T4
adc
C0
,
T5
sbb
C0
,
C0
adc
H2
,
T2
adc
H3
,
T3
adc
C2
,
T4
adc
D5
,
T5
C
Value
delayed
from
initial
folding
adc
$
0
,
C0
C
Use
sbb
and
switch
si
gn?
C
Final
unlikely
carry
mov
C0
,
H0
mov
C0
,
H1
mov
C0
,
H2
sar
$
63
,
C0
shl
$
32
,
H1
sub
H1
,
H0
sbb
$
0
,
H1
add
C0
,
H2
pop
RP
sub
H0
,
T0
add
H0
,
T0
mov
T0
,
(
RP
)
sbb
H1
,
T1
adc
H1
,
T1
mov
T1
,
8
(
RP
)
sbb
H2
,
T2
adc
C0
,
T2
mov
T2
,
16
(
RP
)
sbb
C
0
,
T3
adc
$
0
,
T3
mov
T3
,
24
(
RP
)
sbb
C
0
,
T4
adc
$
0
,
T4
mov
T4
,
32
(
RP
)
sbb
C
0
,
T5
adc
$
0
,
T5
mov
T5
,
40
(
RP
)
pop
%
r15
