Commit 51ff7924 authored by Niels Möller's avatar Niels Möller
Browse files

Improved ARM sha512 assembly.

parent 084733ae
2013-03-14 Niels Möller <nisse@lysator.liu.se> 2013-03-14 Niels Möller <nisse@lysator.liu.se>
* armv7/sha512-compress.asm: Optimized. Keep expanded data in
registers, exploit parallelism. Another 70% speedup.
* testsuite/sha512-test.c (test_main): Additional test vectors, * testsuite/sha512-test.c (test_main): Additional test vectors,
including some longer than 128 bytes. including some longer than 128 bytes.
......
...@@ -23,56 +23,93 @@ C MA 02111-1301, USA. ...@@ -23,56 +23,93 @@ C MA 02111-1301, USA.
define(<STATE>, <r0>) define(<STATE>, <r0>)
define(<INPUT>, <r1>) define(<INPUT>, <r1>)
define(<K>, <r2>) define(<K>, <r2>)
define(<SA>, <d16>)
define(<SB>, <d17>)
define(<SC>, <d18>)
define(<SD>, <d19>)
define(<SE>, <d20>)
define(<SF>, <d21>)
define(<SG>, <d22>)
define(<SH>, <d23>)
define(<W>, <d24>)
define(<T0>, <d25>)
define(<COUNT>, <r3>) define(<COUNT>, <r3>)
C Used for data load
define(<I0>, <r4>)
define(<I1>, <r5>)
define(<I2>, <r6>)
define(<I3>, <r7>)
define(<I4>, <r8>)
define(<DST>, <r10>)
define(<SHIFT>, <r12>) define(<SHIFT>, <r12>)
define(<IT>, <r14>)
C FIXME: More opportunities for parallelism, at least do s0 and s1 xors, define(<SA>, <d0>)
C or expand two words at a time. define(<SB>, <d1>)
define(<SC>, <d2>)
define(<SD>, <d3>)
define(<SE>, <d4>)
define(<SF>, <d5>)
define(<SG>, <d6>)
define(<SH>, <d7>)
define(<QSAB>, <q0>)
define(<QSCD>, <q1>)
define(<QSEF>, <q2>)
define(<QSGH>, <q3>)
C d8-d15 are callee-save
define(<DT0>, <d8>)
define(<DT1>, <d9>)
define(<QT01>, <q4>)
define(<DT2>, <d10>)
define(<DT3>, <d11>)
define(<QT23>, <q5>)
define(<DT4>, <d12>)
define(<DT5>, <d13>)
define(<QT45>, <q6>)
C Used only when reading the input, can overlap with state
define(<DT6>, <d0>)
define(<DT7>, <d1>)
define(<QT67>, <q0>)
define(<DW0>, <d16>)
define(<DW1>, <d17>)
define(<DW2>, <d18>)
define(<DW3>, <d19>)
define(<DW4>, <d20>)
define(<DW5>, <d21>)
define(<DW6>, <d22>)
define(<DW7>, <d23>)
define(<DW8>, <d24>)
define(<DW9>, <d25>)
define(<DW10>, <d26>)
define(<DW11>, <d27>)
define(<DW12>, <d28>)
define(<DW13>, <d29>)
define(<DW14>, <d30>)
define(<DW15>, <d31>)
define(<QW0001>, <q8>)
define(<QW0203>, <q9>)
define(<QW0405>, <q10>)
define(<QW0607>, <q11>)
define(<QW0809>, <q12>)
define(<QW1011>, <q13>)
define(<QW1213>, <q14>)
define(<QW1415>, <q15>)
define(<EXPAND_ME>, <$1>)
define(<W>, <EXPAND_ME(<DW>eval(($1) % 16))>)
C If x = W(i+14), y = w(i+1), we xor in parallel
C
C x << 45 y << 63
C x >> 19 y >> 1
C x << 3 y << 56
C x >> 61 y >> 8
C xor x >> 6 y >> 7
C -----------------------------
C DT0 DT1
define(<EXPN>, < define(<EXPN>, <
vldr W, [sp, #+eval(8*$1)] vshl.i64 DT0, W($1+14), #45
vldr T0, [sp, #+eval(8*(($1 + 14) % 16))] vshl.i64 DT1, W($1 + 1), #63
vshl.i64 d0, T0, #45 vshr.u64 DT2, W($1+14), #19
vshr.u64 d2, T0, #19 vshr.u64 DT3, W($1 + 1), #1
vshl.i64 d1, T0, #3 vshl.i64 DT4, W($1+14), #3
vshr.u64 d3, T0, #61 vshl.i64 DT5, W($1 + 1), #56
vadd.i64 q0, q0, q1 veor.i64 QT01, QT01, QT23
vshr.u64 T0, T0, #6 vshr.u64 DT2, W($1+14), #61
veor T0, T0, d0 vshr.u64 DT3, W($1 + 1), #8
veor T0, T0, d1 veor.i64 QT01, QT01, QT45
vadd.i64 W, W, T0 vshr.u64 DT4, W($1+14), #6
vldr T0, [sp, #+eval(8*(($1 + 9) % 16))] vshr.u64 DT5, W($1 + 1), #7
vadd.i64 W, W, T0 veor.i64 QT01, QT01, QT23
vldr T0, [sp, #+eval(8*(($1 + 1) % 16))] vadd.i64 W($1), W($1), W($1 + 9)
vshl.i64 d0, T0, #63 veor.i64 QT01, QT01, QT45
vshr.u64 d2, T0, #1 vadd.i64 W($1), W($1), DT0
vshl.i64 d1, T0, #56 vadd.i64 W($1), W($1), DT1
vshr.u64 d3, T0, #8
vadd.i64 q0, q0, q1
vshr.u64 T0, T0, #7
veor T0, T0, d0
veor T0, T0, d1
vadd.i64 W, W, T0
vstr W, [sp, #+eval(8*$1)]
>) >)
C ROUND(A,B,C,D,E,F,G,H,i) C ROUND(A,B,C,D,E,F,G,H,i)
...@@ -88,48 +125,48 @@ C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25 ...@@ -88,48 +125,48 @@ C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25
C Choice (E, F, G) = G^(E&(F^G)) C Choice (E, F, G) = G^(E&(F^G))
C Majority (A,B,C) = (A&B) + (C&(A^B)) C Majority (A,B,C) = (A&B) + (C&(A^B))
C FIXME: More opportunities for parallelism, at least do S0 and S1 xors. C Do S1 and S0 in parallel
C
C e << 50 a << 36
C e >> 14 a >> 28
C e << 46 a << 30
C e >> 18 a >> 34
C e << 23 a << 25
C xor e >> 41 a >> 39
C ----------------------------
C DT0 DT1
define(<ROUND>, < define(<ROUND>, <
vshl.i64 d0, $5, #50 vshl.i64 DT0, $5, #50
vshr.u64 d2, $5, #14 vshl.i64 DT1, $1, #36
vshl.i64 d1, $5, #46 vshr.u64 DT2, $5, #14
vshr.u64 d3, $5, #18 vshr.u64 DT3, $1, #28
vadd.i64 q0, q0, q1 vshl.i64 DT4, $5, #46
vshl.i64 d2, $5, #23 vshl.i64 DT5, $1, #30
vshr.u64 d3, $5, #41 veor QT01, QT01, QT23
vadd.i64 d2, d2, d3 vshr.u64 DT2, $5, #18
veor d0, d0, d1 vshr.u64 DT3, $1, #34
veor d0, d0, d2 veor QT01, QT01, QT45
vadd.i64 $8, $8, d0 vshl.i64 DT4, $5, #23
veor d0, $6, $7 vshl.i64 DT5, $1, #25
vand d0, d0, $5 veor QT01, QT01, QT23
veor d0, d0, $7 vshr.u64 DT2, $5, #41
vadd.i64 $8,$8, d0 vshr.u64 DT3, $1, #39
vldr d0, [K,#eval(8*$9)] veor QT01, QT01, QT45
vadd.i64 $8, $8, W veor DT4, $6, $7
vadd.i64 $8, $8, d0 veor DT5, $1, $2
vand DT4, DT4, $5
vand DT5, DT5, $3
veor DT4, DT4, $7
veor QT01, QT01, QT23
vand DT2, $1, $2
vldr DT3, [K,#eval(8*$9)]
vadd.i64 $8, $8, W($9)
vadd.i64 QT01, QT01, QT45
vadd.i64 $8, $8, DT3
vadd.i64 $8, $8, DT0
vadd.i64 DT1, DT1, DT2
vadd.i64 $4, $4, $8 vadd.i64 $4, $4, $8
vadd.i64 $8, $8, DT1
vshl.i64 d0, $1, #36
vshr.u64 d2, $1, #28
vshl.i64 d1, $1, #30
vshr.u64 d3, $1, #34
vadd.i64 q0, q0, q1
vshl.i64 d2, $1, #25
vshr.u64 d3, $1, #39
vadd.i64 d2, d2, d3
veor d0, d0, d1
veor d0, d0, d2
vadd.i64 $8, $8, d0
vand d0, $1, $2
veor d1, $1, $2
vadd.i64 $8, $8, d0
vand d1, d1, $3
vadd.i64 $8, $8, d1
>)
define(<NOEXPN>, <
vldr W, [INPUT, #eval(8*$1)]
>) >)
C void C void
...@@ -139,69 +176,100 @@ define(<NOEXPN>, < ...@@ -139,69 +176,100 @@ define(<NOEXPN>, <
.align 2 .align 2
PROLOGUE(_nettle_sha512_compress) PROLOGUE(_nettle_sha512_compress)
push {r4,r5,r6,r7,r8,r10,r14} vpush {d8,d9,d10,d11,d12,d13}
sub sp, sp, #128
ands SHIFT, INPUT, #7 ands SHIFT, INPUT, #7
and INPUT, INPUT, #-8 and INPUT, INPUT, #-8
vld1.8 {d0}, [INPUT :64] vld1.8 {DT5}, [INPUT :64]
addne INPUT, INPUT, #8 addne INPUT, INPUT, #8
addeq SHIFT, SHIFT, #8 addeq SHIFT, SHIFT, #8
lsl SHIFT, SHIFT, #3 lsl SHIFT, SHIFT, #3
C Put right shift in d2 and d3, aka q1 C Put right shift in DT0 and DT1, aka QT01
neg SHIFT, SHIFT neg SHIFT, SHIFT
vmov.i32 d2, #0 vmov.i32 DT0, #0
vmov.32 d2[0], SHIFT vmov.32 DT0[0], SHIFT
vmov d3, d2 vmov DT1, DT0
C Put left shift in d4 and d5, aka q2 C Put left shift in DT2 and DT3, aka QT23
add SHIFT, SHIFT, #64 add SHIFT, SHIFT, #64
vmov.i32 d4, #0 vmov.i32 DT2, #0
vmov.32 d4[0], SHIFT vmov.32 DT2[0], SHIFT
vmov d5, d4 vmov DT3, DT2
vshl.u64 d0, d0, d2 vshl.u64 DT5, DT5, DT0
mov DST, sp
mov COUNT, #4
.Lcopy:
C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT
vld1.8 {d16,d17,d18,d19}, [INPUT :64]! vld1.8 {W(0),W(1),W(2),W(3)}, [INPUT :64]!
vshl.u64 q3, q8, q1 C Right shift vshl.u64 QT67, QW0001, QT01 C Right shift
vshl.u64 q8, q8, q2 C Left shift vshl.u64 QW0001, QW0001, QT23 C Left shift
veor d16, d16, d0 veor W(0), W(0), DT5
veor d17, d17, d6 veor W(1), W(1), DT6
vrev64.8 q8, q8 vrev64.8 QW0001, QW0001
vshl.u64 q0, q9, q1 C Right shift vshl.u64 QT45, QW0203, QT01 C Right shift
vshl.u64 q9, q9, q2 C Left shift vshl.u64 QW0203, QW0203, QT23 C Left shift
veor d18, d18, d7 veor W(2), W(2), DT7
veor d19, d19, d0 veor W(3), W(3), DT4
vrev64.8 q9, q9 vrev64.8 QW0203, QW0203
subs COUNT, COUNT, #1
vst1.64 {d16,d17,d18,d19}, [DST]! vld1.8 {W(4),W(5),W(6),W(7)}, [INPUT :64]!
vmov d0, d1 vshl.u64 QT67, QW0405, QT01 C Right shift
bne .Lcopy vshl.u64 QW0405, QW0405, QT23 C Left shift
veor W(4), W(4), DT5
veor W(5), W(5), DT6
vrev64.8 QW0405, QW0405
vshl.u64 QT45, QW0607, QT01 C Right shift
vshl.u64 QW0607, QW0607, QT23 C Left shift
veor W(6), W(6), DT7
veor W(7), W(7), DT4
vrev64.8 QW0607, QW0607
mov COUNT,#2 vld1.8 {W(8),W(9),W(10),W(11)}, [INPUT :64]!
mov INPUT, sp vshl.u64 QT67, QW0809, QT01 C Right shift
vshl.u64 QW0809, QW0809, QT23 C Left shift
veor W(8), W(8), DT5
veor W(9), W(9), DT6
vrev64.8 QW0809, QW0809
vshl.u64 QT45, QW1011, QT01 C Right shift
vshl.u64 QW1011, QW1011, QT23 C Left shift
veor W(10), W(10), DT7
veor W(11), W(11), DT4
vrev64.8 QW1011, QW1011
vld1.8 {W(12),W(13),W(14),W(15)}, [INPUT :64]!
vshl.u64 QT67, QW1213, QT01 C Right shift
vshl.u64 QW1213, QW1213, QT23 C Left shift
veor W(12), W(12), DT5
veor W(13), W(13), DT6
vrev64.8 QW1213, QW1213
vshl.u64 QT45, QW1415, QT01 C Right shift
vshl.u64 QW1415, QW1415, QT23 C Left shift
veor W(14), W(14), DT7
veor W(15), W(15), DT4
vrev64.8 QW1415, QW1415
vldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH} vldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
.Loop1: ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
NOEXPN(0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
NOEXPN(1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
NOEXPN(2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
NOEXPN(3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
NOEXPN(4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
NOEXPN(5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
NOEXPN(6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
NOEXPN(7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
subs COUNT,#1 ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
add INPUT, INPUT, #64 ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
add K, K, #64 ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
bne .Loop1 ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
add K, K, #128
mov COUNT, #4 mov COUNT, #4
.Loop2: .Loop:
EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0) EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1) EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
...@@ -221,33 +289,29 @@ PROLOGUE(_nettle_sha512_compress) ...@@ -221,33 +289,29 @@ PROLOGUE(_nettle_sha512_compress)
subs COUNT, COUNT, #1 subs COUNT, COUNT, #1
EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15) EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
add K, K, #128 add K, K, #128
bne .Loop2 bne .Loop
vld1.64 {d24,d25,d26,d27}, [STATE] vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
vadd.i64 SA, SA, d24 vadd.i64 QSAB, QSAB, QW0001
vadd.i64 SB, SB, d25 vadd.i64 QSCD, QSCD, QW0203
vadd.i64 SC, SC, d26
vadd.i64 SD, SD, d27
vst1.64 {SA,SB,SC,SD}, [STATE]! vst1.64 {SA,SB,SC,SD}, [STATE]!
vld1.64 {d24,d25,d26,d27}, [STATE] vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
vadd.i64 SE, SE, d24 vadd.i64 QSEF, QSEF, QW0001
vadd.i64 SF, SF, d25 vadd.i64 QSGH, QSGH, QW0203
vadd.i64 SG, SG, d26
vadd.i64 SH, SH, d27
vst1.64 {SE,SF,SG,SH}, [STATE]! vst1.64 {SE,SF,SG,SH}, [STATE]!
add sp, sp, #128 vpop {d8,d9,d10,d11,d12,d13}
pop {r4,r5,r6,r7,r8,r10,pc} bx lr
EPILOGUE(_nettle_sha512_compress) EPILOGUE(_nettle_sha512_compress)
divert(-1) divert(-1)
define shastate define shastate
p/x $d16.u64 p/x $d0.u64
p/x $d17.u64 p/x $d1.u64
p/x $d18.u64 p/x $d2.u64
p/x $d19.u64 p/x $d3.u64
p/x $d20.u64 p/x $d4.u64
p/x $d21.u64 p/x $d5.u64
p/x $d22.u64 p/x $d6.u64
p/x $d23.u64 p/x $d7.u64
end end
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment