Commit 084733ae authored by Niels Möller's avatar Niels Möller

Use neon registers for loading the input. Slight slowdown.

parent 45040019
...@@ -141,38 +141,44 @@ define(<NOEXPN>, < ...@@ -141,38 +141,44 @@ define(<NOEXPN>, <
PROLOGUE(_nettle_sha512_compress) PROLOGUE(_nettle_sha512_compress)
push {r4,r5,r6,r7,r8,r10,r14} push {r4,r5,r6,r7,r8,r10,r14}
sub sp, sp, #128 sub sp, sp, #128
C Load data up front. FIXME: Use aligned vld1, and vshl. ands SHIFT, INPUT, #7
and INPUT, INPUT, #-8
ands SHIFT, INPUT, #3 vld1.8 {d0}, [INPUT :64]
and INPUT, INPUT, $-4 addne INPUT, INPUT, #8
addeq SHIFT, SHIFT, #8
lsl SHIFT, SHIFT, #3 lsl SHIFT, SHIFT, #3
mov I0, #0
movne I0, #-1 C Put right shift in d2 and d3, aka q1
lsl I1, I0, SHIFT neg SHIFT, SHIFT
uadd8 I0, I0, I1 C Sets APSR.GE bits vmov.i32 d2, #0
ldr I0, [INPUT] vmov.32 d2[0], SHIFT
addne INPUT, INPUT, #4 vmov d3, d2
C Put left shift in d4 and d5, aka q2
add SHIFT, SHIFT, #64
vmov.i32 d4, #0
vmov.32 d4[0], SHIFT
vmov d5, d4
vshl.u64 d0, d0, d2
mov DST, sp mov DST, sp
mov COUNT, #8 mov COUNT, #4
.Lcopy: .Lcopy:
ldm INPUT!, {I1,I2,I3,I4} C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT
sel IT, I0, I1 vld1.8 {d16,d17,d18,d19}, [INPUT :64]!
ror IT, IT, SHIFT vshl.u64 q3, q8, q1 C Right shift
sel I0, I1, I2 vshl.u64 q8, q8, q2 C Left shift
ror I0, I0, SHIFT veor d16, d16, d0
rev I0, I0 veor d17, d17, d6
rev I1, IT vrev64.8 q8, q8
sel IT, I2, I3 vshl.u64 q0, q9, q1 C Right shift
ror IT, IT, SHIFT vshl.u64 q9, q9, q2 C Left shift
sel I2, I3, I4 veor d18, d18, d7
ror I2, I2, SHIFT veor d19, d19, d0
rev I2, I2 vrev64.8 q9, q9
rev I3, IT
subs COUNT, COUNT, #1 subs COUNT, COUNT, #1
stm DST!, {I0,I1,I2,I3} vst1.64 {d16,d17,d18,d19}, [DST]!
mov I0, I4 vmov d0, d1
bne .Lcopy bne .Lcopy
mov COUNT,#2 mov COUNT,#2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment