Commit 2644d1ed authored by Michael Weiser's avatar Michael Weiser Committed by Niels Möller

Support big-endian arm in assembly code

Adjust sha1-compress, sha256-compress, umac-nh, chacha-core-internal,
salsa20-core-internal and memxor for arm to work in big-endian mode.
parent d5738a57
......@@ -44,6 +44,11 @@ define(<N>, <r2>)
define(<CNT>, <r6>)
define(<TNC>, <r12>)
C little-endian and big-endian need to shift in different directions for
C alignment correction
define(<S0ADJ>, IF_LE(<lsr>, <lsl>))
define(<S1ADJ>, IF_LE(<lsl>, <lsr>))
.syntax unified
.file "memxor.asm"
......@@ -99,6 +104,8 @@ PROLOGUE(nettle_memxor)
C
C With little-endian, we need to do
C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
C With big-endian, we need to do
C DST[i] ^= (SRC[i] << CNT) ^ (SRC[i+1] >> TNC)
push {r4,r5,r6}
......@@ -117,14 +124,14 @@ PROLOGUE(nettle_memxor)
.Lmemxor_word_loop:
ldr r5, [SRC], #+4
ldr r3, [DST]
eor r3, r3, r4, lsr CNT
eor r3, r3, r5, lsl TNC
eor r3, r3, r4, S0ADJ CNT
eor r3, r3, r5, S1ADJ TNC
str r3, [DST], #+4
.Lmemxor_odd:
ldr r4, [SRC], #+4
ldr r3, [DST]
eor r3, r3, r5, lsr CNT
eor r3, r3, r4, lsl TNC
eor r3, r3, r5, S0ADJ CNT
eor r3, r3, r4, S1ADJ TNC
str r3, [DST], #+4
subs N, #8
bcs .Lmemxor_word_loop
......@@ -132,10 +139,14 @@ PROLOGUE(nettle_memxor)
beq .Lmemxor_odd_done
C We have TNC/8 left-over bytes in r4, high end
lsr r4, CNT
S0ADJ r4, CNT
ldr r3, [DST]
eor r3, r4
C memxor_leftover does an LSB store
C so we need to reverse if actually BE
IF_BE(< rev r3, r3>)
pop {r4,r5,r6}
C Store bytes, one by one.
......
......@@ -49,6 +49,11 @@ define(<ATNC>, <r10>)
define(<BCNT>, <r11>)
define(<BTNC>, <r12>)
C little-endian and big-endian need to shift in different directions for
C alignment correction
define(<S0ADJ>, IF_LE(<lsr>, <lsl>))
define(<S1ADJ>, IF_LE(<lsl>, <lsr>))
.syntax unified
.file "memxor3.asm"
......@@ -124,6 +129,8 @@ PROLOGUE(nettle_memxor3)
C
C With little-endian, we need to do
C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
C With big-endian, we need to do
C DST[i-i] ^= (SRC[i-i] << CNT) ^ (SRC[i] >> TNC)
rsb ATNC, ACNT, #32
bic BP, #3
......@@ -138,14 +145,14 @@ PROLOGUE(nettle_memxor3)
.Lmemxor3_au_loop:
ldr r5, [BP, #-4]!
ldr r6, [AP, #-4]!
eor r6, r6, r4, lsl ATNC
eor r6, r6, r5, lsr ACNT
eor r6, r6, r4, S1ADJ ATNC
eor r6, r6, r5, S0ADJ ACNT
str r6, [DST, #-4]!
.Lmemxor3_au_odd:
ldr r4, [BP, #-4]!
ldr r6, [AP, #-4]!
eor r6, r6, r5, lsl ATNC
eor r6, r6, r4, lsr ACNT
eor r6, r6, r5, S1ADJ ATNC
eor r6, r6, r4, S0ADJ ACNT
str r6, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_au_loop
......@@ -154,7 +161,11 @@ PROLOGUE(nettle_memxor3)
C Leftover bytes in r4, low end
ldr r5, [AP, #-4]
eor r4, r5, r4, lsl ATNC
eor r4, r5, r4, S1ADJ ATNC
C leftover does an LSB store
C so we need to reverse if actually BE
IF_BE(< rev r4, r4>)
.Lmemxor3_au_leftover:
C Store a byte at a time
......@@ -247,21 +258,25 @@ PROLOGUE(nettle_memxor3)
ldr r5, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r5, r6
lsl r4, ATNC
eor r4, r4, r5, lsr ACNT
S1ADJ r4, ATNC
eor r4, r4, r5, S0ADJ ACNT
str r4, [DST, #-4]!
.Lmemxor3_uu_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r4, r6
lsl r5, ATNC
eor r5, r5, r4, lsr ACNT
S1ADJ r5, ATNC
eor r5, r5, r4, S0ADJ ACNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uu_loop
adds N, #8
beq .Lmemxor3_done
C leftover does an LSB store
C so we need to reverse if actually BE
IF_BE(< rev r4, r4>)
C Leftover bytes in a4, low end
ror r4, ACNT
.Lmemxor3_uu_leftover:
......@@ -290,18 +305,18 @@ PROLOGUE(nettle_memxor3)
.Lmemxor3_uud_loop:
ldr r5, [AP, #-4]!
ldr r7, [BP, #-4]!
lsl r4, ATNC
eor r4, r4, r6, lsl BTNC
eor r4, r4, r5, lsr ACNT
eor r4, r4, r7, lsr BCNT
S1ADJ r4, ATNC
eor r4, r4, r6, S1ADJ BTNC
eor r4, r4, r5, S0ADJ ACNT
eor r4, r4, r7, S0ADJ BCNT
str r4, [DST, #-4]!
.Lmemxor3_uud_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
lsl r5, ATNC
eor r5, r5, r7, lsl BTNC
eor r5, r5, r4, lsr ACNT
eor r5, r5, r6, lsr BCNT
S1ADJ r5, ATNC
eor r5, r5, r7, S1ADJ BTNC
eor r5, r5, r4, S0ADJ ACNT
eor r5, r5, r6, S0ADJ BCNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uud_loop
......
......@@ -90,31 +90,52 @@ PROLOGUE(_nettle_chacha_core)
vmov S2, X2
vmov S3, X3
C Input rows:
C Input rows little-endian:
C 0 1 2 3 X0
C 4 5 6 7 X1
C 8 9 10 11 X2
C 12 13 14 15 X3
C Input rows big-endian:
C 1 0 3 2 X0
C 5 4 7 6 X1
C 9 8 11 10 X2
C 13 12 15 14 X3
C even and odd columns switched because
C vldm loads consecutive doublewords and
C switches words inside them to make them BE
.Loop:
QROUND(X0, X1, X2, X3)
C Rotate rows, to get
C In little-endian rotate rows, to get
C 0 1 2 3
C 5 6 7 4 >>> 3
C 10 11 8 9 >>> 2
C 15 12 13 14 >>> 1
vext.32 X1, X1, X1, #1
C In big-endian rotate rows, to get
C 1 0 3 2
C 6 5 4 7 >>> 1
C 11 10 9 8 >>> 2
C 12 15 14 13 >>> 3
C different number of elements needs to be
C extracted on BE because of different column order
IF_LE(< vext.32 X1, X1, X1, #1>)
IF_BE(< vext.32 X1, X1, X1, #3>)
vext.32 X2, X2, X2, #2
vext.32 X3, X3, X3, #3
IF_LE(< vext.32 X3, X3, X3, #3>)
IF_BE(< vext.32 X3, X3, X3, #1>)
QROUND(X0, X1, X2, X3)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
vext.32 X1, X1, X1, #3
IF_LE(< vext.32 X1, X1, X1, #3>)
IF_BE(< vext.32 X1, X1, X1, #1>)
vext.32 X2, X2, X2, #2
vext.32 X3, X3, X3, #1
IF_LE(< vext.32 X3, X3, X3, #1>)
IF_BE(< vext.32 X3, X3, X3, #3>)
bhi .Loop
......@@ -123,6 +144,12 @@ PROLOGUE(_nettle_chacha_core)
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
C caller expects result little-endian
IF_BE(< vrev32.u8 X0, X0
vrev32.u8 X1, X1
vrev32.u8 X2, X2
vrev32.u8 X3, X3>)
vstm DST, {X0,X1,X2,X3}
bx lr
EPILOGUE(_nettle_chacha_core)
......
......@@ -88,7 +88,7 @@ define(<QROUND>, <
PROLOGUE(_nettle_salsa20_core)
vldm SRC, {X0,X1,X2,X3}
C Input rows:
C Input rows little-endian:
C 0 1 2 3 X0
C 4 5 6 7 X1
C 8 9 10 11 X2
......@@ -99,6 +99,20 @@ PROLOGUE(_nettle_salsa20_core)
C 8 13 2 7
C 12 1 6 11
C Input rows big-endian:
C 1 0 3 2 X0
C 5 4 7 6 X1
C 9 8 11 10 X2
C 13 12 15 14 X3
C even and odd columns switched because
C vldm loads consecutive doublewords and
C switches words inside them to make them BE
C Permuted to:
C 5 0 15 10
C 9 4 3 14
C 13 8 7 2
C 1 12 11 6
C FIXME: Construct in some other way?
adr r12, .Lmasks
vldm r12, {M0101, M0110, M0011}
......@@ -112,6 +126,7 @@ PROLOGUE(_nettle_salsa20_core)
C 4 1 6 3 T0 v
C 8 13 10 15 T1 ^
C 12 9 14 11 X3 v
C same in big endian just with transposed rows
vmov T0, X1
vmov T1, X2
vbit T0, X0, M0101
......@@ -140,22 +155,34 @@ PROLOGUE(_nettle_salsa20_core)
.Loop:
QROUND(X0, X1, X2, X3)
C Rotate rows, to get
C In little-endian rotate rows, to get
C 0 5 10 15
C 3 4 9 14 >>> 1
C 2 7 8 13 >>> 2
C 1 6 11 12 >>> 3
vext.32 X1, X1, X1, #3
C In big-endian rotate rows, to get
C 5 0 15 10
C 4 3 14 9 >>> 3
C 7 2 13 8 >>> 2
C 6 1 12 11 >>> 1
C different number of elements needs to be
C extracted on BE because of different column order
IF_LE(< vext.32 X1, X1, X1, #3>)
IF_BE(< vext.32 X1, X1, X1, #1>)
vext.32 X2, X2, X2, #2
vext.32 X3, X3, X3, #1
IF_LE(< vext.32 X3, X3, X3, #1>)
IF_BE(< vext.32 X3, X3, X3, #3>)
QROUND(X0, X3, X2, X1)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
vext.32 X1, X1, X1, #1
IF_LE(< vext.32 X1, X1, X1, #1>)
IF_BE(< vext.32 X1, X1, X1, #3>)
vext.32 X2, X2, X2, #2
vext.32 X3, X3, X3, #3
IF_LE(< vext.32 X3, X3, X3, #3>)
IF_BE(< vext.32 X3, X3, X3, #1>)
bhi .Loop
......@@ -181,6 +208,12 @@ PROLOGUE(_nettle_salsa20_core)
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
C caller expects result little-endian
IF_BE(< vrev32.u8 X0, X0
vrev32.u8 X1, X1
vrev32.u8 X2, X2
vrev32.u8 X3, X3>)
vstm DST, {X0,X1,X2,X3}
bx lr
EPILOGUE(_nettle_salsa20_core)
......
......@@ -97,6 +97,8 @@ PROLOGUE(_nettle_umac_nh)
bhi .Loop
vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY)
vmov r0, r1, D0REG(QY)
C return value needs to respect word order mandated by AAPCS
IF_LE(< vmov r0, r1, D0REG(QY)>)
IF_BE(< vmov r1, r0, D0REG(QY)>)
bx lr
EPILOGUE(_nettle_umac_nh)
......@@ -52,7 +52,7 @@ define(<LOAD>, <
sel W, WPREV, T0
ror W, W, SHIFT
mov WPREV, T0
rev W, W
IF_LE(< rev W, W>)
str W, [SP,#eval(4*$1)]
>)
define(<EXPN>, <
......@@ -127,8 +127,12 @@ PROLOGUE(_nettle_sha1_compress)
lsl SHIFT, SHIFT, #3
mov T0, #0
movne T0, #-1
lsl W, T0, SHIFT
IF_LE(< lsl W, T0, SHIFT>)
IF_BE(< lsr W, T0, SHIFT>)
uadd8 T0, T0, W C Sets APSR.GE bits
C on BE rotate right by 32-SHIFT bits
C because there is no rotate left
IF_BE(< rsb SHIFT, SHIFT, #32>)
ldr K, .LK1
ldm STATE, {SA,SB,SC,SD,SE}
......
......@@ -137,8 +137,12 @@ PROLOGUE(_nettle_sha256_compress)
lsl SHIFT, SHIFT, #3
mov T0, #0
movne T0, #-1
lsl I1, T0, SHIFT
IF_LE(< lsl I1, T0, SHIFT>)
IF_BE(< lsr I1, T0, SHIFT>)
uadd8 T0, T0, I1 C Sets APSR.GE bits
C on BE rotate right by 32-SHIFT bits
C because there is no rotate left
IF_BE(< rsb SHIFT, SHIFT, #32>)
mov DST, sp
mov ILEFT, #4
......@@ -146,16 +150,16 @@ PROLOGUE(_nettle_sha256_compress)
ldm INPUT!, {I1,I2,I3,I4}
sel I0, I0, I1
ror I0, I0, SHIFT
rev I0, I0
IF_LE(< rev I0, I0>)
sel I1, I1, I2
ror I1, I1, SHIFT
rev I1, I1
IF_LE(< rev I1, I1>)
sel I2, I2, I3
ror I2, I2, SHIFT
rev I2, I2
IF_LE(< rev I2, I2>)
sel I3, I3, I4
ror I3, I3, SHIFT
rev I3, I3
IF_LE(< rev I3, I3>)
subs ILEFT, ILEFT, #1
stm DST!, {I0,I1,I2,I3}
mov I0, I4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment