Commit 14ee65fc authored by Niels Möller's avatar Niels Möller
Initial ARM assembly for sha3_permute.

parent 12003c62
2013-03-15 Niels Möller <>
* armv7/sha3-permute.asm: New file. 4.5 time speedup.
* armv7/machine.m4 (QREG): New macro.
2013-03-14 Niels Möller <>
C nettle, low-level cryptographics library
C Copyright (C) 2013 Niels Möller
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
.file "sha3-permute.asm"
.fpu neon
define(<CTX>, <r0>)
define(<COUNT>, <r1>)
define(<RC>, <r2>)
C First column
define(<A0>, <d0>)
define(<A5>, <d2>)
define(<A10>, <d3>)
define(<A15>, <d4>)
define(<A20>, <d5>)
define(<A1>, <d6>)
define(<A2>, <d7>)
define(<A3>, <d8>)
define(<A4>, <d9>)
define(<A6>, <d16>)
define(<A7>, <d17>)
define(<A8>, <d18>)
define(<A9>, <d19>)
define(<A11>, <d20>)
define(<A12>, <d21>)
define(<A13>, <d22>)
define(<A14>, <d23>)
define(<A16>, <d24>)
define(<A17>, <d25>)
define(<A18>, <d26>)
define(<A19>, <d27>)
define(<A21>, <d28>)
define(<A22>, <d29>)
define(<A23>, <d30>)
define(<A24>, <d31>)
define(<T0>, <d10>)
define(<T1>, <d11>)
define(<C0>, <d1>)
define(<C1>, <d12>)
define(<C2>, <d13>)
define(<C3>, <d14>)
define(<C4>, <d15>)
C Must have SRC != DST
define(<ROL>, <
vshr.u64 $1, $2, #eval(64-$3)
vsli.i64 $1, $2, #$3
C sha3_permute(struct sha3_ctx *ctx)
.align 3
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808A
.quad 0x8000000080008000
.quad 0x000000000000808B
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008A
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000A
.quad 0x000000008000808B
.quad 0x800000000000008B
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800A
.quad 0x800000008000000A
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
vpush {d8-d15}
vld1.64 {A0}, [CTX]!
vldm CTX!, {A1,A2,A3,A4}
vld1.64 {A5}, [CTX]!
vldm CTX!, {A6,A7,A8,A9}
vld1.64 {A10}, [CTX]!
vldm CTX!, {A11,A12,A13,A14}
vld1.64 {A15}, [CTX]!
vldm CTX!, {A16,A17,A18,A19}
vld1.64 {A20}, [CTX]!
vldm CTX, {A21,A22,A23,A24}
sub CTX, CTX, #168
mov COUNT, #24
adr RC, .Lrc
.align 3
veor QREG(T0), QREG(A5), QREG(A15)
veor C0, A0, T0
veor C0, C0, T1
veor QREG(C1), QREG(A1), QREG(A6)
veor QREG(C1), QREG(C1), QREG(A11)
veor QREG(C1), QREG(C1), QREG(A16)
veor QREG(C1), QREG(C1), QREG(A21)
veor QREG(C3), QREG(A3), QREG(A8)
veor QREG(C3), QREG(C3), QREG(A13)
veor QREG(C3), QREG(C3), QREG(A18)
veor QREG(C3), QREG(C3), QREG(A23)
C FIXME: Can we make use of 128-bit xors?
C One more register would help. Or the VSLI instruction?
C D0 = C4 ^ (C1 <<< 1)
vshl.i64 T0, C1, #1
vshr.u64 T1, C1, #63
veor T0, T0, C4
veor T0, T0, T1
veor A0, A0, T0
veor A5, A5, T0
veor A10, A10, T0
veor A15, A15, T0
veor A20, A20, T0
C D1 = C0 ^ (C2 <<< 1)
vshl.i64 T0, C2, #1
vshr.u64 T1, C2, #63
veor T0, T0, C0
veor T0, T0, T1
veor A1, A1, T0
veor A6, A6, T0
veor A11, A11, T0
veor A16, A16, T0
veor A21, A21, T0
C D2 = C1 ^ (C3 <<< 1)
vshl.i64 T0, C3, #1
vshr.u64 T1, C3, #63
veor T0, T0, C1
veor T0, T0, T1
veor A2, A2, T0
veor A7, A7, T0
veor A12, A12, T0
veor A17, A17, T0
veor A22, A22, T0
C D3 = C2 ^ (C4 <<< 1)
vshl.i64 T0, C4, #1
vshr.u64 T1, C4, #63
veor T0, T0, C2
veor T0, T0, T1
veor A3, A3, T0
veor A8, A8, T0
veor A13, A13, T0
veor A18, A18, T0
veor A23, A23, T0
C D4 = C3 ^ (C0 <<< 1)
vshl.i64 T0, C0, #1
vshr.u64 T1, C0, #63
veor T0, T0, C3
veor T0, T0, T1
veor A4, A4, T0
veor A9, A9, T0
veor A14, A14, T0
veor A19, A19, T0
veor A24, A24, T0
ROL( T0, A1, 1)
ROL( A1, A6, 44)
ROL( A6, A9, 20)
ROL( A9, A22, 61)
ROL(A22, A14, 39)
ROL(A14, A20, 18)
ROL(A20, A2, 62)
ROL( A2, A12, 43)
ROL(A12, A13, 25)
ROL(A13, A19, 8)
ROL(A19, A23, 56)
ROL(A23, A15, 41)
ROL(A15, A4, 27)
ROL( A4, A24, 14)
ROL(A24, A21, 2)
ROL(A21, A8, 55)
ROL( A8, A16, 45)
ROL(A16, A5, 36)
ROL( A5, A3, 28)
ROL( A3, A18, 21)
ROL(A18, A17, 15)
ROL(A17, A11, 10)
ROL(A11, A7, 6)
ROL( A7, A10, 3)
vmov A10, T0
vbic C0, A2, A1
vbic C1, A3, A2
vbic C2, A4, A3
vbic C3, A0, A4
vbic C4, A1, A0
veor A0, A0, C0
vld1.64 {C0}, [RC :64]!
veor QREG(A1), QREG(A1), QREG(C1)
veor QREG(A3), QREG(A3), QREG(C3)
veor A0, A0, C0
vbic C0, A7, A6
vbic C1, A8, A7
vbic C2, A9, A8
vbic C3, A5, A9
vbic C4, A6, A5
veor A5, A5, C0
veor QREG(A6), QREG(A6), QREG(C1)
veor QREG(A8), QREG(A8), QREG(C3)
vbic C0, A12, A11
vbic C1, A13, A12
vbic C2, A14, A13
vbic C3, A10, A14
vbic C4, A11, A10
veor A10, A10, C0
veor QREG(A11), QREG(A11), QREG(C1)
veor QREG(A13), QREG(A13), QREG(C3)
vbic C0, A17, A16
vbic C1, A18, A17
vbic C2, A19, A18
vbic C3, A15, A19
vbic C4, A16, A15
veor A15, A15, C0
veor QREG(A16), QREG(A16), QREG(C1)
veor QREG(A18), QREG(A18), QREG(C3)
vbic C0, A22, A21
vbic C1, A23, A22
vbic C2, A24, A23
vbic C3, A20, A24
vbic C4, A21, A20
subs COUNT, COUNT, #1
veor A20, A20, C0
veor QREG(A21), QREG(A21), QREG(C1)
veor QREG(A23), QREG(A23), QREG(C3)
bne .Loop
vst1.64 {A0}, [CTX]!
vstm CTX!, {A1,A2,A3,A4}
vst1.64 {A5}, [CTX]!
vstm CTX!, {A6,A7,A8,A9}
vst1.64 {A10}, [CTX]!
vstm CTX!, {A11,A12,A13,A14}
vst1.64 {A15}, [CTX]!
vstm CTX!, {A16,A17,A18,A19}
vst1.64 {A20}, [CTX]!
vstm CTX, {A21,A22,A23,A24}
vpop {d8-d15}
bx lr
