C nettle, low-level cryptographics library C C Copyright (C) 2013 Niels Möller C C The nettle library is free software; you can redistribute it and/or modify C it under the terms of the GNU Lesser General Public License as published by C the Free Software Foundation; either version 2.1 of the License, or (at your C option) any later version. C C The nettle library is distributed in the hope that it will be useful, but C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public C License for more details. C C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, C MA 02111-1301, USA. .file "sha3-permute.asm" .fpu neon define(, ) define(, ) define(, ) C First column define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) define(, ) C ROL(DST, SRC, COUNT) C Must have SRC != DST define(, < vshr.u64 $1, $2, #eval(64-$3) vsli.i64 $1, $2, #$3 >) C sha3_permute(struct sha3_ctx *ctx) .text .align 3 .Lrc: .quad 0x0000000000000001 .quad 0x0000000000008082 .quad 0x800000000000808A .quad 0x8000000080008000 .quad 0x000000000000808B .quad 0x0000000080000001 .quad 0x8000000080008081 .quad 0x8000000000008009 .quad 0x000000000000008A .quad 0x0000000000000088 .quad 0x0000000080008009 .quad 0x000000008000000A .quad 0x000000008000808B .quad 0x800000000000008B .quad 0x8000000000008089 .quad 0x8000000000008003 .quad 0x8000000000008002 .quad 0x8000000000000080 .quad 0x000000000000800A .quad 0x800000008000000A .quad 0x8000000080008081 .quad 0x8000000000008080 .quad 0x0000000080000001 .quad 0x8000000080008008 PROLOGUE(nettle_sha3_permute) vpush {d8-d15} vld1.64 {A0}, [CTX]! vldm CTX!, {A1,A2,A3,A4} vld1.64 {A5}, [CTX]! vldm CTX!, {A6,A7,A8,A9} vld1.64 {A10}, [CTX]! vldm CTX!, {A11,A12,A13,A14} vld1.64 {A15}, [CTX]! vldm CTX!, {A16,A17,A18,A19} vld1.64 {A20}, [CTX]! vldm CTX, {A21,A22,A23,A24} sub CTX, CTX, #168 mov COUNT, #24 adr RC, .Lrc .align 3 .Loop: veor QREG(T0), QREG(A5), QREG(A15) veor C0, A0, T0 veor C0, C0, T1 veor QREG(C1), QREG(A1), QREG(A6) veor QREG(C1), QREG(C1), QREG(A11) veor QREG(C1), QREG(C1), QREG(A16) veor QREG(C1), QREG(C1), QREG(A21) veor QREG(C3), QREG(A3), QREG(A8) veor QREG(C3), QREG(C3), QREG(A13) veor QREG(C3), QREG(C3), QREG(A18) veor QREG(C3), QREG(C3), QREG(A23) C FIXME: Can we make use of 128-bit xors? C One more register would help. Or the VSLI instruction? C D0 = C4 ^ (C1 <<< 1) vshl.i64 T0, C1, #1 vshr.u64 T1, C1, #63 veor T0, T0, C4 veor T0, T0, T1 veor A0, A0, T0 veor A5, A5, T0 veor A10, A10, T0 veor A15, A15, T0 veor A20, A20, T0 C D1 = C0 ^ (C2 <<< 1) vshl.i64 T0, C2, #1 vshr.u64 T1, C2, #63 veor T0, T0, C0 veor T0, T0, T1 veor A1, A1, T0 veor A6, A6, T0 veor A11, A11, T0 veor A16, A16, T0 veor A21, A21, T0 C D2 = C1 ^ (C3 <<< 1) vshl.i64 T0, C3, #1 vshr.u64 T1, C3, #63 veor T0, T0, C1 veor T0, T0, T1 veor A2, A2, T0 veor A7, A7, T0 veor A12, A12, T0 veor A17, A17, T0 veor A22, A22, T0 C D3 = C2 ^ (C4 <<< 1) vshl.i64 T0, C4, #1 vshr.u64 T1, C4, #63 veor T0, T0, C2 veor T0, T0, T1 veor A3, A3, T0 veor A8, A8, T0 veor A13, A13, T0 veor A18, A18, T0 veor A23, A23, T0 C D4 = C3 ^ (C0 <<< 1) vshl.i64 T0, C0, #1 vshr.u64 T1, C0, #63 veor T0, T0, C3 veor T0, T0, T1 veor A4, A4, T0 veor A9, A9, T0 veor A14, A14, T0 veor A19, A19, T0 veor A24, A24, T0 ROL( T0, A1, 1) ROL( A1, A6, 44) ROL( A6, A9, 20) ROL( A9, A22, 61) ROL(A22, A14, 39) ROL(A14, A20, 18) ROL(A20, A2, 62) ROL( A2, A12, 43) ROL(A12, A13, 25) ROL(A13, A19, 8) ROL(A19, A23, 56) ROL(A23, A15, 41) ROL(A15, A4, 27) ROL( A4, A24, 14) ROL(A24, A21, 2) ROL(A21, A8, 55) ROL( A8, A16, 45) ROL(A16, A5, 36) ROL( A5, A3, 28) ROL( A3, A18, 21) ROL(A18, A17, 15) ROL(A17, A11, 10) ROL(A11, A7, 6) ROL( A7, A10, 3) vmov A10, T0 vbic C0, A2, A1 vbic C1, A3, A2 vbic C2, A4, A3 vbic C3, A0, A4 vbic C4, A1, A0 veor A0, A0, C0 vld1.64 {C0}, [RC :64]! veor QREG(A1), QREG(A1), QREG(C1) veor QREG(A3), QREG(A3), QREG(C3) veor A0, A0, C0 vbic C0, A7, A6 vbic C1, A8, A7 vbic C2, A9, A8 vbic C3, A5, A9 vbic C4, A6, A5 veor A5, A5, C0 veor QREG(A6), QREG(A6), QREG(C1) veor QREG(A8), QREG(A8), QREG(C3) vbic C0, A12, A11 vbic C1, A13, A12 vbic C2, A14, A13 vbic C3, A10, A14 vbic C4, A11, A10 veor A10, A10, C0 veor QREG(A11), QREG(A11), QREG(C1) veor QREG(A13), QREG(A13), QREG(C3) vbic C0, A17, A16 vbic C1, A18, A17 vbic C2, A19, A18 vbic C3, A15, A19 vbic C4, A16, A15 veor A15, A15, C0 veor QREG(A16), QREG(A16), QREG(C1) veor QREG(A18), QREG(A18), QREG(C3) vbic C0, A22, A21 vbic C1, A23, A22 vbic C2, A24, A23 vbic C3, A20, A24 vbic C4, A21, A20 subs COUNT, COUNT, #1 veor A20, A20, C0 veor QREG(A21), QREG(A21), QREG(C1) veor QREG(A23), QREG(A23), QREG(C3) bne .Loop vst1.64 {A0}, [CTX]! vstm CTX!, {A1,A2,A3,A4} vst1.64 {A5}, [CTX]! vstm CTX!, {A6,A7,A8,A9} vst1.64 {A10}, [CTX]! vstm CTX!, {A11,A12,A13,A14} vst1.64 {A15}, [CTX]! vstm CTX!, {A16,A17,A18,A19} vst1.64 {A20}, [CTX]! vstm CTX, {A21,A22,A23,A24} vpop {d8-d15} bx lr EPILOGUE(nettle_sha3_permute)