sha3-permute.asm 5.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
C nettle, low-level cryptographics library
C 
C Copyright (C) 2013 Niels Möller
C  
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C 
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
C License for more details.
C 
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB.  If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.

	.file "sha3-permute.asm"
	.fpu	neon

define(<CTX>, <r0>)
define(<COUNT>, <r1>)
define(<RC>, <r2>)
C First column
define(<A0>, <d0>)
define(<A5>, <d2>)
define(<A10>, <d3>)
define(<A15>, <d4>)
define(<A20>, <d5>)

define(<A1>, <d6>)
define(<A2>, <d7>)
define(<A3>, <d8>)
define(<A4>, <d9>)

define(<A6>, <d16>)
define(<A7>, <d17>)
define(<A8>, <d18>)
define(<A9>, <d19>)

define(<A11>, <d20>)
define(<A12>, <d21>)
define(<A13>, <d22>)
define(<A14>, <d23>)

define(<A16>, <d24>)
define(<A17>, <d25>)
define(<A18>, <d26>)
define(<A19>, <d27>)

define(<A21>, <d28>)
define(<A22>, <d29>)
define(<A23>, <d30>)
define(<A24>, <d31>)

define(<T0>, <d10>)
define(<T1>, <d11>)

define(<C0>, <d1>)
define(<C1>, <d12>)
define(<C2>, <d13>)
define(<C3>, <d14>)
define(<C4>, <d15>)


C ROL(DST, SRC, COUNT)
C Must have SRC != DST
define(<ROL>, <
	vshr.u64	$1, $2, #eval(64-$3)
	vsli.i64	$1, $2, #$3
	>)
C sha3_permute(struct sha3_ctx *ctx)

	.text
	.align	3
.Lrc:
	.quad	0x0000000000000001
	.quad	0x0000000000008082
	.quad	0x800000000000808A
	.quad	0x8000000080008000
	.quad	0x000000000000808B
	.quad	0x0000000080000001
	.quad	0x8000000080008081
	.quad	0x8000000000008009
	.quad	0x000000000000008A
	.quad	0x0000000000000088
	.quad	0x0000000080008009
	.quad	0x000000008000000A
	.quad	0x000000008000808B
	.quad	0x800000000000008B
	.quad	0x8000000000008089
	.quad	0x8000000000008003
	.quad	0x8000000000008002
	.quad	0x8000000000000080
	.quad	0x000000000000800A
	.quad	0x800000008000000A
	.quad	0x8000000080008081
	.quad	0x8000000000008080
	.quad	0x0000000080000001
	.quad	0x8000000080008008
	
PROLOGUE(nettle_sha3_permute)
	vpush	{d8-d15}

	vld1.64	{A0}, [CTX]!
	vldm	CTX!, {A1,A2,A3,A4}
	vld1.64	{A5}, [CTX]!
	vldm	CTX!, {A6,A7,A8,A9}
	vld1.64	{A10}, [CTX]!
	vldm	CTX!, {A11,A12,A13,A14}
	vld1.64	{A15}, [CTX]!
	vldm	CTX!, {A16,A17,A18,A19}
	vld1.64	{A20}, [CTX]!
	vldm	CTX, {A21,A22,A23,A24}
	sub	CTX, CTX, #168

	mov	COUNT, #24
	adr	RC, .Lrc

	.align 3
.Loop:
	veor	QREG(T0), QREG(A5), QREG(A15)
	veor	C0, A0, T0
	veor	C0, C0, T1
	veor	QREG(C1), QREG(A1), QREG(A6)
	veor	QREG(C1), QREG(C1), QREG(A11)
	veor	QREG(C1), QREG(C1), QREG(A16)
	veor	QREG(C1), QREG(C1), QREG(A21)

	veor	QREG(C3), QREG(A3), QREG(A8)
	veor	QREG(C3), QREG(C3), QREG(A13)
	veor	QREG(C3), QREG(C3), QREG(A18)
	veor	QREG(C3), QREG(C3), QREG(A23)

	C	FIXME: Can we make use of 128-bit xors?
	C	One more register would help. Or the VSLI instruction?
	C 	D0 = C4 ^ (C1 <<< 1)
	vshl.i64	T0, C1, #1
	vshr.u64	T1, C1, #63
	veor		T0, T0, C4
	veor		T0, T0, T1
	veor		A0, A0, T0
	veor		A5, A5, T0
	veor		A10, A10, T0
	veor		A15, A15, T0
	veor		A20, A20, T0

	C 	D1 = C0 ^ (C2 <<< 1)
	vshl.i64	T0, C2, #1
	vshr.u64	T1, C2, #63
	veor		T0, T0, C0
	veor		T0, T0, T1
	veor		A1, A1, T0
	veor		A6, A6, T0
	veor		A11, A11, T0
	veor		A16, A16, T0
	veor		A21, A21, T0

	C 	D2 = C1 ^ (C3 <<< 1)
	vshl.i64	T0, C3, #1
	vshr.u64	T1, C3, #63
	veor		T0, T0, C1
	veor		T0, T0, T1
	veor		A2, A2, T0
	veor		A7, A7, T0
	veor		A12, A12, T0
	veor		A17, A17, T0
	veor		A22, A22, T0

	C 	D3 = C2 ^ (C4 <<< 1)
	vshl.i64	T0, C4, #1
	vshr.u64	T1, C4, #63
	veor		T0, T0, C2
	veor		T0, T0, T1
	veor		A3, A3, T0
	veor		A8, A8, T0
	veor		A13, A13, T0
	veor		A18, A18, T0
	veor		A23, A23, T0

	C 	D4 = C3 ^ (C0 <<< 1)
	vshl.i64	T0, C0, #1
	vshr.u64	T1, C0, #63
	veor		T0, T0, C3
	veor		T0, T0, T1
	veor		A4, A4, T0
	veor		A9, A9, T0
	veor		A14, A14, T0
	veor		A19, A19, T0
	veor		A24, A24, T0

	ROL( T0,  A1,  1)
	ROL( A1,  A6, 44)
	ROL( A6,  A9, 20)
	ROL( A9, A22, 61)
	ROL(A22, A14, 39)
	ROL(A14, A20, 18)
	ROL(A20,  A2, 62)
	ROL( A2, A12, 43)
	ROL(A12, A13, 25)
	ROL(A13, A19,  8)
	ROL(A19, A23, 56)
	ROL(A23, A15, 41)
	ROL(A15,  A4, 27)
	ROL( A4, A24, 14)
	ROL(A24, A21,  2)
	ROL(A21,  A8, 55)
	ROL( A8, A16, 45)
	ROL(A16,  A5, 36)
	ROL( A5,  A3, 28)
	ROL( A3, A18, 21)
	ROL(A18, A17, 15)
	ROL(A17, A11, 10)
	ROL(A11,  A7,  6)
	ROL( A7, A10,  3)
	vmov	A10, T0

	vbic	C0, A2, A1
	vbic	C1, A3, A2
	vbic	C2, A4, A3
	vbic	C3, A0, A4
	vbic	C4, A1, A0

	veor	A0, A0, C0
	vld1.64	{C0}, [RC :64]!
	veor	QREG(A1), QREG(A1), QREG(C1)
	veor	QREG(A3), QREG(A3), QREG(C3)
	veor	A0, A0, C0

	vbic	C0, A7, A6
	vbic	C1, A8, A7
	vbic	C2, A9, A8
	vbic	C3, A5, A9
	vbic	C4, A6, A5

	veor	A5, A5, C0
	veor	QREG(A6), QREG(A6), QREG(C1)
	veor	QREG(A8), QREG(A8), QREG(C3)

	vbic	C0, A12, A11
	vbic	C1, A13, A12
	vbic	C2, A14, A13
	vbic	C3, A10, A14
	vbic	C4, A11, A10

	veor	A10, A10, C0
	veor	QREG(A11), QREG(A11), QREG(C1)
	veor	QREG(A13), QREG(A13), QREG(C3)

	vbic	C0, A17, A16
	vbic	C1, A18, A17
	vbic	C2, A19, A18
	vbic	C3, A15, A19
	vbic	C4, A16, A15

	veor	A15, A15, C0
	veor	QREG(A16), QREG(A16), QREG(C1)
	veor	QREG(A18), QREG(A18), QREG(C3)

	vbic	C0, A22, A21
	vbic	C1, A23, A22
	vbic	C2, A24, A23
	vbic	C3, A20, A24
	vbic	C4, A21, A20

	subs	COUNT, COUNT, #1
	veor	A20, A20, C0
	veor	QREG(A21), QREG(A21), QREG(C1)
	veor	QREG(A23), QREG(A23), QREG(C3)

	bne	.Loop

	vst1.64	{A0}, [CTX]!
	vstm	CTX!, {A1,A2,A3,A4}
	vst1.64	{A5}, [CTX]!
	vstm	CTX!, {A6,A7,A8,A9}
	vst1.64	{A10}, [CTX]!
	vstm	CTX!, {A11,A12,A13,A14}
	vst1.64	{A15}, [CTX]!
	vstm	CTX!, {A16,A17,A18,A19}
	vst1.64	{A20}, [CTX]!
	vstm	CTX, {A21,A22,A23,A24}
	
	vpop	{d8-d15}
	bx	lr
EPILOGUE(nettle_sha3_permute)