Compare revisions

89bbf8c0 · 89bbf8c0 · 89bbf8c0 · 89bbf8c0 · 89bbf8c0 · 89bbf8c0
--- a/arm/fat/aes-encrypt-internal.asm
+++ b/arm/fat/aes-encrypt-internal.asm
+C arm/fat/aes-encrypt-internal.asm
+
+
+ifelse(`
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+define(`fat_transform', `$1_arm')
+include_src(`arm/aes-encrypt-internal.asm')
--- a/arm/fat/chacha-3core.asm
+++ b/arm/fat/chacha-3core.asm
+C arm/fat/chacha-3core.asm
+
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_3core) picked up by configure
+
+include_src(`arm/neon/chacha-3core.asm')
--- a/arm/fat/salsa20-2core.asm
+++ b/arm/fat/salsa20-2core.asm
+C arm/fat/salsa20-2core.asm
+
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_salsa20_2core) picked up by configure
+
+include_src(`arm/neon/salsa20-2core.asm')
--- a/arm/fat/sha1-compress-2.asm
+++ b/arm/fat/sha1-compress-2.asm
+C arm/fat/sha1-compress-2.asm
+
+
+ifelse(`
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(nettle_sha1_compress) picked up by configure
+
+define(`fat_transform', `_$1_armv6')
+include_src(`arm/v6/sha1-compress.asm')
--- a/arm/fat/sha256-compress-n-2.asm
+++ b/arm/fat/sha256-compress-n-2.asm
+C arm/fat/sha256-compress-n-2.asm
+
+
+ifelse(`
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
+
+define(`fat_transform', `$1_armv6')
+include_src(`arm/v6/sha256-compress-n.asm')
--- a/arm/fat/sha3-permute-2.asm
+++ b/arm/fat/sha3-permute-2.asm
+C arm/fat/sha3-permute-2.asm
+
+
+ifelse(`
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha3_permute) picked up by configure
+
+define(`fat_transform', `_$1_neon')
+include_src(`arm/neon/sha3-permute.asm')
--- a/arm/fat/sha512-compress-2.asm
+++ b/arm/fat/sha512-compress-2.asm
+C arm/fat/sha3-compress-2.asm
+
+
+ifelse(`
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha512_compress) picked up by configure
+
+define(`fat_transform', `$1_neon')
+include_src(`arm/neon/sha512-compress.asm')
--- a/arm/fat/umac-nh-2.asm
+++ b/arm/fat/umac-nh-2.asm
+C arm/fat/umac-nh-2.asm
+
+
+ifelse(`
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_umac_nh) picked up by configure
+
+define(`fat_transform', `$1_neon')
+include_src(`arm/neon/umac-nh.asm')
--- a/arm/fat/umac-nh-n-2.asm
+++ b/arm/fat/umac-nh-n-2.asm
+C arm/fat/umac-nh-n-2.asm
+
+
+ifelse(`
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_umac_nh_n) picked up by configure
+
+define(`fat_transform', `$1_neon')
+include_src(`arm/neon/umac-nh-n.asm')
--- a/arm/machine.m4
+++ b/arm/machine.m4
+define(`QREG', `ifelse(
+	$1, d0, q0,
+	$1, d2, q1,
+	$1, d4, q2,
+	$1, d6, q3,
+	$1, d8, q4,
+	$1, d10, q5,
+	$1, d12, q6,
+	$1, d14, q7,
+	$1, d16, q8,
+	$1, d18, q9,
+	$1, d20, q10,
+	$1, d22, q11,
+	$1, d24, q12,
+	$1, d26, q13,
+	$1, d28, q14,
+	$1, d30, q15,
+	`NO REGISTER')')dnl
+
+define(`D0REG', `ifelse(
+	$1, q0, d0,
+	$1, q1, d2,
+	$1, q2, d4,
+	$1, q3, d6,
+	$1, q4, d8,
+	$1, q5, d10,
+	$1, q6, d12,
+	$1, q7, d14,
+	$1, q8, d16,
+	$1, q9, d18,
+	$1, q10, d20,
+	$1, q11, d22,
+	$1, q12, d24,
+	$1, q13, d26,
+	$1, q14, d28,
+	$1, q15, d30,
+	`NO REGISTER')')dnl
+
+define(`D1REG', `ifelse(
+	$1, q0, d1,
+	$1, q1, d3,
+	$1, q2, d5,
+	$1, q3, d7,
+	$1, q4, d9,
+	$1, q5, d11,
+	$1, q6, d13,
+	$1, q7, d15,
+	$1, q8, d17,
+	$1, q9, d19,
+	$1, q10, d21,
+	$1, q11, d23,
+	$1, q12, d25,
+	$1, q13, d27,
+	$1, q14, d29,
+	$1, q15, d31,
+	`NO REGISTER')')dnl
--- a/arm/memxor.asm
+++ b/arm/memxor.asm
+C arm/memxor.asm
+
+ifelse(`
+   Copyright (C) 2013 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Possible speedups:
+C
+C The ldm instruction can do load two registers per cycle,
+C if the address is two-word aligned. Or three registers in two
+C cycles, regardless of alignment.
+
+C Register usage:
+
+define(`DST', `r0')
+define(`SRC', `r1')
+define(`N', `r2')
+define(`CNT', `r6')
+define(`TNC', `r12')
+
+C little-endian and big-endian need to shift in different directions for
+C alignment correction
+define(`S0ADJ', IF_LE(`lsr', `lsl'))
+define(`S1ADJ', IF_LE(`lsl', `lsr'))
+
+	.syntax unified
+
+	.file "memxor.asm"
+
+	.text
+	.arm
+
+	C memxor(void *dst, const void *src, size_t n)
+	.align 4
+PROLOGUE(nettle_memxor)
+	cmp	N, #0
+	beq	.Lmemxor_done
+
+	cmp	N, #7
+	bcs	.Lmemxor_large
+
+	C Simple byte loop
+.Lmemxor_bytes:
+	ldrb	r3, [SRC], #+1
+	ldrb	r12, [DST]
+	eor	r3, r12
+	strb	r3, [DST], #+1
+	subs	N, #1
+	bne	.Lmemxor_bytes
+
+.Lmemxor_done:
+	bx	lr
+
+.Lmemxor_align_loop:
+	ldrb	r3, [SRC], #+1
+	ldrb	r12, [DST]
+	eor	r3, r12
+	strb	r3, [DST], #+1
+	sub	N, #1
+
+.Lmemxor_large:
+	tst	DST, #3
+	bne	.Lmemxor_align_loop
+
+	C We have at least 4 bytes left to do here.
+	sub	N, #4
+
+	ands	r3, SRC, #3
+	beq	.Lmemxor_same
+
+	C Different alignment case.
+	C     v original SRC
+	C +-------+------+
+	C |SRC    |SRC+4 |
+	C +---+---+------+
+	C     |DST    |
+	C     +-------+
+	C
+	C With little-endian, we need to do
+	C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
+	C With big-endian, we need to do
+	C DST[i] ^= (SRC[i] << CNT) ^ (SRC[i+1] >> TNC)
+
+	push	{r4,r5,r6}
+	
+	lsl	CNT, r3, #3
+	bic	SRC, #3
+	rsb	TNC, CNT, #32
+
+	ldr	r4, [SRC], #+4
+
+	tst	N, #4
+	itet	eq
+	moveq	r5, r4
+	subne	N, #4
+	beq	.Lmemxor_odd
+
+.Lmemxor_word_loop:
+	ldr	r5, [SRC], #+4
+	ldr	r3, [DST]
+	eor	r3, r3, r4, S0ADJ CNT
+	eor	r3, r3, r5, S1ADJ TNC
+	str	r3, [DST], #+4
+.Lmemxor_odd:
+	ldr	r4, [SRC], #+4
+	ldr	r3, [DST]
+	eor	r3, r3, r5, S0ADJ CNT
+	eor	r3, r3, r4, S1ADJ TNC
+	str	r3, [DST], #+4
+	subs	N, #8
+	bcs	.Lmemxor_word_loop
+	adds	N, #8
+	beq	.Lmemxor_odd_done
+
+	C We have TNC/8 left-over bytes in r4, high end on LE and low end on
+	C BE, excess bits to be discarded by alignment adjustment at the other
+	S0ADJ	r4, CNT
+	C now byte-aligned at low end on LE and high end on BE
+	ldr	r3, [DST]
+	eor	r3, r4
+
+	pop	{r4,r5,r6}
+
+	C Store bytes, one by one.
+.Lmemxor_leftover:
+	C bring uppermost byte down for saving while preserving lower ones
+IF_BE(`	ror	r3, #24')
+	strb	r3, [DST], #+1
+	subs	N, #1
+	beq	.Lmemxor_done
+	subs	TNC, #8
+	C bring down next byte, no need to preserve
+IF_LE(`	lsr	r3, #8')
+	bne	.Lmemxor_leftover
+	b	.Lmemxor_bytes
+.Lmemxor_odd_done:
+	pop	{r4,r5,r6}
+	bx	lr
+
+.Lmemxor_same:
+	push	{r4,r5,r6,r7,r8,r10,r11,r14}	C lr is the link register
+
+	subs	N, #8
+	bcc	.Lmemxor_same_end
+
+	ldmia	SRC!, {r3, r4, r5}
+	C Keep address for loads in r14
+	mov	r14, DST
+	ldmia	r14!, {r6, r7, r8}
+	subs	N, #12
+	eor	r10, r3, r6
+	eor	r11, r4, r7
+	eor	r12, r5, r8
+	bcc	.Lmemxor_same_final_store
+	subs	N, #12
+	ldmia	r14!, {r6, r7, r8}
+	bcc	.Lmemxor_same_wind_down
+
+	C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
+	C loop starts at offset 0x11c in the object file.
+
+.Lmemxor_same_loop:
+	C r10-r12 contains values to be stored at DST
+	C r6-r8 contains values read from r14, in advance
+	ldmia	SRC!, {r3, r4, r5}
+	subs	N, #12
+	stmia	DST!, {r10, r11, r12}
+	eor	r10, r3, r6
+	eor	r11, r4, r7
+	eor	r12, r5, r8
+	ldmia	r14!, {r6, r7, r8}
+	bcs	.Lmemxor_same_loop
+
+.Lmemxor_same_wind_down:
+	C Wind down code
+	ldmia	SRC!, {r3, r4, r5}
+	stmia	DST!, {r10, r11, r12}
+	eor	r10, r3, r6
+	eor	r11, r4, r7
+	eor	r12, r5, r8
+.Lmemxor_same_final_store:
+	stmia	DST!, {r10, r11, r12}
+	
+.Lmemxor_same_end:
+	C We have 0-11 bytes left to do, and N holds number of bytes -12.
+	adds	N, #4
+	bcc	.Lmemxor_same_lt_8
+	C Do 8 bytes more, leftover is in N
+	ldmia	SRC!, {r3, r4}
+	ldmia	DST, {r6, r7}
+	eor	r3, r6
+	eor	r4, r7
+	stmia	DST!, {r3, r4}
+	pop	{r4,r5,r6,r7,r8,r10,r11,r14}
+	beq	.Lmemxor_done
+	b	.Lmemxor_bytes
+
+.Lmemxor_same_lt_8:
+	pop	{r4,r5,r6,r7,r8,r10,r11,r14}
+	adds	N, #4
+	bcc	.Lmemxor_same_lt_4
+
+	ldr	r3, [SRC], #+4
+	ldr	r12, [DST]
+	eor	r3, r12
+	str	r3, [DST], #+4
+	beq	.Lmemxor_done
+	b	.Lmemxor_bytes
+
+.Lmemxor_same_lt_4:
+	adds	N, #4
+	beq	.Lmemxor_done
+	b	.Lmemxor_bytes
+	
+EPILOGUE(nettle_memxor)
--- a/arm/memxor3.asm
+++ b/arm/memxor3.asm
+C arm/memxor3.asm
+
+ifelse(`
+   Copyright (C) 2013, 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Possible speedups:
+C
+C The ldm instruction can do load two registers per cycle,
+C if the address is two-word aligned. Or three registers in two
+C cycles, regardless of alignment.
+
+C Register usage:
+
+define(`DST', `r0')
+define(`AP', `r1')
+define(`BP', `r2')
+define(`N', `r3')
+
+C Temporaries r4-r7
+define(`ACNT', `r8')
+define(`ATNC', `r10')
+define(`BCNT', `r11')
+define(`BTNC', `r12')
+
+C little-endian and big-endian need to shift in different directions for
+C alignment correction
+define(`S0ADJ', IF_LE(`lsr', `lsl'))
+define(`S1ADJ', IF_LE(`lsl', `lsr'))
+
+	.syntax unified
+
+	.file "memxor3.asm"
+
+	.text
+	.arm
+
+	C memxor3(void *dst, const void *a, const void *b, size_t n)
+	.align 2
+PROLOGUE(nettle_memxor3)
+	cmp	N, #0
+	beq	.Lmemxor3_ret
+
+	push	{r4,r5,r6,r7,r8,r10,r11}
+	cmp	N, #7
+
+	add	AP, N
+	add	BP, N
+	add	DST, N
+
+	bcs	.Lmemxor3_large
+
+	C Simple byte loop
+.Lmemxor3_bytes:
+	ldrb	r4, [AP, #-1]!
+	ldrb	r5, [BP, #-1]!
+	eor	r4, r5
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	bne	.Lmemxor3_bytes
+
+.Lmemxor3_done:
+	pop	{r4,r5,r6,r7,r8,r10,r11}
+.Lmemxor3_ret:
+	bx	lr
+
+.Lmemxor3_align_loop:
+	ldrb	r4, [AP, #-1]!
+	ldrb	r5, [BP, #-1]!
+	eor	r5, r4
+	strb	r5, [DST, #-1]!
+	sub	N, #1
+
+.Lmemxor3_large:
+	tst	DST, #3
+	bne	.Lmemxor3_align_loop
+
+	C We have at least 4 bytes left to do here.
+	sub	N, #4
+	ands	ACNT, AP, #3
+	lsl	ACNT, #3
+	beq	.Lmemxor3_a_aligned
+
+	ands	BCNT, BP, #3
+	lsl	BCNT, #3
+	bne	.Lmemxor3_uu
+
+	C Swap
+	mov	r4, AP
+	mov	AP, BP
+	mov	BP, r4
+
+.Lmemxor3_au:
+	C NOTE: We have the relevant shift count in ACNT, not BCNT
+
+	C AP is aligned, BP is not
+	C           v original SRC
+	C +-------+------+
+	C |SRC-4  |SRC   |
+	C +---+---+------+
+	C     |DST-4  |
+	C     +-------+
+	C
+	C With little-endian, we need to do
+	C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
+	C With big-endian, we need to do
+	C DST[i-i] ^= (SRC[i-i] << CNT) ^ (SRC[i] >> TNC)
+	rsb	ATNC, ACNT, #32
+	bic	BP, #3
+
+	ldr	r4, [BP]
+
+	tst	N, #4
+	itet	eq
+	moveq	r5, r4
+	subne	N, #4
+	beq	.Lmemxor3_au_odd
+
+.Lmemxor3_au_loop:
+	ldr	r5, [BP, #-4]!
+	ldr	r6, [AP, #-4]!
+	eor	r6, r6, r4, S1ADJ ATNC
+	eor	r6, r6, r5, S0ADJ ACNT
+	str	r6, [DST, #-4]!
+.Lmemxor3_au_odd:
+	ldr	r4, [BP, #-4]!
+	ldr	r6, [AP, #-4]!
+	eor	r6, r6, r5, S1ADJ ATNC
+	eor	r6, r6, r4, S0ADJ ACNT
+	str	r6, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_au_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C Leftover bytes in r4, low end on LE and high end on BE before
+	C preparatory alignment correction
+	ldr	r5, [AP, #-4]
+	eor	r4, r5, r4, S1ADJ ATNC
+	C now byte-aligned in high end on LE and low end on BE because we're
+	C working downwards in saving the very first bytes of the buffer
+
+.Lmemxor3_au_leftover:
+	C Store a byte at a time
+	C bring uppermost byte down for saving while preserving lower ones
+IF_LE(`	ror	r4, #24')
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	beq	.Lmemxor3_done
+	subs	ACNT, #8
+	C bring down next byte, no need to preserve
+IF_BE(`	lsr	r4, #8')
+	sub	AP, #1
+	bne	.Lmemxor3_au_leftover
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_a_aligned:
+	ands	ACNT, BP, #3
+	lsl	ACNT, #3
+	bne	.Lmemxor3_au ;
+
+	C a, b and dst all have the same alignment.
+	subs	N, #8
+	bcc	.Lmemxor3_aligned_word_end
+
+	C This loop runs at 8 cycles per iteration. It has been
+	C observed running at only 7 cycles, for this speed, the loop
+	C started at offset 0x2ac in the object file.
+
+	C FIXME: consider software pipelining, similarly to the memxor
+	C loop.
+
+.Lmemxor3_aligned_word_loop:
+	ldmdb	AP!, {r4,r5,r6}
+	ldmdb	BP!, {r7,r8,r10}
+	subs	N, #12
+	eor	r4, r7
+	eor	r5, r8
+	eor	r6, r10
+	stmdb	DST!, {r4, r5,r6}
+	bcs	.Lmemxor3_aligned_word_loop
+
+.Lmemxor3_aligned_word_end:
+	C We have 0-11 bytes left to do, and N holds number of bytes -12.
+	adds	N, #4
+	bcc	.Lmemxor3_aligned_lt_8
+	C Do 8 bytes more, leftover is in N
+	ldmdb	AP!, {r4, r5}
+	ldmdb	BP!, {r6, r7}
+	eor	r4, r6
+	eor	r5, r7
+	stmdb	DST!, {r4,r5}
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_8:
+	adds	N, #4
+	bcc	.Lmemxor3_aligned_lt_4
+
+	ldr	r4, [AP,#-4]!
+	ldr	r5, [BP,#-4]!
+	eor	r4, r5
+	str	r4, [DST,#-4]!
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_4:
+	adds	N, #4
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_uu:
+
+	cmp	ACNT, BCNT
+	bic	AP, #3
+	bic	BP, #3
+	rsb	ATNC, ACNT, #32
+
+	bne	.Lmemxor3_uud
+
+	C AP and BP are unaligned in the same way
+
+	ldr	r4, [AP]
+	ldr	r6, [BP]
+	eor	r4, r6
+
+	tst	N, #4
+	itet	eq
+	moveq	r5, r4
+	subne	N, #4
+	beq	.Lmemxor3_uu_odd
+
+.Lmemxor3_uu_loop:
+	ldr	r5, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	eor	r5, r6
+	S1ADJ	r4, ATNC
+	eor	r4, r4, r5, S0ADJ ACNT
+	str	r4, [DST, #-4]!
+.Lmemxor3_uu_odd:
+	ldr	r4, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	eor	r4, r6
+	S1ADJ	r5, ATNC
+	eor	r5, r5, r4, S0ADJ ACNT
+	str	r5, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_uu_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C Leftover bytes in r4, low end on LE and high end on BE before
+	C preparatory alignment correction
+IF_LE(`	ror	r4, ACNT')
+IF_BE(`	ror	r4, ATNC')
+	C now byte-aligned in high end on LE and low end on BE because we're
+	C working downwards in saving the very first bytes of the buffer
+.Lmemxor3_uu_leftover:
+	C bring uppermost byte down for saving while preserving lower ones
+IF_LE(`	ror	r4, #24')
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	beq	.Lmemxor3_done
+	subs	ACNT, #8
+	C bring down next byte, no need to preserve
+IF_BE(`	lsr	r4, #8')
+	bne	.Lmemxor3_uu_leftover
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_uud:
+	C Both AP and BP unaligned, and in different ways
+	rsb	BTNC, BCNT, #32
+
+	ldr	r4, [AP]
+	ldr	r6, [BP]
+
+	tst	N, #4
+	ittet	eq
+	moveq	r5, r4
+	moveq	r7, r6
+	subne	N, #4
+	beq	.Lmemxor3_uud_odd
+
+.Lmemxor3_uud_loop:
+	ldr	r5, [AP, #-4]!
+	ldr	r7, [BP, #-4]!
+	S1ADJ	r4, ATNC
+	eor	r4, r4, r6, S1ADJ BTNC
+	eor	r4, r4, r5, S0ADJ ACNT
+	eor	r4, r4, r7, S0ADJ BCNT
+	str	r4, [DST, #-4]!
+.Lmemxor3_uud_odd:
+	ldr	r4, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	S1ADJ	r5, ATNC
+	eor	r5, r5, r7, S1ADJ BTNC
+	eor	r5, r5, r4, S0ADJ ACNT
+	eor	r5, r5, r6, S0ADJ BCNT
+	str	r5, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_uud_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C FIXME: More clever left-over handling? For now, just adjust pointers.
+	add	AP, AP,	ACNT, lsr #3
+	add	BP, BP, BCNT, lsr #3
+	b	.Lmemxor3_bytes
+EPILOGUE(nettle_memxor3)
--- a/arm/neon/chacha-3core.asm
+++ b/arm/neon/chacha-3core.asm
+C arm/neon/chacha-3core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.file "chacha-3core.asm"
+	.fpu	neon
+
+define(`DST', `r0')
+define(`SRC', `r1')
+define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
+
+C State, X, Y and Z representing consecutive blocks
+define(`X0', `q0')
+define(`X1', `q1')
+define(`X2', `q2')
+define(`X3', `q3')
+define(`Y0', `q8')
+define(`Y1', `q9')
+define(`Y2', `q10')
+define(`Y3', `q11')
+define(`Z0', `q12')
+define(`Z1', `q13')
+define(`Z2', `q14')
+define(`Z3', `q15')
+
+define(`T0', `q4')
+define(`T1', `q5')
+define(`T2', `q6')
+define(`T3', `q7')
+
+	.text
+	.align 4
+.Lcount1:
+	.int 1,0,0,0
+
+	C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_3core)
+	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+	add	SRCp32, SRC, #32
+	vld1.32	{X0,X1}, [SRC]
+	vld1.32	{X2,X3}, [SRCp32]
+	vpush	{q4,q5,q6,q7}
+	adr	r12, .Lcount1
+	vld1.32 {Z3}, [r12]
+
+	vadd.i64	Y3, X3, Z3	C Increment 64-bit counter
+	vadd.i64	Z3, Y3, Z3
+
+.Lshared_entry:
+	vmov	Y0, X0
+	vmov	Z0, X0
+	vmov	Y1, X1
+	vmov	Z1, X1
+	vmov	Y2, X2
+	vmov	Z2, X2
+
+	C Save initial values for the words including the counters.
+	vmov	T2, Y3
+	vmov	T3, Z3
+
+.Loop:
+	C Interleave three blocks. Note that with this scheduling,
+	C only two temporaries, T0 and T1, are needed.
+	vadd.i32	X0, X0, X1
+	veor		X3, X3, X0
+	 vadd.i32	Y0, Y0, Y1
+	vrev32.16	X3, X3		C lrot 16
+	 veor		Y3, Y3, Y0
+	  vadd.i32	Z0, Z0, Z1
+
+	vadd.i32	X2, X2, X3
+	 vrev32.16	Y3, Y3		C lrot 16
+	  veor		Z3, Z3, Z0
+	veor		T0, X1, X2
+	 vadd.i32	Y2, Y2, Y3
+	  vrev32.16	Z3, Z3		C lrot 16
+	vshl.i32	X1, T0, #12
+	 veor		T1, Y1, Y2
+	  vadd.i32	Z2, Z2, Z3
+	vsri.u32	X1, T0, #20
+	 vshl.i32	Y1, T1, #12
+	  veor		T0, Z1, Z2
+
+	vadd.i32	X0, X0, X1
+	 vsri.u32	Y1, T1, #20
+	  vshl.i32	Z1, T0, #12
+	veor		T1, X3, X0
+	 vadd.i32	Y0, Y0, Y1
+	  vsri.u32	Z1, T0, #20
+	vshl.i32	X3, T1, #8
+	 veor		T0, Y3, Y0
+	  vadd.i32	Z0, Z0, Z1
+	vsri.u32	X3, T1, #24
+	 vshl.i32	Y3, T0, #8
+	  veor		T1, Z3, Z0
+
+	vadd.i32	X2, X2, X3
+	 vsri.u32	Y3, T0, #24
+	  vext.32	X3, X3, X3, #3
+	  vshl.i32	Z3, T1, #8
+	veor		T0, X1, X2
+	 vadd.i32	Y2, Y2, Y3
+	  vsri.u32	Z3, T1, #24
+	   vext.32	Y3, Y3, Y3, #3
+	vshl.i32	X1, T0, #7
+	 veor		T1, Y1, Y2
+	  vadd.i32	Z2, Z2, Z3
+	vsri.u32	X1, T0, #25
+	 vshl.i32	Y1, T1, #7
+	  veor		T0, Z1, Z2
+	   vext.32	X1, X1, X1, #1
+	 vsri.u32	Y1, T1, #25
+	  vshl.i32	Z1, T0, #7
+	   vext.32	Y2, Y2, Y2, #2
+	   vext.32	Y1, Y1, Y1, #1
+	  vsri.u32	Z1, T0, #25
+	   vext.32	X2, X2, X2, #2
+
+	C Second QROUND
+	vadd.i32	X0, X0, X1
+	   vext.32	Z2, Z2, Z2, #2
+	   vext.32	Z1, Z1, Z1, #1
+	veor		X3, X3, X0
+	 vadd.i32	Y0, Y0, Y1
+	   vext.32	Z3, Z3, Z3, #3
+	vrev32.16	X3, X3		C lrot 16
+	 veor		Y3, Y3, Y0
+	  vadd.i32	Z0, Z0, Z1
+
+	vadd.i32	X2, X2, X3
+	 vrev32.16	Y3, Y3		C lrot 16
+	  veor		Z3, Z3, Z0
+	veor		T0, X1, X2
+	 vadd.i32	Y2, Y2, Y3
+	  vrev32.16	Z3, Z3		C lrot 16
+	vshl.i32	X1, T0, #12
+	 veor		T1, Y1, Y2
+	  vadd.i32	Z2, Z2, Z3
+	vsri.u32	X1, T0, #20
+	 vshl.i32	Y1, T1, #12
+	  veor		T0, Z1, Z2
+
+	vadd.i32	X0, X0, X1
+	 vsri.u32	Y1, T1, #20
+	  vshl.i32	Z1, T0, #12
+	veor		T1, X3, X0
+	 vadd.i32	Y0, Y0, Y1
+	  vsri.u32	Z1, T0, #20
+	vshl.i32	X3, T1, #8
+	 veor		T0, Y3, Y0
+	  vadd.i32	Z0, Z0, Z1
+	vsri.u32	X3, T1, #24
+	 vshl.i32	Y3, T0, #8
+	  veor		T1, Z3, Z0
+
+	vadd.i32	X2, X2, X3
+	 vsri.u32	Y3, T0, #24
+	   vext.32	X3, X3, X3, #1
+	  vshl.i32	Z3, T1, #8
+	veor		T0, X1, X2
+	   vext.32	X2, X2, X2, #2
+	 vadd.i32	Y2, Y2, Y3
+	   vext.32	Y3, Y3, Y3, #1
+	  vsri.u32	Z3, T1, #24
+	vshl.i32	X1, T0, #7
+	 veor		T1, Y1, Y2
+	   vext.32	Y2, Y2, Y2, #2
+	  vadd.i32	Z2, Z2, Z3
+	   vext.32	Z3, Z3, Z3, #1
+	vsri.u32	X1, T0, #25
+	 vshl.i32	Y1, T1, #7
+	  veor		T0, Z1, Z2
+	   vext.32	Z2, Z2, Z2, #2
+	   vext.32	X1, X1, X1, #3
+	 vsri.u32	Y1, T1, #25
+	  vshl.i32	Z1, T0, #7
+	   vext.32	Y1, Y1, Y1, #3
+	  vsri.u32	Z1, T0, #25
+
+	subs	ROUNDS, ROUNDS, #2
+
+	   vext.32	Z1, Z1, Z1, #3
+
+	bhi	.Loop
+
+	C Add updated counters
+	vadd.i32	Y3, Y3, T2
+	vadd.i32	Z3, Z3, T3
+
+	vld1.32	{T0,T1}, [SRC]
+	vadd.i32	X0, X0, T0
+	vadd.i32	X1, X1, T1
+
+	C vst1.8 because caller expects results little-endian
+	C interleave loads, calculations and stores to save cycles on stores
+	C use vstm when little-endian for some additional speedup
+IF_BE(`	vst1.8	{X0,X1}, [DST]!')
+
+	vld1.32	{T2,T3}, [SRCp32]
+	vadd.i32	X2, X2, T2
+	vadd.i32	X3, X3, T3
+IF_BE(`	vst1.8	{X2,X3}, [DST]!')
+IF_LE(`	vstmia	DST!, {X0,X1,X2,X3}')
+
+	vadd.i32	Y0, Y0, T0
+	vadd.i32	Y1, Y1, T1
+IF_BE(`	vst1.8	{Y0,Y1}, [DST]!')
+
+	vadd.i32	Y2, Y2, T2
+IF_BE(`	vst1.8	{Y2,Y3}, [DST]!')
+IF_LE(`	vstmia	DST!, {Y0,Y1,Y2,Y3}')
+
+	vadd.i32	Z0, Z0, T0
+	vadd.i32	Z1, Z1, T1
+IF_BE(`	vst1.8	{Z0,Z1}, [DST]!')
+
+	vadd.i32	Z2, Z2, T2
+
+	vpop	{q4,q5,q6,q7}
+
+IF_BE(`	vst1.8	{Z2,Z3}, [DST]')
+IF_LE(`	vstm	DST, {Z0,Z1,Z2,Z3}')
+	bx	lr
+EPILOGUE(_nettle_chacha_3core)
+
+PROLOGUE(_nettle_chacha_3core32)
+	add	SRCp32, SRC, #32
+	vld1.32	{X0,X1}, [SRC]
+	vld1.32	{X2,X3}, [SRCp32]
+	vpush	{q4,q5,q6,q7}
+	adr	r12, .Lcount1
+	vld1.32 {Z3}, [r12]
+
+	vadd.i32	Y3, X3, Z3	C Increment 32-bit counter
+	vadd.i32	Z3, Y3, Z3
+	b .Lshared_entry
+EPILOGUE(_nettle_chacha_3core32)
--- a/arm/neon/salsa20-2core.asm
+++ b/arm/neon/salsa20-2core.asm
+C arm/neon/salsa20-2core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.file "salsa20-2core.asm"
+	.fpu	neon
+
+define(`DST', `r0')
+define(`SRC', `r1')
+define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
+
+C State, even elements in X, odd elements in Y
+define(`X0', `q0')
+define(`X1', `q1')
+define(`X2', `q2')
+define(`X3', `q3')
+define(`Y0', `q8')
+define(`Y1', `q9')
+define(`Y2', `q10')
+define(`Y3', `q11')
+define(`T0', `q12')
+define(`T1', `q13')
+define(`T2', `q14')
+define(`T3', `q15')
+
+	.text
+	.align 4
+.Lcount1:
+	.int 1,0,0,0
+
+	C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_salsa20_2core)
+	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+	add	SRCp32, SRC, #32
+	vld1.32	{X0,X1}, [SRC]
+	vld1.32	{X2,X3}, [SRCp32]
+	adr	r12, .Lcount1
+
+	vmov	Y3, X0
+	vld1.32 {Y1}, [r12]
+	vmov	Y0, X1
+	vadd.i64 Y1, Y1, X2	C Increment counter
+	vmov	Y2, X3
+
+	vtrn.32	X0, Y3		C X0:  0  0  2  2  Y3:  1  1  3  3
+	vtrn.32	X1, Y0		C X1:  4  4  6  6  Y0:  5  5  7  7
+	vtrn.32	X2, Y1		C X2:  8  8 10 10  Y1:  9  9 11 11
+	vtrn.32	X3, Y2		C X3: 12 12 14 14  Y2: 13 13 15 15
+
+	C Swap, to get
+	C X0:  0 10  Y0:  5 15
+	C X1:  4 14  Y1:  9  3
+	C X2:  8  2  Y2: 13  7
+	C X3: 12  6  Y3:  1 11
+	vswp	D1REG(X0), D1REG(X2)
+	vswp	D1REG(X1), D1REG(X3)
+	vswp	D1REG(Y0), D1REG(Y2)
+	vswp	D1REG(Y1), D1REG(Y3)
+
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0: A0  B0  A10 B10  Y0: A5  A5  A15 B15
+C X1: A4  B4  A14 B14  Y1: A9  B9  A3  B3
+C X2: A8  B8  A2  B2   Y2: A13 B13 A7  B7
+C X3: A12 B12 A6  B6   Y3: A1  B1  A11 B11
+
+	vadd.i32	T0, X0, X3
+	vshl.i32	T1, T0, #7
+	 vadd.i32	T2, Y0, Y3
+	vsri.u32	T1, T0, #25
+	 vshl.i32	T3, T2, #7
+	veor		X1, X1, T1
+	 vsri.u32	T3, T2, #25
+	vadd.i32	T0, X1, X0
+	 veor		Y1, Y1, T3
+	vshl.i32	T1, T0, #9
+	 vadd.i32	T2, Y1, Y0
+	vsri.u32	T1, T0, #23
+	 vshl.i32	T3, T2, #9
+	veor		X2, X2, T1
+	 vsri.u32	T3, T2, #23
+	vadd.i32	T0, X2, X1
+	 veor		Y2, Y2, T3
+	vshl.i32	T1, T0, #13
+	 vadd.i32	T2, Y2, Y1
+	vsri.u32	T1, T0, #19
+	 vshl.i32	T3, T2, #13
+	veor		X3, X3, T1
+	 vsri.u32	T3, T2, #19
+	vadd.i32	T0, X3, X2
+	 veor		Y3, Y3, T3
+	vshl.i32	T1, T0, #18
+	 vadd.i32	T2, Y3, Y2
+	  vext.32	Y1, Y1, Y1, #2
+	vsri.u32	T1, T0, #14
+	 vshl.i32	T3, T2, #18
+	  vext.32	Y2, Y2, Y2, #2
+	veor		X0, X0, T1
+	 vsri.u32	T3, T2, #14
+	  vext.32	X3, X3, X3, #2
+	 veor		Y0, Y0, T3
+
+C Register layout:
+C X0: A0  B0  A10 B10  Y0: A5  A5  A15 B15
+C Y1: A3  B3   A9  B9  X1: A4  B4  A14 B14 (Y1 swapped)
+C X2: A2  B2   A8  B8  Y2: A7  B7  A13 B13 (X2, Y2 swapped)
+C Y3: A1  B1  A11 B11  X3: A6  B6  A12 B12 (X3 swapped)
+
+	vadd.i32	T0, X0, Y1
+	  vext.32	X2, X2, X2, #2
+	vshl.i32	T1, T0, #7
+	 vadd.i32	T2, Y0, X1
+	vsri.u32	T1, T0, #25
+	 vshl.i32	T3, T2, #7
+	veor		Y3, Y3, T1
+	 vsri.u32	T3, T2, #25
+	vadd.i32	T0, Y3, X0
+	 veor		X3, X3, T3
+	vshl.i32	T1, T0, #9
+	 vadd.i32	T2, X3, Y0
+	vsri.u32	T1, T0, #23
+	 vshl.i32	T3, T2, #9
+	veor		X2, X2, T1
+	 vsri.u32	T3, T2, #23
+	vadd.i32	T0, X2, Y3
+	 veor		Y2, Y2, T3
+	vshl.i32	T1, T0, #13
+	 vadd.i32	T2, Y2, X3
+	vsri.u32	T1, T0, #19
+	 vshl.i32	T3, T2, #13
+	veor		Y1, Y1, T1
+	 vsri.u32	T3, T2, #19
+	vadd.i32	T0, Y1, X2
+	 veor		X1, X1, T3
+	  vext.32	X2, X2, X2, #2
+	vshl.i32	T1, T0, #18
+	 vadd.i32	T2, X1, Y2
+	  vext.32	Y1, Y1, Y1, #2
+	vsri.u32	T1, T0, #14
+	   subs		ROUNDS, ROUNDS, #2
+	 vshl.i32	T3, T2, #18
+	  vext.32	X3, X3, X3, #2
+	veor		X0, X0, T1
+	 vsri.u32	T3, T2, #14
+	  vext.32	Y2, Y2, Y2, #2
+	 veor		Y0, Y0, T3
+
+	bhi		.Loop
+
+C Inverse swaps and transpositions
+
+	vswp	D1REG(X0), D1REG(X2)
+	vswp	D1REG(X1), D1REG(X3)
+	vswp	D1REG(Y0), D1REG(Y2)
+	vswp	D1REG(Y1), D1REG(Y3)
+
+	vld1.32	{T0,T1}, [SRC]
+	vld1.32	{T2,T3}, [SRCp32]
+
+	vtrn.32	X0, Y3
+	vtrn.32	X1, Y0
+	vtrn.32	X2, Y1
+	vtrn.32	X3, Y2
+
+C Add in the original context
+	vadd.i32	X0, X0, T0
+	vadd.i32	X1, X1, T1
+
+C vst1.8 because caller expects results little-endian
+C interleave loads, calculations and stores to save cycles on stores
+C use vstm when little-endian for some additional speedup
+IF_BE(`	vst1.8	{X0,X1}, [DST]!')
+
+	vadd.i32	X2, X2, T2
+	vadd.i32	X3, X3, T3
+IF_BE(`	vst1.8	{X2,X3}, [DST]!')
+IF_LE(`	vstmia	DST!, {X0,X1,X2,X3}')
+
+	vld1.32 {X0}, [r12]
+	vadd.i32	T0, T0, Y3
+	vadd.i64	T2, T2, X0
+	vadd.i32	T1, T1, Y0
+IF_BE(`	vst1.8	{T0,T1}, [DST]!')
+
+	vadd.i32	T2, T2, Y1
+	vadd.i32	T3, T3, Y2
+IF_BE(`	vst1.8	{T2,T3}, [DST]')
+IF_LE(`	vstm	DST, {T0,T1,T2,T3}')
+	bx	lr
+EPILOGUE(_nettle_salsa20_2core)
--- a/arm/neon/sha3-permute.asm
+++ b/arm/neon/sha3-permute.asm
+C arm/neon/sha3-permute.asm
+
+ifelse(`
+   Copyright (C) 2013 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.file "sha3-permute.asm"
+	.fpu	neon
+
+define(`CTX', `r0')
+define(`COUNT', `r1')
+define(`RC', `r2')
+C First column
+define(`A0', `d0')
+define(`A5', `d2')
+define(`A10', `d3')
+define(`A15', `d4')
+define(`A20', `d5')
+
+define(`A1', `d6')
+define(`A2', `d7')
+define(`A3', `d8')
+define(`A4', `d9')
+
+define(`A6', `d16')
+define(`A7', `d17')
+define(`A8', `d18')
+define(`A9', `d19')
+
+define(`A11', `d20')
+define(`A12', `d21')
+define(`A13', `d22')
+define(`A14', `d23')
+
+define(`A16', `d24')
+define(`A17', `d25')
+define(`A18', `d26')
+define(`A19', `d27')
+
+define(`A21', `d28')
+define(`A22', `d29')
+define(`A23', `d30')
+define(`A24', `d31')
+
+define(`T0', `d10')
+define(`T1', `d11')
+
+define(`C0', `d1')
+define(`C1', `d12')
+define(`C2', `d13')
+define(`C3', `d14')
+define(`C4', `d15')
+
+
+C ROL(DST, SRC, COUNT)
+C Must have SRC != DST
+define(`ROL', `
+	vshr.u64	$1, $2, #eval(64-$3)
+	vsli.i64	$1, $2, #$3
+	')
+C sha3_permute(struct sha3_ctx *ctx)
+
+	.text
+	.align	3
+.Lrc:
+	.quad	0x0000000000000001
+	.quad	0x0000000000008082
+	.quad	0x800000000000808A
+	.quad	0x8000000080008000
+	.quad	0x000000000000808B
+	.quad	0x0000000080000001
+	.quad	0x8000000080008081
+	.quad	0x8000000000008009
+	.quad	0x000000000000008A
+	.quad	0x0000000000000088
+	.quad	0x0000000080008009
+	.quad	0x000000008000000A
+	.quad	0x000000008000808B
+	.quad	0x800000000000008B
+	.quad	0x8000000000008089
+	.quad	0x8000000000008003
+	.quad	0x8000000000008002
+	.quad	0x8000000000000080
+	.quad	0x000000000000800A
+	.quad	0x800000008000000A
+	.quad	0x8000000080008081
+	.quad	0x8000000000008080
+	.quad	0x0000000080000001
+	.quad	0x8000000080008008
+	
+PROLOGUE(nettle_sha3_permute)
+	vpush	{d8-d15}
+
+	vld1.64	{A0}, [CTX]!
+	vldm	CTX!, {A1,A2,A3,A4}
+	vld1.64	{A5}, [CTX]!
+	vldm	CTX!, {A6,A7,A8,A9}
+	vld1.64	{A10}, [CTX]!
+	vldm	CTX!, {A11,A12,A13,A14}
+	vld1.64	{A15}, [CTX]!
+	vldm	CTX!, {A16,A17,A18,A19}
+	vld1.64	{A20}, [CTX]!
+	vldm	CTX, {A21,A22,A23,A24}
+	sub	CTX, CTX, #168
+
+	mov	COUNT, #24
+	adr	RC, .Lrc
+
+	.align 3
+.Loop:
+	veor	QREG(T0), QREG(A5), QREG(A15)
+	veor	C0, A0, T0
+	veor	C0, C0, T1
+	veor	QREG(C1), QREG(A1), QREG(A6)
+	veor	QREG(C1), QREG(C1), QREG(A11)
+	veor	QREG(C1), QREG(C1), QREG(A16)
+	veor	QREG(C1), QREG(C1), QREG(A21)
+
+	veor	QREG(C3), QREG(A3), QREG(A8)
+	veor	QREG(C3), QREG(C3), QREG(A13)
+	veor	QREG(C3), QREG(C3), QREG(A18)
+	veor	QREG(C3), QREG(C3), QREG(A23)
+
+	C 	D0 = C4 ^ (C1 <<< 1)
+	C 	NOTE: Using ROL macro (and vsli) is slightly slower.
+ 	vshl.i64	T0, C1, #1
+ 	vshr.u64	T1, C1, #63
+	veor	T0, T0, C4
+	veor	T0, T0, T1
+	vmov	T1, T0
+	veor	A0, A0, T0
+	veor	QREG(A5), QREG(A5), QREG(T0)
+	veor	QREG(A15), QREG(A15), QREG(T0)
+	
+	C 	D1 = C0 ^ (C2 <<< 1)
+	C 	D2 = C1 ^ (C3 <<< 1)
+	ROL(T0, C2, 1)
+	ROL(T1, C3, 1)
+	veor	T0, T0, C0
+	veor	T1, T1, C1
+	veor	QREG(A1), QREG(A1), QREG(T0)
+	veor	QREG(A6), QREG(A6), QREG(T0)
+	veor	QREG(A11), QREG(A11), QREG(T0)
+	veor	QREG(A16), QREG(A16), QREG(T0)
+	veor	QREG(A21), QREG(A21), QREG(T0)
+
+	C 	D3 = C2 ^ (C4 <<< 1)
+	C 	D4 = C3 ^ (C0 <<< 1)
+	ROL(T0, C4, 1)
+	ROL(T1, C0, 1)
+	veor	T0, T0, C2
+	veor	T1, T1, C3
+	veor	QREG(A3), QREG(A3), QREG(T0)
+	veor	QREG(A8), QREG(A8), QREG(T0)
+	veor	QREG(A13), QREG(A13), QREG(T0)
+	veor	QREG(A18), QREG(A18), QREG(T0)
+	veor	QREG(A23), QREG(A23), QREG(T0)
+
+	ROL( T0,  A1,  1)
+	ROL( A1,  A6, 44)
+	ROL( A6,  A9, 20)
+	ROL( A9, A22, 61)
+	ROL(A22, A14, 39)
+	ROL(A14, A20, 18)
+	ROL(A20,  A2, 62)
+	ROL( A2, A12, 43)
+	ROL(A12, A13, 25)
+	ROL(A13, A19,  8)
+	ROL(A19, A23, 56)
+	ROL(A23, A15, 41)
+	ROL(A15,  A4, 27)
+	ROL( A4, A24, 14)
+	ROL(A24, A21,  2)
+	ROL(A21,  A8, 55)
+	ROL( A8, A16, 45)
+	ROL(A16,  A5, 36)
+	ROL( A5,  A3, 28)
+	ROL( A3, A18, 21)
+	ROL(A18, A17, 15)
+	ROL(A17, A11, 10)
+	ROL(A11,  A7,  6)
+	ROL( A7, A10,  3)
+	C New A10 value left in T0
+
+	vbic	C0, A2, A1
+	vbic	C1, A3, A2
+	vbic	C2, A4, A3
+	vbic	C3, A0, A4
+	vbic	C4, A1, A0
+
+	veor	A0, A0, C0
+	vld1.64	{C0}, [RC :64]!
+	veor	QREG(A1), QREG(A1), QREG(C1)
+	veor	QREG(A3), QREG(A3), QREG(C3)
+	veor	A0, A0, C0
+
+	vbic	C0, A7, A6
+	vbic	C1, A8, A7
+	vbic	C2, A9, A8
+	vbic	C3, A5, A9
+	vbic	C4, A6, A5
+
+	veor	A5, A5, C0
+	veor	QREG(A6), QREG(A6), QREG(C1)
+	veor	QREG(A8), QREG(A8), QREG(C3)
+
+	vbic	C0, A12, A11
+	vbic	C1, A13, A12
+	vbic	C2, A14, A13
+	vbic	C3, T0, A14
+	vbic	C4, A11, T0
+
+	veor	A10, T0, C0
+	veor	QREG(A11), QREG(A11), QREG(C1)
+	veor	QREG(A13), QREG(A13), QREG(C3)
+
+	vbic	C0, A17, A16
+	vbic	C1, A18, A17
+	vbic	C2, A19, A18
+	vbic	C3, A15, A19
+	vbic	C4, A16, A15
+
+	veor	A15, A15, C0
+	veor	QREG(A16), QREG(A16), QREG(C1)
+	veor	QREG(A18), QREG(A18), QREG(C3)
+
+	vbic	C0, A22, A21
+	vbic	C1, A23, A22
+	vbic	C2, A24, A23
+	vbic	C3, A20, A24
+	vbic	C4, A21, A20
+
+	subs	COUNT, COUNT, #1
+	veor	A20, A20, C0
+	veor	QREG(A21), QREG(A21), QREG(C1)
+	veor	QREG(A23), QREG(A23), QREG(C3)
+
+	bne	.Loop
+
+	vst1.64	{A0}, [CTX]!
+	vstm	CTX!, {A1,A2,A3,A4}
+	vst1.64	{A5}, [CTX]!
+	vstm	CTX!, {A6,A7,A8,A9}
+	vst1.64	{A10}, [CTX]!
+	vstm	CTX!, {A11,A12,A13,A14}
+	vst1.64	{A15}, [CTX]!
+	vstm	CTX!, {A16,A17,A18,A19}
+	vst1.64	{A20}, [CTX]!
+	vstm	CTX, {A21,A22,A23,A24}
+	
+	vpop	{d8-d15}
+	bx	lr
+EPILOGUE(nettle_sha3_permute)
--- a/arm/neon/sha512-compress.asm
+++ b/arm/neon/sha512-compress.asm
+C arm/neon/sha512-compress.asm
+
+ifelse(`
+   Copyright (C) 2013 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.file "sha512-compress.asm"
+	.fpu	neon
+
+define(`STATE', `r0')
+define(`INPUT', `r1')
+define(`K', `r2')
+define(`COUNT', `r3')
+define(`SHIFT', `r12')
+
+define(`SA', `d0')
+define(`SB', `d1')
+define(`SC', `d2')
+define(`SD', `d3')
+define(`SE', `d4')
+define(`SF', `d5')
+define(`SG', `d6')
+define(`SH', `d7')
+define(`QSAB', `q0')
+define(`QSCD', `q1')
+define(`QSEF', `q2')
+define(`QSGH', `q3')
+
+C d8-d15 are callee-save	
+define(`DT0', `d8')
+define(`DT1', `d9')
+define(`QT01', `q4')
+define(`DT2', `d10')
+define(`DT3', `d11')
+define(`QT23', `q5')
+define(`DT4', `d12')
+define(`DT5', `d13')
+define(`QT45', `q6')
+
+C Used only when reading the input, can overlap with state
+define(`DT6', `d0')
+define(`DT7', `d1')
+define(`QT67', `q0')
+
+define(`DW0', `d16')
+define(`DW1', `d17')
+define(`DW2', `d18')
+define(`DW3', `d19')
+define(`DW4', `d20')
+define(`DW5', `d21')
+define(`DW6', `d22')
+define(`DW7', `d23')
+define(`DW8', `d24')
+define(`DW9', `d25')
+define(`DW10', `d26')
+define(`DW11', `d27')
+define(`DW12', `d28')
+define(`DW13', `d29')
+define(`DW14', `d30')
+define(`DW15', `d31')
+define(`QW0001', `q8')
+define(`QW0203', `q9')
+define(`QW0405', `q10')
+define(`QW0607', `q11')
+define(`QW0809', `q12')
+define(`QW1011', `q13')
+define(`QW1213', `q14')
+define(`QW1415', `q15')
+
+define(`EXPAND_ME', `$1')
+define(`W', `EXPAND_ME(`DW'eval(($1) % 16))')
+
+C If x = W(i+14), y = w(i+1), we xor in parallel
+C
+C	x << 45		y << 63
+C	x >> 19		y >> 1
+C	x << 3		y << 56
+C	x >> 61		y >> 8
+C  xor	x >> 6		y >> 7
+C  -----------------------------
+C	DT0		DT1
+define(`EXPN', `
+	vshl.i64	DT0, W($1+14), #45
+	vshl.i64	DT1, W($1 + 1), #63
+	vshr.u64	DT2, W($1+14), #19
+	vshr.u64	DT3, W($1 + 1), #1
+	vshl.i64	DT4, W($1+14), #3
+	vshl.i64	DT5, W($1 + 1), #56
+	veor.i64	QT01, QT01, QT23
+	vshr.u64	DT2, W($1+14), #61
+	vshr.u64	DT3, W($1 + 1), #8
+	veor.i64	QT01, QT01, QT45
+	vshr.u64	DT4, W($1+14), #6
+	vshr.u64	DT5, W($1 + 1), #7
+	veor.i64	QT01, QT01, QT23
+	vadd.i64	W($1), W($1), W($1 + 9)
+	veor.i64	QT01, QT01, QT45
+	vadd.i64	W($1), W($1), DT0
+	vadd.i64	W($1), W($1), DT1
+')
+
+C ROUND(A,B,C,D,E,F,G,H,i)
+C
+C H += S1(E) + Choice(E,F,G) + K + W
+C D += H
+C H += S0(A) + Majority(A,B,C)
+C
+C Where
+C
+C S1(E) = E<<<50 ^ E<<<46 ^ E<<<23
+C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25
+C Choice (E, F, G) = G^(E&(F^G))
+C Majority (A,B,C) = (A&B) + (C&(A^B))
+
+C Do S1 and S0 in parallel
+C
+C	e << 50		a << 36
+C	e >> 14		a >> 28
+C	e << 46		a << 30
+C	e >> 18		a >> 34
+C	e << 23		a << 25
+C  xor	e >> 41		a >> 39
+C  ----------------------------
+C	DT0		DT1
+define(`ROUND', `
+	vshl.i64	DT0, $5, #50
+	vshl.i64	DT1, $1, #36
+	vshr.u64	DT2, $5, #14
+	vshr.u64	DT3, $1, #28
+	vshl.i64	DT4, $5, #46
+	vshl.i64	DT5, $1, #30
+	veor		QT01, QT01, QT23
+	vshr.u64	DT2, $5, #18
+	vshr.u64	DT3, $1, #34
+	veor		QT01, QT01, QT45
+	vshl.i64	DT4, $5, #23
+	vshl.i64	DT5, $1, #25
+	veor		QT01, QT01, QT23
+	vshr.u64	DT2, $5, #41
+	vshr.u64	DT3, $1, #39
+	veor		QT01, QT01, QT45
+	veor		DT4, $6, $7
+	veor		DT5, $1, $2
+	vand		DT4, DT4, $5
+	vand		DT5, DT5, $3
+	veor		DT4, DT4, $7
+	veor		QT01, QT01, QT23
+	vand		DT2, $1, $2
+	vldr		DT3, [K,#eval(8*$9)]
+	vadd.i64	$8, $8, W($9)
+	vadd.i64	QT01, QT01, QT45
+	vadd.i64	$8, $8, DT3
+	vadd.i64	$8, $8, DT0
+	vadd.i64	DT1, DT1, DT2
+	vadd.i64	$4, $4, $8
+	vadd.i64	$8, $8, DT1
+')
+
+	C void
+	C _nettle_sha512_compress(uint64_t *state, const uint8_t *input, const uint64_t *k)
+
+	.text
+	.align 2
+
+PROLOGUE(_nettle_sha512_compress)
+	vpush	{d8,d9,d10,d11,d12,d13}
+	
+	ands	SHIFT, INPUT, #7
+	and	INPUT, INPUT, #-8
+	vld1.8	{DT5}, [INPUT :64]
+	addne	INPUT, INPUT, #8
+	addeq	SHIFT, SHIFT, #8
+	lsl	SHIFT, SHIFT, #3
+
+	C Put right shift in DT0 and DT1, aka QT01
+	neg	SHIFT, SHIFT
+	vmov.i32	DT0, #0
+	vmov.32		DT0[0], SHIFT
+	vmov		DT1, DT0
+	C Put left shift in DT2 and DT3, aka QT23
+	add		SHIFT, SHIFT, #64
+	vmov.i32	DT2, #0
+	vmov.32		DT2[0], SHIFT
+	vmov		DT3, DT2
+	vshl.u64	DT5, DT5, DT0
+
+	C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT
+	vld1.8		{W(0),W(1),W(2),W(3)}, [INPUT :64]!
+	vshl.u64	QT67, QW0001, QT01	C Right shift
+	vshl.u64	QW0001, QW0001, QT23	C Left shift
+	veor		W(0), W(0), DT5
+	veor		W(1), W(1), DT6
+	vrev64.8	QW0001, QW0001
+	vshl.u64	QT45, QW0203, QT01	C Right shift
+	vshl.u64	QW0203, QW0203, QT23	C Left shift
+	veor		W(2), W(2), DT7
+	veor		W(3), W(3), DT4
+	vrev64.8	QW0203, QW0203
+
+	vld1.8		{W(4),W(5),W(6),W(7)}, [INPUT :64]!
+	vshl.u64	QT67, QW0405, QT01	C Right shift
+	vshl.u64	QW0405, QW0405, QT23	C Left shift
+	veor		W(4), W(4), DT5
+	veor		W(5), W(5), DT6
+	vrev64.8	QW0405, QW0405
+	vshl.u64	QT45, QW0607, QT01	C Right shift
+	vshl.u64	QW0607, QW0607, QT23	C Left shift
+	veor		W(6), W(6), DT7
+	veor		W(7), W(7), DT4
+	vrev64.8	QW0607, QW0607
+
+	vld1.8		{W(8),W(9),W(10),W(11)}, [INPUT :64]!
+	vshl.u64	QT67, QW0809, QT01	C Right shift
+	vshl.u64	QW0809, QW0809, QT23	C Left shift
+	veor		W(8), W(8), DT5
+	veor		W(9), W(9), DT6
+	vrev64.8	QW0809, QW0809
+	vshl.u64	QT45, QW1011, QT01	C Right shift
+	vshl.u64	QW1011, QW1011, QT23	C Left shift
+	veor		W(10), W(10), DT7
+	veor		W(11), W(11), DT4
+	vrev64.8	QW1011, QW1011
+
+	vld1.8		{W(12),W(13),W(14),W(15)}, [INPUT :64]!
+	vshl.u64	QT67, QW1213, QT01	C Right shift
+	vshl.u64	QW1213, QW1213, QT23	C Left shift
+	veor		W(12), W(12), DT5
+	veor		W(13), W(13), DT6
+	vrev64.8	QW1213, QW1213
+	vshl.u64	QT45, QW1415, QT01	C Right shift
+	vshl.u64	QW1415, QW1415, QT23	C Left shift
+	veor		W(14), W(14), DT7
+	veor		W(15), W(15), DT4
+	vrev64.8	QW1415, QW1415
+
+	vldm	STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
+
+	ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
+	ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
+	ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
+	ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
+	ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
+	ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
+	ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
+	ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
+
+	ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
+	ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
+	ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
+	ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
+	ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
+	ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
+	ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
+	ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
+
+	add	K, K, #128
+
+	mov	COUNT, #4
+.Loop:
+
+	EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,  0)
+	EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG,  1)
+	EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF,  2)
+	EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE,  3)
+	EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD,  4)
+	EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,  5)
+	EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,  6)
+	EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,  7)
+	EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,  8)
+	EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG,  9)
+	EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
+	EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
+	EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
+	EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
+	EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
+	subs	COUNT, COUNT, #1
+	EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
+	add	K, K, #128
+	bne	.Loop
+
+	vld1.64		{DW0, DW1, DW2, DW3}, [STATE]
+	vadd.i64	QSAB, QSAB, QW0001
+	vadd.i64	QSCD, QSCD, QW0203
+	vst1.64		{SA,SB,SC,SD}, [STATE]!
+	vld1.64		{DW0, DW1, DW2, DW3}, [STATE]
+	vadd.i64	QSEF, QSEF, QW0001
+	vadd.i64	QSGH, QSGH, QW0203
+	vst1.64		{SE,SF,SG,SH}, [STATE]!
+
+	vpop	{d8,d9,d10,d11,d12,d13}
+	bx	lr
+EPILOGUE(_nettle_sha512_compress)
+
+divert(-1)
+define shastate
+p/x $d0.u64
+p/x $d1.u64
+p/x $d2.u64
+p/x $d3.u64
+p/x $d4.u64
+p/x $d5.u64
+p/x $d6.u64
+p/x $d7.u64
+end
--- a/arm/neon/umac-nh-n.asm
+++ b/arm/neon/umac-nh-n.asm
+C arm/neon/umac-nh-n.asm
+
+ifelse(`
+   Copyright (C) 2013 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.file "umac-nh.asm"
+	.fpu	neon
+
+define(`OUT', `r0')
+define(`ITERS', `r1')
+define(`KEY', `r2')
+define(`LENGTH', `r3')
+define(`MSG', `r12')
+define(`SHIFT', `r14')
+
+define(`QA', `q0')
+define(`QB', `q1')
+define(`QY0', `q3')	C Accumulates for the first two operations.
+define(`DM', `d4')
+define(`QY1', `q4')	C Used for 3 and 4 iterations.
+define(`QC', `q5')
+define(`QD', `q6')
+define(`QLEFT', `q8')
+define(`QRIGHT', `q9')
+define(`QT0', `q10')
+define(`QT1', `q11')
+define(`QT2', `q12')
+define(`QK0', `q13')
+define(`QK1', `q14')
+define(`QK2', `q15')
+
+C FIXME: Try permuting subkeys using vld4, vzip or similar.
+
+	.text
+	.align	3
+	
+PROLOGUE(_nettle_umac_nh_n)
+	ldr	MSG, [sp]
+	str	lr, [sp, #-4]!
+	
+	C Setup for 64-bit aligned reads
+	ands	SHIFT, MSG, #7
+	and	MSG, MSG, #-8
+	vld1.8	{DM}, [MSG :64]
+	addne	MSG, MSG, #8
+	addeq	SHIFT, SHIFT, #8
+
+	C FIXME: Combine as rsb ?
+	lsl	SHIFT, SHIFT, #3
+	neg	SHIFT, SHIFT
+
+	C Right shift in QRIGHT (both halves)
+	vmov.i32 D0REG(QRIGHT)[0], SHIFT
+	vmov.32	 D1REG(QRIGHT), D0REG(QRIGHT)
+	add	SHIFT, SHIFT, #64
+	
+	vmov.i32 D0REG(QLEFT)[0], SHIFT
+	vmov.32	 D1REG(QLEFT), D0REG(QLEFT)
+	cmp	r1, #3
+	vmov.i64 QY0, #0
+
+	vshl.u64 DM, DM, D0REG(QRIGHT)
+	bcc	.Lnh2
+	beq	.Lnh3
+	
+.Lnh4:	
+	C Permute key words, so we in each iteration have them in order
+	C
+	C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11]
+	C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19]
+	C
+	C Also arrange the message words, so we get them as
+	C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7]
+	C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15]
+	C
+	C Then, accumulate Y0 (first two "iters") using
+	C
+	C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3) 
+	C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7)
+	C
+	C Next iteration is then
+	C
+	C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7) 
+	C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11)
+	C
+	C So we can reuse P4, P5, P6, P7 from the previous iteration.
+
+	C How to for in registers? We need 4 Q regs for P0-P3, and one
+	C more for the last read key. We need at least two regiters
+	C for the message (QA and QB, more if we want to expand only
+	C once). For the Y0 update, we can let the factors overwrite
+	C P0-P3, and for the Y1 update, we can overwrite M0-M3.
+	
+	vpush	{q4,q5,q6}
+	vld1.32 {QK0,QK1}, [KEY]!
+	vld1.32 {QK2}, [KEY]!
+	vmov	QT0, QK1
+	vmov	QT1, QK2
+	
+	C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1
+	vtrn.32	QK0, QK1		C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+	vswp D1REG(QK0), D0REG(QK1)	C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+	vtrn.32	QT0, QT1		C Gives us [4,8,6,10] and [5 ,9,7,11]
+	vswp D1REG(QT0), D0REG(QT1)	C Gives us [4,8,5, 9] and [6,10,7,11]
+
+	vmov.i64 QY1, #0
+.Loop4:
+	C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+	vld1.8 {QA, QB}, [MSG :64]!
+	vshl.u64 QC, QA, QRIGHT
+	vshl.u64 QD, QB, QRIGHT
+	vshl.u64 QA, QA, QLEFT
+	vshl.u64 QB, QB, QLEFT
+	veor	D0REG(QA), D0REG(QA), DM
+	veor	D1REG(QA), D1REG(QA), D0REG(QC)
+	veor	D0REG(QB), D0REG(QB), D1REG(QC)
+	veor	D1REG(QB), D1REG(QB), D0REG(QD)
+	vmov	DM, D1REG(QD)
+
+	C Explode message (too bad there's no vadd with scalar)
+	vdup.32	D1REG(QD), D1REG(QB)[1]
+	vdup.32	D0REG(QD), D1REG(QB)[0]
+	vdup.32	D1REG(QC), D0REG(QB)[1]
+	vdup.32	D0REG(QC), D0REG(QB)[0]
+	vdup.32	D1REG(QB), D1REG(QA)[1]
+	vdup.32	D0REG(QB), D1REG(QA)[0]
+	vdup.32	D1REG(QA), D0REG(QA)[1]
+	vdup.32	D0REG(QA), D0REG(QA)[0]
+
+	vadd.i32 QK0, QK0, QA
+	vadd.i32 QK1, QK1, QB
+	vadd.i32 QT0, QT0, QC
+	vadd.i32 QT1, QT1, QD
+
+	vmlal.u32 QY0, D0REG(QK0), D0REG(QT0)
+	vmlal.u32 QY0, D1REG(QK0), D1REG(QT0)
+	vmlal.u32 QY0, D0REG(QK1), D0REG(QT1)
+	vmlal.u32 QY0, D1REG(QK1), D1REG(QT1)
+	
+	C Next 4 subkeys
+	vld1.32	{QT0,QT1}, [KEY]!
+	vmov	QK0, QK2
+	vmov	QK1, QT0
+	vmov	QK2, QT1		C Save
+	vtrn.32	QK0, QK1		C Gives us [8,12,10,14] and [9,13,11,15]
+	vswp D1REG(QK0), D0REG(QK1)	C Gives us [8,12,9,13] and [10,14,11,15]
+	vtrn.32	QT0, QT1		C Gives us [12,16,14,18] and [13,17,15,19]
+	vswp D1REG(QT0), D0REG(QT1)	C Gives us [12,16,13,17] and [14,18,15,19]
+
+	vadd.i32 QA, QA, QK0
+	vadd.i32 QB, QB, QK1
+	vadd.i32 QC, QC, QT0
+	vadd.i32 QD, QD, QT1
+
+	subs	LENGTH, LENGTH, #32
+
+	vmlal.u32 QY1, D0REG(QA), D0REG(QC)
+	vmlal.u32 QY1, D1REG(QA), D1REG(QC)
+	vmlal.u32 QY1, D0REG(QB), D0REG(QD)
+	vmlal.u32 QY1, D1REG(QB), D1REG(QD)
+
+	bhi	.Loop4
+
+	vst1.64	{QY0, QY1}, [OUT]
+	
+	vpop	{q4,q5,q6}
+	
+	ldr	pc, [sp], #+4
+
+.Lnh3:
+	vpush	{q4}
+	vld1.32 {QK0,QK1}, [KEY]!
+	vmov.i64 QY1, #0
+.Loop3:
+	C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+	vld1.8 {QA, QB}, [MSG :64]!
+	vshl.u64 QT0, QA, QRIGHT
+	vshl.u64 QT1, QB, QRIGHT
+	vshl.u64 QA, QA, QLEFT
+	vshl.u64 QB, QB, QLEFT
+	veor	D0REG(QA), D0REG(QA), DM
+	veor	D1REG(QA), D1REG(QA), D0REG(QT0)
+	veor	D0REG(QB), D0REG(QB), D1REG(QT0)
+	veor	D1REG(QB), D1REG(QB), D0REG(QT1)
+	vmov	DM, D1REG(QT1)
+	
+	vld1.32	{QK2}, [KEY]!
+	C Construct factors, with low half corresponding to first iteration,
+	C and high half corresponding to the second iteration.
+	vmov	QT0, QK1
+	vtrn.32	QK0, QT0		C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+	vswp D1REG(QK0), D0REG(QT0)	C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+	vdup.32	D0REG(QT1), D0REG(QA)[0]
+	vdup.32	D1REG(QT1), D0REG(QA)[1]
+	vadd.i32	QT1, QT1, QK0
+
+	vmov	QK0, QK2		C Save for next iteration
+	vtrn.32	QK1, QK2		C Gives us [4, 8, 2, 1] and [1, 5, 3, 7]
+	vswp	D1REG(QK1), D0REG(QK2)	C Gives us [4, 8, 1, 5] and [2, 1, 3, 7]
+	
+	vdup.32	D0REG(QT2), D0REG(QB)[0]
+	vdup.32	D1REG(QT2), D0REG(QB)[1]
+	vadd.i32 QK1, QK1, QT2
+	vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
+	vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
+
+	vdup.32	D0REG(QT1), D1REG(QA)[0]
+	vdup.32	D1REG(QT1), D1REG(QA)[1]
+	vadd.i32	QT0, QT0, QT1
+	vdup.32	D0REG(QT1), D1REG(QB)[0]
+	vdup.32	D1REG(QT1), D1REG(QB)[1]
+	vadd.i32	QK2, QK2, QT1
+
+	vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
+	vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
+
+	vld1.32	{QK1}, [KEY]!
+	vadd.i32 QA, QA, QK0
+	vadd.i32 QB, QB, QK1
+	subs	LENGTH, LENGTH, #32
+	vmlal.u32 QY1, D0REG(QA), D0REG(QB)
+	vmlal.u32 QY1, D1REG(QA), D1REG(QB)
+	bhi	.Loop3
+
+	vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1)
+	vst1.64	{D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT]
+	
+	vpop	{q4}
+	
+	ldr	pc, [sp], #+4
+	
+.Lnh2:
+	vld1.32 {QK0}, [KEY]!
+.Loop2:
+	C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+	vld1.8 {QA, QB}, [MSG :64]!
+	vshl.u64 QT0, QA, QRIGHT
+	vshl.u64 QT1, QB, QRIGHT
+	vshl.u64 QA, QA, QLEFT
+	vshl.u64 QB, QB, QLEFT
+	veor	D0REG(QA), D0REG(QA), DM
+	veor	D1REG(QA), D1REG(QA), D0REG(QT0)
+	veor	D0REG(QB), D0REG(QB), D1REG(QT0)
+	veor	D1REG(QB), D1REG(QB), D0REG(QT1)
+	vmov	DM, D1REG(QT1)
+	
+	vld1.32	{QK1,QK2}, [KEY]!
+	C Construct factors, with low half corresponding to first iteration,
+	C and high half corresponding to the second iteration.
+	vmov	QT0, QK1
+	vtrn.32	QK0, QT0		C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+	vswp D1REG(QK0), D0REG(QT0)	C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+	vdup.32	D0REG(QT1), D0REG(QA)[0]
+	vdup.32	D1REG(QT1), D0REG(QA)[1]
+	vadd.i32	QT1, QT1, QK0
+
+	vmov	QK0, QK2		C Save for next iteration
+	vtrn.32	QK1, QK2		C Gives us [4, 8, 6, 10] and [5,  9, 7, 11]
+	vswp	D1REG(QK1), D0REG(QK2)	C Gives us [4, 8, 5,  9] and [6, 10, 7, 11]
+	
+	vdup.32	D0REG(QT2), D0REG(QB)[0]
+	vdup.32	D1REG(QT2), D0REG(QB)[1]
+	vadd.i32 QK1, QK1, QT2
+	vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
+	vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
+
+	vdup.32	D0REG(QT1), D1REG(QA)[0]
+	vdup.32	D1REG(QT1), D1REG(QA)[1]
+	vadd.i32	QT0, QT0, QT1
+	vdup.32	D0REG(QT1), D1REG(QB)[0]
+	vdup.32	D1REG(QT1), D1REG(QB)[1]
+	vadd.i32	QK2, QK2, QT1
+
+	subs	LENGTH, LENGTH, #32
+	
+	vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
+	vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
+	
+	bhi	.Loop2
+	vst1.64	{QY0}, [OUT]
+
+.Lend:
+	ldr	pc, [sp], #+4
+EPILOGUE(_nettle_umac_nh_n)
--- a/arm/neon/umac-nh.asm
+++ b/arm/neon/umac-nh.asm
+C arm/neon/umac-nh.asm
+
+ifelse(`
+   Copyright (C) 2013 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.file "umac-nh.asm"
+	.fpu	neon
+
+define(`KEY', `r0')
+define(`LENGTH', `r1')
+define(`MSG', `r2')
+define(`SHIFT', `r3')
+
+define(`QA', `q0')
+define(`QB', `q1')
+define(`DM', `d16')
+define(`QLEFT', `q9')
+define(`QRIGHT', `q10')
+define(`QY', `q11')
+define(`QT0', `q12')
+define(`QT1', `q13')
+define(`QK0', `q14')
+define(`QK1', `q15')
+
+	.text
+	.align	3
+	
+PROLOGUE(_nettle_umac_nh)
+	C Setup for 64-bit aligned reads
+	ands	SHIFT, MSG, #7
+	and	MSG, MSG, #-8
+	vld1.8	{DM}, [MSG :64]
+	addne	MSG, MSG, #8
+	addeq	SHIFT, SHIFT, #8
+
+	C FIXME: Combine as rsb ?
+	lsl	SHIFT, SHIFT, #3
+	neg	SHIFT, SHIFT
+
+	C Right shift in QRIGHT (both halves)
+	vmov.i32 D0REG(QRIGHT)[0], SHIFT
+	vmov.32	 D1REG(QRIGHT), D0REG(QRIGHT)
+	add	SHIFT, SHIFT, #64
+	
+	vmov.i32 D0REG(QLEFT)[0], SHIFT
+	vmov.32	 D1REG(QLEFT), D0REG(QLEFT)
+
+	vmov.i64 QY, #0
+
+	vshl.u64 DM, DM, D0REG(QRIGHT)
+.Loop:
+	C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+	vld1.8 {QA, QB}, [MSG :64]!
+	vshl.u64 QT0, QA, QRIGHT
+	vshl.u64 QT1, QB, QRIGHT
+	vshl.u64 QA, QA, QLEFT
+	vshl.u64 QB, QB, QLEFT
+	veor	D0REG(QA), D0REG(QA), DM
+	veor	D1REG(QA), D1REG(QA), D0REG(QT0)
+	veor	D0REG(QB), D0REG(QB), D1REG(QT0)
+	veor	D1REG(QB), D1REG(QB), D0REG(QT1)
+	vmov	DM, D1REG(QT1)
+
+	vld1.i32 {QK0, QK1}, [KEY]!
+	vadd.i32 QA, QA, QK0
+	vadd.i32 QB, QB, QK1
+	subs	LENGTH, LENGTH, #32
+	vmlal.u32 QY, D0REG(QA), D0REG(QB)
+	vmlal.u32 QY, D1REG(QA), D1REG(QB)
+	bhi	.Loop
+
+	vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY)
+	C return value needs to respect word order mandated by AAPCS
+IF_LE(`	vmov	r0, r1, D0REG(QY)')
+IF_BE(`	vmov	r1, r0, D0REG(QY)')
+	bx	lr
+EPILOGUE(_nettle_umac_nh)
--- a/arm/v6/aes-decrypt-internal.asm
+++ b/arm/v6/aes-decrypt-internal.asm
+C arm/v6/aes-decrypt-internal.asm
+
+ifelse(`
+   Copyright (C) 2013 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.arch armv6
+
+include_src(`arm/aes.m4')
+
+define(`PARAM_ROUNDS', `r0')
+define(`PARAM_KEYS', `r1')
+define(`TABLE', `r2')
+define(`LENGTH', `r3')
+C On stack: DST, SRC
+
+define(`W0', `r4')
+define(`W1', `r5')
+define(`W2', `r6')
+define(`W3', `r7')
+define(`T0', `r8')
+define(`COUNT', `r10')
+define(`KEY', `r11')
+
+define(`X0', `r0')	C Overlaps PARAM_ROUNDS and PARAM_KEYS
+define(`X1', `r1')
+define(`X2', `r12')
+define(`X3', `r14')	C lr
+
+define(`FRAME_ROUNDS',  `[sp]')
+define(`FRAME_KEYS',  `[sp, #+4]')
+C 8 saved registers
+define(`FRAME_DST',  `[sp, #+40]')
+define(`FRAME_SRC',  `[sp, #+44]')
+
+define(`SRC', `r12')	C Overlap registers used in inner loop.
+define(`DST', `COUNT')
+
+C AES_DECRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
+define(`AES_DECRYPT_ROUND', `
+	uxtb	T0, $1
+	ldr	$5, [TABLE, T0, lsl #2]
+	uxtb	T0, $2
+	ldr	$6, [TABLE, T0, lsl #2]
+	uxtb	T0, $3
+	ldr	$7, [TABLE, T0, lsl #2]
+	uxtb	T0, $4
+	ldr	$8, [TABLE, T0, lsl #2]
+
+	uxtb	T0, $4, ror #8
+	add	TABLE, TABLE, #1024
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$5, $5, T0
+	uxtb	T0, $1, ror #8
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$6, $6, T0
+	uxtb	T0, $2, ror #8
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$7, $7, T0
+	uxtb	T0, $3, ror #8
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$8, $8, T0
+
+	uxtb	T0, $3, ror #16
+	add	TABLE, TABLE, #1024
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$5, $5, T0
+	uxtb	T0, $4, ror #16
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$6, $6, T0
+	uxtb	T0, $1, ror #16
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$7, $7, T0
+	uxtb	T0, $2, ror #16
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$8, $8, T0
+
+	uxtb	T0, $2, ror #24
+	add	TABLE, TABLE, #1024
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$5, $5, T0
+	uxtb	T0, $3, ror #24
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$6, $6, T0
+	uxtb	T0, $4, ror #24
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$7, $7, T0
+	uxtb	T0, $1, ror #24
+	ldr	T0, [TABLE, T0, lsl #2]
+
+	ldm	$9, {$1,$2,$3,$4}
+	eor	$8, $8, T0
+	sub	TABLE, TABLE, #3072
+	eor	$5, $5, $1
+	eor	$6, $6, $2
+	sub	$9, $9, #16
+	eor	$7, $7, $3
+	eor	$8, $8, $4
+')
+
+	.file "aes-decrypt-internal.asm"
+	
+	C _aes_decrypt(unsigned rounds, const uint32_t *keys,
+	C	       const struct aes_table *T,
+	C	       size_t length, uint8_t *dst,
+	C	       uint8_t *src)
+	.text
+	ALIGN(4)
+PROLOGUE(_nettle_aes_decrypt)
+	teq	LENGTH, #0
+	beq	.Lend
+
+	ldr	SRC, [sp, #+4]
+
+	push	{r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
+
+	ALIGN(16)
+.Lblock_loop:
+	ldm	sp, {COUNT, KEY}
+
+	add	TABLE, TABLE, #AES_TABLE0
+
+	AES_LOAD(SRC,KEY,W0)
+	AES_LOAD(SRC,KEY,W1)
+	AES_LOAD(SRC,KEY,W2)
+	AES_LOAD_INCR(SRC,KEY,W3, -28)
+
+	str	SRC, FRAME_SRC
+
+	b	.Lentry
+	ALIGN(16)
+.Lround_loop:
+	C	Transform X -> W
+	AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
+	
+.Lentry:
+	subs	COUNT, COUNT,#2
+	C	Transform W -> X
+	AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
+
+	bne	.Lround_loop
+
+	sub	TABLE, TABLE, #AES_TABLE0
+
+	C	Final round
+	ldr	DST, FRAME_DST
+
+	AES_FINAL_ROUND_V6(X0, X3, X2, X1, KEY, W0)
+	AES_FINAL_ROUND_V6(X1, X0, X3, X2, KEY, W1)
+	AES_FINAL_ROUND_V6(X2, X1, X0, X3, KEY, W2)
+	AES_FINAL_ROUND_V6(X3, X2, X1, X0, KEY, W3)
+
+	ldr	SRC, FRAME_SRC
+	
+	AES_STORE(DST,W0)
+	AES_STORE(DST,W1)
+	AES_STORE(DST,W2)
+	AES_STORE(DST,W3)
+
+	str	DST, FRAME_DST
+	subs	LENGTH, LENGTH, #16
+	bhi	.Lblock_loop
+
+	add	sp, sp, #8	C Drop saved r0, r1
+	pop	{r4,r5,r6,r7,r8,r10,r11,pc}
+	
+.Lend:
+	bx	lr
+EPILOGUE(_nettle_aes_decrypt)
--- a/arm/v6/aes-encrypt-internal.asm
+++ b/arm/v6/aes-encrypt-internal.asm
+C arm/v6/aes-encrypt-internal.asm
+
+ifelse(`
+   Copyright (C) 2013 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.arch armv6
+
+include_src(`arm/aes.m4')
+
+C	Benchmarked at at 706, 870, 963 cycles/block on cortex A9,
+C	for 128, 192 and 256 bit key sizes.
+
+C	Possible improvements: More efficient load and store with
+C	aligned accesses. Better scheduling.
+
+define(`PARAM_ROUNDS', `r0')
+define(`PARAM_KEYS', `r1')
+define(`TABLE', `r2')
+define(`LENGTH', `r3')
+C On stack: DST, SRC
+
+define(`W0', `r4')
+define(`W1', `r5')
+define(`W2', `r6')
+define(`W3', `r7')
+define(`T0', `r8')
+define(`COUNT', `r10')
+define(`KEY', `r11')
+
+define(`X0', `r0')	C Overlaps PARAM_ROUNDS and PARAM_KEYS
+define(`X1', `r1')
+define(`X2', `r12')
+define(`X3', `r14')	C lr
+
+define(`FRAME_ROUNDS',  `[sp]')
+define(`FRAME_KEYS',  `[sp, #+4]')
+C 8 saved registers
+define(`FRAME_DST',  `[sp, #+40]')
+define(`FRAME_SRC',  `[sp, #+44]')
+
+define(`SRC', `r12')	C Overlap registers used in inner loop.
+define(`DST', `COUNT')
+
+C 53 instr.
+C It's tempting to use eor with rotation, but that's slower.
+C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
+define(`AES_ENCRYPT_ROUND', `
+	uxtb	T0, $1 
+	ldr	$5, [TABLE, T0, lsl #2]
+	uxtb	T0, $2
+	ldr	$6, [TABLE, T0, lsl #2]
+	uxtb	T0, $3
+	ldr	$7, [TABLE, T0, lsl #2]
+	uxtb	T0, $4
+	ldr	$8, [TABLE, T0, lsl #2]
+
+	uxtb	T0, $2, ror #8
+	add	TABLE, TABLE, #1024
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$5, $5, T0
+	uxtb	T0, $3, ror #8
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$6, $6, T0
+	uxtb	T0, $4, ror #8
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$7, $7, T0
+	uxtb	T0, $1, ror #8
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$8, $8, T0
+
+	uxtb	T0, $3, ror #16
+	add	TABLE, TABLE, #1024
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$5, $5, T0
+	uxtb	T0, $4, ror #16
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$6, $6, T0
+	uxtb	T0, $1, ror #16
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$7, $7, T0
+	uxtb	T0, $2, ror #16
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$8, $8, T0
+
+	uxtb	T0, $4, ror #24
+	add	TABLE, TABLE, #1024
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$5, $5, T0
+	uxtb	T0, $1, ror #24
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$6, $6, T0
+	uxtb	T0, $2, ror #24
+	ldr	T0, [TABLE, T0, lsl #2]
+	eor	$7, $7, T0
+	uxtb	T0, $3, ror #24
+	ldr	T0, [TABLE, T0, lsl #2]
+
+	ldm	$9!, {$1,$2,$3,$4}
+	eor	$8, $8, T0
+	sub	TABLE, TABLE, #3072
+	eor	$5, $5, $1
+	eor	$6, $6, $2
+	eor	$7, $7, $3
+	eor	$8, $8, $4
+')
+
+	.file "aes-encrypt-internal.asm"
+	
+	C _aes_encrypt(unsigned rounds, const uint32_t *keys,
+	C	       const struct aes_table *T,
+	C	       size_t length, uint8_t *dst,
+	C	       uint8_t *src)
+	.text
+	ALIGN(4)
+PROLOGUE(_nettle_aes_encrypt)
+	teq	LENGTH, #0
+	beq	.Lend
+
+	ldr	SRC, [sp, #+4]
+
+	push	{r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
+
+	ALIGN(16)
+.Lblock_loop:
+	ldm	sp, {COUNT, KEY}
+
+	add	TABLE, TABLE, #AES_TABLE0
+
+	AES_LOAD(SRC,KEY,W0)
+	AES_LOAD(SRC,KEY,W1)
+	AES_LOAD(SRC,KEY,W2)
+	AES_LOAD(SRC,KEY,W3)
+
+	str	SRC, FRAME_SRC
+
+	b	.Lentry
+	ALIGN(16)
+.Lround_loop:
+	C	Transform X -> W
+	AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
+	
+.Lentry:
+	subs	COUNT, COUNT,#2
+	C	Transform W -> X
+	AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
+
+	bne	.Lround_loop
+
+	sub	TABLE, TABLE, #AES_TABLE0
+
+	C	Final round
+	ldr	DST, FRAME_DST
+
+	AES_FINAL_ROUND_V6(X0, X1, X2, X3, KEY, W0)
+	AES_FINAL_ROUND_V6(X1, X2, X3, X0, KEY, W1)
+	AES_FINAL_ROUND_V6(X2, X3, X0, X1, KEY, W2)
+	AES_FINAL_ROUND_V6(X3, X0, X1, X2, KEY, W3)
+
+	ldr	SRC, FRAME_SRC
+	
+	AES_STORE(DST,W0)
+	AES_STORE(DST,W1)
+	AES_STORE(DST,W2)
+	AES_STORE(DST,W3)
+
+	str	DST, FRAME_DST
+	subs	LENGTH, LENGTH, #16
+	bhi	.Lblock_loop
+
+	add	sp, sp, #8	C Drop saved r0, r1
+	pop	{r4,r5,r6,r7,r8,r10,r11,pc}
+	
+.Lend:
+	bx	lr
+EPILOGUE(_nettle_aes_encrypt)
No results found