From db9b8594e4caa5459483359567fd077025a0cb65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Wed, 3 Jan 2018 19:00:19 +0100
Subject: [PATCH] Unroll x86_64 aesni loops.

---
 ChangeLog                             |   7 ++
 x86_64/aesni/aes-decrypt-internal.asm | 106 +++++++++++++++++---------
 x86_64/aesni/aes-encrypt-internal.asm | 106 +++++++++++++++++---------
 3 files changed, 147 insertions(+), 72 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 231cb8ce..24f0cc1a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2018-01-03  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/aesni/aes-encrypt-internal.asm: Read subkeys into xmm
+	registers before the block loop, and completely unroll the round
+	loop.
+	* x86_64/aesni/aes-decrypt-internal.asm: Likewise.
+
 2017-11-19  Niels Möller  <nisse@lysator.liu.se>
 
 	* Released nettle-3.4.
diff --git a/x86_64/aesni/aes-decrypt-internal.asm b/x86_64/aesni/aes-decrypt-internal.asm
index 412e8d31..3d6d6e30 100644
--- a/x86_64/aesni/aes-decrypt-internal.asm
+++ b/x86_64/aesni/aes-decrypt-internal.asm
@@ -2,7 +2,7 @@ C x86_64/aesni/aes-decrypt-internal.asm
 
 
 ifelse(<
-   Copyright (C) 2015 Niels Möller
+   Copyright (C) 2015, 2018 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -39,15 +39,22 @@ define(<LENGTH>,<%rcx>)
 define(<DST>,	<%r8>)
 define(<SRC>,	<%r9>)
 
-C Round counter
-define(<CNT>, <%rdx>)
-C Subkey pointer
-define(<KEY>, <%rax>)
-
-dnl aesdec %xmm1, %xmm0
-define(<AESDEC>, <.byte 0x66, 0x0f, 0x38, 0xde, 0xc1>)
-dnl aesdeclast %xmm1, %xmm0
-define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>)
+define(<KEY0>, <%xmm0>)
+define(<KEY1>, <%xmm1>)
+define(<KEY2>, <%xmm2>)
+define(<KEY3>, <%xmm3>)
+define(<KEY4>, <%xmm4>)
+define(<KEY5>, <%xmm5>)
+define(<KEY6>, <%xmm6>)
+define(<KEY7>, <%xmm7>)
+define(<KEY8>, <%xmm8>)
+define(<KEY9>, <%xmm9>)
+define(<KEY10>, <%xmm10>)
+define(<KEY11>, <%xmm11>)
+define(<KEY12>, <%xmm12>)
+define(<KEY13>, <%xmm13>)
+define(<KEYLAST>, <%xmm14>)
+define(<BLOCK>, <%xmm15>)
 
 	.file "aes-decrypt-internal.asm"
 
@@ -58,43 +65,70 @@ define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>)
 	.text
 	ALIGN(16)
 PROLOGUE(_nettle_aes_decrypt)
-	W64_ENTRY(6, 2)
+	W64_ENTRY(6, 16)
 	shr	$4, LENGTH
 	test	LENGTH, LENGTH
 	jz	.Lend
 
-	decl	XREG(ROUNDS)
+	movups	(KEYS), KEY0
+	movups	16(KEYS), KEY1
+	movups	32(KEYS), KEY2
+	movups	48(KEYS), KEY3
+	movups	64(KEYS), KEY4
+	movups	80(KEYS), KEY5
+	movups	96(KEYS), KEY6
+	movups	112(KEYS), KEY7
+	movups	128(KEYS), KEY8
+	movups	144(KEYS), KEY9
+	lea	160(KEYS), KEYS
+	sub	$10, XREG(ROUNDS)	C Also clears high half
+	je	.Lkey_last
+
+	movups	(KEYS), KEY10
+	movups	16(KEYS), KEY11
+	lea	(KEYS, ROUNDS, 8), KEYS
+	lea	(KEYS, ROUNDS, 8), KEYS
+
+	cmpl	$2, XREG(ROUNDS)
+	je	.Lkey_last
+	movups	-32(KEYS), KEY12
+	movups	-16(KEYS), KEY13
+
+.Lkey_last:
+	movups	(KEYS), KEYLAST
 
 .Lblock_loop:
-	mov	ROUNDS, CNT
-	mov	KEYS, KEY
-	movups	(SRC), %xmm0
-	C FIXME: Better alignment of subkeys, so we can use movaps.
-	movups	(KEY), %xmm1
-	pxor	%xmm1, %xmm0
-
-	C FIXME: Could use some unrolling. Also all subkeys fit in
-	C registers, so they could be loaded once (on W64 we would
-	C need to save and restore some xmm registers, though).
-
-.Lround_loop:
-	add	$16, KEY
-
-	movups	(KEY), %xmm1
-	AESDEC	C %xmm1, %xmm0
-	decl	XREG(CNT)
-	jnz	.Lround_loop
-
-	movups	16(KEY), %xmm1
-	AESDECLAST	C %xmm1, %xmm0
-
-	movups	%xmm0, (DST)
+	movups	(SRC), BLOCK
+	pxor	KEY0, BLOCK
+	aesdec	KEY1, BLOCK
+	aesdec	KEY2, BLOCK
+	aesdec	KEY3, BLOCK
+	aesdec	KEY4, BLOCK
+	aesdec	KEY5, BLOCK
+	aesdec	KEY6, BLOCK
+	aesdec	KEY7, BLOCK
+	aesdec	KEY8, BLOCK
+	aesdec	KEY9, BLOCK
+	testl	XREG(ROUNDS), XREG(ROUNDS)
+	je	.Lblock_end
+	aesdec	KEY10, BLOCK
+	aesdec	KEY11, BLOCK
+	cmpl	$2, XREG(ROUNDS)
+	je	.Lblock_end
+
+	aesdec	KEY12, BLOCK
+	aesdec	KEY13, BLOCK
+
+.Lblock_end:
+	aesdeclast KEYLAST, BLOCK
+
+	movups	BLOCK, (DST)
 	add	$16, SRC
 	add	$16, DST
 	dec	LENGTH
 	jnz	.Lblock_loop
 
 .Lend:
-	W64_EXIT(6, 2)
+	W64_EXIT(6, 16)
 	ret
 EPILOGUE(_nettle_aes_decrypt)
diff --git a/x86_64/aesni/aes-encrypt-internal.asm b/x86_64/aesni/aes-encrypt-internal.asm
index 07f17b25..99caf1f8 100644
--- a/x86_64/aesni/aes-encrypt-internal.asm
+++ b/x86_64/aesni/aes-encrypt-internal.asm
@@ -2,7 +2,7 @@ C x86_64/aesni/aes-encrypt-internal.asm
 
 
 ifelse(<
-   Copyright (C) 2015 Niels Möller
+   Copyright (C) 2015, 2018 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -39,16 +39,23 @@ define(<LENGTH>,<%rcx>)
 define(<DST>,	<%r8>)
 define(<SRC>,	<%r9>)
 
-C Round counter
-define(<CNT>, <%rdx>)
-C Subkey pointer
-define(<KEY>, <%rax>)
+define(<KEY0>, <%xmm0>)
+define(<KEY1>, <%xmm1>)
+define(<KEY2>, <%xmm2>)
+define(<KEY3>, <%xmm3>)
+define(<KEY4>, <%xmm4>)
+define(<KEY5>, <%xmm5>)
+define(<KEY6>, <%xmm6>)
+define(<KEY7>, <%xmm7>)
+define(<KEY8>, <%xmm8>)
+define(<KEY9>, <%xmm9>)
+define(<KEY10>, <%xmm10>)
+define(<KEY11>, <%xmm11>)
+define(<KEY12>, <%xmm12>)
+define(<KEY13>, <%xmm13>)
+define(<KEYLAST>, <%xmm14>)
+define(<BLOCK>, <%xmm15>)
 
-dnl aesenc %xmm1, %xmm0
-define(<AESENC>, <.byte 0x66, 0x0f, 0x38, 0xdc, 0xc1>)
-dnl aesenclast %xmm1, %xmm0
-define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
-	
 	.file "aes-encrypt-internal.asm"
 
 	C _aes_encrypt(unsigned rounds, const uint32_t *keys,
@@ -58,43 +65,70 @@ define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
 	.text
 	ALIGN(16)
 PROLOGUE(_nettle_aes_encrypt)
-	W64_ENTRY(6, 2)
+	W64_ENTRY(6, 16)
 	shr	$4, LENGTH
 	test	LENGTH, LENGTH
 	jz	.Lend
 
-	decl	XREG(ROUNDS)
+	movups	(KEYS), KEY0
+	movups	16(KEYS), KEY1
+	movups	32(KEYS), KEY2
+	movups	48(KEYS), KEY3
+	movups	64(KEYS), KEY4
+	movups	80(KEYS), KEY5
+	movups	96(KEYS), KEY6
+	movups	112(KEYS), KEY7
+	movups	128(KEYS), KEY8
+	movups	144(KEYS), KEY9
+	lea	160(KEYS), KEYS
+	sub	$10, XREG(ROUNDS)	C Also clears high half
+	je	.Lkey_last
+
+	movups	(KEYS), KEY10
+	movups	16(KEYS), KEY11
+	lea	(KEYS, ROUNDS, 8), KEYS
+	lea	(KEYS, ROUNDS, 8), KEYS
+
+	cmpl	$2, XREG(ROUNDS)
+	je	.Lkey_last
+	movups	-32(KEYS), KEY12
+	movups	-16(KEYS), KEY13
+
+.Lkey_last:
+	movups	(KEYS), KEYLAST
 
 .Lblock_loop:
-	mov	ROUNDS, CNT
-	mov	KEYS, KEY
-	movups	(SRC), %xmm0
-	C FIXME: Better alignment of subkeys, so we can use movaps.
-	movups	(KEY), %xmm1
-	pxor	%xmm1, %xmm0
-
-	C FIXME: Could use some unrolling. Also all subkeys fit in
-	C registers, so they could be loaded once (on W64 we would
-	C need to save and restore some xmm registers, though).
-
-.Lround_loop:
-	add	$16, KEY
-
-	movups	(KEY), %xmm1
-	AESENC	C %xmm1, %xmm0
-	decl	XREG(CNT)
-	jnz	.Lround_loop
-
-	movups	16(KEY), %xmm1
-	AESENCLAST	C %xmm1, %xmm0
-
-	movups	%xmm0, (DST)
+	movups	(SRC), BLOCK
+	pxor	KEY0, BLOCK
+	aesenc	KEY1, BLOCK
+	aesenc	KEY2, BLOCK
+	aesenc	KEY3, BLOCK
+	aesenc	KEY4, BLOCK
+	aesenc	KEY5, BLOCK
+	aesenc	KEY6, BLOCK
+	aesenc	KEY7, BLOCK
+	aesenc	KEY8, BLOCK
+	aesenc	KEY9, BLOCK
+	testl	XREG(ROUNDS), XREG(ROUNDS)
+	je	.Lblock_end
+	aesenc	KEY10, BLOCK
+	aesenc	KEY11, BLOCK
+	cmpl	$2, XREG(ROUNDS)
+	je	.Lblock_end
+
+	aesenc	KEY12, BLOCK
+	aesenc	KEY13, BLOCK
+
+.Lblock_end:
+	aesenclast KEYLAST, BLOCK
+
+	movups	BLOCK, (DST)
 	add	$16, SRC
 	add	$16, DST
 	dec	LENGTH
 	jnz	.Lblock_loop
 
 .Lend:
-	W64_EXIT(6, 2)
+	W64_EXIT(6, 16)
 	ret
 EPILOGUE(_nettle_aes_encrypt)
-- 
GitLab