From 58fd657743fe29f3c4dacd50a402fb598f204399 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Tue, 16 May 2023 21:11:04 +0200
Subject: [PATCH] x86_64: Fix alignment bug in _nettle_ghash_update.

---
 ChangeLog               |  7 +++++++
 x86_64/ghash-update.asm | 22 ++++++++++++++++------
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 507e0438..6312ac00 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2023-05-16  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/ghash-update.asm: Use separate unaligned load
+	instructions (movups) to load the tabulated values, since they are
+	only 8-byte aligned and pand memory operands require 16-byte
+	alignment.
+
 2023-05-15  Niels Möller  <nisse@lysator.liu.se>
 
 	* eccdata.c (output_bignum_redc): Add missing mpz_clear, reported
diff --git a/x86_64/ghash-update.asm b/x86_64/ghash-update.asm
index b3417e45..0a72f071 100644
--- a/x86_64/ghash-update.asm
+++ b/x86_64/ghash-update.asm
@@ -44,6 +44,8 @@ define(`M0', `%xmm2')
 define(`M1', `%xmm3')
 define(`M2', `%xmm4')
 define(`M3', `%xmm5')
+define(`T0', `%xmm6')
+define(`T1', `%xmm7')
 
 	.file "ghash-update.asm"
 
@@ -54,7 +56,7 @@ define(`M3', `%xmm5')
 	.text
 	ALIGN(16)
 PROLOGUE(_nettle_ghash_update)
-	W64_ENTRY(4, 6)
+	W64_ENTRY(4, 8)
 	sub	$1, BLOCKS
 	movups	(XP), X
 	jc	.Ldone
@@ -77,10 +79,18 @@ ALIGN(16)
 	pshufd	$0xaa, M3, M2
 	pshufd	$0xff, M3, M3
 	pslld	$1, X
-	pand	(KEY, CNT), M0
-	pand	(KEY32, CNT), M1
-	pand	16(KEY, CNT), M2
-	pand	16(KEY32, CNT), M3
+	C Tabulated values are only 8-byte aligned, and therefore
+	C can't be used as memory operands to pand. It would be nice
+	C if we could force 16-byte alignment on nettle_block16, using
+	C C11 alignas.
+	movups	(KEY, CNT), T0
+	movups	(KEY32, CNT), T1
+	pand	T0, M0
+	pand	T1, M1
+	movups	16(KEY, CNT), T0
+	movups	16(KEY32, CNT), T1
+	pand	T0, M2
+	pand	T1, M3
 	pxor	M0, M1
 	pxor	M2, M3
 	pxor	M1, R
@@ -98,6 +108,6 @@ ALIGN(16)
 .Ldone:
 	movups	X, (XP)
 	mov	SRC, %rax
-	W64_EXIT(4, 6)
+	W64_EXIT(4, 8)
 	ret
 EPILOGUE(_nettle_ghash_update)
-- 
GitLab