From f0105607a70b298b06ad73a88f03f1bc28f1b020 Mon Sep 17 00:00:00 2001
From: Maamoun TK <maamoun.tk@googlemail.com>
Date: Fri, 24 Mar 2023 00:23:09 +0200
Subject: [PATCH] [x86_64] Use 2-way GHASH pclmul update

---
 x86_64/pclmul/ghash-set-key.asm | 35 ++++++++++++++--
 x86_64/pclmul/ghash-update.asm  | 72 +++++++++++++++++++++++++++------
 2 files changed, 91 insertions(+), 16 deletions(-)

diff --git a/x86_64/pclmul/ghash-set-key.asm b/x86_64/pclmul/ghash-set-key.asm
index 3a7a976a..863ee244 100644
--- a/x86_64/pclmul/ghash-set-key.asm
+++ b/x86_64/pclmul/ghash-set-key.asm
@@ -39,12 +39,15 @@ define(`BSWAP', `%xmm1')
 define(`H', `%xmm2')
 define(`D', `%xmm3')
 define(`T', `%xmm4')
-define(`MASK', `%xmm5')
+define(`R', `%xmm5')
+define(`M', `%xmm6')
+define(`F', `%xmm7')
+define(`MASK', `%xmm7')
 
     C void _ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key)
 
 PROLOGUE(_nettle_ghash_set_key)
-	W64_ENTRY(2, 6)
+	W64_ENTRY(2, 8)
 	movdqa	.Lpolynomial(%rip), P
 	movdqa	.Lbswap(%rip), BSWAP
 	movups	(KEY), H
@@ -63,11 +66,35 @@ PROLOGUE(_nettle_ghash_set_key)
 	movups	H, (CTX)
 
 	C Set D = x^{-64} H = {H0, H1} + P1 H0
+	movdqa	H, T
+	pshufd	$0x4e, H, D	C Swap H0, H1
+	pclmullqhqdq P, T
+	pxor	T, D
+	movups	D, 16(CTX)
+
+	movdqa		H, M
+	movdqa		H, F
+	movdqa		H, T
+	pclmulhqlqdq	H, T	C H0 * M1
+	pclmulhqhqdq	H, M	C H1 * M1
+	pclmullqlqdq	D, F 	C D0 * M0
+	pclmullqhqdq	D, H	C D1 * M0
+	pxor		T, F
+	pxor		M, H
+
+	pshufd		$0x4e, F, T		C Swap halves of F
+	pxor		T, H
+	pclmullqhqdq	P, F
+	pxor		F, H
+	movups	H, 32(CTX)
+
+	C Set D2 = x^{-64} H^2 = {H0, H1} + P1 H0
 	pshufd	$0x4e, H, D	C Swap H0, H1
 	pclmullqhqdq P, H
 	pxor	H, D
-	movups	D, 16(CTX)
-	W64_EXIT(2, 6)
+	movups	D, 48(CTX)
+
+	W64_EXIT(2, 8)
 	ret
 EPILOGUE(_nettle_ghash_set_key)
 
diff --git a/x86_64/pclmul/ghash-update.asm b/x86_64/pclmul/ghash-update.asm
index 93cc002c..1c30e7d8 100644
--- a/x86_64/pclmul/ghash-update.asm
+++ b/x86_64/pclmul/ghash-update.asm
@@ -41,11 +41,16 @@ define(`P', `%xmm0')
 define(`BSWAP', `%xmm1')
 define(`H', `%xmm2')
 define(`D', `%xmm3')
-define(`T', `%xmm4')
-
-define(`R', `%xmm5')
-define(`M', `%xmm6')
-define(`F', `%xmm7')
+define(`H2', `%xmm4')
+define(`D2', `%xmm5')
+define(`T', `%xmm6')
+define(`R', `%xmm7')
+define(`M', `%xmm8')
+define(`F', `%xmm9')
+define(`T2', `%xmm10')
+define(`R2', `%xmm11')
+define(`M2', `%xmm12')
+define(`F2', `%xmm13')
 
 C Use pclmulqdq, doing one 64x64 --> 127 bit carry-less multiplication,
 C with source operands being selected from the halves of two 128-bit registers.
@@ -80,21 +85,66 @@ C registers left for temporaries.
 	C				size_t blocks, const uint8_t *data)
 
 PROLOGUE(_nettle_ghash_update)
-	W64_ENTRY(4, 8)
+	W64_ENTRY(4, 14)
 	movdqa		.Lpolynomial(%rip), P
 	movdqa		.Lbswap(%rip), BSWAP
 	movups		(CTX), H
 	movups		16(CTX), D
+	movups		32(CTX), H2
+	movups		48(CTX), D2
 	movups		(X), R
 	pshufb		BSWAP, R
 
-	sub		$1, BLOCKS
-	jc		.Ldone
+	mov		BLOCKS, %rax
+	shr		$1, %rax
+	jz		.L1_block
 
 .Loop:
 	movups		(DATA), M
 	pshufb		BSWAP, M
-.Lblock:
+	pxor		M, R
+	movdqa		R, M
+	movdqa		R, F
+	movdqa		R, T
+	pclmullqlqdq	D2, F 	C {D^2}0 * M1_0
+	pclmullqhqdq	D2, R	C {D^2}1 * M1_0
+	pclmulhqlqdq	H2, T	C {H^2}0 * M1_1
+	pclmulhqhqdq	H2, M	C {H^2}1 * M1_1
+	
+
+	movups		16(DATA), M2
+	pshufb		BSWAP, M2
+	movdqa		M2, R2
+	movdqa		M2, F2
+	movdqa		M2, T2
+	pclmullqlqdq	D, F2 	C D0 * M2_0
+	pclmullqhqdq	D, R2	C D1 * M2_0
+	pclmulhqlqdq	H, T2	C H0 * M2_1
+	pclmulhqhqdq	H, M2	C H1 * M2_1
+
+	pxor		T, F
+	pxor		M, R
+	pxor		T2, F2
+	pxor		M2, R2
+
+	pxor		F2, F
+	pxor		R2, R
+
+	pshufd		$0x4e, F, T		C Swap halves of F
+	pxor		T, R
+	pclmullqhqdq	P, F
+	pxor		F, R
+
+	add		$32, DATA
+	dec		%rax
+	jnz		.Loop
+
+.L1_block:
+	and		$1, BLOCKS
+	jz		.Ldone
+
+	movups		(DATA), M
+	pshufb		BSWAP, M
 	pxor		M, R
 	movdqa		R, M
 	movdqa		R, F
@@ -112,14 +162,12 @@ PROLOGUE(_nettle_ghash_update)
 	pxor		F, R
 
 	add		$16, DATA
-	sub		$1, BLOCKS
-	jnc		.Loop
 
 .Ldone:
 	pshufb		BSWAP, R
 	movups		R, (X)
 	mov		DATA, %rax
-	W64_EXIT(4, 8)
+	W64_EXIT(4, 14)
 	ret
 EPILOGUE(_nettle_ghash_update)
 
-- 
GitLab