diff --git a/ChangeLog b/ChangeLog
index dc1b999d268779a61eb0ce922f2481850b2055d3..eac25a0d969c8484bbcc5e0179fab0bd94943c88 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2022-01-27  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/poly1305-internal.asm: Rewrote. Rearrange folding, so
+	that all the multiply instructions needed to process a block are
+	independent of each other. Measured speedup of 16% on AMD zen2 and
+	28% on Intel broadwell, and expected to be generally faster.
+	* asm.m4 (P1305): Rearrange struct contents, to fit 64-bit entries
+	S0 and H2. Total struct size unchanged.
+
 2022-01-25  Niels Möller  <nisse@lysator.liu.se>
 
 	Chacha implementation for arm64, contributed by Mamone Tarsha.
diff --git a/asm.m4 b/asm.m4
index 4ac21c20f33c74b05e4e00f585c747795c41dc6b..60c66c25366fa0ab2fb64b58cbec85e4866bca78 100644
--- a/asm.m4
+++ b/asm.m4
@@ -94,10 +94,10 @@ C For 64-bit implementation
 STRUCTURE(P1305)
   STRUCT(R0, 8)
   STRUCT(R1, 8)
+  STRUCT(S0, 8)
   STRUCT(S1, 8)
-  STRUCT(PAD, 12)
-  STRUCT(H2, 4)
   STRUCT(H0, 8)
   STRUCT(H1, 8)
+  STRUCT(H2, 8)
 
 divert
diff --git a/x86_64/poly1305-internal.asm b/x86_64/poly1305-internal.asm
index 1e2c60b6a0c406a4303e831fa4d44d3119f4326f..ef2f38e4fde1c3df24147eaace19f96816dceaa2 100644
--- a/x86_64/poly1305-internal.asm
+++ b/x86_64/poly1305-internal.asm
@@ -33,153 +33,175 @@ ifelse(`
 	.file "poly1305-internal.asm"
 
 C Registers mainly used by poly1305_block
-define(`CTX', `%rdi')
-define(`T0', `%rcx')
-define(`T1', `%rsi')
-define(`T2', `%r8')
-define(`H0', `%r9')
-define(`H1', `%r10')
-define(`H2', `%r11')
-	
+define(`CTX', `%rdi') C First argument to all functions
+
+define(`KEY', `%rsi')
+define(`MASK',` %r8')
 	C _poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
 	.text
-	C Registers:
-	C  %rdi: ctx
-	C  %rsi: key
-	C  %r8: mask
 	ALIGN(16)
 PROLOGUE(_nettle_poly1305_set_key)
 	W64_ENTRY(2,0)
-	mov	$0x0ffffffc0fffffff, %r8
-	mov	(%rsi), %rax
-	and	%r8, %rax
-	and	$-4, %r8
-	mov	%rax, (CTX)
-	mov	8(%rsi), %rax
-	and	%r8, %rax
+	mov	$0x0ffffffc0fffffff, MASK
+	mov	(KEY), %rax
+	and	MASK, %rax
+	and	$-4, MASK
+	mov	%rax, P1305_R0 (CTX)
+	imul	$5, %rax
+	mov	%rax, P1305_S0 (CTX)	C 5*R0
+	mov	8(KEY), %rax
+	and	MASK, %rax
 	mov	%rax, P1305_R1 (CTX)
 	shr	$2, %rax
 	imul	$5, %rax
-	mov	%rax, P1305_S1 (CTX)
+	mov	%rax, P1305_S1 (CTX)	C 5*(R1>>2)
 	xor	XREG(%rax), XREG(%rax)
 	mov	%rax, P1305_H0 (CTX)
 	mov	%rax, P1305_H1 (CTX)
-	mov	XREG(%rax), P1305_H2 (CTX)
+	mov	%rax, P1305_H2 (CTX)
 	
 	W64_EXIT(2,0)
 	ret
 
+undefine(`KEY')
+undefine(`MASK')
+
 EPILOGUE(_nettle_poly1305_set_key)
 
-C 64-bit multiplication mod 2^130 - 5
+define(`T0', `%rcx')
+define(`T1', `%rsi')	C Overlaps message input pointer.
+define(`T2', `%r8')
+define(`H0', `%r9')
+define(`H1', `%r10')
+define(`F0', `%r11')
+define(`F1', `%r12')
+
+C First accumulate the independent products
+C
+C {H1,H0} = R0 T0 + S1 T1 + S0 (T2 >> 2)
+C {F1,F0} = R1 T0 + R0 T1 + S1 T2
+C T = R0 * (T2 & 3)
 C
-C (x_0 + B x_1 + B^2 x_2) * (r_0 + B r_1) =
-C     1   B B^2 B^3 
-C   x_0 r_0
-C       x_0 r_1
-C	x_1 r_0
-C	    x_1 r_1
-C	    x_2 r_0
-C               x_2 r_1
-C Then r_1 B^2 = r_1/4 (2^130) = 5/4 r_1.
-C and  r_1 B^3 = 5/4 B r_1
-C So we get
+C Then add together as
 C
-C  x_0 r_0 + x_1 (5/4 r_1) + B (x_0 r_1 + x_1 r_0 + x_2 5/4 r_1 + B x_2 r_0)
-C     1   B B^2 B^3 
-C   x_0 r_0
-C   x_1 r'_1
-C       x_0 r_1
-C	x_1 r_0
-C       x_2 r'_1
-C           x_2 r_0
+C     +--+--+--+
+C     |T |H1|H0|
+C     +--+--+--+
+C   + |F1|F0|
+C   --+--+--+--+
+C     |H2|H1|H0|
+C     +--+--+--+
 
 	C _poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned hi)
 	
 PROLOGUE(_nettle_poly1305_block)
 	W64_ENTRY(3, 0)
+	push	%r12
 	mov	(%rsi), T0
 	mov	8(%rsi), T1
-	mov	XREG(%rdx),	XREG(T2)
-
-	C Registers:
-	C Inputs:  CTX, T0, T1, T2,
-	C Outputs: H0, H1, H2, stored into the context.
+	mov	XREG(%rdx), XREG(T2)	C Also zero extends
 
 	add	P1305_H0 (CTX), T0
 	adc	P1305_H1 (CTX), T1
-	adc	P1305_H2 (CTX), XREG(T2)
-	mov	P1305_R0 (CTX), %rax
-	mul	T0			C x0*r0
+	adc	P1305_H2 (CTX), T2
+
+	mov	P1305_R1 (CTX), %rax
+	mul	T0			C R1 T0
+	mov	%rax, F0
+	mov	%rdx, F1
+
+	mov	T0, %rax		C Last use of T0 input
+	mov	P1305_R0 (CTX), T0
+	mul	T0			C R0*T0
 	mov	%rax, H0
 	mov	%rdx, H1
-	mov	P1305_S1 (CTX), %rax	C 5/4 r1
-	mov	%rax, H2
-	mul	T1			C x1*r1'
-	imul	T2, H2			C x2*r1'
-	imul	P1305_R0 (CTX), T2	C x2*r0
+
+	mov	T1, %rax
+	mul	T0			C R0*T1
+	add	%rax, F0
+	adc	%rdx, F1
+
+	mov	P1305_S1 (CTX), T0
+	mov	T1, %rax		C Last use of T1 input
+	mul	T0			C S1*T1
 	add	%rax, H0
 	adc	%rdx, H1
-	mov	P1305_R0 (CTX), %rax
-	mul	T1			C x1*r0
-	add	%rax, H2
-	adc	%rdx, T2
-	mov	P1305_R1 (CTX), %rax
-	mul	T0			C x0*r1
-	add	%rax, H2
-	adc	%rdx, T2
+
 	mov	T2, %rax
-	shr	$2, %rax
-	imul	$5, %rax
-	and	$3, XREG(T2)
+	mul	T0			C S1*T2
+	add	%rax, F0
+	adc	%rdx, F1
+
+	mov	$3, XREG(T1)
+	and	T2, T1
+
+	shr	$2, T2
+	mov	P1305_S0 (CTX), %rax
+	mul	T2			C S0*(T2 >> 2)
 	add	%rax, H0
-	adc	H2, H1
-	adc	$0, XREG(T2)
+	adc	%rdx, H1
+
+	imul	P1305_R0 (CTX), T1	C R0*(T2 & 3)
+	add	F0, H1
+	adc	T1, F1
+
 	mov	H0, P1305_H0 (CTX)
 	mov	H1, P1305_H1 (CTX)
-	mov	XREG(T2), P1305_H2 (CTX)
+	mov	F1, P1305_H2 (CTX)
+	pop	%r12
 	W64_EXIT(3, 0)
 	ret
 EPILOGUE(_nettle_poly1305_block)
+undefine(`T0')
+undefine(`T1')
+undefine(`T2')
+undefine(`H0')
+undefine(`H1')
+undefine(`F0')
+undefine(`F1')
 
 	C _poly1305_digest (struct poly1305_ctx *ctx, uint8_t *s)
-	C Registers:
-	C   %rdi: ctx
-	C   %rsi: s
-	
+define(`S', `%rsi')
+
+define(`T0', `%rcx')
+define(`T1', `%r8')
+define(`H0', `%r9')
+define(`H1', `%r10')
+define(`F0', `%r11')
+define(`F1', `%rrd')	C Overlaps CTX
+
 PROLOGUE(_nettle_poly1305_digest)
 	W64_ENTRY(2, 0)
 
 	mov	P1305_H0 (CTX), H0
 	mov	P1305_H1 (CTX), H1
-	mov	P1305_H2 (CTX), XREG(H2)
-	mov	XREG(H2), XREG(%rax)
-	shr	$2, XREG(%rax)
-	and	$3, H2
-	imul	$5, XREG(%rax)
-	add	%rax, H0
+	mov	P1305_H2 (CTX), F0
+
+	xor	XREG(%rax), XREG(%rax)
+	mov	%rax, P1305_H0 (CTX)
+	mov	%rax, P1305_H1 (CTX)
+	mov	%rax, P1305_H2 (CTX)
+
+	mov	$3, XREG(%rax)
+	and 	XREG(F0), XREG(%rax)
+	shr	$2, F0
+	imul	$5, F0
+	add	F0, H0
 	adc	$0, H1
-	adc	$0, XREG(H2)
+	adc	$0, XREG(%rax)
 
-C Use %rax instead of %rsi
-define(`T1', `%rax')
 	C Add 5, use result if >= 2^130
 	mov	$5, T0
 	xor	T1, T1
 	add	H0, T0
 	adc	H1, T1
-	adc	$0, XREG(H2)
-	cmp	$4, XREG(H2)
-	cmovnc	T0, H0
-	cmovnc	T1, H1
+	adc	$-4, XREG(%rax)		C Carry if %rax + c >= 4
+	cmovc	T0, H0
+	cmovc	T1, H1
 
-	add	H0, (%rsi)
-	adc	H1, 8(%rsi)
+	add	H0, (S)
+	adc	H1, 8(S)
 
-	xor	XREG(%rax), XREG(%rax)
-	mov	%rax, P1305_H0 (CTX)
-	mov	%rax, P1305_H1 (CTX)
-	mov	XREG(%rax), P1305_H2 (CTX)
 	W64_EXIT(2, 0)
 	ret
 EPILOGUE(_nettle_poly1305_digest)