diff --git a/x86_64/avx2/poly1305-blocks.asm b/x86_64/avx2/poly1305-blocks.asm
index e5a0ae7b157d38162082a1f7d9f6922ac1808c82..85a21b86a8ebd988f3bff2c4d223ea8a098e8e05 100644
--- a/x86_64/avx2/poly1305-blocks.asm
+++ b/x86_64/avx2/poly1305-blocks.asm
@@ -224,7 +224,7 @@ PROLOGUE(_nettle_poly1305_blocks)
 	lea		.Lmask26(%rip), %rax
 	vmovdqa	0(%rax), MASK26
 
-	cmp		$POLY1305_BLOCK_THRESHOLD, BLOCKS
+	cmp		$POLY1305_BLOCK_THRESHOLD_AVX2, BLOCKS
 	jb		L1B
 
 	vmovq	P1305_R0 (CTX), XMM(C0)		C R0
diff --git a/x86_64/poly1305.m4 b/x86_64/poly1305.m4
index fe4a760c25012c92e0f1d7f3ce455276a10ebe3b..df59ba5e61a656cc1f98a9e46ffb812ec77c28cd 100644
--- a/x86_64/poly1305.m4
+++ b/x86_64/poly1305.m4
@@ -1,45 +1,38 @@
 C Threshold of processing multiple blocks in parallel
-C of a multiple of 4 with a minimum 8 blocks
-define(`POLY1305_BLOCK_THRESHOLD', `32')
-
-C CTX is the address of context where key and pre-computed values are stored
-C DATA is the address of input block
-C PADBYTE is padding byte for input block
-C GPR0 is the starting register of sequential general-purpose registers
-C used in the macro of following layout
-C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64
-C GPR3, GPR4 are temporary registers
-C VR0 is the starting register of sequential vector resigers used in
-C the macro of following layout
-C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows
-C (low 64-bit of VR0) + (low 64-bit of VR1) + (high 64-bit of VR1)
-C VR2..VR12 are temporary registers
-C BLOCK_R64(CTX, PAD, H0, H1, H2, T0, T1, F0, F1)
+C using AVX2 code of a multiple of 4 with a minimum 8 blocks
+define(`POLY1305_BLOCK_THRESHOLD_AVX2', `32')
+
+C CTX contians key and pre-computed values
+C Inputs T0, T1, T2 expect sum of prievous hash with padded input
+C message in radix 64
+C Output hash results of radix 64 are stored in H0, H1, T2 respectively
+C F0, F1 are temporary general registers
+C BLOCK_R64(CTX, H0, H1, T0, T1, T2, F0, F1)
 define(`BLOCK_R64', `
 	mov	P1305_R1 ($1), %rax
-	mul	$4			C R1*$4
+	mul	$4			C R1*T0
 	mov	%rax, $7
 	mov	%rdx, $8
 
-	mov	$4, %rax		C Last use of $4 input
+	mov	$4, %rax		C Last use of T0 input
 	mov	P1305_R0 ($1), $4
-	mul	$4			C R0*$4
+	mul	$4			C R0*T0
 	mov	%rax, $2
 	mov	%rdx, $3
 
 	mov	$5, %rax
-	mul	$4			C R0*$5
+	mul	$4			C R0*T1
 	add	%rax, $7
 	adc	%rdx, $8
 
 	mov	P1305_S1 ($1), $4
-	mov	$5, %rax		C Last use of $5 input
-	mul	$4			C S1*$5
+	mov	$5, %rax		C Last use of T1 input
+	mul	$4			C S1*T1
 	add	%rax, $2
 	adc	%rdx, $3
 
 	mov	$6, %rax
-	mul	$4			C S1*$6
+	mul	$4			C S1*T2
 	add	%rax, $7
 	adc	%rdx, $8
 
@@ -48,11 +41,11 @@ define(`BLOCK_R64', `
 
 	shr	`$'2, $4
 	mov	P1305_S0 ($1), %rax
-	mul	$4			C S0*($6 >> 2)
+	mul	$4			C S0*(T2 >> 2)
 	add	%rax, $2
 	adc	%rdx, $3
 
-	imul	P1305_R0 ($1), $6	C R0*($6 & 3)
+	imul	P1305_R0 ($1), $6	C R0*(T2 & 3)
 	add 	$7, $3
 	adc	$8, $6
 	')