diff --git a/x86_64/avx2/poly1305-blocks.asm b/x86_64/avx2/poly1305-blocks.asm index e5a0ae7b157d38162082a1f7d9f6922ac1808c82..85a21b86a8ebd988f3bff2c4d223ea8a098e8e05 100644 --- a/x86_64/avx2/poly1305-blocks.asm +++ b/x86_64/avx2/poly1305-blocks.asm @@ -224,7 +224,7 @@ PROLOGUE(_nettle_poly1305_blocks) lea .Lmask26(%rip), %rax vmovdqa 0(%rax), MASK26 - cmp $POLY1305_BLOCK_THRESHOLD, BLOCKS + cmp $POLY1305_BLOCK_THRESHOLD_AVX2, BLOCKS jb L1B vmovq P1305_R0 (CTX), XMM(C0) C R0 diff --git a/x86_64/poly1305.m4 b/x86_64/poly1305.m4 index fe4a760c25012c92e0f1d7f3ce455276a10ebe3b..df59ba5e61a656cc1f98a9e46ffb812ec77c28cd 100644 --- a/x86_64/poly1305.m4 +++ b/x86_64/poly1305.m4 @@ -1,45 +1,38 @@ C Threshold of processing multiple blocks in parallel -C of a multiple of 4 with a minimum 8 blocks -define(`POLY1305_BLOCK_THRESHOLD', `32') - -C CTX is the address of context where key and pre-computed values are stored -C DATA is the address of input block -C PADBYTE is padding byte for input block -C GPR0 is the starting register of sequential general-purpose registers -C used in the macro of following layout -C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64 -C GPR3, GPR4 are temporary registers -C VR0 is the starting register of sequential vector resigers used in -C the macro of following layout -C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows -C (low 64-bit of VR0) + (low 64-bit of VR1) + (high 64-bit of VR1) -C VR2..VR12 are temporary registers -C BLOCK_R64(CTX, PAD, H0, H1, H2, T0, T1, F0, F1) +C using AVX2 code of a multiple of 4 with a minimum 8 blocks +define(`POLY1305_BLOCK_THRESHOLD_AVX2', `32') + +C CTX contians key and pre-computed values +C Inputs T0, T1, T2 expect sum of prievous hash with padded input +C message in radix 64 +C Output hash results of radix 64 are stored in H0, H1, T2 respectively +C F0, F1 are temporary general registers +C BLOCK_R64(CTX, H0, H1, T0, T1, T2, F0, F1) define(`BLOCK_R64', ` mov P1305_R1 ($1), %rax - mul $4 C R1*$4 + mul $4 C R1*T0 mov %rax, $7 mov %rdx, $8 - mov $4, %rax C Last use of $4 input + mov $4, %rax C Last use of T0 input mov P1305_R0 ($1), $4 - mul $4 C R0*$4 + mul $4 C R0*T0 mov %rax, $2 mov %rdx, $3 mov $5, %rax - mul $4 C R0*$5 + mul $4 C R0*T1 add %rax, $7 adc %rdx, $8 mov P1305_S1 ($1), $4 - mov $5, %rax C Last use of $5 input - mul $4 C S1*$5 + mov $5, %rax C Last use of T1 input + mul $4 C S1*T1 add %rax, $2 adc %rdx, $3 mov $6, %rax - mul $4 C S1*$6 + mul $4 C S1*T2 add %rax, $7 adc %rdx, $8 @@ -48,11 +41,11 @@ define(`BLOCK_R64', ` shr `$'2, $4 mov P1305_S0 ($1), %rax - mul $4 C S0*($6 >> 2) + mul $4 C S0*(T2 >> 2) add %rax, $2 adc %rdx, $3 - imul P1305_R0 ($1), $6 C R0*($6 & 3) + imul P1305_R0 ($1), $6 C R0*(T2 & 3) add $7, $3 adc $8, $6 ')