From 03b8ba39e367c6d0738e108d7899b8abeb8c96e6 Mon Sep 17 00:00:00 2001 From: Maamoun TK <maamoun.tk@gmail.com> Date: Sun, 21 Mar 2021 20:45:38 +0200 Subject: [PATCH] [AArch64] Use m4 macros in gcm-hash.asm and add documentation comments --- arm64/crypto/gcm-hash.asm | 220 ++++++++++++++++++++------------------ 1 file changed, 113 insertions(+), 107 deletions(-) diff --git a/arm64/crypto/gcm-hash.asm b/arm64/crypto/gcm-hash.asm index b77b08d6..b088360b 100644 --- a/arm64/crypto/gcm-hash.asm +++ b/arm64/crypto/gcm-hash.asm @@ -1,4 +1,4 @@ -C arm/v8/gcm-hash.asm +C arm64/crypto/gcm-hash.asm ifelse(` Copyright (C) 2020 Niels Möller and Mamone Tarsha @@ -38,30 +38,42 @@ ifelse(` C gcm_set_key() assigns H value in the middle element of the table define(`H_Idx', `128') -C common register usage: +C common SIMD register usage: define(`POLY', `v6') +C temporary register that assist the reduction procedure define(`T', `v7') +C permenant register that hold the 16-byte result of pmull define(`F', `v16') +C permenant register that hold the 16-byte result of pmull2, +C its value is accumulated on 'F' register immediately define(`F1', `v17') +C permenant register that hold the 16-byte result of pmull define(`R', `v18') +C permenant register that hold the 16-byte result of pmull2, +C its value is accumulated on 'F' register immediately define(`R1', `v19') C common macros: -.macro PMUL in, param1, param2 - pmull F.1q,\param2\().1d,\in\().1d - pmull2 F1.1q,\param2\().2d,\in\().2d - pmull R.1q,\param1\().1d,\in\().1d - pmull2 R1.1q,\param1\().2d,\in\().2d +C long multiply of six 64-bit polynomials and sum +C R = (in.l × param2.l) + (in.h × param2.h) +C F = (in.l × param3.l) + (in.h × param3.h) +C PMUL(in, param1, param2) +define(`PMUL', m4_assert_numargs(3)` + pmull F.1q,$3.1d,$1.1d + pmull2 F1.1q,$3.2d,$1.2d + pmull R.1q,$2.1d,$1.1d + pmull2 R1.1q,$2.2d,$1.2d eor F.16b,F.16b,F1.16b eor R.16b,R.16b,R1.16b -.endm - -.macro REDUCTION out +') +C Reduce 'R' and 'F' values to 128-bit output +C REDUCTION(out) +define(`REDUCTION', m4_assert_numargs(1)` pmull T.1q,F.1d,POLY.1d eor R.16b,R.16b,T.16b ext R.16b,R.16b,R.16b,#8 - eor \out\().16b,F.16b,R.16b -.endm + eor $1.16b,F.16b,R.16b +') C void gcm_init_key (union gcm_block *table) @@ -101,13 +113,14 @@ define(`H3L', `v28') define(`H4M', `v29') define(`H4L', `v30') -.macro PMUL_PARAM in, param1, param2 - pmull2 Hp.1q,\in\().2d,POLY.2d - eor Hm.16b,\in\().16b,Hp.16b - ext \param1\().16b,Hm.16b,\in\().16b,#8 - ext \param2\().16b,\in\().16b,Hm.16b,#8 - ext \param1\().16b,\param1\().16b,\param1\().16b,#8 -.endm +C PMUL_PARAM(in, param1, param2) +define(`PMUL_PARAM', m4_assert_numargs(3)` + pmull2 Hp.1q,$1.2d,POLY.2d + eor Hm.16b,$1.16b,Hp.16b + ext $2.16b,Hm.16b,$1.16b,#8 + ext $3.16b,$1.16b,Hm.16b,#8 + ext $2.16b,$2.16b,$2.16b,#8 +') PROLOGUE(_nettle_gcm_init_key) add x1,TABLE,#16*H_Idx @@ -120,6 +133,8 @@ PROLOGUE(_nettle_gcm_init_key) IF_LE(` rev64 H.16b,H.16b ') + C --- calculate H = H × x mod R(X); R(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) --- + dup EMSB.16b,H.b[7] mov x1,#0xC200000000000000 mov x2,#1 @@ -136,36 +151,36 @@ IF_LE(` dup POLY.2d,POLY.d[0] - C --- calculate H^2 = H*H --- + C --- calculate H^2 = H × H --- - PMUL_PARAM H,H1M,H1L + PMUL_PARAM(H,H1M,H1L) - PMUL H,H1M,H1L + PMUL(H,H1M,H1L) - REDUCTION H2 + REDUCTION(H2) - PMUL_PARAM H2,H2M,H2L + PMUL_PARAM(H2,H2M,H2L) C we store to the table as doubleword-vectors in current memory endianness C because it's our own strictly internal data structure and what gcm_hash C can most naturally use st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64 - C --- calculate H^3 = H^1*H^2 --- + C --- calculate H^3 = H^1 × H^2 --- - PMUL H2,H1M,H1L + PMUL(H2,H1M,H1L) - REDUCTION H3 + REDUCTION(H3) - PMUL_PARAM H3,H3M,H3L + PMUL_PARAM(H3,H3M,H3L) - C --- calculate H^4 = H^2*H^2 --- + C --- calculate H^4 = H^2 × H^2 --- - PMUL H2,H2M,H2L + PMUL(H2,H2M,H2L) - REDUCTION H4 + REDUCTION(H4) - PMUL_PARAM H4,H4M,H4L + PMUL_PARAM(H4,H4M,H4L) st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE] @@ -180,7 +195,6 @@ define(`DATA', `x3') define(`D', `v0') define(`C0', `v1') -define(`C0D', `d1') define(`C1', `v2') define(`C2', `v3') define(`C3', `v4') @@ -197,16 +211,52 @@ define(`H3L', `v29') define(`H4M', `v30') define(`H4L', `v31') -.macro PMUL_SUM in, param1, param2 - pmull F2.1q,\param2\().1d,\in\().1d - pmull2 F3.1q,\param2\().2d,\in\().2d - pmull R2.1q,\param1\().1d,\in\().1d - pmull2 R3.1q,\param1\().2d,\in\().2d +C PMUL_SUM(in, param1, param2) +define(`PMUL_SUM', m4_assert_numargs(3)` + pmull F2.1q,$3.1d,$1.1d + pmull2 F3.1q,$3.2d,$1.2d + pmull R2.1q,$2.1d,$1.1d + pmull2 R3.1q,$2.2d,$1.2d eor F2.16b,F2.16b,F3.16b eor R2.16b,R2.16b,R3.16b eor F.16b,F.16b,F2.16b eor R.16b,R.16b,R2.16b -.endm +') + +C Load the final partial block into SIMD register, +C stored in little-endian order for each 64-bit part +C LOAD_REV_PARTIAL_BLOCK(out) +define(`LOAD_REV_PARTIAL_BLOCK', m4_assert_numargs(1)` + tbz LENGTH,3,Lless_8_bytes + ldr `d'substr($1,1,len($1)),[DATA],#8 +IF_LE(` + rev64 $1.16b,$1.16b +') + mov x7,#0 + mov $1.d[1],x7 + tst LENGTH,#7 + b.eq Lload_done +Lless_8_bytes: + mov x6,#0 + mov x5,#64 + and x4,LENGTH,#7 +Lload_byte_loop: + mov x7,#0 + ldrb w7,[DATA],#1 + sub x5,x5,#8 + lsl x7,x7,x5 + orr x6,x6,x7 + subs x4,x4,#1 + b.ne Lload_byte_loop + tbz LENGTH,3,Lstore_hi_dw + mov $1.d[1],x6 + b Lload_done +Lstore_hi_dw: + mov x7,#0 + mov $1.d[0],x6 + mov $1.d[1],x7 +Lload_done: +') C void gcm_hash (const struct gcm_key *key, union gcm_block *x, C size_t length, const uint8_t *data) @@ -221,13 +271,13 @@ IF_LE(` ') ands x4,LENGTH,#-64 - b.eq L2x + b.eq L1_block add x5,TABLE,#64 ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE] ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5] -L4x_loop: +L4_blocks_loop: ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64 IF_LE(` rev64 C0.16b,C0.16b @@ -238,45 +288,25 @@ IF_LE(` eor C0.16b,C0.16b,D.16b - PMUL C1,H3M,H3L - PMUL_SUM C2,H2M,H2L - PMUL_SUM C3,H1M,H1L - PMUL_SUM C0,H4M,H4L + PMUL(C1,H3M,H3L) + PMUL_SUM(C2,H2M,H2L) + PMUL_SUM(C3,H1M,H1L) + PMUL_SUM(C0,H4M,H4L) - REDUCTION D + REDUCTION(D) subs x4,x4,#64 - b.ne L4x_loop + b.ne L4_blocks_loop and LENGTH,LENGTH,#63 -L2x: - tst LENGTH,#-32 - b.eq L1x - - ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE] - - ld1 {C0.2d,C1.2d},[DATA],#32 -IF_LE(` - rev64 C0.16b,C0.16b - rev64 C1.16b,C1.16b -') - - eor C0.16b,C0.16b,D.16b - - PMUL C1,H1M,H1L - PMUL_SUM C0,H2M,H2L - - REDUCTION D - - and LENGTH,LENGTH,#31 - -L1x: - tst LENGTH,#-16 - b.eq Lmod +L1_block: + ands x4,LENGTH,#-16 + b.eq Lpartial ld1 {H1M.2d,H1L.2d},[TABLE] +L1_block_loop: ld1 {C0.2d},[DATA],#16 IF_LE(` rev64 C0.16b,C0.16b @@ -284,52 +314,28 @@ IF_LE(` eor C0.16b,C0.16b,D.16b - PMUL C0,H1M,H1L + PMUL(C0,H1M,H1L) + + REDUCTION(D) - REDUCTION D + subs x4,x4,#16 + b.ne L1_block_loop -Lmod: +Lpartial: tst LENGTH,#15 - b.eq Ldone + b.eq Lghash_done ld1 {H1M.2d,H1L.2d},[TABLE] + + LOAD_REV_PARTIAL_BLOCK(C0) - tbz LENGTH,3,Lmod_8 - ldr C0D,[DATA],#8 -IF_LE(` - rev64 C0.16b,C0.16b -') - mov x7,#0 - mov C0.d[1],x7 -Lmod_8: - tst LENGTH,#7 - b.eq Lmod_8_done - mov x6,#0 - mov x5,#64 - and x4,LENGTH,#7 -Lmod_8_loop: - mov x7,#0 - ldrb w7,[DATA],#1 - sub x5,x5,#8 - lsl x7,x7,x5 - orr x6,x6,x7 - subs x4,x4,#1 - b.ne Lmod_8_loop - tbz LENGTH,3,Lmod_8_load - mov C0.d[1],x6 - b Lmod_8_done -Lmod_8_load: - mov x7,#0 - mov C0.d[0],x6 - mov C0.d[1],x7 -Lmod_8_done: eor C0.16b,C0.16b,D.16b - PMUL C0,H1M,H1L + PMUL(C0,H1M,H1L) - REDUCTION D + REDUCTION(D) -Ldone: +Lghash_done: IF_LE(` rev64 D.16b,D.16b ') -- GitLab