From c7030b8788fb6aaab21ed93bd54942e631c40bf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Sun, 21 Jan 2024 15:49:53 +0100 Subject: [PATCH] Trim number of used registers --- powerpc64/machine.m4 | 11 ++++--- powerpc64/p8/ghash-update.asm | 54 ++++++++++++++--------------------- 2 files changed, 26 insertions(+), 39 deletions(-) diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 index 8bbc299c..8caa9584 100644 --- a/powerpc64/machine.m4 +++ b/powerpc64/machine.m4 @@ -80,13 +80,12 @@ ifelse(eval($# > 3), 1, `OPN_XXXY($1, $2, shift(shift(shift($@))))dnl ')') -C FIXME: If we allow clobber of F, no need for T register. -C Polynomial reduction D = R + x^{-64} F mod P +C Polynomial reduction R += x^{-64} F mod P C where x^{-64} = x^{64} + P1 (mod P) -C GHASH_REDUCE(D, R, F, P1, T) +C GHASH_REDUCE(R, F, P1, T1, T2) define(`GHASH_REDUCE', ` - vpmsumd $5,$3,$4 - xxswapd VSR($1),VSR($3) - vxor $1, $1, $2 + vpmsumd $4, $2, $3 + xxswapd VSR($5),VSR($2) vxor $1, $1, $5 + vxor $1, $1, $4 ') diff --git a/powerpc64/p8/ghash-update.asm b/powerpc64/p8/ghash-update.asm index 054f035d..515705b7 100644 --- a/powerpc64/p8/ghash-update.asm +++ b/powerpc64/p8/ghash-update.asm @@ -46,10 +46,10 @@ define(`BLOCKS', `r5') define(`DATA', `r6') define(`ZERO', `v16') -define(`POLY', `v17') -define(`POLY_L', `v0') +define(`LE_TEMP', `v17') -define(`D', `v1') +define(`POLY_L', `v0') +define(`LE_MASK', `v1') define(`C0', `v2') define(`C1', `v3') define(`C2', `v4') @@ -66,14 +66,10 @@ define(`R', `v14') define(`F', `v15') define(`R2', `v16') define(`F2', `v17') -define(`T', `v18') -define(`R3', `v20') -define(`F3', `v21') -define(`R4', `v22') -define(`F4', `v23') - -define(`LE_TEMP', `v18') -define(`LE_MASK', `v19') +define(`R3', `v18') +define(`F3', `v19') +define(`R4', `v20') +define(`F4', `v21') C const uint8_t *_ghash_update (const struct gcm_key *ctx, C union nettle_block16 *x, @@ -82,19 +78,19 @@ define(`LE_MASK', `v19') define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_ghash_update) vxor ZERO,ZERO,ZERO - DATA_LOAD_VEC(POLY,.polynomial,r7) + DATA_LOAD_VEC(POLY_L,.polynomial,r7) IF_LE(` li r8,0 lvsl LE_MASK,0,r8 vspltisb LE_TEMP,0x07 vxor LE_MASK,LE_MASK,LE_TEMP ') - xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY_L) - lxvd2x VSR(D),0,X C load 'X' pointer + lxvd2x VSR(R),0,X C load 'X' pointer C byte-reverse of each doubleword permuting on little-endian mode IF_LE(` - vperm D,D,D,LE_MASK + vperm R,R,R,LE_MASK ') C --- process 4 blocks '128-bit each' per one loop --- @@ -105,14 +101,10 @@ IF_LE(` mtctr r7 C assign counter register to loop count C store non-volatile vector registers - addi r8,SP,-64 + addi r8,SP,-32 stvx v20,0,r8 addi r8,r8,16 stvx v21,0,r8 - addi r8,r8,16 - stvx v22,0,r8 - addi r8,r8,16 - stvx v23,0,r8 C load table elements li r8,1*16 @@ -150,7 +142,7 @@ IF_LE(` ') C previous digest combining - vxor C0,C0,D + vxor C0,C0,R C polynomial multiplication vpmsumd F2,H3L,C1 @@ -170,20 +162,16 @@ IF_LE(` vxor F,F,F3 vxor R,R,R3 - GHASH_REDUCE(D, R, F, POLY_L, T) + GHASH_REDUCE(R, F, POLY_L, R2, F2) C R2, F2 used as temporaries addi DATA,DATA,0x40 bdnz L4x_loop C restore non-volatile vector registers - addi r8,SP,-64 + addi r8,SP,-32 lvx v20,0,r8 addi r8,r8,16 lvx v21,0,r8 - addi r8,r8,16 - lvx v22,0,r8 - addi r8,r8,16 - lvx v23,0,r8 clrldi BLOCKS,BLOCKS,62 C 'set the high-order 62 bits to zeros' L2x: @@ -212,7 +200,7 @@ IF_LE(` ') C previous digest combining - vxor C0,C0,D + vxor C0,C0,R C polynomial multiplication vpmsumd F2,H1L,C1 @@ -224,7 +212,7 @@ IF_LE(` vxor F,F,F2 vxor R,R,R2 - GHASH_REDUCE(D, R, F, POLY_L, T) + GHASH_REDUCE(R, F, POLY_L, R2, F2) C R2, F2 used as temporaries addi DATA,DATA,0x20 clrldi BLOCKS,BLOCKS,63 C 'set the high-order 63 bits to zeros' @@ -247,13 +235,13 @@ IF_LE(` ') C previous digest combining - vxor C0,C0,D + vxor C0,C0,R C polynomial multiplication vpmsumd F,H1L,C0 vpmsumd R,H1M,C0 - GHASH_REDUCE(D, R, F, POLY_L, T) + GHASH_REDUCE(R, F, POLY_L, R2, F2) C R2, F2 used as temporaries addi DATA,DATA,0x10 clrldi BLOCKS,BLOCKS,60 C 'set the high-order 60 bits to zeros' @@ -261,9 +249,9 @@ IF_LE(` Ldone: C byte-reverse of each doubleword permuting on little-endian mode IF_LE(` - vperm D,D,D,LE_MASK + vperm R,R,R,LE_MASK ') - stxvd2x VSR(D),0,X C store digest 'D' + stxvd2x VSR(R),0,X C store digest 'R' mr r3, DATA blr -- GitLab