From e91f1eea7c2ed8f4796e72a5a5a37f7e161f65b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Sun, 21 Jan 2024 10:37:03 +0100 Subject: [PATCH] powerpc64: Use m4 to generate instruction sequences for aes. --- ChangeLog | 7 ++ powerpc64/machine.m4 | 16 ++++ powerpc64/p8/aes-decrypt-internal.asm | 114 ++++++-------------------- powerpc64/p8/aes-encrypt-internal.asm | 100 ++++++---------------- 4 files changed, 75 insertions(+), 162 deletions(-) diff --git a/ChangeLog b/ChangeLog index ab3c8690..e9c64ed8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2024-01-21 Niels Möller <nisse@lysator.liu.se> + + * powerpc64/machine.m4 (OPN_XXY, OPN_XXXY): New macros. + * powerpc64/p8/aes-encrypt-internal.asm: Use macros for repeated + instruction patterns. + * powerpc64/p8/aes-decrypt-internal.asm: Likewise. + 2023-12-27 Niels Möller <nisse@lysator.liu.se> * testsuite/gcm-test.c (test_main): Additional gcm test case, with diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 index 8f28f295..3a05e318 100644 --- a/powerpc64/machine.m4 +++ b/powerpc64/machine.m4 @@ -63,3 +63,19 @@ C INC_VR(VR, INC) define(`INC_VR',`ifelse(substr($1,0,1),`v', ``v'eval($2+substr($1,1,len($1)))', `eval($2+$1)')') + +C Apply op x, x, y, for each x. +C OPN_XXY(OP, Y, X1, X2, ...) +define(`OPN_XXY', +`$1 $3, $3, $2 +ifelse(eval($# > 3), 1, +`OPN_XXY($1, $2, shift(shift(shift($@))))dnl +')') + +C Apply op x, x, x, y, for each x. +C OPN_XXXY(OP, Y, X1, X2, ...) +define(`OPN_XXXY', +`$1 $3, $3, $3, $2 +ifelse(eval($# > 3), 1, +`OPN_XXXY($1, $2, shift(shift(shift($@))))dnl +')') diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm index b2c49c63..1c33f279 100644 --- a/powerpc64/p8/aes-decrypt-internal.asm +++ b/powerpc64/p8/aes-decrypt-internal.asm @@ -80,13 +80,13 @@ PROLOGUE(_nettle_aes_decrypt) cmpldi r5,0 beq L4x - std r25,-56(SP); - std r26,-48(SP); - std r27,-40(SP); - std r28,-32(SP); - std r29,-24(SP); - std r30,-16(SP); - std r31,-8(SP); + std r25,-56(SP) + std r26,-48(SP) + std r27,-40(SP) + std r28,-32(SP) + std r29,-24(SP) + std r30,-16(SP) + std r31,-8(SP) li r25,0x10 li r26,0x20 @@ -110,23 +110,9 @@ Lx8_loop: lxvd2x VSR(S6),r30,SRC lxvd2x VSR(S7),r31,SRC -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK - vperm S2,S2,S2,SWAP_MASK - vperm S3,S3,S3,SWAP_MASK - vperm S4,S4,S4,SWAP_MASK - vperm S5,S5,S5,SWAP_MASK - vperm S6,S6,S6,SWAP_MASK - vperm S7,S7,S7,SWAP_MASK') - - vxor S0,S0,K - vxor S1,S1,K - vxor S2,S2,K - vxor S3,S3,K - vxor S4,S4,K - vxor S5,S5,K - vxor S6,S6,K - vxor S7,S7,K +IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') + + OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7) mtctr ROUNDS li r10,0x10 @@ -134,44 +120,16 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK L8x_round_loop: lxvd2x VSR(K),r10,KEYS vperm K,K,K,SWAP_MASK - vncipher S0,S0,ZERO - vncipher S1,S1,ZERO - vncipher S2,S2,ZERO - vncipher S3,S3,ZERO - vncipher S4,S4,ZERO - vncipher S5,S5,ZERO - vncipher S6,S6,ZERO - vncipher S7,S7,ZERO - vxor S0,S0,K - vxor S1,S1,K - vxor S2,S2,K - vxor S3,S3,K - vxor S4,S4,K - vxor S5,S5,K - vxor S6,S6,K - vxor S7,S7,K + OPN_XXY(vncipher, ZERO, S0, S1, S2, S3, S4, S5, S6, S7) + OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7) addi r10,r10,0x10 bdnz L8x_round_loop lxvd2x VSR(K),r10,KEYS vperm K,K,K,SWAP_MASK - vncipherlast S0,S0,K - vncipherlast S1,S1,K - vncipherlast S2,S2,K - vncipherlast S3,S3,K - vncipherlast S4,S4,K - vncipherlast S5,S5,K - vncipherlast S6,S6,K - vncipherlast S7,S7,K + OPN_XXY(vncipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7) -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK - vperm S2,S2,S2,SWAP_MASK - vperm S3,S3,S3,SWAP_MASK - vperm S4,S4,S4,SWAP_MASK - vperm S5,S5,S5,SWAP_MASK - vperm S6,S6,S6,SWAP_MASK - vperm S7,S7,S7,SWAP_MASK') +IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') stxvd2x VSR(S0),0,DST stxvd2x VSR(S1),r25,DST @@ -187,13 +145,13 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK subic. r5,r5,1 bne Lx8_loop - ld r25,-56(SP); - ld r26,-48(SP); - ld r27,-40(SP); - ld r28,-32(SP); - ld r29,-24(SP); - ld r30,-16(SP); - ld r31,-8(SP); + ld r25,-56(SP) + ld r26,-48(SP) + ld r27,-40(SP) + ld r28,-32(SP) + ld r29,-24(SP) + ld r30,-16(SP) + ld r31,-8(SP) clrldi LENGTH,LENGTH,61 @@ -213,15 +171,9 @@ L4x: addi r9,r9,0x10 lxvd2x VSR(S3),r9,SRC -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK - vperm S2,S2,S2,SWAP_MASK - vperm S3,S3,S3,SWAP_MASK') +IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') - vxor S0,S0,K - vxor S1,S1,K - vxor S2,S2,K - vxor S3,S3,K + OPN_XXY(vxor, K, S0, S1, S2, S3) mtctr ROUNDS li r10,0x10 @@ -229,28 +181,16 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK L4x_round_loop: lxvd2x VSR(K),r10,KEYS vperm K,K,K,SWAP_MASK - vncipher S0,S0,ZERO - vncipher S1,S1,ZERO - vncipher S2,S2,ZERO - vncipher S3,S3,ZERO - vxor S0,S0,K - vxor S1,S1,K - vxor S2,S2,K - vxor S3,S3,K + OPN_XXY(vncipher, ZERO, S0, S1, S2, S3) + OPN_XXY(vxor, K, S0, S1, S2, S3) addi r10,r10,0x10 bdnz L4x_round_loop lxvd2x VSR(K),r10,KEYS vperm K,K,K,SWAP_MASK - vncipherlast S0,S0,K - vncipherlast S1,S1,K - vncipherlast S2,S2,K - vncipherlast S3,S3,K + OPN_XXY(vncipherlast, K, S0, S1, S2, S3) -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK - vperm S2,S2,S2,SWAP_MASK - vperm S3,S3,S3,SWAP_MASK') +IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') stxvd2x VSR(S0),0,DST li r9,0x10 diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm index 9957eb1d..9ae8bc0e 100644 --- a/powerpc64/p8/aes-encrypt-internal.asm +++ b/powerpc64/p8/aes-encrypt-internal.asm @@ -72,13 +72,13 @@ PROLOGUE(_nettle_aes_encrypt) cmpldi r5,0 beq L4x - std r25,-56(SP); - std r26,-48(SP); - std r27,-40(SP); - std r28,-32(SP); - std r29,-24(SP); - std r30,-16(SP); - std r31,-8(SP); + std r25,-56(SP) + std r26,-48(SP) + std r27,-40(SP) + std r28,-32(SP) + std r29,-24(SP) + std r30,-16(SP) + std r31,-8(SP) li r25,0x10 li r26,0x20 @@ -102,23 +102,9 @@ Lx8_loop: lxvd2x VSR(S6),r30,SRC lxvd2x VSR(S7),r31,SRC -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK - vperm S2,S2,S2,SWAP_MASK - vperm S3,S3,S3,SWAP_MASK - vperm S4,S4,S4,SWAP_MASK - vperm S5,S5,S5,SWAP_MASK - vperm S6,S6,S6,SWAP_MASK - vperm S7,S7,S7,SWAP_MASK') - - vxor S0,S0,K - vxor S1,S1,K - vxor S2,S2,K - vxor S3,S3,K - vxor S4,S4,K - vxor S5,S5,K - vxor S6,S6,K - vxor S7,S7,K +IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') + + OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7) mtctr ROUNDS li r10,0x10 @@ -126,36 +112,15 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK L8x_round_loop: lxvd2x VSR(K),r10,KEYS vperm K,K,K,SWAP_MASK - vcipher S0,S0,K - vcipher S1,S1,K - vcipher S2,S2,K - vcipher S3,S3,K - vcipher S4,S4,K - vcipher S5,S5,K - vcipher S6,S6,K - vcipher S7,S7,K + OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7) addi r10,r10,0x10 bdnz L8x_round_loop lxvd2x VSR(K),r10,KEYS vperm K,K,K,SWAP_MASK - vcipherlast S0,S0,K - vcipherlast S1,S1,K - vcipherlast S2,S2,K - vcipherlast S3,S3,K - vcipherlast S4,S4,K - vcipherlast S5,S5,K - vcipherlast S6,S6,K - vcipherlast S7,S7,K + OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7) -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK - vperm S2,S2,S2,SWAP_MASK - vperm S3,S3,S3,SWAP_MASK - vperm S4,S4,S4,SWAP_MASK - vperm S5,S5,S5,SWAP_MASK - vperm S6,S6,S6,SWAP_MASK - vperm S7,S7,S7,SWAP_MASK') +IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') stxvd2x VSR(S0),0,DST stxvd2x VSR(S1),r25,DST @@ -171,13 +136,13 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK subic. r5,r5,1 bne Lx8_loop - ld r25,-56(SP); - ld r26,-48(SP); - ld r27,-40(SP); - ld r28,-32(SP); - ld r29,-24(SP); - ld r30,-16(SP); - ld r31,-8(SP); + ld r25,-56(SP) + ld r26,-48(SP) + ld r27,-40(SP) + ld r28,-32(SP) + ld r29,-24(SP) + ld r30,-16(SP) + ld r31,-8(SP) clrldi LENGTH,LENGTH,61 @@ -197,15 +162,9 @@ L4x: addi r9,r9,0x10 lxvd2x VSR(S3),r9,SRC -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK - vperm S2,S2,S2,SWAP_MASK - vperm S3,S3,S3,SWAP_MASK') +IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') - vxor S0,S0,K - vxor S1,S1,K - vxor S2,S2,K - vxor S3,S3,K + OPN_XXY(vxor, K, S0, S1, S2, S3) mtctr ROUNDS li r10,0x10 @@ -213,24 +172,15 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK L4x_round_loop: lxvd2x VSR(K),r10,KEYS vperm K,K,K,SWAP_MASK - vcipher S0,S0,K - vcipher S1,S1,K - vcipher S2,S2,K - vcipher S3,S3,K + OPN_XXY(vcipher, K, S0, S1, S2, S3) addi r10,r10,0x10 bdnz L4x_round_loop lxvd2x VSR(K),r10,KEYS vperm K,K,K,SWAP_MASK - vcipherlast S0,S0,K - vcipherlast S1,S1,K - vcipherlast S2,S2,K - vcipherlast S3,S3,K + OPN_XXY(vcipherlast, K, S0, S1, S2, S3) -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK - vperm S2,S2,S2,SWAP_MASK - vperm S3,S3,S3,SWAP_MASK') +IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') stxvd2x VSR(S0),0,DST li r9,0x10 -- GitLab