From e91f1eea7c2ed8f4796e72a5a5a37f7e161f65b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Sun, 21 Jan 2024 10:37:03 +0100
Subject: [PATCH] powerpc64: Use m4 to generate instruction sequences for aes.

---
 ChangeLog                             |   7 ++
 powerpc64/machine.m4                  |  16 ++++
 powerpc64/p8/aes-decrypt-internal.asm | 114 ++++++--------------------
 powerpc64/p8/aes-encrypt-internal.asm | 100 ++++++----------------
 4 files changed, 75 insertions(+), 162 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ab3c8690..e9c64ed8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2024-01-21  Niels Möller  <nisse@lysator.liu.se>
+
+	* powerpc64/machine.m4 (OPN_XXY, OPN_XXXY): New macros.
+	* powerpc64/p8/aes-encrypt-internal.asm: Use macros for repeated
+	instruction patterns.
+	* powerpc64/p8/aes-decrypt-internal.asm: Likewise.
+
 2023-12-27  Niels Möller  <nisse@lysator.liu.se>
 
 	* testsuite/gcm-test.c (test_main): Additional gcm test case, with
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4
index 8f28f295..3a05e318 100644
--- a/powerpc64/machine.m4
+++ b/powerpc64/machine.m4
@@ -63,3 +63,19 @@ C INC_VR(VR, INC)
 define(`INC_VR',`ifelse(substr($1,0,1),`v',
 ``v'eval($2+substr($1,1,len($1)))',
 `eval($2+$1)')')
+
+C Apply op x, x, y, for each x.
+C OPN_XXY(OP, Y, X1, X2, ...)
+define(`OPN_XXY',
+`$1 $3, $3, $2
+ifelse(eval($# > 3), 1,
+`OPN_XXY($1, $2, shift(shift(shift($@))))dnl
+')')
+
+C Apply op x, x, x, y, for each x.
+C OPN_XXXY(OP, Y, X1, X2, ...)
+define(`OPN_XXXY',
+`$1 $3, $3, $3, $2
+ifelse(eval($# > 3), 1,
+`OPN_XXXY($1, $2, shift(shift(shift($@))))dnl
+')')
diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm
index b2c49c63..1c33f279 100644
--- a/powerpc64/p8/aes-decrypt-internal.asm
+++ b/powerpc64/p8/aes-decrypt-internal.asm
@@ -80,13 +80,13 @@ PROLOGUE(_nettle_aes_decrypt)
  cmpldi r5,0
  beq L4x
 
- std r25,-56(SP);
- std r26,-48(SP);
- std r27,-40(SP);
- std r28,-32(SP);
- std r29,-24(SP);
- std r30,-16(SP);
- std r31,-8(SP);
+ std r25,-56(SP)
+ std r26,-48(SP)
+ std r27,-40(SP)
+ std r28,-32(SP)
+ std r29,-24(SP)
+ std r30,-16(SP)
+ std r31,-8(SP)
 
  li r25,0x10
  li r26,0x20
@@ -110,23 +110,9 @@ Lx8_loop:
  lxvd2x VSR(S6),r30,SRC
  lxvd2x VSR(S7),r31,SRC
 
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK
- vperm S2,S2,S2,SWAP_MASK
- vperm S3,S3,S3,SWAP_MASK
- vperm S4,S4,S4,SWAP_MASK
- vperm S5,S5,S5,SWAP_MASK
- vperm S6,S6,S6,SWAP_MASK
- vperm S7,S7,S7,SWAP_MASK')
-
- vxor S0,S0,K
- vxor S1,S1,K
- vxor S2,S2,K
- vxor S3,S3,K
- vxor S4,S4,K
- vxor S5,S5,K
- vxor S6,S6,K
- vxor S7,S7,K
+IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
+
+ OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
 
  mtctr ROUNDS
  li r10,0x10
@@ -134,44 +120,16 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK
 L8x_round_loop:
  lxvd2x VSR(K),r10,KEYS
  vperm   K,K,K,SWAP_MASK
- vncipher S0,S0,ZERO
- vncipher S1,S1,ZERO
- vncipher S2,S2,ZERO
- vncipher S3,S3,ZERO
- vncipher S4,S4,ZERO
- vncipher S5,S5,ZERO
- vncipher S6,S6,ZERO
- vncipher S7,S7,ZERO
- vxor S0,S0,K
- vxor S1,S1,K
- vxor S2,S2,K
- vxor S3,S3,K
- vxor S4,S4,K
- vxor S5,S5,K
- vxor S6,S6,K
- vxor S7,S7,K
+ OPN_XXY(vncipher, ZERO, S0, S1, S2, S3, S4, S5, S6, S7)
+ OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
  addi r10,r10,0x10
  bdnz L8x_round_loop
 
  lxvd2x VSR(K),r10,KEYS
  vperm   K,K,K,SWAP_MASK
- vncipherlast S0,S0,K
- vncipherlast S1,S1,K
- vncipherlast S2,S2,K
- vncipherlast S3,S3,K
- vncipherlast S4,S4,K
- vncipherlast S5,S5,K
- vncipherlast S6,S6,K
- vncipherlast S7,S7,K
+ OPN_XXY(vncipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
 
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK
- vperm S2,S2,S2,SWAP_MASK
- vperm S3,S3,S3,SWAP_MASK
- vperm S4,S4,S4,SWAP_MASK
- vperm S5,S5,S5,SWAP_MASK
- vperm S6,S6,S6,SWAP_MASK
- vperm S7,S7,S7,SWAP_MASK')
+IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
 
  stxvd2x VSR(S0),0,DST
  stxvd2x VSR(S1),r25,DST
@@ -187,13 +145,13 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK
  subic. r5,r5,1
  bne Lx8_loop
 
- ld r25,-56(SP);
- ld r26,-48(SP);
- ld r27,-40(SP);
- ld r28,-32(SP);
- ld r29,-24(SP);
- ld r30,-16(SP);
- ld r31,-8(SP);
+ ld r25,-56(SP)
+ ld r26,-48(SP)
+ ld r27,-40(SP)
+ ld r28,-32(SP)
+ ld r29,-24(SP)
+ ld r30,-16(SP)
+ ld r31,-8(SP)
 
  clrldi LENGTH,LENGTH,61
 
@@ -213,15 +171,9 @@ L4x:
  addi   r9,r9,0x10
  lxvd2x VSR(S3),r9,SRC
 
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK
- vperm S2,S2,S2,SWAP_MASK
- vperm S3,S3,S3,SWAP_MASK')
+IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
 
- vxor S0,S0,K
- vxor S1,S1,K
- vxor S2,S2,K
- vxor S3,S3,K
+ OPN_XXY(vxor, K, S0, S1, S2, S3)
 
  mtctr ROUNDS
  li r10,0x10
@@ -229,28 +181,16 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK
 L4x_round_loop:
  lxvd2x VSR(K),r10,KEYS
  vperm  K,K,K,SWAP_MASK
- vncipher S0,S0,ZERO
- vncipher S1,S1,ZERO
- vncipher S2,S2,ZERO
- vncipher S3,S3,ZERO
- vxor   S0,S0,K
- vxor  S1,S1,K
- vxor   S2,S2,K
- vxor   S3,S3,K
+ OPN_XXY(vncipher, ZERO, S0, S1, S2, S3)
+ OPN_XXY(vxor, K, S0, S1, S2, S3)
  addi   r10,r10,0x10
  bdnz  L4x_round_loop
 
  lxvd2x VSR(K),r10,KEYS
  vperm   K,K,K,SWAP_MASK
- vncipherlast S0,S0,K
- vncipherlast S1,S1,K
- vncipherlast S2,S2,K
- vncipherlast S3,S3,K
+ OPN_XXY(vncipherlast, K, S0, S1, S2, S3)
 
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK
- vperm S2,S2,S2,SWAP_MASK
- vperm S3,S3,S3,SWAP_MASK')
+IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
 
  stxvd2x VSR(S0),0,DST
  li  r9,0x10
diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm
index 9957eb1d..9ae8bc0e 100644
--- a/powerpc64/p8/aes-encrypt-internal.asm
+++ b/powerpc64/p8/aes-encrypt-internal.asm
@@ -72,13 +72,13 @@ PROLOGUE(_nettle_aes_encrypt)
  cmpldi r5,0
  beq L4x
 
- std r25,-56(SP);
- std r26,-48(SP);
- std r27,-40(SP);
- std r28,-32(SP);
- std r29,-24(SP);
- std r30,-16(SP);
- std r31,-8(SP);
+ std r25,-56(SP)
+ std r26,-48(SP)
+ std r27,-40(SP)
+ std r28,-32(SP)
+ std r29,-24(SP)
+ std r30,-16(SP)
+ std r31,-8(SP)
 
  li r25,0x10
  li r26,0x20
@@ -102,23 +102,9 @@ Lx8_loop:
  lxvd2x VSR(S6),r30,SRC
  lxvd2x VSR(S7),r31,SRC
 
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK
- vperm S2,S2,S2,SWAP_MASK
- vperm S3,S3,S3,SWAP_MASK
- vperm S4,S4,S4,SWAP_MASK
- vperm S5,S5,S5,SWAP_MASK
- vperm S6,S6,S6,SWAP_MASK
- vperm S7,S7,S7,SWAP_MASK')
-
- vxor S0,S0,K
- vxor S1,S1,K
- vxor S2,S2,K
- vxor S3,S3,K
- vxor S4,S4,K
- vxor S5,S5,K
- vxor S6,S6,K
- vxor S7,S7,K
+IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
+
+ OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
 
  mtctr ROUNDS
  li r10,0x10
@@ -126,36 +112,15 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK
 L8x_round_loop:
  lxvd2x VSR(K),r10,KEYS
  vperm   K,K,K,SWAP_MASK
- vcipher S0,S0,K
- vcipher S1,S1,K
- vcipher S2,S2,K
- vcipher S3,S3,K
- vcipher S4,S4,K
- vcipher S5,S5,K
- vcipher S6,S6,K
- vcipher S7,S7,K
+ OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7)
  addi r10,r10,0x10
  bdnz L8x_round_loop
 
  lxvd2x VSR(K),r10,KEYS
  vperm   K,K,K,SWAP_MASK
- vcipherlast S0,S0,K
- vcipherlast S1,S1,K
- vcipherlast S2,S2,K
- vcipherlast S3,S3,K
- vcipherlast S4,S4,K
- vcipherlast S5,S5,K
- vcipherlast S6,S6,K
- vcipherlast S7,S7,K
+ OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
 
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK
- vperm S2,S2,S2,SWAP_MASK
- vperm S3,S3,S3,SWAP_MASK
- vperm S4,S4,S4,SWAP_MASK
- vperm S5,S5,S5,SWAP_MASK
- vperm S6,S6,S6,SWAP_MASK
- vperm S7,S7,S7,SWAP_MASK')
+IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
 
  stxvd2x VSR(S0),0,DST
  stxvd2x VSR(S1),r25,DST
@@ -171,13 +136,13 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK
  subic. r5,r5,1
  bne Lx8_loop
 
- ld r25,-56(SP);
- ld r26,-48(SP);
- ld r27,-40(SP);
- ld r28,-32(SP);
- ld r29,-24(SP);
- ld r30,-16(SP);
- ld r31,-8(SP);
+ ld r25,-56(SP)
+ ld r26,-48(SP)
+ ld r27,-40(SP)
+ ld r28,-32(SP)
+ ld r29,-24(SP)
+ ld r30,-16(SP)
+ ld r31,-8(SP)
 
  clrldi LENGTH,LENGTH,61
 
@@ -197,15 +162,9 @@ L4x:
  addi   r9,r9,0x10
  lxvd2x VSR(S3),r9,SRC
 
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK
- vperm S2,S2,S2,SWAP_MASK
- vperm S3,S3,S3,SWAP_MASK')
+IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
 
- vxor S0,S0,K
- vxor S1,S1,K
- vxor S2,S2,K
- vxor S3,S3,K
+ OPN_XXY(vxor, K, S0, S1, S2, S3)
 
  mtctr ROUNDS
  li r10,0x10
@@ -213,24 +172,15 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK
 L4x_round_loop:
  lxvd2x VSR(K),r10,KEYS
  vperm  K,K,K,SWAP_MASK
- vcipher S0,S0,K
- vcipher S1,S1,K
- vcipher S2,S2,K
- vcipher S3,S3,K
+ OPN_XXY(vcipher, K, S0, S1, S2, S3)
  addi   r10,r10,0x10
  bdnz  L4x_round_loop
 
  lxvd2x VSR(K),r10,KEYS
  vperm   K,K,K,SWAP_MASK
- vcipherlast S0,S0,K
- vcipherlast S1,S1,K
- vcipherlast S2,S2,K
- vcipherlast S3,S3,K
+ OPN_XXY(vcipherlast, K, S0, S1, S2, S3)
 
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK
- vperm S2,S2,S2,SWAP_MASK
- vperm S3,S3,S3,SWAP_MASK')
+IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
 
  stxvd2x VSR(S0),0,DST
  li  r9,0x10
-- 
GitLab