From 2ddcc2262b8eb1bc40157bdaa8057c576d1c6dda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Sun, 16 Oct 2005 14:24:13 +0200 Subject: [PATCH] * sparc/machine.m4 (AES_FINAL_ROUND): Better scheduling, by interleaving independent operations. Rev: src/nettle/sparc/aes-encrypt-internal.asm:1.10 Rev: src/nettle/sparc/machine.m4:1.11 --- sparc/aes-encrypt-internal.asm | 24 ++++++++++++++---------- sparc/machine.m4 | 24 ++++++++++-------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/sparc/aes-encrypt-internal.asm b/sparc/aes-encrypt-internal.asm index 09964b69..58beb4df 100644 --- a/sparc/aes-encrypt-internal.asm +++ b/sparc/aes-encrypt-internal.asm @@ -137,15 +137,19 @@ EPILOGUE(_nettle_aes_encrypt) C Some stats from adriana.lysator.liu.se (SS1000$, 85 MHz), for AES 128 -C A: nettle-1.13 C-code -C B: nettle-1.13 assembler -C C: New C-code -C D: New assembler, first correct version -C E: New assembler, with basic scheduling of AES_ROUND. +C 1: nettle-1.13 C-code +C 2: nettle-1.13 assembler +C 3: New C-code +C 4: New assembler, first correct version +C 5: New assembler, with basic scheduling of AES_ROUND. +C 6: New assembpler, with loop invariants T0-T3. +C 7: New assembler, with basic scheduling also of AES_FINAL_ROUND. C MB/s cycles/block -C A 1.2 1107 -C B 2.3 572 -C C 2.1 627 -C D 1.8 722 -C E 2.6 496 +C 1 1.2 1107 +C 2 2.3 572 +C 3 2.1 627 +C 4 1.8 722 +C 5 2.6 496 +C 6 3.0 437 +C 7 3.1 415 diff --git a/sparc/machine.m4 b/sparc/machine.m4 index adc205e2..0817327a 100644 --- a/sparc/machine.m4 +++ b/sparc/machine.m4 @@ -66,29 +66,25 @@ define(<AES_FINAL_ROUND>, < ld [$7 + eval(4*$1)], TMP3 and $3, 0xff, TMP1 C 0 - ldub [T + TMP1], TMP1 C 0 - nop - xor TMP3, TMP1, TMP1 C 0 - stb TMP1, [$8 + eval(4*$1)] C 0 - srl $4, 8, TMP2 C 1 + ldub [T + TMP1], TMP1 C 0 and TMP2, 0xff, TMP2 C 1 + xor TMP3, TMP1, TMP1 C 0 ldub [T + TMP2], TMP2 C 1 - srl TMP3, 8, TMP3 C 1 - xor TMP3, TMP2, TMP2 C 1 - stb TMP2, [$8 + eval(4*$1 + 1)] C 1 - + stb TMP1, [$8 + eval(4*$1)] C 0 E0 srl $5, 16, TMP1 C 2 + srl TMP3, 8, TMP3 C 1 and TMP1, 0xff, TMP1 C 2 + xor TMP3, TMP2, TMP2 C 1 ldub [T + TMP1], TMP1 C 2 - srl TMP3, 8, TMP3 C 2 - xor TMP3, TMP1, TMP1 C 2 - stb TMP1, [$8 + eval(4*$1 + 2)] C 2 - + stb TMP2, [$8 + eval(4*$1 + 1)] C 1 E1 srl $6, 24, TMP2 C 3 + srl TMP3, 8, TMP3 C 2 ldub [T + TMP2], TMP2 C 3 + xor TMP3, TMP1, TMP1 C 2 srl TMP3, 8, TMP3 C 3 + stb TMP1, [$8 + eval(4*$1 + 2)] C 2 E2 xor TMP3, TMP2, TMP2 C 3 - stb TMP2, [$8 + eval(4*$1 + 3)] C 3 + stb TMP2, [$8 + eval(4*$1 + 3)] C 3 E3 >) -- GitLab