diff --git a/sparc/aes-encrypt-internal.asm b/sparc/aes-encrypt-internal.asm index 8e06e03a42a940a1e33583a57a7c9e9404601ec8..e5951ad19857d7b18ef9669c70029beb97d3c8b3 100644 --- a/sparc/aes-encrypt-internal.asm +++ b/sparc/aes-encrypt-internal.asm @@ -133,10 +133,11 @@ C A: nettle-1.13 C-code C B: nettle-1.13 assembler C C: New C-code C D: New assembler, first correct version - +C E: New assembler, with basic scheduling of AES_ROUND. + C MB/s cycles/block C A 1.2 1107 C B 2.3 572 C C 2.1 627 C D 1.8 722 - +C E 2.6 496 diff --git a/sparc/machine.m4 b/sparc/machine.m4 index 5f3731a078e1abd4dd4bb7785076c2fa8d7c51ed..eb9e334b40ef4ec927f5408f628865d89598823d 100644 --- a/sparc/machine.m4 +++ b/sparc/machine.m4 @@ -33,34 +33,27 @@ C FIXME: Needs better instruction scheduling, and perhaps more temporaries C Alternatively, we can use a single table and some rotations define(<AES_ROUND>, < and $3, 0xff, TMP1 C 0 - sll TMP1, 2, TMP1 C 0 - add TMP1, AES_TABLE0, TMP1 C 0 - ld [$2 + TMP1], $8 C 0 - srl $4, 6, TMP2 C 1 + sll TMP1, 2, TMP1 C 0 and TMP2, 0x3fc, TMP2 C 1 + add TMP1, AES_TABLE0, TMP1 C 0 add TMP2, AES_TABLE1, TMP2 C 1 - ld [$2 + TMP2], TMP2 C 1 - nop - xor $8, TMP2, $8 C 1 - + ld [$2 + TMP1], $8 C 0 E0 srl $5, 14, TMP1 C 2 + ld [$2 + TMP2], TMP2 C 1 and TMP1, 0x3fc, TMP1 C 2 - add TMP1, AES_TABLE2, TMP1 C 2 - ld [$2 + TMP1], TMP1 C 2 - nop - xor $8, TMP1, $8 C 2 - + xor $8, TMP2, $8 C 1 E1 srl $6, 22, TMP2 C 3 + add TMP1, AES_TABLE2, TMP1 C 2 and TMP2, 0x3fc, TMP2 C 3 + ld [$2 + TMP1], TMP1 C 2 add TMP2, AES_TABLE3, TMP2 C 3 + xor $8, TMP1, $8 C 2 E2 + ld [$7 + eval(4*$1)], TMP1 C 4 ld [$2 + TMP2], TMP2 C 3 - nop + xor $8, TMP1, $8 C 4 xor $8, TMP2, $8 C 3 - - ld [$7 + eval(4*$1)], TMP2 C 4 - nop - xor $8, TMP2, $8>)dnl C 4 +>)dnl C AES_FINAL_ROUND(i, T, a, b, c, d, key, dst) C Compute one word in the final round function. Output is converted to