diff --git a/sparc/aes.asm b/sparc/aes.asm index 6b90c0cc829bf6a2543d51efa0f76624333f0688..6e0f0b69c6206390e5ace66df7cf3c24236b4e2e 100644 --- a/sparc/aes.asm +++ b/sparc/aes.asm @@ -19,12 +19,13 @@ ! MA 02111-1307, USA. ! NOTE: Some of the %g registers are reserved for operating system etc -! (see gcc/config/sparc.h). We should use only %g1-%g3 to be safe. +! (see gcc/config/sparc.h). The only %g registers that seems safe to +! use are %g1-%g3. - ! Used registers: %l0,1,2,3,4,6,7 + ! Used registers: %l0,1,2,3,4,5,6,7 ! %i0,1,2,3,4 (%i6=%fp, %i7 = return) - ! %o0,1,2,,4,7 (%o6=%sp) - ! %g5 + ! %o0,1,2,3,4 (%o6=%sp) + ! .file "aes.asm" @@ -41,25 +42,26 @@ define(tmp, %l1) define(diff, %l2) define(nrounds, %l3) +! Further loop invariants +define(T0, %l4) +define(T1, %l5) +define(T2, %l6) +define(T3, %l7) + +! Teporaries +define(t0, %o0) +define(t1, %o1) +define(t2, %o2) + ! Loop variables -define(round, %l4) +define(round, %o3) define(key, %o4) -! Further loop invariants -define(T0, %l6) -define(T1, %l7) -define(T2, %g5) -define(T3, %o7) - C IDX1 cointains the permutation values * 4 + 2 define(IDX1, <T + AES_SIDX1 >) C IDX3 cointains the permutation values * 4 define(IDX3, <T + AES_SIDX3 >) -! Teporaries -define(t0, %o0) -define(t1, %o1) -define(t2, %o2) C AES_LOAD(i) C Get one word of input, XOR with first subkey, store in wtxt @@ -88,9 +90,6 @@ C Input in wtxt, output stored in tmp + i. C C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ] C the instruction is a part of. -C -C The code uses the register %o[j], aka tj, as the primary -C register for that sub-expression. True for j==1,3. define(<AES_ROUND>, < ld [IDX1+$1], t1 ! 1 ldub [wtxt+$1+3], t0 ! 0 @@ -182,7 +181,6 @@ _aes_crypt: add %fp, -24, wtxt add %fp, -40, tmp - ld [ctx + AES_NROUNDS], nrounds ! Compute xor, so that we can swap efficiently. xor wtxt, tmp, diff @@ -193,6 +191,7 @@ _aes_crypt: add T, AES_TABLE1, T1 add T, AES_TABLE2, T2 add T, AES_TABLE3, T3 + nop .Lblock_loop: C Read src, and add initial subkey @@ -204,7 +203,7 @@ _aes_crypt: sub nrounds, 1, round add ctx, 16, key - + nop .Lround_loop: AES_ROUND(0) ! i = 0 @@ -215,9 +214,9 @@ _aes_crypt: ! switch roles for tmp and wtxt xor wtxt, diff, wtxt xor tmp, diff, tmp - subcc round, 1, round bne .Lround_loop + add key, 16, key C Final round, and storage of the output @@ -228,16 +227,14 @@ _aes_crypt: AES_FINAL_ROUND(12) ! i = 3 addcc length, -16, length - bne .Lblock_loop add dst, 16, dst .Lend: ret restore -.LLFE1: -.LLfe1: - .size _aes_crypt,.LLfe1-_aes_crypt +.Leord: + .size _aes_crypt,.Leord-_aes_crypt ! Benchmarks on my slow sparcstation: ! Original C code @@ -343,3 +340,19 @@ _aes_crypt: ! aes256 (CBC encrypt): 15.59s, 0.641MB/s ! aes256 (CBC decrypt): 15.76s, 0.635MB/s + ! After unrolling loops, and other optimizations suggested by + ! Marcus: + ! aes128 (ECB encrypt): 6.40s, 1.562MB/s + ! aes128 (ECB decrypt): 8.17s, 1.224MB/s + ! aes128 (CBC encrypt): 13.11s, 0.763MB/s + ! aes128 (CBC decrypt): 10.05s, 0.995MB/s + ! + ! aes192 (ECB encrypt): 7.43s, 1.346MB/s + ! aes192 (ECB decrypt): 9.51s, 1.052MB/s + ! aes192 (CBC encrypt): 14.09s, 0.710MB/s + ! aes192 (CBC decrypt): 11.58s, 0.864MB/s + ! + ! aes256 (ECB encrypt): 8.57s, 1.167MB/s + ! aes256 (ECB decrypt): 11.13s, 0.898MB/s + ! aes256 (CBC encrypt): 15.30s, 0.654MB/s + ! aes256 (CBC decrypt): 12.93s, 0.773MB/s