Commit a2c38d31 authored by Niels Möller's avatar Niels Möller
Browse files

(_aes_crypt): Moved some registers around. We now use input

registers only for arguments, local registers for loop invariants,
output registers for temporaries and loop variables, and no global
registers at all.

Rev: src/nettle/sparc/aes.asm:1.123
parent 47c59e94
...@@ -19,12 +19,13 @@ ...@@ -19,12 +19,13 @@
! MA 02111-1307, USA. ! MA 02111-1307, USA.
! NOTE: Some of the %g registers are reserved for operating system etc ! NOTE: Some of the %g registers are reserved for operating system etc
! (see gcc/config/sparc.h). We should use only %g1-%g3 to be safe. ! (see gcc/config/sparc.h). The only %g registers that seems safe to
! use are %g1-%g3.
! Used registers: %l0,1,2,3,4,6,7 ! Used registers: %l0,1,2,3,4,5,6,7
! %i0,1,2,3,4 (%i6=%fp, %i7 = return) ! %i0,1,2,3,4 (%i6=%fp, %i7 = return)
! %o0,1,2,,4,7 (%o6=%sp) ! %o0,1,2,3,4 (%o6=%sp)
! %g5 !
.file "aes.asm" .file "aes.asm"
...@@ -41,25 +42,26 @@ define(tmp, %l1) ...@@ -41,25 +42,26 @@ define(tmp, %l1)
define(diff, %l2) define(diff, %l2)
define(nrounds, %l3) define(nrounds, %l3)
! Further loop invariants
define(T0, %l4)
define(T1, %l5)
define(T2, %l6)
define(T3, %l7)
! Teporaries
define(t0, %o0)
define(t1, %o1)
define(t2, %o2)
! Loop variables ! Loop variables
define(round, %l4) define(round, %o3)
define(key, %o4) define(key, %o4)
! Further loop invariants
define(T0, %l6)
define(T1, %l7)
define(T2, %g5)
define(T3, %o7)
C IDX1 cointains the permutation values * 4 + 2 C IDX1 cointains the permutation values * 4 + 2
define(IDX1, <T + AES_SIDX1 >) define(IDX1, <T + AES_SIDX1 >)
C IDX3 cointains the permutation values * 4 C IDX3 cointains the permutation values * 4
define(IDX3, <T + AES_SIDX3 >) define(IDX3, <T + AES_SIDX3 >)
! Teporaries
define(t0, %o0)
define(t1, %o1)
define(t2, %o2)
C AES_LOAD(i) C AES_LOAD(i)
C Get one word of input, XOR with first subkey, store in wtxt C Get one word of input, XOR with first subkey, store in wtxt
...@@ -88,9 +90,6 @@ C Input in wtxt, output stored in tmp + i. ...@@ -88,9 +90,6 @@ C Input in wtxt, output stored in tmp + i.
C C
C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ] C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
C the instruction is a part of. C the instruction is a part of.
C
C The code uses the register %o[j], aka tj, as the primary
C register for that sub-expression. True for j==1,3.
define(<AES_ROUND>, < define(<AES_ROUND>, <
ld [IDX1+$1], t1 ! 1 ld [IDX1+$1], t1 ! 1
ldub [wtxt+$1+3], t0 ! 0 ldub [wtxt+$1+3], t0 ! 0
...@@ -182,7 +181,6 @@ _aes_crypt: ...@@ -182,7 +181,6 @@ _aes_crypt:
add %fp, -24, wtxt add %fp, -24, wtxt
add %fp, -40, tmp add %fp, -40, tmp
ld [ctx + AES_NROUNDS], nrounds ld [ctx + AES_NROUNDS], nrounds
! Compute xor, so that we can swap efficiently. ! Compute xor, so that we can swap efficiently.
xor wtxt, tmp, diff xor wtxt, tmp, diff
...@@ -193,6 +191,7 @@ _aes_crypt: ...@@ -193,6 +191,7 @@ _aes_crypt:
add T, AES_TABLE1, T1 add T, AES_TABLE1, T1
add T, AES_TABLE2, T2 add T, AES_TABLE2, T2
add T, AES_TABLE3, T3 add T, AES_TABLE3, T3
nop
.Lblock_loop: .Lblock_loop:
C Read src, and add initial subkey C Read src, and add initial subkey
...@@ -204,7 +203,7 @@ _aes_crypt: ...@@ -204,7 +203,7 @@ _aes_crypt:
sub nrounds, 1, round sub nrounds, 1, round
add ctx, 16, key add ctx, 16, key
nop
.Lround_loop: .Lround_loop:
AES_ROUND(0) ! i = 0 AES_ROUND(0) ! i = 0
...@@ -215,9 +214,9 @@ _aes_crypt: ...@@ -215,9 +214,9 @@ _aes_crypt:
! switch roles for tmp and wtxt ! switch roles for tmp and wtxt
xor wtxt, diff, wtxt xor wtxt, diff, wtxt
xor tmp, diff, tmp xor tmp, diff, tmp
subcc round, 1, round subcc round, 1, round
bne .Lround_loop bne .Lround_loop
add key, 16, key add key, 16, key
C Final round, and storage of the output C Final round, and storage of the output
...@@ -228,16 +227,14 @@ _aes_crypt: ...@@ -228,16 +227,14 @@ _aes_crypt:
AES_FINAL_ROUND(12) ! i = 3 AES_FINAL_ROUND(12) ! i = 3
addcc length, -16, length addcc length, -16, length
bne .Lblock_loop bne .Lblock_loop
add dst, 16, dst add dst, 16, dst
.Lend: .Lend:
ret ret
restore restore
.LLFE1: .Leord:
.LLfe1: .size _aes_crypt,.Leord-_aes_crypt
.size _aes_crypt,.LLfe1-_aes_crypt
! Benchmarks on my slow sparcstation: ! Benchmarks on my slow sparcstation:
! Original C code ! Original C code
...@@ -343,3 +340,19 @@ _aes_crypt: ...@@ -343,3 +340,19 @@ _aes_crypt:
! aes256 (CBC encrypt): 15.59s, 0.641MB/s ! aes256 (CBC encrypt): 15.59s, 0.641MB/s
! aes256 (CBC decrypt): 15.76s, 0.635MB/s ! aes256 (CBC decrypt): 15.76s, 0.635MB/s
! After unrolling loops, and other optimizations suggested by
! Marcus:
! aes128 (ECB encrypt): 6.40s, 1.562MB/s
! aes128 (ECB decrypt): 8.17s, 1.224MB/s
! aes128 (CBC encrypt): 13.11s, 0.763MB/s
! aes128 (CBC decrypt): 10.05s, 0.995MB/s
!
! aes192 (ECB encrypt): 7.43s, 1.346MB/s
! aes192 (ECB decrypt): 9.51s, 1.052MB/s
! aes192 (CBC encrypt): 14.09s, 0.710MB/s
! aes192 (CBC decrypt): 11.58s, 0.864MB/s
!
! aes256 (ECB encrypt): 8.57s, 1.167MB/s
! aes256 (ECB decrypt): 11.13s, 0.898MB/s
! aes256 (CBC encrypt): 15.30s, 0.654MB/s
! aes256 (CBC decrypt): 12.93s, 0.773MB/s
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment