diff --git a/sparc/aes.asm b/sparc/aes.asm index d72ec883f21870eb53e30447e8da57cc7a22c382..6731c25993b5614081a6b377e14c532f7ffe373a 100644 --- a/sparc/aes.asm +++ b/sparc/aes.asm @@ -58,6 +58,7 @@ define(T1, %l7) define(T2, %g5) define(T3, %o7) define(IDX1, %i5) +C define(IDX1, <T + AES_SIDX1 >) define(IDX3, %o5) ! Teporaries @@ -141,12 +142,14 @@ _aes_crypt: ! The code uses the register %o[j], aka tj, as the primary ! register for that sub-expression. True for j==1,3. + C Unrolled inner loop begins + C i = 0 - ld [IDX1], t1 ! 1 + ld [IDX1+0], t1 ! 1 add wtxt, t1, t1 ! 1 ldub [t1+2], t1 ! 1 - ld [IDX3], t3 ! 3 + ld [IDX3+0], t3 ! 3 sll t1, 2, t1 ! 1 ld [wtxt], t0 ! 0 @@ -273,7 +276,7 @@ _aes_crypt: xor t0, t1, t0 st t0, [tmp+12] - C End of unrolled loop + C Unrolled inner loop ends ! switch roles for tmp and wtxt xor wtxt, diff, wtxt @@ -290,9 +293,151 @@ define(i, round) ! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])] ! the instruction is part of + + C Unrolled final loop begins ! NOTE: First instruction duplicated in delay slot + + C i = 0 + ld [IDX1+i], t1 ! 1 +C .Lfinal_loop: + ! IDX2(j) = j XOR 2 + xor i, 8, t2 + add wtxt, t1, t1 ! 1 + ldub [t1+2], t1 ! 1 + + ld [wtxt+i], t0 ! 0 + lduh [wtxt+t2], t2 ! 2 + and t0, 255, t0 ! 0 + ld [IDX3 + i], t3 ! 3 + + and t2, 255, t2 ! 2 + ldub [T+t1], t1 ! 1 + ldub [T+t0], t0 ! 0 + sll t1, 8, t1 ! 1 + + ldub [wtxt+t3], t3 ! 3 + or t0, t1, t0 ! 0, 1 + ldub [T+t2], t2 ! 2 + ldub [T+t3], t3 ! 3 + + sll t2, 16, t2 ! 2 + or t0, t2, t0 ! 0, 1, 2 + ld [key + i], t2 + sll t3, 24, t3 ! 3 + + or t0, t3, t0 ! 0, 1, 2, 3 + xor t0, t2, t0 + add i, 4, i + cmp i, 12 + + srl t0, 24, t3 + srl t0, 16, t2 + srl t0, 8, t1 + stb t1, [dst+1] + + stb t3, [dst+3] + stb t2, [dst+2] + stb t0, [dst] + add dst, 4, dst + + C bleu .Lfinal_loop + ld [IDX1+i], t1 ! 1 + + C i = 1 + ld [IDX1+i], t1 ! 1 +C .Lfinal_loop: + ! IDX2(j) = j XOR 2 + xor i, 8, t2 + add wtxt, t1, t1 ! 1 + ldub [t1+2], t1 ! 1 + + ld [wtxt+i], t0 ! 0 + lduh [wtxt+t2], t2 ! 2 + and t0, 255, t0 ! 0 + ld [IDX3 + i], t3 ! 3 + + and t2, 255, t2 ! 2 + ldub [T+t1], t1 ! 1 + ldub [T+t0], t0 ! 0 + sll t1, 8, t1 ! 1 + + ldub [wtxt+t3], t3 ! 3 + or t0, t1, t0 ! 0, 1 + ldub [T+t2], t2 ! 2 + ldub [T+t3], t3 ! 3 + + sll t2, 16, t2 ! 2 + or t0, t2, t0 ! 0, 1, 2 + ld [key + i], t2 + sll t3, 24, t3 ! 3 + + or t0, t3, t0 ! 0, 1, 2, 3 + xor t0, t2, t0 + add i, 4, i + cmp i, 12 + + srl t0, 24, t3 + srl t0, 16, t2 + srl t0, 8, t1 + stb t1, [dst+1] + + stb t3, [dst+3] + stb t2, [dst+2] + stb t0, [dst] + add dst, 4, dst + + C bleu .Lfinal_loop + ld [IDX1+i], t1 ! 1 + + C i = 2 + ld [IDX1+i], t1 ! 1 +C .Lfinal_loop: + ! IDX2(j) = j XOR 2 + xor i, 8, t2 + add wtxt, t1, t1 ! 1 + ldub [t1+2], t1 ! 1 + + ld [wtxt+i], t0 ! 0 + lduh [wtxt+t2], t2 ! 2 + and t0, 255, t0 ! 0 + ld [IDX3 + i], t3 ! 3 + + and t2, 255, t2 ! 2 + ldub [T+t1], t1 ! 1 + ldub [T+t0], t0 ! 0 + sll t1, 8, t1 ! 1 + + ldub [wtxt+t3], t3 ! 3 + or t0, t1, t0 ! 0, 1 + ldub [T+t2], t2 ! 2 + ldub [T+t3], t3 ! 3 + + sll t2, 16, t2 ! 2 + or t0, t2, t0 ! 0, 1, 2 + ld [key + i], t2 + sll t3, 24, t3 ! 3 + + or t0, t3, t0 ! 0, 1, 2, 3 + xor t0, t2, t0 + add i, 4, i + cmp i, 12 + + srl t0, 24, t3 + srl t0, 16, t2 + srl t0, 8, t1 + stb t1, [dst+1] + + stb t3, [dst+3] + stb t2, [dst+2] + stb t0, [dst] + add dst, 4, dst + + C bleu .Lfinal_loop ld [IDX1+i], t1 ! 1 -.Lfinal_loop: + + C i = 3 + ld [IDX1+i], t1 ! 1 +C .Lfinal_loop: ! IDX2(j) = j XOR 2 xor i, 8, t2 add wtxt, t1, t1 ! 1 @@ -333,8 +478,11 @@ define(i, round) stb t0, [dst] add dst, 4, dst - bleu .Lfinal_loop + C bleu .Lfinal_loop ld [IDX1+i], t1 ! 1 + + C Unrolled final loop ends + addcc length, -16, length sub ctx, src, %g2