aes.asm 5.99 KB
Newer Older
1
	! Used registers:	%l0,1,2,3,4,5
2
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
3
	!			%o0,1,2,3,4,5,7 (%o6=%sp)
4
	!			%g2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
! Arguments
16
define(ctx, %i0)
17
define(T, %i1)
18
define(length, %i2)
19
define(dst, %i3)
20
define(src, %i4)
21

22
23
24
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
25
26
define(diff, %l2)
define(nround, %l3)
27
28

! Loop variables
29
30
define(round, %l4) ! Should perhaps be 16 * round
define(i, %l5)
31
32

! Teporaries
33
define(t0, %o0)
34
define(t1, %o1)
35
define(t2, %o2)
36
define(t3, %o3)
37
define(idx, %o4)
38

39
_aes_crypt:
40
! Why -136?
41
	save	%sp, -136, %sp
42

43
	cmp	length, 0
44
	be	.Lend
45

46
	! wtxt
47
	add	%fp, -24, wtxt
48
	add	%fp, -40, tmp
49
50
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
51
52
53
	
	ld	[ctx + AES_NROUNDS], nround

54
55
.Lblock_loop:
	! Read src, and add initial subkey
56
	mov	-4, i
57
.Lsource_loop:
58
	add	i, 4, i
59
		
60
	add	i, src, %o5
61
	ldub	[%o5+3], %g2
62

63
	ldub	[%o5+2], %g3
64
	sll	%g2, 24, %g2
65
	ldub	[%o5+1], %o0
66
67
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
68
	ldub	[src+i], %o5
69
	sll	%o0, 8, %o0
70
	ld	[ctx+i], %g3
71
	or	%g2, %o0, %g2
72
	or	%g2, %o5, %g2
73
	xor	%g2, %g3, %g2
74

75
	cmp	i, 12
76
	bleu	.Lsource_loop
77
	st	%g2, [wtxt+i]
78

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
95
	! ld	[ctx+%o3], %g3
96
97
98
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
99
	! st	%g2, [wtxt+%o4]
100
	! 
101
	! cmp	%o3, 8
102
	! bleu	.Lsource_loop
103
	! add	%o3, 4, %o3
104

105
	mov	1, round
106

107
108
109
	! 4*i:	i
	! This instruction copied to the delay slot of the branch here. 
	mov	0, i
110
.Lround_loop:
111
	add	T, AES_SIDX3, idx
112
.Linner_loop:
113
	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
114
115
116
117
	! the instruction is a part of. 
	!
	! The code uses the register %o[j], aka tj, as the primary 
	! register for that sub-expression. True for j==1,3.
118
	
119
	! AES_SIDX1
120
	ld	[idx-32], t1		! 1
121

122
	! AES_SIDX2
123
	ld	[idx-16], t2		! 2
124
	! wtxt[IDX1...]
125
126
	add	wtxt, t1, t1		! 1
	ldub	[t1+2], t1		! 1
127

128
	! AES_SIDX3
129
	ld	[idx], t3		! 3
130
	sll	t1, 2, t1		! 1
131
	
132
	! wtxt[i]
133
	ld	[wtxt+i], t0		! 0
134
135
	
	! wtxt[IDX2...]
136
	lduh	[wtxt+t2], t2		! 2
137
	
138
	and	t0, 255, t0		! 0
139
140

	! wtxt[IDX3...]
141
	ldub	[wtxt+t3], t3		! 3
142
	
143
144
145
	sll	t0, 2, t0		! 0
	add	t0, AES_TABLE0, t0	! 0
	ld	[T+t0], t0		! 0
146

147
	add	t1, AES_TABLE1, t1	! 1
148
	and	t2, 255, t2		! 2
149
	ld	[T+t1], t1		! 1
150
151
152
	sll	t2, 2, t2		! 2
	add	t2, AES_TABLE2, t2	! 2
	ld	[T+t2], t2		! 2
153
154
155
	sll	t3, 2, t3		! 3
	add	t3, AES_TABLE3, t3	! 3
	ld	[T+t3], t3		! 3
156
157
	xor	t0, t1, t0		! 0, 1
	xor	t0, t2, t0		! 0, 1, 2
158

159
	add	idx, 4, idx		
160

161
162
163
164
	! Fetch roundkey
	sll	round, 4, t1
	add	t1, ctx, t1
	ld	[t1+i], t1
165
		
166
	xor	t0, t3, t0		! 0, 1, 2, 3
167

168
	xor	t0, t1, t0
169
	st	t0, [tmp+i]
170
171

	cmp	i, 8
172

173
	bleu	.Linner_loop
174
	add	i, 4, i
175
	
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
! 	sll	round, 4, %g2
! 	add	%g2, ctx, %o0
! 	mov	0, i
! 
! .Lroundkey_loop:
! 	sll	i, 2, %g2
! 	ld	[%o0], %o5
! 	add	i, 1, i
! 	ld	[tmp+%g2], %g3
! 	cmp	i, 3
! 	xor	%g3, %o5, %g3
! 	! st	%g3, [wtxt+%g2]
! 	st	%g3, [tmp+%g2]
! 	bleu	.Lroundkey_loop
! 	add	%o0, 4, %o0
! 
192
	! switch roles for tmp and wtxt
193
194
	xor	wtxt, diff, wtxt
	xor	tmp, diff, tmp
195

196
197
	add	round, 1, round
	cmp	round, nround
198
	blu	.Lround_loop
199
	mov	0, i
200

201
	sll	round, 4, %g2
202
203
	
	! final round
204
	add	%g2, ctx, %o7
205
	mov	0, i
206
	add	T, 288, %g4
207
.Lfinal_loop:
208
	ld	[%g4-32], %g2
209
	sll	i, 2, %i5
210
	sll	%g2, 2, %g2
211
	add	wtxt, %g2, %g2
212
	ldub	[%g2+2], %o3
213
	add	%i5, dst, %o2
214
	ld	[%g4-16], %g3
215
216
	add	i, 1, i
	ld	[wtxt+%i5], %g2
217
	sll	%g3, 2, %g3
218
	lduh	[wtxt+%g3], %o4
219
	and	%g2, 255, %g2
220
	ld	[%g4], %o5
221
	and	%o4, 255, %o4
222
	ldub	[T+%o3], %o0
223
	sll	%o5, 2, %o5
224
	ldub	[T+%g2], %g3
225
	sll	%o0, 8, %o0
226
	ldub	[wtxt+%o5], %o3
227
	or	%g3, %o0, %g3
228
	ldub	[T+%o4], %g2
229
	cmp	i, 3
230
	ldub	[T+%o3], %o5
231
232
233
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
234
235
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
236
	xor	%g3, %g2, %g3
237
	srl	%g3, 24, %o5
238
	srl	%g3, 16, %o0
239
	srl	%g3, 8, %g2
240
241
242
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
243
	stb	%g3, [dst+%i5]
244
	add	%o7, 4, %o7
245
	bleu	.Lfinal_loop
246
	add	%g4, 4, %g4
247
	
248
	add	src, 16, src
249
	addcc	length, -16, length
250
	bne	.Lblock_loop
251
	add	dst, 16, dst
252
.Lend:
253
254
	ret
	restore
255
256
257
258
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s