aes.asm 7.37 KB
Newer Older
1
! -*- mode: asm; asm-comment-char: ?!; -*-  
2
	! Used registers:	%l0,1,2,3,4,5,6,7
3
	!			%i0,1,2,3,4 (%i6=%fp, %i7 = return)
4
	!			%o0,1,2,3 (%o6=%sp)
5
	!			%g2,3,4,5,6
6
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
7
	
8
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
9
	
10
	.section	".text"
11
	.align 16
12
13
	.global _aes_crypt
	.type	_aes_crypt,#function
14
	.proc	020
15

16
! Arguments
17
define(ctx, %i0)
18
define(T, %i1)
19
define(length, %i2)
20
define(dst, %i3)
21
define(src, %i4)
22

23
24
25
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
26
define(diff, %l2)
27
define(nrounds, %l3)
28
29

! Loop variables
30
define(round, %l4)
31
define(i, %l5)
32
define(key, %g7)
33

34
35
36
37
38
! Further loop invariants
define(T0, %l6)
define(T1, %l7)
define(T2, %g5)
define(T3, %g6)
39
40
define(IDX1, %i5)
define(IDX3, %o5)
41

42
! Teporaries
43
define(t0, %o0)
44
define(t1, %o1)
45
define(t2, %o2)
46
47
define(t3, %o3)

48
_aes_crypt:
49
! Why -136?
50
	save	%sp, -136, %sp
51
	cmp	length, 0
52
	be	.Lend
53
	! wtxt
54
	add	%fp, -24, wtxt
55
	
56
	add	%fp, -40, tmp
57
	ld	[ctx + AES_NROUNDS], nrounds
58
59
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
60
	! The loop variable will be multiplied by 16.
61
62
	! More loop invariants
	add	T, AES_TABLE0, T0
63
	
64
65
66
	add	T, AES_TABLE1, T1
	add	T, AES_TABLE2, T2
	add	T, AES_TABLE3, T3
67
68
69
	add	T, AES_SIDX1, IDX1

	add	T, AES_SIDX3, IDX3
70
	! Read src, and add initial subkey
71
	! Difference between ctx and src.
72
	! NOTE: This instruction is duplicated in the delay slot
73
	sub	ctx, src, %g2
74

75
.Lblock_loop:
76
77
	! Difference between wtxt and src
	sub	wtxt, src, %g3
78
79
80
	! For stop condition. Note that src is incremented in the
	! delay slot
	add	src, 8, %g4
81
	
82
.Lsource_loop:
83
84
85
86
	ldub	[src+3], t3
	ldub	[src+2], t2
	sll	t3, 24, t3
	ldub	[src+1], t1
87
	
88
89
90
91
	sll	t2, 16, t2
	or	t3, t2, t3
	ldub	[src], t0
	sll	t1, 8, t1
92
	
93
	! Get subkey
94
	ld	[src+%g2], t2
95
96
	or	t3, t1, t3
	or	t3, t0, t3
97
	xor	t3, t2, t3
98
	
99
	cmp	src, %g4
100
	st	t3, [src+%g3]
101
	bleu	.Lsource_loop
102
	add	src, 4, src
103
	
104
	sub	nrounds, 1, round
105
	add	ctx, 16, key
106
.Lround_loop:
107
108
	! 4*i
	mov	0, i
109
.Linner_loop:
110
	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
111
112
113
114
	! the instruction is a part of. 
	!
	! The code uses the register %o[j], aka tj, as the primary 
	! register for that sub-expression. True for j==1,3.
115
	
116
	ld	[IDX1+i], t1		! 1
117
	
118
119
	! IDX2(j) = j XOR 2
	xor	i, 8, t2
120
121
	add	wtxt, t1, t1		! 1
	ldub	[t1+2], t1		! 1
122
	ld	[IDX3+i], t3		! 3
123
	
124
	sll	t1, 2, t1		! 1
125
	ld	[wtxt+i], t0		! 0
126
	lduh	[wtxt+t2], t2		! 2
127
	and	t0, 255, t0		! 0
128
	
129
	ldub	[wtxt+t3], t3		! 3
130
	sll	t0, 2, t0		! 0
131
	ld	[T0+t0], t0		! 0
132
	and	t2, 255, t2		! 2
133
	
134
	ld	[T1+t1], t1		! 1
135
	sll	t2, 2, t2		! 2
136
	ld	[T2+t2], t2		! 2
137
	sll	t3, 2, t3		! 3
138
	
139
	ld	[T3+t3], t3		! 3
140
141
	xor	t0, t1, t0		! 0, 1
	xor	t0, t2, t0		! 0, 1, 2
142
	! Fetch roundkey
143
	ld	[key+i], t1
144
	
145
	xor	t0, t3, t0		! 0, 1, 2, 3
146
	xor	t0, t1, t0
147
	st	t0, [tmp+i]
148
	cmp	i, 8
149
	
150
	bleu	.Linner_loop
151
152
	add	i, 4, i
	! switch roles for tmp and wtxt
153
	xor	wtxt, diff, wtxt
154
	subcc	round, 1, round
155
	
156
	add	key, 16, key
157
	nop
158
	bne	.Lround_loop
159
	xor	tmp, diff, tmp
160

161
	! final round
162
	! 4*i
163
	mov	0, i
164

165
.Lfinal_loop:
166
167
	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
	! the instruction is part of
168
	ld	[IDX1+i], t1 	! 1
169
	! IDX2(j) = j XOR 2
170
171
	xor	i, 8, t2
	! ld	[idx-16], t2	! 2
172
173
	add	wtxt, t1, t1	! 1
	ldub	[t1+2], t1	! 1
174

175
	ld	[wtxt+i], t0	! 0
176
	lduh	[wtxt+t2], t2	! 2
177
	and	t0, 255, t0	! 0
178
	ld	[IDX3 + i], t3	! 3
179
	
180
	and	t2, 255, t2	! 2
181
	ldub	[T+t1], t1	! 1
182
	ldub	[T+t0], t0	! 0
183
	sll	t1, 8, t1	! 1
184
	
185
	ldub	[wtxt+t3], t3	! 3
186
	or	t0, t1, t0	! 0, 1
187
	ldub	[T+t2], t2	! 2
188
	ldub	[T+t3], t3	! 3
189
	
190
191
	sll	t2, 16, t2	! 2
	or	t0, t2, t0	! 0, 1, 2
192
	ld	[key + i], t2
193
	sll	t3, 24, t3	! 3
194
	
195
	or	t0, t3, t0	! 0, 1, 2, 3
196
197
198
199
200
201
202
	xor	t0, t2, t0
	add	i, 4, i
	cmp	i, 12
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
203
	stb	t1, [dst+1]
204
	
205
206
207
208
	stb	t3, [dst+3]
	stb	t2, [dst+2]
	stb	t0, [dst]
	
209
	bleu	.Lfinal_loop
210
211
	add	dst, 4, dst

212
	addcc	length, -16, length
213
214
	nop
	
215
	bne	.Lblock_loop
216
217
	sub	ctx, src, %g2

218
.Lend:
219
220
	ret
	restore
221
222
223
224
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310

	! After implementing double buffering
	! aes128 (ECB encrypt): 12.59s, 0.794MB/s
	! aes128 (ECB decrypt): 10.56s, 0.947MB/s
	! aes128 (CBC encrypt): 17.91s, 0.558MB/s
	! aes128 (CBC decrypt): 12.30s, 0.813MB/s
	! 
	! aes192 (ECB encrypt): 15.03s, 0.665MB/s
	! aes192 (ECB decrypt): 12.56s, 0.796MB/s
	! aes192 (CBC encrypt): 20.30s, 0.493MB/s
	! aes192 (CBC decrypt): 14.26s, 0.701MB/s
	! 
	! aes256 (ECB encrypt): 17.30s, 0.578MB/s
	! aes256 (ECB decrypt): 14.51s, 0.689MB/s
	! aes256 (CBC encrypt): 22.75s, 0.440MB/s
	! aes256 (CBC decrypt): 16.35s, 0.612MB/s
	
	! After reordering aes-encrypt.c and aes-decypt.c
	! (the order probably causes strange cache-effects):
	! aes128 (ECB encrypt): 9.21s, 1.086MB/s
	! aes128 (ECB decrypt): 11.13s, 0.898MB/s
	! aes128 (CBC encrypt): 14.12s, 0.708MB/s
	! aes128 (CBC decrypt): 13.77s, 0.726MB/s
	! 
	! aes192 (ECB encrypt): 10.86s, 0.921MB/s
	! aes192 (ECB decrypt): 13.17s, 0.759MB/s
	! aes192 (CBC encrypt): 15.74s, 0.635MB/s
	! aes192 (CBC decrypt): 15.91s, 0.629MB/s
	! 
	! aes256 (ECB encrypt): 12.71s, 0.787MB/s
	! aes256 (ECB decrypt): 15.38s, 0.650MB/s
	! aes256 (CBC encrypt): 17.49s, 0.572MB/s
	! aes256 (CBC decrypt): 17.87s, 0.560MB/s
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328

	! After further optimizations of the initial and final loops,
	! source_loop and final_loop. 
	! aes128 (ECB encrypt): 8.07s, 1.239MB/s
	! aes128 (ECB decrypt): 9.48s, 1.055MB/s
	! aes128 (CBC encrypt): 12.76s, 0.784MB/s
	! aes128 (CBC decrypt): 12.15s, 0.823MB/s
	! 
	! aes192 (ECB encrypt): 9.43s, 1.060MB/s
	! aes192 (ECB decrypt): 11.20s, 0.893MB/s
	! aes192 (CBC encrypt): 14.19s, 0.705MB/s
	! aes192 (CBC decrypt): 13.97s, 0.716MB/s
	! 
	! aes256 (ECB encrypt): 10.81s, 0.925MB/s
	! aes256 (ECB decrypt): 12.92s, 0.774MB/s
	! aes256 (CBC encrypt): 15.59s, 0.641MB/s
	! aes256 (CBC decrypt): 15.76s, 0.635MB/s