aes.asm 5.97 KB
Newer Older
1
	! Used registers:	%l0,1,2,3,4,5
2
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
3
	!			%o0,1,2,3,4,5,7 (%o6=%sp)
4
	!			%g2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
! Arguments
16
define(ctx, %i0)
17
define(T, %i1)
18
define(length, %i2)
19
define(dst, %i3)
20
define(src, %i4)
21

22
23
24
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
25
26
define(diff, %l2)
define(nround, %l3)
27
28

! Loop variables
29
30
define(round, %l4) ! Should perhaps be 16 * round
define(i, %l5)
31
32
33

! Teporaries
define(t1, %o1)
34
define(t2, %o2)
35
define(t3, %o3)
36
define(idx, %o4)
37

38
_aes_crypt:
39
! Why -136?
40
	save	%sp, -136, %sp
41

42
	cmp	length, 0
43
	be	.Lend
44

45
	! wtxt
46
	add	%fp, -24, wtxt
47
	add	%fp, -40, tmp
48
49
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
50
51
52
	
	ld	[ctx + AES_NROUNDS], nround

53
54
.Lblock_loop:
	! Read src, and add initial subkey
55
	mov	-4, i
56
.Lsource_loop:
57
	add	i, 4, i
58
		
59
	add	i, src, %o5
60
	ldub	[%o5+3], %g2
61

62
	ldub	[%o5+2], %g3
63
	sll	%g2, 24, %g2
64
	ldub	[%o5+1], %o0
65
66
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
67
	ldub	[src+i], %o5
68
	sll	%o0, 8, %o0
69
	ld	[ctx+i], %g3
70
	or	%g2, %o0, %g2
71
	or	%g2, %o5, %g2
72
	xor	%g2, %g3, %g2
73

74
	cmp	i, 12
75
	bleu	.Lsource_loop
76
	st	%g2, [wtxt+i]
77

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
94
	! ld	[ctx+%o3], %g3
95
96
97
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
98
	! st	%g2, [wtxt+%o4]
99
	! 
100
	! cmp	%o3, 8
101
	! bleu	.Lsource_loop
102
	! add	%o3, 4, %o3
103

104
	mov	1, round
105

106
107
108
	! 4*i:	i
	! This instruction copied to the delay slot of the branch here. 
	mov	0, i
109
.Lround_loop:
110
	add	T, AES_SIDX3, idx
111
.Linner_loop:
112
	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
113
114
115
116
	! the instruction is a part of. 
	!
	! The code uses the register %o[j], aka tj, as the primary 
	! register for that sub-expression. True for j==1,3.
117
	
118
	! AES_SIDX1
119
	ld	[idx-32], t1		! 1
120

121
	! AES_SIDX2
122
	ld	[idx-16], t2		! 2
123
	! wtxt[IDX1...]
124
125
	add	wtxt, t1, t1		! 1
	ldub	[t1+2], t1		! 1
126

127
	! AES_SIDX3
128
	ld	[idx], t3		! 3
129
	sll	t1, 2, t1		! 1
130
	
131
	! wtxt[i]
Niels Möller's avatar
Niels Möller committed
132
	ld	[wtxt+i], %o5		! 0
133
134
	
	! wtxt[IDX2...]
135
	lduh	[wtxt+t2], t2		! 2
136
	
137
	and	%o5, 255, %o5		! 0
138
139

	! wtxt[IDX3...]
140
	ldub	[wtxt+t3], t3		! 3
141
	
142
143
144
145
	sll	%o5, 2, %o5		! 0
	add	%o5, AES_TABLE0, %o5	! 0
	ld	[T+%o5], %g2		! 0

146
	add	t1, AES_TABLE1, t1	! 1
147
	and	t2, 255, t2		! 2
148
	ld	[T+t1], t1		! 1
149
150
151
	sll	t2, 2, t2		! 2
	add	t2, AES_TABLE2, t2	! 2
	ld	[T+t2], t2		! 2
152
153
154
155
	sll	t3, 2, t3		! 3
	add	t3, AES_TABLE3, t3	! 3
	ld	[T+t3], t3		! 3
	xor	%g2, t1, %g2		! 0, 1
156
	xor	%g2, t2, %g2		! 0, 1, 2
157

158
	add	idx, 4, idx		
159
160
161
162
163
164

! 	! Fetch roundkey
! 	sll	round, 4, %o5
! 	add	%o5, ctx, %o5
! 	ld	[%o5], %o5
		
165
	xor	%g2, t3, %g2		! 0, 1, 2, 3
166

167
168
169
170
!	xor	%g2, %o5, %g2
	st	%g2, [tmp+i]

	cmp	i, 8
171

172
	bleu	.Linner_loop
173
	add	i, 4, i
174
	
175
	sll	round, 4, %g2
176
	add	%g2, ctx, %o0
177
178
	mov	0, i

179
.Lroundkey_loop:
180
	sll	i, 2, %g2
181
	ld	[%o0], %o5
182
183
184
	add	i, 1, i
	ld	[tmp+%g2], %g3
	cmp	i, 3
185
	xor	%g3, %o5, %g3
186
	st	%g3, [wtxt+%g2]
Niels Möller's avatar
Niels Möller committed
187
	! st	%g3, [tmp+%g2]
188
	bleu	.Lroundkey_loop
189
	add	%o0, 4, %o0
190
191

	! switch roles for tmp and wtxt
Niels Möller's avatar
Niels Möller committed
192
193
	! xor	wtxt, diff, wtxt
	! xor	tmp, diff, tmp
194

195
196
	add	round, 1, round
	cmp	round, nround
197
	blu	.Lround_loop
198
	mov	0, i
199

200
	sll	round, 4, %g2
201
202
	
	! final round
203
	add	%g2, ctx, %o7
204
	mov	0, i
205
	add	T, 288, %g4
206
.Lfinal_loop:
207
	ld	[%g4-32], %g2
208
	sll	i, 2, %i5
209
	sll	%g2, 2, %g2
210
	add	wtxt, %g2, %g2
211
	ldub	[%g2+2], %o3
212
	add	%i5, dst, %o2
213
	ld	[%g4-16], %g3
214
215
	add	i, 1, i
	ld	[wtxt+%i5], %g2
216
	sll	%g3, 2, %g3
217
	lduh	[wtxt+%g3], %o4
218
	and	%g2, 255, %g2
219
	ld	[%g4], %o5
220
	and	%o4, 255, %o4
221
	ldub	[T+%o3], %o0
222
	sll	%o5, 2, %o5
223
	ldub	[T+%g2], %g3
224
	sll	%o0, 8, %o0
225
	ldub	[wtxt+%o5], %o3
226
	or	%g3, %o0, %g3
227
	ldub	[T+%o4], %g2
228
	cmp	i, 3
229
	ldub	[T+%o3], %o5
230
231
232
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
233
234
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
235
	xor	%g3, %g2, %g3
236
	srl	%g3, 24, %o5
237
	srl	%g3, 16, %o0
238
	srl	%g3, 8, %g2
239
240
241
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
242
	stb	%g3, [dst+%i5]
243
	add	%o7, 4, %o7
244
	bleu	.Lfinal_loop
245
	add	%g4, 4, %g4
246
	
247
	add	src, 16, src
248
	addcc	length, -16, length
249
	bne	.Lblock_loop
250
	add	dst, 16, dst
251
.Lend:
252
253
	ret
	restore
254
255
256
257
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s