aes.asm 5.89 KB
Newer Older
1
	! Used registers:	%l0,1,2,3,4,5
2
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
3
4
	!			%o0,2,3,4,5,7 (%o6=%sp)
	!			%g2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
! Arguments
16
define(ctx, %i0)
17
define(T, %i1)
18
define(length, %i2)
19
define(dst, %i3)
20
define(src, %i4)
21

22
23
24
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
25
26
define(diff, %l2)
define(nround, %l3)
27
28

! Loop variables
29
30
define(round, %l4) ! Should perhaps be 16 * round
define(i, %l5)
31
	
32
_aes_crypt:
33
! Why -136?
34
	save	%sp, -136, %sp
35

36
	cmp	length, 0
37
	be	.Lend
38

39
	! wtxt
40
	add	%fp, -24, wtxt
41
	add	%fp, -40, tmp
42
43
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
44
45
46
	
	ld	[ctx + AES_NROUNDS], nround

47
48
.Lblock_loop:
	! Read src, and add initial subkey
49
	mov	-4, i
50
.Lsource_loop:
51
	add	i, 4, i
52
		
53
	add	i, src, %o5
54
	ldub	[%o5+3], %g2
55

56
	ldub	[%o5+2], %g3
57
	sll	%g2, 24, %g2
58
	ldub	[%o5+1], %o0
59
60
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
61
	ldub	[src+i], %o5
62
	sll	%o0, 8, %o0
63
	ld	[ctx+i], %g3
64
	or	%g2, %o0, %g2
65
	or	%g2, %o5, %g2
66
	xor	%g2, %g3, %g2
67

68
	cmp	i, 12
69
	bleu	.Lsource_loop
70
	st	%g2, [wtxt+i]
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
88
	! ld	[ctx+%o3], %g3
89
90
91
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
92
	! st	%g2, [wtxt+%o4]
93
	! 
94
	! cmp	%o3, 8
95
	! bleu	.Lsource_loop
96
	! add	%o3, 4, %o3
97

98
	mov	1, round
99

100
101
102
	! 4*i:	i
	! This instruction copied to the delay slot of the branch here. 
	mov	0, i
103
.Lround_loop:
104
	add	T, AES_SIDX3, %o2
105
.Linner_loop:
106
107
108
	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
	! the instruction is a part of. Uses the %o[j] as the primary 
	! register for that sub-expression. True for j==1.
109
	
110
	! AES_SIDX1
111
	ld	[%o2-32], %o1		! 1
112

113
	! AES_SIDX2
114
	ld	[%o2-16], %o4		! 2
115
	! wtxt[IDX1...]
116
117
	add	wtxt, %o1, %o1		! 1
	ldub	[%o1+2], %o1		! 1
118

119
	! AES_SIDX3
120
	ld	[%o2], %g2		! 3
121
	sll	%o1, 2, %o1		! 1
122
	
123
	! wtxt[i]
Niels Möller's avatar
Niels Möller committed
124
	ld	[wtxt+i], %o5		! 0
125
126
	
	! wtxt[IDX2...]
Niels Möller's avatar
Niels Möller committed
127
	lduh	[wtxt+%o4], %g3		! 2
128
	
129
	and	%o5, 255, %o5		! 0
130
131

	! wtxt[IDX3...]
Niels Möller's avatar
Niels Möller committed
132
	ldub	[wtxt+%g2], %o4		! 3
133
	
134
135
136
137
	sll	%o5, 2, %o5		! 0
	add	%o5, AES_TABLE0, %o5	! 0
	ld	[T+%o5], %g2		! 0

138
	add	%o1, AES_TABLE1, %o1	! 1
139
	and	%g3, 255, %g3		! 2
140
	ld	[T+%o1], %o1		! 1
141
142
143
144
145
146
	sll	%g3, 2, %g3		! 2
	add	%g3, AES_TABLE2, %g3	! 2
	ld	[T+%g3], %o0		! 2
	sll	%o4, 2, %o4		! 3
	add	%o4, AES_TABLE3, %o4	! 3
	ld	[T+%o4], %g3		! 3
147
	xor	%g2, %o1, %g2		! 0, 1
148
149
150
	xor	%g2, %o0, %g2		! 0, 1, 2

	add	%o2, 4, %o2		
151
152
153
154
155
156

! 	! Fetch roundkey
! 	sll	round, 4, %o5
! 	add	%o5, ctx, %o5
! 	ld	[%o5], %o5
		
157
	xor	%g2, %g3, %g2		! 0, 1, 2, 3
158

159
160
161
162
!	xor	%g2, %o5, %g2
	st	%g2, [tmp+i]

	cmp	i, 8
163

164
	bleu	.Linner_loop
165
	add	i, 4, i
166
	
167
	sll	round, 4, %g2
168
	add	%g2, ctx, %o0
169
170
	mov	0, i

171
.Lroundkey_loop:
172
	sll	i, 2, %g2
173
	ld	[%o0], %o5
174
175
176
	add	i, 1, i
	ld	[tmp+%g2], %g3
	cmp	i, 3
177
	xor	%g3, %o5, %g3
178
	st	%g3, [wtxt+%g2]
Niels Möller's avatar
Niels Möller committed
179
	! st	%g3, [tmp+%g2]
180
	bleu	.Lroundkey_loop
181
	add	%o0, 4, %o0
182
183

	! switch roles for tmp and wtxt
Niels Möller's avatar
Niels Möller committed
184
185
	! xor	wtxt, diff, wtxt
	! xor	tmp, diff, tmp
186

187
188
	add	round, 1, round
	cmp	round, nround
189
	blu	.Lround_loop
190
	mov	0, i
191

192
	sll	round, 4, %g2
193
194
	
	! final round
195
	add	%g2, ctx, %o7
196
	mov	0, i
197
	add	T, 288, %g4
198
.Lfinal_loop:
199
	ld	[%g4-32], %g2
200
	sll	i, 2, %i5
201
	sll	%g2, 2, %g2
202
	add	wtxt, %g2, %g2
203
	ldub	[%g2+2], %o3
204
	add	%i5, dst, %o2
205
	ld	[%g4-16], %g3
206
207
	add	i, 1, i
	ld	[wtxt+%i5], %g2
208
	sll	%g3, 2, %g3
209
	lduh	[wtxt+%g3], %o4
210
	and	%g2, 255, %g2
211
	ld	[%g4], %o5
212
	and	%o4, 255, %o4
213
	ldub	[T+%o3], %o0
214
	sll	%o5, 2, %o5
215
	ldub	[T+%g2], %g3
216
	sll	%o0, 8, %o0
217
	ldub	[wtxt+%o5], %o3
218
	or	%g3, %o0, %g3
219
	ldub	[T+%o4], %g2
220
	cmp	i, 3
221
	ldub	[T+%o3], %o5
222
223
224
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
225
226
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
227
	xor	%g3, %g2, %g3
228
	srl	%g3, 24, %o5
229
	srl	%g3, 16, %o0
230
	srl	%g3, 8, %g2
231
232
233
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
234
	stb	%g3, [dst+%i5]
235
	add	%o7, 4, %o7
236
	bleu	.Lfinal_loop
237
	add	%g4, 4, %g4
238
	
239
	add	src, 16, src
240
	addcc	length, -16, length
241
	bne	.Lblock_loop
242
	add	dst, 16, dst
243
.Lend:
244
245
	ret
	restore
246
247
248
249
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s