aes.asm 5.83 KB
Newer Older
1
	! Used registers:	%l0,1,2
2
3
4
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
	!			%g1,2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
! Arguments
16
define(ctx, %i0)
17
define(T, %i1)
18
define(length, %i2)
19
define(dst, %i3)
20
define(src, %i4)
21

22
23
24
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
25
26
define(diff, %l2)
define(nround, %l3)
27
28

! Loop variables
29
30
define(round, %l4) ! Should perhaps be 16 * round
define(i, %l5)
31
	
32
_aes_crypt:
33
! Why -136?
34
	save	%sp, -136, %sp
35

36
	cmp	length, 0
37
	be	.Lend
38

39
	! wtxt
40
	add	%fp, -24, wtxt
41
	add	%fp, -40, tmp
42
43
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
44
45
46
	
	ld	[ctx + AES_NROUNDS], nround

47
48
.Lblock_loop:
	! Read src, and add initial subkey
49
	mov	-4, i
50
.Lsource_loop:
51
	add	i, 4, i
52
		
53
	add	i, src, %o5
54
	ldub	[%o5+3], %g2
55

56
	ldub	[%o5+2], %g3
57
	sll	%g2, 24, %g2
58
	ldub	[%o5+1], %o0
59
60
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
61
	ldub	[src+i], %o5
62
	sll	%o0, 8, %o0
63
	ld	[ctx+i], %g3
64
	or	%g2, %o0, %g2
65
	or	%g2, %o5, %g2
66
	xor	%g2, %g3, %g2
67

68
	cmp	i, 12
69
	bleu	.Lsource_loop
70
	st	%g2, [wtxt+i]
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
88
	! ld	[ctx+%o3], %g3
89
90
91
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
92
	! st	%g2, [wtxt+%o4]
93
	! 
94
	! cmp	%o3, 8
95
	! bleu	.Lsource_loop
96
	! add	%o3, 4, %o3
97

98
	mov	1, round
99

100
101
102
	! 4*i:	i
	! This instruction copied to the delay slot of the branch here. 
	mov	0, i
103
.Lround_loop:
104
	add	T, AES_SIDX3, %o2
105
.Linner_loop:
106
	! The comments mark which T->table[0][ B0(wtxt[IDX0(j)]) ]
107
	! the instruction is a part of.
108
	
109
	! AES_SIDX1
110
	ld	[%o2-32], %g3		! 1
111

112
	! AES_SIDX2
113
	ld	[%o2-16], %o4		! 2
114
	! wtxt[IDX1...]
Niels Möller's avatar
Niels Möller committed
115
	add	wtxt, %g3, %g3		! 1
116
	ldub	[%g3+2], %o0		! 1
117

118
	! AES_SIDX3
119
120
	ld	[%o2], %g2		! 3
	sll	%o0, 2, %o0		! 1
121
	
122
	! wtxt[i]
Niels Möller's avatar
Niels Möller committed
123
	ld	[wtxt+i], %o5		! 0
124
125
	
	! wtxt[IDX2...]
Niels Möller's avatar
Niels Möller committed
126
	lduh	[wtxt+%o4], %g3		! 2
127
	
128
	and	%o5, 255, %o5		! 0
129
130

	! wtxt[IDX3...]
Niels Möller's avatar
Niels Möller committed
131
	ldub	[wtxt+%g2], %o4		! 3
132
	
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
	sll	%o5, 2, %o5		! 0
	add	%o5, AES_TABLE0, %o5	! 0
	ld	[T+%o5], %g2		! 0

	add	%o0, AES_TABLE1, %o0	! 1
	and	%g3, 255, %g3		! 2
	ld	[T+%o0], %o5		! 1
	sll	%g3, 2, %g3		! 2
	add	%g3, AES_TABLE2, %g3	! 2
	ld	[T+%g3], %o0		! 2
	sll	%o4, 2, %o4		! 3
	add	%o4, AES_TABLE3, %o4	! 3
	ld	[T+%o4], %g3		! 3
	xor	%g2, %o5, %g2		! 0, 1
	xor	%g2, %o0, %g2		! 0, 1, 2

	add	%o2, 4, %o2		
150
151
152
153
154
155

! 	! Fetch roundkey
! 	sll	round, 4, %o5
! 	add	%o5, ctx, %o5
! 	ld	[%o5], %o5
		
156
	xor	%g2, %g3, %g2		! 0, 1, 2, 3
157

158
159
160
161
!	xor	%g2, %o5, %g2
	st	%g2, [tmp+i]

	cmp	i, 8
162

163
	bleu	.Linner_loop
164
	add	i, 4, i
165
	
166
	sll	round, 4, %g2
167
	add	%g2, ctx, %o0
168
169
	mov	0, i

170
.Lroundkey_loop:
171
	sll	i, 2, %g2
172
	ld	[%o0], %o5
173
174
175
	add	i, 1, i
	ld	[tmp+%g2], %g3
	cmp	i, 3
176
	xor	%g3, %o5, %g3
177
	st	%g3, [wtxt+%g2]
Niels Möller's avatar
Niels Möller committed
178
	! st	%g3, [tmp+%g2]
179
	bleu	.Lroundkey_loop
180
	add	%o0, 4, %o0
181
182

	! switch roles for tmp and wtxt
Niels Möller's avatar
Niels Möller committed
183
184
	! xor	wtxt, diff, wtxt
	! xor	tmp, diff, tmp
185

186
187
	add	round, 1, round
	cmp	round, nround
188
	blu	.Lround_loop
189
	mov	0, i
190

191
	sll	round, 4, %g2
192
193
	
	! final round
194
	add	%g2, ctx, %o7
195
	mov	0, %o1
196
	mov	wtxt, %g1
197
	add	T, 288, %g4
198
.Lfinal_loop:
199
200
201
202
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
203
	ldub	[%g2+2], %o3
204
	add	%i5, dst, %o2
205
206
207
208
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
209
	lduh	[%g1+%g3], %o4
210
	and	%g2, 255, %g2
211
	ld	[%g4], %o5
212
	and	%o4, 255, %o4
213
	ldub	[T+%o3], %o0
214
	sll	%o5, 2, %o5
215
	ldub	[T+%g2], %g3
216
	sll	%o0, 8, %o0
217
	ldub	[%g1+%o5], %o3
218
	or	%g3, %o0, %g3
219
	ldub	[T+%o4], %g2
220
	cmp	%o1, 3
221
	ldub	[T+%o3], %o5
222
223
224
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
225
226
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
227
	xor	%g3, %g2, %g3
228
	srl	%g3, 24, %o5
229
	srl	%g3, 16, %o0
230
	srl	%g3, 8, %g2
231
232
233
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
234
	stb	%g3, [dst+%i5]
235
	add	%o7, 4, %o7
236
	bleu	.Lfinal_loop
237
	add	%g4, 4, %g4
238
	
239
	add	src, 16, src
240
	addcc	length, -16, length
241
	bne	.Lblock_loop
242
	add	dst, 16, dst
243
.Lend:
244
245
	ret
	restore
246
247
248
249
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s