aes.asm 5.82 KB
Newer Older
1
	! Used registers:	%l0,1,2
2
3
4
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
	!			%g1,2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
! Arguments
16
define(ctx, %i0)
17
define(T, %i1)
18
define(length, %i2)
19
define(dst, %i3)
20
define(src, %i4)
21

22
23
24
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
25
26
define(diff, %l2)
define(nround, %l3)
27
28

! Loop variables
29
30
define(round, %l4) ! Should perhaps be 16 * round
define(i, %l5)
31
	
32
_aes_crypt:
33
! Why -136?
34
	save	%sp, -136, %sp
35

36
	cmp	length, 0
37
	be	.Lend
38

39
	! wtxt
40
	add	%fp, -24, wtxt
41
	add	%fp, -40, tmp
42
43
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
44
45
.Lblock_loop:
	! Read src, and add initial subkey
46
	mov	-4, i
47
.Lsource_loop:
48
	add	i, 4, i
49
		
50
	add	i, src, %o5
51
	ldub	[%o5+3], %g2
52

53
	ldub	[%o5+2], %g3
54
	sll	%g2, 24, %g2
55
	ldub	[%o5+1], %o0
56
57
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
58
	ldub	[src+i], %o5
59
	sll	%o0, 8, %o0
60
	ld	[ctx+i], %g3
61
	or	%g2, %o0, %g2
62
	or	%g2, %o5, %g2
63
	xor	%g2, %g3, %g2
64

65
	cmp	i, 12
66
	bleu	.Lsource_loop
67
	st	%g2, [wtxt+i]
68

69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
85
	! ld	[ctx+%o3], %g3
86
87
88
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
89
	! st	%g2, [wtxt+%o4]
90
	! 
91
	! cmp	%o3, 8
92
	! bleu	.Lsource_loop
93
	! add	%o3, 4, %o3
94

95
96
	ld	[ctx + AES_NROUNDS], nround
	mov	1, round
97

98
99
100
	! 4*i:	i
	! This instruction copied to the delay slot of the branch here. 
	mov	0, i
101
.Lround_loop:
102
	add	T, AES_SIDX3, %o2
103
.Linner_loop:
104
	! The comments mark which T->table[0][ B0(wtxt[IDX0(j)]) ]
105
	! the instruction is a part of.
106
	
107
	! AES_SIDX1
108
	ld	[%o2-32], %g3		! 1
109

110
	! AES_SIDX2
111
	ld	[%o2-16], %o4		! 2
112
	! wtxt[IDX1...]
Niels Möller's avatar
Niels Möller committed
113
	add	wtxt, %g3, %g3		! 1
114
	ldub	[%g3+2], %o0		! 1
115

116
	! AES_SIDX3
117
118
	ld	[%o2], %g2		! 3
	sll	%o0, 2, %o0		! 1
119
	
120
	! wtxt[i]
Niels Möller's avatar
Niels Möller committed
121
	ld	[wtxt+i], %o5		! 0
122
123
	
	! wtxt[IDX2...]
Niels Möller's avatar
Niels Möller committed
124
	lduh	[wtxt+%o4], %g3		! 2
125
	
126
	and	%o5, 255, %o5		! 0
127
128

	! wtxt[IDX3...]
Niels Möller's avatar
Niels Möller committed
129
	ldub	[wtxt+%g2], %o4		! 3
130
	
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
	sll	%o5, 2, %o5		! 0
	add	%o5, AES_TABLE0, %o5	! 0
	ld	[T+%o5], %g2		! 0

	add	%o0, AES_TABLE1, %o0	! 1
	and	%g3, 255, %g3		! 2
	ld	[T+%o0], %o5		! 1
	sll	%g3, 2, %g3		! 2
	add	%g3, AES_TABLE2, %g3	! 2
	ld	[T+%g3], %o0		! 2
	sll	%o4, 2, %o4		! 3
	add	%o4, AES_TABLE3, %o4	! 3
	ld	[T+%o4], %g3		! 3
	xor	%g2, %o5, %g2		! 0, 1
	xor	%g2, %o0, %g2		! 0, 1, 2

	add	%o2, 4, %o2		
148
149
150
151
152
153

! 	! Fetch roundkey
! 	sll	round, 4, %o5
! 	add	%o5, ctx, %o5
! 	ld	[%o5], %o5
		
154
	xor	%g2, %g3, %g2		! 0, 1, 2, 3
155

156
157
158
159
!	xor	%g2, %o5, %g2
	st	%g2, [tmp+i]

	cmp	i, 8
160

161
	bleu	.Linner_loop
162
	add	i, 4, i
163
	
164
	sll	round, 4, %g2
165
	add	%g2, ctx, %o0
166
167
	mov	0, i

168
.Lroundkey_loop:
169
	sll	i, 2, %g2
170
	ld	[%o0], %o5
171
172
173
	add	i, 1, i
	ld	[tmp+%g2], %g3
	cmp	i, 3
174
	xor	%g3, %o5, %g3
175
	st	%g3, [wtxt+%g2]
Niels Möller's avatar
Niels Möller committed
176
	! st	%g3, [tmp+%g2]
177
	bleu	.Lroundkey_loop
178
	add	%o0, 4, %o0
179
180

	! switch roles for tmp and wtxt
Niels Möller's avatar
Niels Möller committed
181
182
	! xor	wtxt, diff, wtxt
	! xor	tmp, diff, tmp
183

184
185
	add	round, 1, round
	cmp	round, nround
186
	blu	.Lround_loop
187
	mov	0, i
188

189
	sll	round, 4, %g2
190
191
	
	! final round
192
	add	%g2, ctx, %o7
193
	mov	0, %o1
194
	mov	wtxt, %g1
195
	add	T, 288, %g4
196
.Lfinal_loop:
197
198
199
200
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
201
	ldub	[%g2+2], %o3
202
	add	%i5, dst, %o2
203
204
205
206
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
207
	lduh	[%g1+%g3], %o4
208
	and	%g2, 255, %g2
209
	ld	[%g4], %o5
210
	and	%o4, 255, %o4
211
	ldub	[T+%o3], %o0
212
	sll	%o5, 2, %o5
213
	ldub	[T+%g2], %g3
214
	sll	%o0, 8, %o0
215
	ldub	[%g1+%o5], %o3
216
	or	%g3, %o0, %g3
217
	ldub	[T+%o4], %g2
218
	cmp	%o1, 3
219
	ldub	[T+%o3], %o5
220
221
222
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
223
224
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
225
	xor	%g3, %g2, %g3
226
	srl	%g3, 24, %o5
227
	srl	%g3, 16, %o0
228
	srl	%g3, 8, %g2
229
230
231
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
232
	stb	%g3, [dst+%i5]
233
	add	%o7, 4, %o7
234
	bleu	.Lfinal_loop
235
	add	%g4, 4, %g4
236
	
237
	add	src, 16, src
238
	addcc	length, -16, length
239
	bne	.Lblock_loop
240
	add	dst, 16, dst
241
.Lend:
242
243
	ret
	restore
244
245
246
247
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s