aes.asm 5.8 KB
Newer Older
1
	! Used registers:	%l0,1,2
2
3
4
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
	!			%g1,2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
! Arguments
16
define(ctx, %i0)
17
define(T, %i1)
18
define(length, %i2)
19
define(dst, %i3)
20
define(src, %i4)
21

22
23
24
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
25
26
define(diff, %l2)
define(nround, %l3)
27
28

! Loop variables
29
30
define(round, %l4) ! Should perhaps be 16 * round
define(i, %l5)
31
	
32
_aes_crypt:
33
! Why -136?
34
	save	%sp, -136, %sp
35

36
	cmp	length, 0
37
	be	.Lend
38

39
	! wtxt
40
	add	%fp, -24, wtxt
41
	add	%fp, -40, tmp
42
43
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
44
45
46
	
	ld	[ctx + AES_NROUNDS], nround

47
48
.Lblock_loop:
	! Read src, and add initial subkey
49
	mov	-4, i
50
.Lsource_loop:
51
	add	i, 4, i
52
		
53
	add	i, src, %o5
54
	ldub	[%o5+3], %g2
55

56
	ldub	[%o5+2], %g3
57
	sll	%g2, 24, %g2
58
	ldub	[%o5+1], %o0
59
60
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
61
	ldub	[src+i], %o5
62
	sll	%o0, 8, %o0
63
	ld	[ctx+i], %g3
64
	or	%g2, %o0, %g2
65
	or	%g2, %o5, %g2
66
	xor	%g2, %g3, %g2
67

68
	cmp	i, 12
69
	bleu	.Lsource_loop
70
	st	%g2, [wtxt+i]
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
88
	! ld	[ctx+%o3], %g3
89
90
91
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
92
	! st	%g2, [wtxt+%o4]
93
	! 
94
	! cmp	%o3, 8
95
	! bleu	.Lsource_loop
96
	! add	%o3, 4, %o3
97

98
	mov	1, round
99

100
101
102
	! 4*i:	i
	! This instruction copied to the delay slot of the branch here. 
	mov	0, i
103
.Lround_loop:
104
	add	T, AES_SIDX3, %o2
105
.Linner_loop:
106
	! The comments mark which T->table[0][ B0(wtxt[IDX0(j)]) ]
107
	! the instruction is a part of.
108
	
109
	! AES_SIDX1
110
	ld	[%o2-32], %g3		! 1
111

112
	! AES_SIDX2
113
	ld	[%o2-16], %o4		! 2
114
	! wtxt[IDX1...]
Niels Möller's avatar
Niels Möller committed
115
	add	wtxt, %g3, %g3		! 1
116
	ldub	[%g3+2], %o0		! 1
117

118
	! AES_SIDX3
119
120
	ld	[%o2], %g2		! 3
	sll	%o0, 2, %o0		! 1
121
	
122
	! wtxt[i]
Niels Möller's avatar
Niels Möller committed
123
	ld	[wtxt+i], %o5		! 0
124
125
	
	! wtxt[IDX2...]
Niels Möller's avatar
Niels Möller committed
126
	lduh	[wtxt+%o4], %g3		! 2
127
	
128
	and	%o5, 255, %o5		! 0
129
130

	! wtxt[IDX3...]
Niels Möller's avatar
Niels Möller committed
131
	ldub	[wtxt+%g2], %o4		! 3
132
	
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
	sll	%o5, 2, %o5		! 0
	add	%o5, AES_TABLE0, %o5	! 0
	ld	[T+%o5], %g2		! 0

	add	%o0, AES_TABLE1, %o0	! 1
	and	%g3, 255, %g3		! 2
	ld	[T+%o0], %o5		! 1
	sll	%g3, 2, %g3		! 2
	add	%g3, AES_TABLE2, %g3	! 2
	ld	[T+%g3], %o0		! 2
	sll	%o4, 2, %o4		! 3
	add	%o4, AES_TABLE3, %o4	! 3
	ld	[T+%o4], %g3		! 3
	xor	%g2, %o5, %g2		! 0, 1
	xor	%g2, %o0, %g2		! 0, 1, 2

	add	%o2, 4, %o2		
150
151
152
153
154
155

! 	! Fetch roundkey
! 	sll	round, 4, %o5
! 	add	%o5, ctx, %o5
! 	ld	[%o5], %o5
		
156
	xor	%g2, %g3, %g2		! 0, 1, 2, 3
157

158
159
160
161
!	xor	%g2, %o5, %g2
	st	%g2, [tmp+i]

	cmp	i, 8
162

163
	bleu	.Linner_loop
164
	add	i, 4, i
165
	
166
	sll	round, 4, %g2
167
	add	%g2, ctx, %o0
168
169
	mov	0, i

170
.Lroundkey_loop:
171
	sll	i, 2, %g2
172
	ld	[%o0], %o5
173
174
175
	add	i, 1, i
	ld	[tmp+%g2], %g3
	cmp	i, 3
176
	xor	%g3, %o5, %g3
177
	st	%g3, [wtxt+%g2]
Niels Möller's avatar
Niels Möller committed
178
	! st	%g3, [tmp+%g2]
179
	bleu	.Lroundkey_loop
180
	add	%o0, 4, %o0
181
182

	! switch roles for tmp and wtxt
Niels Möller's avatar
Niels Möller committed
183
184
	! xor	wtxt, diff, wtxt
	! xor	tmp, diff, tmp
185

186
187
	add	round, 1, round
	cmp	round, nround
188
	blu	.Lround_loop
189
	mov	0, i
190

191
	sll	round, 4, %g2
192
193
	
	! final round
194
	add	%g2, ctx, %o7
195
	mov	0, i
196
	add	T, 288, %g4
197
.Lfinal_loop:
198
	ld	[%g4-32], %g2
199
	sll	i, 2, %i5
200
	sll	%g2, 2, %g2
201
	add	wtxt, %g2, %g2
202
	ldub	[%g2+2], %o3
203
	add	%i5, dst, %o2
204
	ld	[%g4-16], %g3
205
206
	add	i, 1, i
	ld	[wtxt+%i5], %g2
207
	sll	%g3, 2, %g3
208
	lduh	[wtxt+%g3], %o4
209
	and	%g2, 255, %g2
210
	ld	[%g4], %o5
211
	and	%o4, 255, %o4
212
	ldub	[T+%o3], %o0
213
	sll	%o5, 2, %o5
214
	ldub	[T+%g2], %g3
215
	sll	%o0, 8, %o0
216
	ldub	[wtxt+%o5], %o3
217
	or	%g3, %o0, %g3
218
	ldub	[T+%o4], %g2
219
	cmp	i, 3
220
	ldub	[T+%o3], %o5
221
222
223
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
224
225
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
226
	xor	%g3, %g2, %g3
227
	srl	%g3, 24, %o5
228
	srl	%g3, 16, %o0
229
	srl	%g3, 8, %g2
230
231
232
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
233
	stb	%g3, [dst+%i5]
234
	add	%o7, 4, %o7
235
	bleu	.Lfinal_loop
236
	add	%g4, 4, %g4
237
	
238
	add	src, 16, src
239
	addcc	length, -16, length
240
	bne	.Lblock_loop
241
	add	dst, 16, dst
242
.Lend:
243
244
	ret
	restore
245
246
247
248
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s