aes.asm 5.63 KB
Newer Older
1
	! Used registers:	%l0,1,2
2
3
4
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
	!			%g1,2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
! Arguments
16
define(ctx, %i0)
17
define(T, %i1)
18
define(length, %i2)
19
define(dst, %i3)
20
define(src, %i4)
21

22
23
24
25
26
27
28
29
30
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
define(nround, %l2)

! Loop variables
define(round, %l3) ! Should perhaps be 16 * round
define(i, %l4)
	
31
_aes_crypt:
32
! Why -136?
33
	save	%sp, -136, %sp
34

35
36
! Why this moving around of the input parameters?
	cmp	length, 0
37
	be	.Lend
38

39
	! wtxt
40
	add	%fp, -24, wtxt
41
	add	%fp, -40, tmp
42

43
44
.Lblock_loop:
	! Read src, and add initial subkey
45
	mov	-4, %o4
46
.Lsource_loop:
47
	add	%o4, 4, %o4
48
		
49
	add	%o4, src, %o5
50
	ldub	[%o5+3], %g2
51

52
	ldub	[%o5+2], %g3
53
	sll	%g2, 24, %g2
54
	ldub	[%o5+1], %o0
55
56
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
57
	ldub	[src+%o4], %o5
58
	sll	%o0, 8, %o0
59
	ld	[ctx+%o4], %g3
60
	or	%g2, %o0, %g2
61
	or	%g2, %o5, %g2
62
	xor	%g2, %g3, %g2
63

64
	cmp	%o4, 12
65
	bleu	.Lsource_loop
66
	st	%g2, [wtxt+%o4]
67

68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
84
	! ld	[ctx+%o3], %g3
85
86
87
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
88
	! st	%g2, [wtxt+%o4]
89
	! 
90
	! cmp	%o3, 8
91
	! bleu	.Lsource_loop
92
	! add	%o3, 4, %o3
93

94
95
	ld	[ctx + AES_NROUNDS], nround
	mov	1, round
96

97
	! mov	%g2, %o7
98
99

	! wtxt
100
	mov	wtxt, %g4
101

102
103
	! 4*i:	%o3
	mov	0, %o3
104
.Lround_loop:
105
	add	T, AES_SIDX3, %o2
106
.Linner_loop:
107
108
109
	! The comments mark which T->table[0][ B0(wtxt[IDX0(j)]) ]
	! the isntruction is a part of.
	
110
	! AES_SIDX1
111
	ld	[%o2-32], %g3		! 1
112

113
	! AES_SIDX2
114
	ld	[%o2-16], %o4		! 2
115
	! wtxt[IDX1...]
116
117
	add	%g4, %g3, %g3		! 1
	ldub	[%g3+2], %o0		! 1
118

119
	! AES_SIDX3
120
121
	ld	[%o2], %g2		! 3
	sll	%o0, 2, %o0		! 1
122
	
123
124
	! wtxt[i]
	ld	[%g4+%o3], %o5		! 0
125
126
	
	! wtxt[IDX2...]
127
	lduh	[%g4+%o4], %g3		! 2
128
	
129
	and	%o5, 255, %o5		! 0
130
131

	! wtxt[IDX3...]
132
	ldub	[%g4+%g2], %o4		! 3
133
	
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
	sll	%o5, 2, %o5		! 0
	add	%o5, AES_TABLE0, %o5	! 0
	ld	[T+%o5], %g2		! 0

	add	%o0, AES_TABLE1, %o0	! 1
	and	%g3, 255, %g3		! 2
	ld	[T+%o0], %o5		! 1
	sll	%g3, 2, %g3		! 2
	add	%g3, AES_TABLE2, %g3	! 2
	ld	[T+%g3], %o0		! 2
	sll	%o4, 2, %o4		! 3
	add	%o4, AES_TABLE3, %o4	! 3
	ld	[T+%o4], %g3		! 3
	xor	%g2, %o5, %g2		! 0, 1
	xor	%g2, %o0, %g2		! 0, 1, 2

	add	%o2, 4, %o2		
151
	
152
153
	xor	%g2, %g3, %g2		! 0, 1, 2, 3
	st	%g2, [tmp+%o3]
154

155
	cmp	%o3, 8
156

157
	bleu	.Linner_loop
158
	add	%o3, 4, %o3
159
	
160
	sll	round, 4, %g2
161
	add	%g2, ctx, %o0
162
	mov	0, %i5
163
	mov	wtxt, %o3
164
	mov	tmp, %o4
165
.Lroundkey_loop:
166
	sll	%i5, 2, %g2
167
	ld	[%o0], %o5
168
	add	%i5, 1, %i5
169
	ld	[%o4+%g2], %g3
170
	cmp	%i5, 3
171
	xor	%g3, %o5, %g3
172
	st	%g3, [%o3+%g2]
173
	bleu	.Lroundkey_loop
174
	add	%o0, 4, %o0
175
176
	add	round, 1, round
	cmp	round, nround
177
	blu	.Lround_loop
178
	mov	0, %o3
179

180
	sll	round, 4, %g2
181
182
	
	! final round
183
	add	%g2, ctx, %o7
184
	mov	0, %o1
185
	mov	wtxt, %g1
186
	add	T, 288, %g4
187
.Lfinal_loop:
188
189
190
191
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
192
	ldub	[%g2+2], %o3
193
	add	%i5, dst, %o2
194
195
196
197
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
198
	lduh	[%g1+%g3], %o4
199
	and	%g2, 255, %g2
200
	ld	[%g4], %o5
201
	and	%o4, 255, %o4
202
	ldub	[T+%o3], %o0
203
	sll	%o5, 2, %o5
204
	ldub	[T+%g2], %g3
205
	sll	%o0, 8, %o0
206
	ldub	[%g1+%o5], %o3
207
	or	%g3, %o0, %g3
208
	ldub	[T+%o4], %g2
209
	cmp	%o1, 3
210
	ldub	[T+%o3], %o5
211
212
213
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
214
215
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
216
	xor	%g3, %g2, %g3
217
	srl	%g3, 24, %o5
218
	srl	%g3, 16, %o0
219
	srl	%g3, 8, %g2
220
221
222
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
223
	stb	%g3, [dst+%i5]
224
	add	%o7, 4, %o7
225
	bleu	.Lfinal_loop
226
	add	%g4, 4, %g4
227
	
228
	add	src, 16, src
229
	addcc	length, -16, length
230
	bne	.Lblock_loop
231
	add	dst, 16, dst
232
.Lend:
233
234
	ret
	restore
235
236
237
238
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s