aes.asm 5.29 KB
Newer Older
1
2
3
4
	! Used registers:	%l0,1,2
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
	!			%g1,2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
define(ctx, %i0)
16
define(T, %i1)
17
define(length, %i2)
18
define(dst, %i3)
19
define(src, %o2)
20
21

define(wtxt, %l2)
22
define(tmp, %o1)
23
_aes_crypt:
24
! Why -136?
25
	save	%sp, -136, %sp
26

27
! Why this moving around of the input parameters?
28
	!mov	%i3, dst
29
	cmp	length, 0
30
	be	.Lend
31
	mov	%i4, src
32
	! wtxt
33
	add	%fp, -24, %l1
34
35
36
	mov	%l1, wtxt
.Lblock_loop:
	! Read src, and add initial subkey
37
	mov	-4, %o4
38
.Lsource_loop:
39
	add	%o4, 4, %o4
40
		
41
	add	%o4, src, %o5
42
	ldub	[%o5+3], %g2
43

44
	ldub	[%o5+2], %g3
45
	sll	%g2, 24, %g2
46
	ldub	[%o5+1], %o0
47
48
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
49
	ldub	[src+%o4], %o5
50
	sll	%o0, 8, %o0
51
	ld	[ctx+%o4], %g3
52
	or	%g2, %o0, %g2
53
	or	%g2, %o5, %g2
54
	xor	%g2, %g3, %g2
55

56
	cmp	%o4, 12
57
	bleu	.Lsource_loop
58
	st	%g2, [wtxt+%o4]
59

60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
76
	! ld	[ctx+%o3], %g3
77
78
79
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
80
	! st	%g2, [wtxt+%o4]
81
	! 
82
	! cmp	%o3, 8
83
	! bleu	.Lsource_loop
84
	! add	%o3, 4, %o3
85

86
	ld	[ctx + AES_NROUNDS], %g2
87
	mov	1, %g1
88
89

	add	%fp, -40, tmp
90
	mov	%g2, %o7
91
92
93
	mov	tmp, %l0

	! wtxt
94
	mov	%l1, %g4
95

96
97
	! 4*i:	%o3
	mov	0, %o3
98
.Lround_loop:
99
	add	T, AES_SIDX3, %i4
100
.Linner_loop:
101
	! AES_SIDX1
102
	ld	[%i4-32], %g3
103

104
	! AES_SIDX2
105
	ld	[%i4-16], %o4
106
	! wtxt[IDX1...]
107
	add	%g4, %g3, %g3
108
	ldub	[%g3+2], %o0
109

110
	! AES_SIDX3
111
	ld	[%i4], %g2
112
	sll	%o0, 2, %o0
113
114
	
	! wtxt[j]
115
	ld	[%g4+%o3], %o5
116
117
	
	! wtxt[IDX2...]
118
	lduh	[%g4+%o4], %g3
119
	
120
	and	%o5, 255, %o5
121
122

	! wtxt[IDX3...]
123
	ldub	[%g4+%g2], %o4
124
	
125
126
127
	sll	%o5, 2, %o5
	add	%o5, AES_TABLE0, %o5
	ld	[T+%o5], %g2
128

129
	add	%o0, AES_TABLE1, %o0
130
	and	%g3, 255, %g3
131
	ld	[T+%o0], %o5
132
	sll	%g3, 2, %g3
133
	add	%g3, AES_TABLE2, %g3
134
	ld	[T+%g3], %o0
135
136
137
	sll	%o4, 2, %o4
	add	%o4, AES_TABLE3, %o4
	ld	[T+%o4], %g3
138
	xor	%g2, %o5, %g2
139
	xor	%g2, %o0, %g2
140
141
142

	add	%i4, 4, %i4
	
143
	xor	%g2, %g3, %g2
144
	st	%g2, [%l0+%o3]
145

146
	cmp	%o3, 8
147

148
	bleu	.Linner_loop
149
	add	%o3, 4, %o3
150
	
151
	sll	%g1, 4, %g2
152
	add	%g2, ctx, %o0
153
	mov	0, %i5
154
	mov	%l1, %o3
155
	mov	tmp, %o4
156
.Lroundkey_loop:
157
	sll	%i5, 2, %g2
158
	ld	[%o0], %o5
159
	add	%i5, 1, %i5
160
	ld	[%o4+%g2], %g3
161
	cmp	%i5, 3
162
	xor	%g3, %o5, %g3
163
	st	%g3, [%o3+%g2]
164
	bleu	.Lroundkey_loop
165
	add	%o0, 4, %o0
166
167
	add	%g1, 1, %g1
	cmp	%g1, %o7
168
	blu	.Lround_loop
169
	mov	0, %o3
170

171
	sll	%g1, 4, %g2
172
173
	
	! final round
174
	add	%g2, ctx, %o7
175
176
	mov	0, %o1
	mov	%l1, %g1
177
	add	T, 288, %g4
178
.Lfinal_loop:
179
180
181
182
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
183
	ldub	[%g2+2], %o3
184
	add	%i5, dst, %i4
185
186
187
188
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
189
	lduh	[%g1+%g3], %o4
190
	and	%g2, 255, %g2
191
	ld	[%g4], %o5
192
	and	%o4, 255, %o4
193
	ldub	[T+%o3], %o0
194
	sll	%o5, 2, %o5
195
	ldub	[T+%g2], %g3
196
	sll	%o0, 8, %o0
197
	ldub	[%g1+%o5], %o3
198
	or	%g3, %o0, %g3
199
	ldub	[T+%o4], %g2
200
	cmp	%o1, 3
201
	ldub	[T+%o3], %o5
202
203
204
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
205
206
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
207
	xor	%g3, %g2, %g3
208
	srl	%g3, 24, %o5
209
	srl	%g3, 16, %o0
210
211
	srl	%g3, 8, %g2
	stb	%g2, [%i4+1]
212
	stb	%o5, [%i4+3]
213
	stb	%o0, [%i4+2]
214
	stb	%g3, [dst+%i5]
215
	add	%o7, 4, %o7
216
	bleu	.Lfinal_loop
217
	add	%g4, 4, %g4
218
	
219
	add	src, 16, src
220
	addcc	length, -16, length
221
	bne	.Lblock_loop
222
	add	dst, 16, dst
223
.Lend:
224
225
	ret
	restore
226
227
228
229
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s