aes.asm 5.27 KB
Newer Older
1
2
3
4
	! Used registers:	%l0,1,2
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
	!			%g1,2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
define(ctx, %i0)
16
define(T, %i1)
17
define(length, %i2)
18
define(dst, %i3)
19
define(src, %i4)
20
21

define(wtxt, %l2)
22
define(tmp, %o1)
23
_aes_crypt:
24
! Why -136?
25
	save	%sp, -136, %sp
26

27
28
! Why this moving around of the input parameters?
	cmp	length, 0
29
	be	.Lend
30

31
	! wtxt
32
	add	%fp, -24, %l1
33
34
35
	mov	%l1, wtxt
.Lblock_loop:
	! Read src, and add initial subkey
36
	mov	-4, %o4
37
.Lsource_loop:
38
	add	%o4, 4, %o4
39
		
40
	add	%o4, src, %o5
41
	ldub	[%o5+3], %g2
42

43
	ldub	[%o5+2], %g3
44
	sll	%g2, 24, %g2
45
	ldub	[%o5+1], %o0
46
47
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
48
	ldub	[src+%o4], %o5
49
	sll	%o0, 8, %o0
50
	ld	[ctx+%o4], %g3
51
	or	%g2, %o0, %g2
52
	or	%g2, %o5, %g2
53
	xor	%g2, %g3, %g2
54

55
	cmp	%o4, 12
56
	bleu	.Lsource_loop
57
	st	%g2, [wtxt+%o4]
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
75
	! ld	[ctx+%o3], %g3
76
77
78
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
79
	! st	%g2, [wtxt+%o4]
80
	! 
81
	! cmp	%o3, 8
82
	! bleu	.Lsource_loop
83
	! add	%o3, 4, %o3
84

85
	ld	[ctx + AES_NROUNDS], %g2
86
	mov	1, %g1
87
88

	add	%fp, -40, tmp
89
	mov	%g2, %o7
90
91
92
	mov	tmp, %l0

	! wtxt
93
	mov	%l1, %g4
94

95
96
	! 4*i:	%o3
	mov	0, %o3
97
.Lround_loop:
98
	add	T, AES_SIDX3, %o2
99
.Linner_loop:
100
	! AES_SIDX1
101
	ld	[%o2-32], %g3
102

103
	! AES_SIDX2
104
	ld	[%o2-16], %o4
105
	! wtxt[IDX1...]
106
	add	%g4, %g3, %g3
107
	ldub	[%g3+2], %o0
108

109
	! AES_SIDX3
110
	ld	[%o2], %g2
111
	sll	%o0, 2, %o0
112
113
	
	! wtxt[j]
114
	ld	[%g4+%o3], %o5
115
116
	
	! wtxt[IDX2...]
117
	lduh	[%g4+%o4], %g3
118
	
119
	and	%o5, 255, %o5
120
121

	! wtxt[IDX3...]
122
	ldub	[%g4+%g2], %o4
123
	
124
125
126
	sll	%o5, 2, %o5
	add	%o5, AES_TABLE0, %o5
	ld	[T+%o5], %g2
127

128
	add	%o0, AES_TABLE1, %o0
129
	and	%g3, 255, %g3
130
	ld	[T+%o0], %o5
131
	sll	%g3, 2, %g3
132
	add	%g3, AES_TABLE2, %g3
133
	ld	[T+%g3], %o0
134
135
136
	sll	%o4, 2, %o4
	add	%o4, AES_TABLE3, %o4
	ld	[T+%o4], %g3
137
	xor	%g2, %o5, %g2
138
	xor	%g2, %o0, %g2
139

140
	add	%o2, 4, %o2
141
	
142
	xor	%g2, %g3, %g2
143
	st	%g2, [%l0+%o3]
144

145
	cmp	%o3, 8
146

147
	bleu	.Linner_loop
148
	add	%o3, 4, %o3
149
	
150
	sll	%g1, 4, %g2
151
	add	%g2, ctx, %o0
152
	mov	0, %i5
153
	mov	%l1, %o3
154
	mov	tmp, %o4
155
.Lroundkey_loop:
156
	sll	%i5, 2, %g2
157
	ld	[%o0], %o5
158
	add	%i5, 1, %i5
159
	ld	[%o4+%g2], %g3
160
	cmp	%i5, 3
161
	xor	%g3, %o5, %g3
162
	st	%g3, [%o3+%g2]
163
	bleu	.Lroundkey_loop
164
	add	%o0, 4, %o0
165
166
	add	%g1, 1, %g1
	cmp	%g1, %o7
167
	blu	.Lround_loop
168
	mov	0, %o3
169

170
	sll	%g1, 4, %g2
171
172
	
	! final round
173
	add	%g2, ctx, %o7
174
175
	mov	0, %o1
	mov	%l1, %g1
176
	add	T, 288, %g4
177
.Lfinal_loop:
178
179
180
181
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
182
	ldub	[%g2+2], %o3
183
	add	%i5, dst, %o2
184
185
186
187
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
188
	lduh	[%g1+%g3], %o4
189
	and	%g2, 255, %g2
190
	ld	[%g4], %o5
191
	and	%o4, 255, %o4
192
	ldub	[T+%o3], %o0
193
	sll	%o5, 2, %o5
194
	ldub	[T+%g2], %g3
195
	sll	%o0, 8, %o0
196
	ldub	[%g1+%o5], %o3
197
	or	%g3, %o0, %g3
198
	ldub	[T+%o4], %g2
199
	cmp	%o1, 3
200
	ldub	[T+%o3], %o5
201
202
203
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
204
205
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
206
	xor	%g3, %g2, %g3
207
	srl	%g3, 24, %o5
208
	srl	%g3, 16, %o0
209
	srl	%g3, 8, %g2
210
211
212
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
213
	stb	%g3, [dst+%i5]
214
	add	%o7, 4, %o7
215
	bleu	.Lfinal_loop
216
	add	%g4, 4, %g4
217
	
218
	add	src, 16, src
219
	addcc	length, -16, length
220
	bne	.Lblock_loop
221
	add	dst, 16, dst
222
.Lend:
223
224
	ret
	restore
225
226
227
228
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s