aes.asm 5.32 KB
Newer Older
1
2
3
4
	! Used registers:	%l0,1,2
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
	!			%g1,2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
define(ctx, %i0)
16
define(T, %i1)
17
18
19
define(length, %o4)
define(dst, %o3)
define(src, %o2)
20
21

define(wtxt, %l2)
22
define(tmp, %o1)
23
_aes_crypt:
24
! Why -136?
25
	save	%sp, -136, %sp
26

27
28
! Why this moving around of the input parameters?
	mov	%i2, length
29
	! mov	%i1, T
30
31
	mov	%i3, dst
	cmp	length, 0
32
	be	.Lend
33
	mov	%i4, src
34
	! wtxt
35
	add	%fp, -24, %l1
36
37
38
	mov	%l1, wtxt
.Lblock_loop:
	! Read src, and add initial subkey
39
	mov	-4, %i2
40
.Lsource_loop:
41
42
	add	%i2, 4, %i2
		
43
44
	add	%i2, src, %o5
	ldub	[%o5+3], %g2
45

46
	ldub	[%o5+2], %g3
47
	sll	%g2, 24, %g2
48
	ldub	[%o5+1], %o0
49
50
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
51
	ldub	[src+%i2], %o5
52
	sll	%o0, 8, %o0
53
	ld	[ctx+%i2], %g3
54
	or	%g2, %o0, %g2
55
	or	%g2, %o5, %g2
56
	xor	%g2, %g3, %g2
57
58

	cmp	%i2, 12
59
60
61
	bleu	.Lsource_loop
	st	%g2, [wtxt+%i2]

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ld	[ctx+%i3], %g3
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
	! st	%g2, [wtxt+%i2]
	! 
	! cmp	%i3, 8
	! bleu	.Lsource_loop
	! add	%i3, 4, %i3

88
	ld	[ctx + AES_NROUNDS], %g2
89
	mov	1, %g1
90
91

	add	%fp, -40, tmp
92
	mov	%g2, %o7
93
94
95
	mov	tmp, %l0

	! wtxt
96
	mov	%l1, %g4
97

98
99
	! 4*i:	%i3
	mov	0, %i3
100
.Lround_loop:
101
	add	T, AES_SIDX3, %i4
102
.Linner_loop:
103
	! AES_SIDX1
104
	ld	[%i4-32], %g3
105

106
	! AES_SIDX2
107
	ld	[%i4-16], %i2
108
	! wtxt[IDX1...]
109
	add	%g4, %g3, %g3
110
	ldub	[%g3+2], %o0
111

112
	! AES_SIDX3
113
	ld	[%i4], %g2
114
	sll	%o0, 2, %o0
115
116
	
	! wtxt[j]
117
	ld	[%g4+%i3], %o5
118
119
	
	! wtxt[IDX2...]
120
	lduh	[%g4+%i2], %g3
121
	
122
	and	%o5, 255, %o5
123
124

	! wtxt[IDX3...]
125
	ldub	[%g4+%g2], %i2
126
	
127
128
129
	sll	%o5, 2, %o5
	add	%o5, AES_TABLE0, %o5
	ld	[T+%o5], %g2
130

131
	add	%o0, AES_TABLE1, %o0
132
	and	%g3, 255, %g3
133
	ld	[T+%o0], %o5
134
	sll	%g3, 2, %g3
135
	add	%g3, AES_TABLE2, %g3
136
	ld	[T+%g3], %o0
137
	sll	%i2, 2, %i2
138
	add	%i2, AES_TABLE3, %i2
139
	ld	[T+%i2], %g3
140
	xor	%g2, %o5, %g2
141
	xor	%g2, %o0, %g2
142
143
144

	add	%i4, 4, %i4
	
145
146
	xor	%g2, %g3, %g2
	st	%g2, [%l0+%i3]
147
148
149

	cmp	%i3, 8

150
	bleu	.Linner_loop
151
	add	%i3, 4, %i3
152
	
153
	sll	%g1, 4, %g2
154
	add	%g2, ctx, %o0
155
156
	mov	0, %i5
	mov	%l1, %i3
157
	mov	tmp, %i2
158
.Lroundkey_loop:
159
	sll	%i5, 2, %g2
160
	ld	[%o0], %o5
161
162
163
	add	%i5, 1, %i5
	ld	[%i2+%g2], %g3
	cmp	%i5, 3
164
	xor	%g3, %o5, %g3
165
	st	%g3, [%i3+%g2]
166
	bleu	.Lroundkey_loop
167
	add	%o0, 4, %o0
168
169
	add	%g1, 1, %g1
	cmp	%g1, %o7
170
	blu	.Lround_loop
171
172
	mov	0, %i3

173
	sll	%g1, 4, %g2
174
175
	
	! final round
176
	add	%g2, ctx, %o7
177
178
	mov	0, %o1
	mov	%l1, %g1
179
	add	T, 288, %g4
180
.Lfinal_loop:
181
182
183
184
185
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
	ldub	[%g2+2], %i3
186
	add	%i5, dst, %i4
187
188
189
190
191
192
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
	lduh	[%g1+%g3], %i2
	and	%g2, 255, %g2
193
	ld	[%g4], %o5
194
	and	%i2, 255, %i2
195
	ldub	[T+%i3], %o0
196
	sll	%o5, 2, %o5
197
	ldub	[T+%g2], %g3
198
	sll	%o0, 8, %o0
199
	ldub	[%g1+%o5], %i3
200
	or	%g3, %o0, %g3
201
	ldub	[T+%i2], %g2
202
	cmp	%o1, 3
203
	ldub	[T+%i3], %o5
204
205
206
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
207
208
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
209
	xor	%g3, %g2, %g3
210
	srl	%g3, 24, %o5
211
	srl	%g3, 16, %o0
212
213
	srl	%g3, 8, %g2
	stb	%g2, [%i4+1]
214
	stb	%o5, [%i4+3]
215
	stb	%o0, [%i4+2]
216
	stb	%g3, [dst+%i5]
217
	add	%o7, 4, %o7
218
	bleu	.Lfinal_loop
219
	add	%g4, 4, %g4
220
	
221
	add	src, 16, src
222
	addcc	length, -16, length
223
	bne	.Lblock_loop
224
	add	dst, 16, dst
225
.Lend:
226
227
	ret
	restore
228
229
230
231
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s