aes.asm 5.22 KB
Newer Older
1
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
2
	
3
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
4
	
5
6
	.section	".text"
	.align 4
7
8
	.global _aes_crypt
	.type	_aes_crypt,#function
9
	.proc	020
10
11
12
13
14
15

define(ctx, %o5)
define(T, %o0)
define(length, %o4)
define(dst, %o3)
define(src, %o2)
16
17

define(wtxt, %l2)
18
define(tmp, %o1)
19
_aes_crypt:
20
! Why -136?
21
	save	%sp, -136, %sp
22

23
24
25
26
27
28
! Why this moving around of the input parameters?
	mov	%i2, length
	mov	%i0, ctx
	mov	%i1, T
	mov	%i3, dst
	cmp	length, 0
29
	be	.Lend
30
	mov	%i4, src
31
	! wtxt
32
	add	%fp, -24, %l1
33
34
35
	mov	%l1, wtxt
.Lblock_loop:
	! Read src, and add initial subkey
36
	mov	-4, %i2
37
.Lsource_loop:
38
39
	add	%i2, 4, %i2
		
40
	add	%i2, src, %i0
41
	ldub	[%i0+3], %g2
42

43
44
45
46
47
	ldub	[%i0+2], %g3
	sll	%g2, 24, %g2
	ldub	[%i0+1], %i1
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
48
	ldub	[src+%i2], %i0
49
	sll	%i1, 8, %i1
50
	ld	[ctx+%i2], %g3
51
52
53
	or	%g2, %i1, %g2
	or	%g2, %i0, %g2
	xor	%g2, %g3, %g2
54
55

	cmp	%i2, 12
56
57
58
	bleu	.Lsource_loop
	st	%g2, [wtxt+%i2]

59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ld	[ctx+%i3], %g3
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
	! st	%g2, [wtxt+%i2]
	! 
	! cmp	%i3, 8
	! bleu	.Lsource_loop
	! add	%i3, 4, %i3

85
	ld	[ctx + AES_NROUNDS], %g2
86
	mov	1, %g1
87
88

	add	%fp, -40, tmp
89
	mov	%g2, %o7
90
91
92
	mov	tmp, %l0

	! wtxt
93
	mov	%l1, %g4
94

95
96
	! 4*i:	%i3
	mov	0, %i3
97
.Lround_loop:
98
	add	T, AES_SIDX3, %i4
99
.Linner_loop:
100
	! AES_SIDX1
101
	ld	[%i4-32], %g3
102

103
	! AES_SIDX2
104
	ld	[%i4-16], %i2
105
	! wtxt[IDX1...]
106
107
	add	%g4, %g3, %g3
	ldub	[%g3+2], %i1
108

109
	! AES_SIDX3
110
111
	ld	[%i4], %g2
	sll	%i1, 2, %i1
112
113
	
	! wtxt[j]
114
	ld	[%g4+%i3], %i0
115
116
	
	! wtxt[IDX2...]
117
	lduh	[%g4+%i2], %g3
118
	
119
	and	%i0, 255, %i0
120
121

	! wtxt[IDX3...]
122
	ldub	[%g4+%g2], %i2
123
	
124
	sll	%i0, 2, %i0
125
	add	%i0, AES_TABLE0, %i0
126
	ld	[T+%i0], %g2
127
128

	add	%i1, AES_TABLE1, %i1
129
	and	%g3, 255, %g3
130
	ld	[T+%i1], %i0
131
	sll	%g3, 2, %g3
132
	add	%g3, AES_TABLE2, %g3
133
	ld	[T+%g3], %i1
134
	sll	%i2, 2, %i2
135
	add	%i2, AES_TABLE3, %i2
136
	ld	[T+%i2], %g3
137
138
	xor	%g2, %i0, %g2
	xor	%g2, %i1, %g2
139
140
141

	add	%i4, 4, %i4
	
142
143
	xor	%g2, %g3, %g2
	st	%g2, [%l0+%i3]
144
145
146

	cmp	%i3, 8

147
	bleu	.Linner_loop
148
	add	%i3, 4, %i3
149
	
150
	sll	%g1, 4, %g2
151
	add	%g2, ctx, %i1
152
153
	mov	0, %i5
	mov	%l1, %i3
154
	mov	tmp, %i2
155
.Lroundkey_loop:
156
157
158
159
160
161
162
	sll	%i5, 2, %g2
	ld	[%i1], %i0
	add	%i5, 1, %i5
	ld	[%i2+%g2], %g3
	cmp	%i5, 3
	xor	%g3, %i0, %g3
	st	%g3, [%i3+%g2]
163
	bleu	.Lroundkey_loop
164
165
166
	add	%i1, 4, %i1
	add	%g1, 1, %g1
	cmp	%g1, %o7
167
	blu	.Lround_loop
168
169
	mov	0, %i3

170
	sll	%g1, 4, %g2
171
172
	
	! final round
173
	add	%g2, ctx, %o7
174
175
	mov	0, %o1
	mov	%l1, %g1
176
	add	T, 288, %g4
177
.Lfinal_loop:
178
179
180
181
182
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
	ldub	[%g2+2], %i3
183
	add	%i5, dst, %i4
184
185
186
187
188
189
190
191
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
	lduh	[%g1+%g3], %i2
	and	%g2, 255, %g2
	ld	[%g4], %i0
	and	%i2, 255, %i2
192
	ldub	[T+%i3], %i1
193
	sll	%i0, 2, %i0
194
	ldub	[T+%g2], %g3
195
196
197
	sll	%i1, 8, %i1
	ldub	[%g1+%i0], %i3
	or	%g3, %i1, %g3
198
	ldub	[T+%i2], %g2
199
	cmp	%o1, 3
200
	ldub	[T+%i3], %i0
201
202
203
204
205
206
207
208
209
210
211
212
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
	sll	%i0, 24, %i0
	or	%g3, %i0, %g3
	xor	%g3, %g2, %g3
	srl	%g3, 24, %i0
	srl	%g3, 16, %i1
	srl	%g3, 8, %g2
	stb	%g2, [%i4+1]
	stb	%i0, [%i4+3]
	stb	%i1, [%i4+2]
213
	stb	%g3, [dst+%i5]
214
	add	%o7, 4, %o7
215
	bleu	.Lfinal_loop
216
	add	%g4, 4, %g4
217
	
218
	add	src, 16, src
219
	addcc	length, -16, length
220
	bne	.Lblock_loop
221
	add	dst, 16, dst
222
.Lend:
223
224
	ret
	restore
225
226
227
228
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s