aes.asm 5.47 KB
Newer Older
1
	! Used registers:	%l0,1
2
3
4
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
	!			%g1,2,3,4
5
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
6
	
7
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
8
	
9
10
	.section	".text"
	.align 4
11
12
	.global _aes_crypt
	.type	_aes_crypt,#function
13
	.proc	020
14

15
define(ctx, %i0)
16
define(T, %i1)
17
define(length, %i2)
18
define(dst, %i3)
19
define(src, %i4)
20

21
define(wtxt, %l1)
22
define(tmp, %l0)
23
_aes_crypt:
24
! Why -136?
25
	save	%sp, -136, %sp
26

27
28
! Why this moving around of the input parameters?
	cmp	length, 0
29
	be	.Lend
30

31
	! wtxt
32
	add	%fp, -24, wtxt
33

34
35
.Lblock_loop:
	! Read src, and add initial subkey
36
	mov	-4, %o4
37
.Lsource_loop:
38
	add	%o4, 4, %o4
39
		
40
	add	%o4, src, %o5
41
	ldub	[%o5+3], %g2
42

43
	ldub	[%o5+2], %g3
44
	sll	%g2, 24, %g2
45
	ldub	[%o5+1], %o0
46
47
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
48
	ldub	[src+%o4], %o5
49
	sll	%o0, 8, %o0
50
	ld	[ctx+%o4], %g3
51
	or	%g2, %o0, %g2
52
	or	%g2, %o5, %g2
53
	xor	%g2, %g3, %g2
54

55
	cmp	%o4, 12
56
	bleu	.Lsource_loop
57
	st	%g2, [wtxt+%o4]
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
	! ! Read a little-endian word
	! ldub	[src+3], %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+2], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+1], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
	! ldub	[src+0], %g3
	! or	%g3, %g2, %g2
	! sll	%g2, 8, %g2
	! 
75
	! ld	[ctx+%o3], %g3
76
77
78
	! xor	%g3, %g2, %g2
	! 
	! add	src, 4, src
79
	! st	%g2, [wtxt+%o4]
80
	! 
81
	! cmp	%o3, 8
82
	! bleu	.Lsource_loop
83
	! add	%o3, 4, %o3
84

85
	ld	[ctx + AES_NROUNDS], %g2
86
	mov	1, %g1
87
88

	add	%fp, -40, tmp
89
	mov	%g2, %o7
90
91

	! wtxt
92
	mov	wtxt, %g4
93

94
95
	! 4*i:	%o3
	mov	0, %o3
96
.Lround_loop:
97
	add	T, AES_SIDX3, %o2
98
.Linner_loop:
99
100
101
	! The comments mark which T->table[0][ B0(wtxt[IDX0(j)]) ]
	! the isntruction is a part of.
	
102
	! AES_SIDX1
103
	ld	[%o2-32], %g3		! 1
104

105
	! AES_SIDX2
106
	ld	[%o2-16], %o4		! 2
107
	! wtxt[IDX1...]
108
109
	add	%g4, %g3, %g3		! 1
	ldub	[%g3+2], %o0		! 1
110

111
	! AES_SIDX3
112
113
	ld	[%o2], %g2		! 3
	sll	%o0, 2, %o0		! 1
114
	
115
116
	! wtxt[i]
	ld	[%g4+%o3], %o5		! 0
117
118
	
	! wtxt[IDX2...]
119
	lduh	[%g4+%o4], %g3		! 2
120
	
121
	and	%o5, 255, %o5		! 0
122
123

	! wtxt[IDX3...]
124
	ldub	[%g4+%g2], %o4		! 3
125
	
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
	sll	%o5, 2, %o5		! 0
	add	%o5, AES_TABLE0, %o5	! 0
	ld	[T+%o5], %g2		! 0

	add	%o0, AES_TABLE1, %o0	! 1
	and	%g3, 255, %g3		! 2
	ld	[T+%o0], %o5		! 1
	sll	%g3, 2, %g3		! 2
	add	%g3, AES_TABLE2, %g3	! 2
	ld	[T+%g3], %o0		! 2
	sll	%o4, 2, %o4		! 3
	add	%o4, AES_TABLE3, %o4	! 3
	ld	[T+%o4], %g3		! 3
	xor	%g2, %o5, %g2		! 0, 1
	xor	%g2, %o0, %g2		! 0, 1, 2

	add	%o2, 4, %o2		
143
	
144
145
	xor	%g2, %g3, %g2		! 0, 1, 2, 3
	st	%g2, [tmp+%o3]
146

147
	cmp	%o3, 8
148

149
	bleu	.Linner_loop
150
	add	%o3, 4, %o3
151
	
152
	sll	%g1, 4, %g2
153
	add	%g2, ctx, %o0
154
	mov	0, %i5
155
	mov	wtxt, %o3
156
	mov	tmp, %o4
157
.Lroundkey_loop:
158
	sll	%i5, 2, %g2
159
	ld	[%o0], %o5
160
	add	%i5, 1, %i5
161
	ld	[%o4+%g2], %g3
162
	cmp	%i5, 3
163
	xor	%g3, %o5, %g3
164
	st	%g3, [%o3+%g2]
165
	bleu	.Lroundkey_loop
166
	add	%o0, 4, %o0
167
168
	add	%g1, 1, %g1
	cmp	%g1, %o7
169
	blu	.Lround_loop
170
	mov	0, %o3
171

172
	sll	%g1, 4, %g2
173
174
	
	! final round
175
	add	%g2, ctx, %o7
176
	mov	0, %o1
177
	mov	wtxt, %g1
178
	add	T, 288, %g4
179
.Lfinal_loop:
180
181
182
183
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
184
	ldub	[%g2+2], %o3
185
	add	%i5, dst, %o2
186
187
188
189
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
190
	lduh	[%g1+%g3], %o4
191
	and	%g2, 255, %g2
192
	ld	[%g4], %o5
193
	and	%o4, 255, %o4
194
	ldub	[T+%o3], %o0
195
	sll	%o5, 2, %o5
196
	ldub	[T+%g2], %g3
197
	sll	%o0, 8, %o0
198
	ldub	[%g1+%o5], %o3
199
	or	%g3, %o0, %g3
200
	ldub	[T+%o4], %g2
201
	cmp	%o1, 3
202
	ldub	[T+%o3], %o5
203
204
205
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
206
207
	sll	%o5, 24, %o5
	or	%g3, %o5, %g3
208
	xor	%g3, %g2, %g3
209
	srl	%g3, 24, %o5
210
	srl	%g3, 16, %o0
211
	srl	%g3, 8, %g2
212
213
214
	stb	%g2, [%o2+1]
	stb	%o5, [%o2+3]
	stb	%o0, [%o2+2]
215
	stb	%g3, [dst+%i5]
216
	add	%o7, 4, %o7
217
	bleu	.Lfinal_loop
218
	add	%g4, 4, %g4
219
	
220
	add	src, 16, src
221
	addcc	length, -16, length
222
	bne	.Lblock_loop
223
	add	dst, 16, dst
224
.Lend:
225
226
	ret
	restore
227
228
229
230
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s