aes.asm 8.62 KB
Newer Older
1
! -*- mode: asm; asm-comment-char: ?!; -*-  
Niels Möller's avatar
Niels Möller committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
! nettle, low-level cryptographics library
! 
! Copyright (C) 2002 Niels Mller
!  
! The nettle library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
! 
! The nettle library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
! License for more details.
! 
! You should have received a copy of the GNU Lesser General Public License
! along with the nettle library; see the file COPYING.LIB.  If not, write to
! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! MA 02111-1307, USA.

21
22
! NOTE: Some of the %g registers are reserved for operating system etc
! (see gcc/config/sparc.h). We should use only %g1-%g3 to be safe.
Niels Möller's avatar
Niels Möller committed
23
	
24
	! Used registers:	%l0,1,2,3,4,5,6,7
25
26
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,5,7 (%o6=%sp)
27
	!			%g1,2,3
Niels Möller's avatar
Niels Möller committed
28
	
29
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
30
	
31
	.section	".text"
32
	.align 16
33
34
	.global _aes_crypt
	.type	_aes_crypt,#function
35
	.proc	020
36

37
! Arguments
38
define(ctx, %i0)
39
define(T, %i1)
40
define(length, %i2)
41
define(dst, %i3)
42
define(src, %i4)
43

44
! Loop invariants
45
define(wtxt, %sp)
46
define(tmp, %l1)
47
define(diff, %l2)
48
define(nrounds, %l3)
49
50

! Loop variables
51
define(round, %l4)
52
define(i, %l5)
53
define(key, %o4)
54

55
56
57
! Further loop invariants
define(T0, %l6)
define(T1, %l7)
58
define(T2, %l0)
59
define(T3, %o7)
60
61
define(IDX1, %i5)
define(IDX3, %o5)
62

63
! Teporaries
64
define(t0, %o0)
65
define(t1, %o1)
66
define(t2, %o2)
67
68
define(t3, %o3)

69
70
71
72
define(<FRAME_SIZE>, 32)
define(<FRAME_WTXT>, 0)
define(<FRAME_TMP>, 16)

73
_aes_crypt:
74
	save	%sp, -FRAME_SIZE, %sp
75
	cmp	length, 0
76
	be	.Lend
77
78
	add	%sp, FRAME_TMP, tmp

79
	ld	[ctx + AES_NROUNDS], nrounds
80
81
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
82
	! The loop variable will be multiplied by 16.
83
84
	! More loop invariants
	add	T, AES_TABLE0, T0
85
	
86
87
88
	add	T, AES_TABLE1, T1
	add	T, AES_TABLE2, T2
	add	T, AES_TABLE3, T3
89
90
91
	add	T, AES_SIDX1, IDX1

	add	T, AES_SIDX3, IDX3
92
	! Read src, and add initial subkey
93
	! Difference between ctx and src.
94
95
	! NOTE: These instruction is duplicated in the delay slot,
	! and the instruction before the branch
96
97
98
	sub	ctx, src, %g2
	! Difference between wtxt and src
	sub	wtxt, src, %g3
99
.Lblock_loop:
100
101
	! For stop condition. Note that src is incremented in the
	! delay slot
102
	add	src, 8, %g1
103
	
104
.Lsource_loop:
105
106
107
108
	ldub	[src+3], t3
	ldub	[src+2], t2
	sll	t3, 24, t3
	ldub	[src+1], t1
109
	
110
111
112
113
	sll	t2, 16, t2
	or	t3, t2, t3
	ldub	[src], t0
	sll	t1, 8, t1
114
	
115
	! Get subkey
116
	ld	[src+%g2], t2
117
118
	or	t3, t1, t3
	or	t3, t0, t3
119
	xor	t3, t2, t3
120
	
121
	cmp	src, %g1
122
	st	t3, [src+%g3]
123
	bleu	.Lsource_loop
124
	add	src, 4, src
125
	
126
	sub	nrounds, 1, round
127
	add	ctx, 16, key
128
.Lround_loop:
129
	! 4*i
130
	! NOTE: Instruction duplicated in delay slot
131
	mov	0, i
132
.Linner_loop:
133
	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
134
135
136
137
	! the instruction is a part of. 
	!
	! The code uses the register %o[j], aka tj, as the primary 
	! register for that sub-expression. True for j==1,3.
138
	
139
	ld	[IDX1+i], t1		! 1
140
	
141
142
	! IDX2(j) = j XOR 2
	xor	i, 8, t2
143
144
	add	wtxt, t1, t1		! 1
	ldub	[t1+2], t1		! 1
145
	ld	[IDX3+i], t3		! 3
146
	
147
	sll	t1, 2, t1		! 1
148
	ld	[wtxt+i], t0		! 0
149
	lduh	[wtxt+t2], t2		! 2
150
	and	t0, 255, t0		! 0
151
	
152
	ldub	[wtxt+t3], t3		! 3
153
	sll	t0, 2, t0		! 0
154
	ld	[T0+t0], t0		! 0
155
	and	t2, 255, t2		! 2
156
	
157
	ld	[T1+t1], t1		! 1
158
	sll	t2, 2, t2		! 2
159
	ld	[T2+t2], t2		! 2
160
	sll	t3, 2, t3		! 3
161
	
162
	ld	[T3+t3], t3		! 3
163
164
	xor	t0, t1, t0		! 0, 1
	xor	t0, t2, t0		! 0, 1, 2
165
	! Fetch roundkey
166
	ld	[key+i], t1
167
	
168
	xor	t0, t3, t0		! 0, 1, 2, 3
169
	xor	t0, t1, t0
170
	st	t0, [tmp+i]
171
	cmp	i, 8
172
	
173
	bleu	.Linner_loop
174
175
	add	i, 4, i
	! switch roles for tmp and wtxt
176
	xor	wtxt, diff, wtxt
177
178
	xor	tmp, diff, tmp

179
	subcc	round, 1, round
180
	add	key, 16, key
181
182
	bne	.Linner_loop
	mov	0, i
183

184
	! final round
185
186
187
	! Use round as the loop variable, as it's already zero
undefine(<i>)
define(i, round)
188

189
190
	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
	! the instruction is part of
191
	! NOTE: First instruction duplicated in delay slot
192
	ld	[IDX1+i], t1 	! 1
193
.Lfinal_loop:
194
	! IDX2(j) = j XOR 2
195
196
	xor	i, 8, t2
	! ld	[idx-16], t2	! 2
197
198
	add	wtxt, t1, t1	! 1
	ldub	[t1+2], t1	! 1
199

200
	ld	[wtxt+i], t0	! 0
201
	lduh	[wtxt+t2], t2	! 2
202
	and	t0, 255, t0	! 0
203
	ld	[IDX3 + i], t3	! 3
204
	
205
	and	t2, 255, t2	! 2
206
	ldub	[T+t1], t1	! 1
207
	ldub	[T+t0], t0	! 0
208
	sll	t1, 8, t1	! 1
209
	
210
	ldub	[wtxt+t3], t3	! 3
211
	or	t0, t1, t0	! 0, 1
212
	ldub	[T+t2], t2	! 2
213
	ldub	[T+t3], t3	! 3
214
	
215
216
	sll	t2, 16, t2	! 2
	or	t0, t2, t0	! 0, 1, 2
217
	ld	[key + i], t2
218
	sll	t3, 24, t3	! 3
219
	
220
	or	t0, t3, t0	! 0, 1, 2, 3
221
222
223
224
225
226
227
	xor	t0, t2, t0
	add	i, 4, i
	cmp	i, 12
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
228
	stb	t1, [dst+1]
229
	
230
231
232
	stb	t3, [dst+3]
	stb	t2, [dst+2]
	stb	t0, [dst]
233
	add	dst, 4, dst
234
	
235
	bleu	.Lfinal_loop
236
	ld	[IDX1+i], t1 	! 1
237
	addcc	length, -16, length
238
	sub	ctx, src, %g2
239
	
240
	bne	.Lblock_loop
241
	sub	wtxt, src, %g3
242

243
.Lend:
244
245
	ret
	restore
246
247
248
249
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

	! After implementing double buffering
	! aes128 (ECB encrypt): 12.59s, 0.794MB/s
	! aes128 (ECB decrypt): 10.56s, 0.947MB/s
	! aes128 (CBC encrypt): 17.91s, 0.558MB/s
	! aes128 (CBC decrypt): 12.30s, 0.813MB/s
	! 
	! aes192 (ECB encrypt): 15.03s, 0.665MB/s
	! aes192 (ECB decrypt): 12.56s, 0.796MB/s
	! aes192 (CBC encrypt): 20.30s, 0.493MB/s
	! aes192 (CBC decrypt): 14.26s, 0.701MB/s
	! 
	! aes256 (ECB encrypt): 17.30s, 0.578MB/s
	! aes256 (ECB decrypt): 14.51s, 0.689MB/s
	! aes256 (CBC encrypt): 22.75s, 0.440MB/s
	! aes256 (CBC decrypt): 16.35s, 0.612MB/s
	
	! After reordering aes-encrypt.c and aes-decypt.c
	! (the order probably causes strange cache-effects):
	! aes128 (ECB encrypt): 9.21s, 1.086MB/s
	! aes128 (ECB decrypt): 11.13s, 0.898MB/s
	! aes128 (CBC encrypt): 14.12s, 0.708MB/s
	! aes128 (CBC decrypt): 13.77s, 0.726MB/s
	! 
	! aes192 (ECB encrypt): 10.86s, 0.921MB/s
	! aes192 (ECB decrypt): 13.17s, 0.759MB/s
	! aes192 (CBC encrypt): 15.74s, 0.635MB/s
	! aes192 (CBC decrypt): 15.91s, 0.629MB/s
	! 
	! aes256 (ECB encrypt): 12.71s, 0.787MB/s
	! aes256 (ECB decrypt): 15.38s, 0.650MB/s
	! aes256 (CBC encrypt): 17.49s, 0.572MB/s
	! aes256 (CBC decrypt): 17.87s, 0.560MB/s
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353

	! After further optimizations of the initial and final loops,
	! source_loop and final_loop. 
	! aes128 (ECB encrypt): 8.07s, 1.239MB/s
	! aes128 (ECB decrypt): 9.48s, 1.055MB/s
	! aes128 (CBC encrypt): 12.76s, 0.784MB/s
	! aes128 (CBC decrypt): 12.15s, 0.823MB/s
	! 
	! aes192 (ECB encrypt): 9.43s, 1.060MB/s
	! aes192 (ECB decrypt): 11.20s, 0.893MB/s
	! aes192 (CBC encrypt): 14.19s, 0.705MB/s
	! aes192 (CBC decrypt): 13.97s, 0.716MB/s
	! 
	! aes256 (ECB encrypt): 10.81s, 0.925MB/s
	! aes256 (ECB decrypt): 12.92s, 0.774MB/s
	! aes256 (CBC encrypt): 15.59s, 0.641MB/s
	! aes256 (CBC decrypt): 15.76s, 0.635MB/s