aes.asm 8.8 KB
Newer Older
1
! -*- mode: asm; asm-comment-char: ?!; -*-  
Niels Möller's avatar
Niels Möller committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
! nettle, low-level cryptographics library
! 
! Copyright (C) 2002 Niels Mller
!  
! The nettle library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
! 
! The nettle library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
! License for more details.
! 
! You should have received a copy of the GNU Lesser General Public License
! along with the nettle library; see the file COPYING.LIB.  If not, write to
! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! MA 02111-1307, USA.

21
22
! NOTE: Some of the %g registers are reserved for operating system etc
! (see gcc/config/sparc.h). We should use only %g1-%g3 to be safe.
Niels Möller's avatar
Niels Möller committed
23
	
24
	! Used registers:	%l0,1,2,3,4,5,6,7
25
26
	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,5,7 (%o6=%sp)
27
	!			%g1,2,3,5
Niels Möller's avatar
Niels Möller committed
28
	
29
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
30
	
31
	.section	".text"
32
	.align 16
33
34
	.global _aes_crypt
	.type	_aes_crypt,#function
35
	.proc	020
36

37
! Arguments
38
define(ctx, %i0)
39
define(T, %i1)
40
define(length, %i2)
41
define(dst, %i3)
42
define(src, %i4)
43

44
! Loop invariants
45
define(wtxt, %l0)
46
define(tmp, %l1)
47
define(diff, %l2)
48
define(nrounds, %l3)
49
50

! Loop variables
51
define(round, %l4)
52
define(i, %l5)
53
define(key, %o4)
54

55
56
57
! Further loop invariants
define(T0, %l6)
define(T1, %l7)
58
define(T2, %g5)
59
define(T3, %o7)
60
61
define(IDX1, %i5)
define(IDX3, %o5)
62

63
! Teporaries
64
define(t0, %o0)
65
define(t1, %o1)
66
define(t2, %o2)
67
68
define(t3, %o3)

69
70
71
72
73
74
75
76
C The stack frame looks like
C
C %fp -   4: OS-dependent link field
C %fp -   8: OS-dependent link field
C %fp -  24: tmp, uint32_t[4]
C %fp -  40: wtxt, uint32_t[4]
C %fp - 136: OS register save area. 
define(<FRAME_SIZE>, 136)
77

78
_aes_crypt:
79
	save	%sp, -FRAME_SIZE, %sp
80
	cmp	length, 0
81
	be	.Lend
82
83
84
85
	! wtxt
	add	%fp, -24, wtxt
	
	add	%fp, -40, tmp
86
	ld	[ctx + AES_NROUNDS], nrounds
87
88
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
89
	! The loop variable will be multiplied by 16.
90
91
	! More loop invariants
	add	T, AES_TABLE0, T0
Niels Möller's avatar
Niels Möller committed
92
	
93
	add	T, AES_TABLE1, T1
94
95
	add	T, AES_TABLE2, T2
	add	T, AES_TABLE3, T3
96
	add	T, AES_SIDX1, IDX1
Niels Möller's avatar
Niels Möller committed
97
	
98
	add	T, AES_SIDX3, IDX3
99
	! Read src, and add initial subkey
100
	! Difference between ctx and src.
Niels Möller's avatar
Niels Möller committed
101
	! NOTE: These instructions are duplicated in the delay slot,
102
	! and the instruction before the branch.
103
104
105
	sub	ctx, src, %g2
	! Difference between wtxt and src
	sub	wtxt, src, %g3
106
.Lblock_loop:
107
108
	! For stop condition. Note that src is incremented in the
	! delay slot
109
	add	src, 8, %g1
110
	
111
.Lsource_loop:
112
113
114
115
	ldub	[src+3], t3
	ldub	[src+2], t2
	sll	t3, 24, t3
	ldub	[src+1], t1
116
	
117
118
119
120
	sll	t2, 16, t2
	or	t3, t2, t3
	ldub	[src], t0
	sll	t1, 8, t1
121
	
122
	! Get subkey
123
	ld	[src+%g2], t2
124
125
	or	t3, t1, t3
	or	t3, t0, t3
126
	xor	t3, t2, t3
127
	
128
	cmp	src, %g1
129
	st	t3, [src+%g3]
130
	bleu	.Lsource_loop
131
	add	src, 4, src
132
	
133
	sub	nrounds, 1, round
134
	add	ctx, 16, key
Niels Möller's avatar
Niels Möller committed
135
	nop
136
	! 4*i
137
	! NOTE: Instruction duplicated in delay slot
138
	mov	0, i
139
.Linner_loop:
140
	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
141
142
143
144
	! the instruction is a part of. 
	!
	! The code uses the register %o[j], aka tj, as the primary 
	! register for that sub-expression. True for j==1,3.
145
	
146
	ld	[IDX1+i], t1		! 1
147
	
148
149
	! IDX2(j) = j XOR 2
	xor	i, 8, t2
150
151
	add	wtxt, t1, t1		! 1
	ldub	[t1+2], t1		! 1
Niels Möller's avatar
Niels Möller committed
152
	ld	[IDX3+i], t3		! 3
153
	
154
	sll	t1, 2, t1		! 1
155
	ld	[wtxt+i], t0		! 0
156
	lduh	[wtxt+t2], t2		! 2
Niels Möller's avatar
Niels Möller committed
157
	and	t0, 255, t0		! 0
158
	
159
	ldub	[wtxt+t3], t3		! 3
160
	sll	t0, 2, t0		! 0
161
	ld	[T0+t0], t0		! 0
Niels Möller's avatar
Niels Möller committed
162
	and	t2, 255, t2		! 2
163
	
164
	ld	[T1+t1], t1		! 1
165
	sll	t2, 2, t2		! 2
166
	ld	[T2+t2], t2		! 2
Niels Möller's avatar
Niels Möller committed
167
	sll	t3, 2, t3		! 3
168
	
169
	ld	[T3+t3], t3		! 3
170
171
	xor	t0, t1, t0		! 0, 1
	xor	t0, t2, t0		! 0, 1, 2
172
	! Fetch roundkey
173
	ld	[key+i], t1
174
	
175
	xor	t0, t3, t0		! 0, 1, 2, 3
176
	xor	t0, t1, t0
177
	st	t0, [tmp+i]
Niels Möller's avatar
Niels Möller committed
178
	cmp	i, 8
179
	
180
	bleu	.Linner_loop
181
182
	add	i, 4, i
	! switch roles for tmp and wtxt
183
	xor	wtxt, diff, wtxt
184
	xor	tmp, diff, tmp
185

186
	subcc	round, 1, round
187
	add	key, 16, key
188
189
	bne	.Linner_loop
	mov	0, i
190

191
	! final round
192
193
194
	! Use round as the loop variable, as it's already zero
undefine(<i>)
define(i, round)
195

196
197
	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
	! the instruction is part of
198
	! NOTE: First instruction duplicated in delay slot
199
	ld	[IDX1+i], t1 	! 1
200
.Lfinal_loop:
201
	! IDX2(j) = j XOR 2
202
	xor	i, 8, t2
203
204
	add	wtxt, t1, t1	! 1
	ldub	[t1+2], t1	! 1
205

206
	ld	[wtxt+i], t0	! 0
207
	lduh	[wtxt+t2], t2	! 2
208
	and	t0, 255, t0	! 0
209
	ld	[IDX3 + i], t3	! 3
210
	
211
	and	t2, 255, t2	! 2
212
	ldub	[T+t1], t1	! 1
213
	ldub	[T+t0], t0	! 0
214
	sll	t1, 8, t1	! 1
215
	
216
	ldub	[wtxt+t3], t3	! 3
217
	or	t0, t1, t0	! 0, 1
218
	ldub	[T+t2], t2	! 2
219
	ldub	[T+t3], t3	! 3
220
	
221
222
	sll	t2, 16, t2	! 2
	or	t0, t2, t0	! 0, 1, 2
223
	ld	[key + i], t2
224
	sll	t3, 24, t3	! 3
225
	
226
	or	t0, t3, t0	! 0, 1, 2, 3
227
228
229
230
231
232
233
	xor	t0, t2, t0
	add	i, 4, i
	cmp	i, 12
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
234
	stb	t1, [dst+1]
235
	
236
237
238
	stb	t3, [dst+3]
	stb	t2, [dst+2]
	stb	t0, [dst]
239
	add	dst, 4, dst
240
	
241
	bleu	.Lfinal_loop
242
	ld	[IDX1+i], t1 	! 1
243
	addcc	length, -16, length
244
	sub	ctx, src, %g2
245
	
246
	bne	.Lblock_loop
247
	sub	wtxt, src, %g3
248

249
.Lend:
250
	add	%sp, FRAME_SIZE, %fp
251
252
	ret
	restore
253
254
255
256
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342

	! After implementing double buffering
	! aes128 (ECB encrypt): 12.59s, 0.794MB/s
	! aes128 (ECB decrypt): 10.56s, 0.947MB/s
	! aes128 (CBC encrypt): 17.91s, 0.558MB/s
	! aes128 (CBC decrypt): 12.30s, 0.813MB/s
	! 
	! aes192 (ECB encrypt): 15.03s, 0.665MB/s
	! aes192 (ECB decrypt): 12.56s, 0.796MB/s
	! aes192 (CBC encrypt): 20.30s, 0.493MB/s
	! aes192 (CBC decrypt): 14.26s, 0.701MB/s
	! 
	! aes256 (ECB encrypt): 17.30s, 0.578MB/s
	! aes256 (ECB decrypt): 14.51s, 0.689MB/s
	! aes256 (CBC encrypt): 22.75s, 0.440MB/s
	! aes256 (CBC decrypt): 16.35s, 0.612MB/s
	
	! After reordering aes-encrypt.c and aes-decypt.c
	! (the order probably causes strange cache-effects):
	! aes128 (ECB encrypt): 9.21s, 1.086MB/s
	! aes128 (ECB decrypt): 11.13s, 0.898MB/s
	! aes128 (CBC encrypt): 14.12s, 0.708MB/s
	! aes128 (CBC decrypt): 13.77s, 0.726MB/s
	! 
	! aes192 (ECB encrypt): 10.86s, 0.921MB/s
	! aes192 (ECB decrypt): 13.17s, 0.759MB/s
	! aes192 (CBC encrypt): 15.74s, 0.635MB/s
	! aes192 (CBC decrypt): 15.91s, 0.629MB/s
	! 
	! aes256 (ECB encrypt): 12.71s, 0.787MB/s
	! aes256 (ECB decrypt): 15.38s, 0.650MB/s
	! aes256 (CBC encrypt): 17.49s, 0.572MB/s
	! aes256 (CBC decrypt): 17.87s, 0.560MB/s
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360

	! After further optimizations of the initial and final loops,
	! source_loop and final_loop. 
	! aes128 (ECB encrypt): 8.07s, 1.239MB/s
	! aes128 (ECB decrypt): 9.48s, 1.055MB/s
	! aes128 (CBC encrypt): 12.76s, 0.784MB/s
	! aes128 (CBC decrypt): 12.15s, 0.823MB/s
	! 
	! aes192 (ECB encrypt): 9.43s, 1.060MB/s
	! aes192 (ECB decrypt): 11.20s, 0.893MB/s
	! aes192 (CBC encrypt): 14.19s, 0.705MB/s
	! aes192 (CBC decrypt): 13.97s, 0.716MB/s
	! 
	! aes256 (ECB encrypt): 10.81s, 0.925MB/s
	! aes256 (ECB decrypt): 12.92s, 0.774MB/s
	! aes256 (CBC encrypt): 15.59s, 0.641MB/s
	! aes256 (CBC decrypt): 15.76s, 0.635MB/s