aes.asm 8.44 KB
Newer Older
1
! -*- mode: asm; asm-comment-char: ?!; -*-  
Niels Möller's avatar
Niels Möller committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
! nettle, low-level cryptographics library
! 
! Copyright (C) 2002 Niels Mller
!  
! The nettle library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
! 
! The nettle library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
! License for more details.
! 
! You should have received a copy of the GNU Lesser General Public License
! along with the nettle library; see the file COPYING.LIB.  If not, write to
! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! MA 02111-1307, USA.

21
	! Used registers:	%l0,1,2,3,4,5,6,7
22
	!			%i0,1,2,3,4 (%i6=%fp, %i7 = return)
23
	!			%o0,1,2,3 (%o6=%sp)
24
	!			%g2,3,4,5,6
Niels Möller's avatar
Niels Möller committed
25
	
26
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
27
	
28
	.section	".text"
29
	.align 16
30
31
	.global _aes_crypt
	.type	_aes_crypt,#function
32
	.proc	020
33

34
! Arguments
35
define(ctx, %i0)
36
define(T, %i1)
37
define(length, %i2)
38
define(dst, %i3)
39
define(src, %i4)
40

41
42
43
! Loop invariants
define(wtxt, %l0)
define(tmp, %l1)
44
define(diff, %l2)
45
define(nrounds, %l3)
46
47

! Loop variables
48
define(round, %l4)
49
define(i, %l5)
50
define(key, %g7)
51

52
53
54
55
56
! Further loop invariants
define(T0, %l6)
define(T1, %l7)
define(T2, %g5)
define(T3, %g6)
57
58
define(IDX1, %i5)
define(IDX3, %o5)
59

60
! Teporaries
61
define(t0, %o0)
62
define(t1, %o1)
63
define(t2, %o2)
64
65
define(t3, %o3)

66
_aes_crypt:
67
! Why -136?
68
	save	%sp, -136, %sp
69
	cmp	length, 0
70
	be	.Lend
71
	! wtxt
72
	add	%fp, -24, wtxt
73
	
74
	add	%fp, -40, tmp
75
	ld	[ctx + AES_NROUNDS], nrounds
76
77
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
78
	! The loop variable will be multiplied by 16.
79
80
	! More loop invariants
	add	T, AES_TABLE0, T0
81
	
82
83
84
	add	T, AES_TABLE1, T1
	add	T, AES_TABLE2, T2
	add	T, AES_TABLE3, T3
85
86
87
	add	T, AES_SIDX1, IDX1

	add	T, AES_SIDX3, IDX3
88
	! Read src, and add initial subkey
89
	! Difference between ctx and src.
90
91
	! NOTE: These instruction is duplicated in the delay slot,
	! and the instruction before the branch
92
93
94
	sub	ctx, src, %g2
	! Difference between wtxt and src
	sub	wtxt, src, %g3
95
.Lblock_loop:
96
97
98
	! For stop condition. Note that src is incremented in the
	! delay slot
	add	src, 8, %g4
99
	
100
.Lsource_loop:
101
102
103
104
	ldub	[src+3], t3
	ldub	[src+2], t2
	sll	t3, 24, t3
	ldub	[src+1], t1
105
	
106
107
108
109
	sll	t2, 16, t2
	or	t3, t2, t3
	ldub	[src], t0
	sll	t1, 8, t1
110
	
111
	! Get subkey
112
	ld	[src+%g2], t2
113
114
	or	t3, t1, t3
	or	t3, t0, t3
115
	xor	t3, t2, t3
116
	
117
	cmp	src, %g4
118
	st	t3, [src+%g3]
119
	bleu	.Lsource_loop
120
	add	src, 4, src
121
	
122
	sub	nrounds, 1, round
123
	add	ctx, 16, key
124
.Lround_loop:
125
	! 4*i
126
	! NOTE: Instruction duplicated in delay slot
127
	mov	0, i
128
.Linner_loop:
129
	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
130
131
132
133
	! the instruction is a part of. 
	!
	! The code uses the register %o[j], aka tj, as the primary 
	! register for that sub-expression. True for j==1,3.
134
	
135
	ld	[IDX1+i], t1		! 1
136
	
137
138
	! IDX2(j) = j XOR 2
	xor	i, 8, t2
139
140
	add	wtxt, t1, t1		! 1
	ldub	[t1+2], t1		! 1
141
	ld	[IDX3+i], t3		! 3
142
	
143
	sll	t1, 2, t1		! 1
144
	ld	[wtxt+i], t0		! 0
145
	lduh	[wtxt+t2], t2		! 2
146
	and	t0, 255, t0		! 0
147
	
148
	ldub	[wtxt+t3], t3		! 3
149
	sll	t0, 2, t0		! 0
150
	ld	[T0+t0], t0		! 0
151
	and	t2, 255, t2		! 2
152
	
153
	ld	[T1+t1], t1		! 1
154
	sll	t2, 2, t2		! 2
155
	ld	[T2+t2], t2		! 2
156
	sll	t3, 2, t3		! 3
157
	
158
	ld	[T3+t3], t3		! 3
159
160
	xor	t0, t1, t0		! 0, 1
	xor	t0, t2, t0		! 0, 1, 2
161
	! Fetch roundkey
162
	ld	[key+i], t1
163
	
164
	xor	t0, t3, t0		! 0, 1, 2, 3
165
	xor	t0, t1, t0
166
	st	t0, [tmp+i]
167
	cmp	i, 8
168
	
169
	bleu	.Linner_loop
170
171
	add	i, 4, i
	! switch roles for tmp and wtxt
172
	xor	wtxt, diff, wtxt
173
174
	xor	tmp, diff, tmp

175
	subcc	round, 1, round
176
	add	key, 16, key
177
178
	bne	.Linner_loop
	mov	0, i
179

180
	! final round
181
182
183
	! Use round as the loop variable, as it's already zero
undefine(<i>)
define(i, round)
184

185
186
	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
	! the instruction is part of
187
	! NOTE: First instruction duplicated in delay slot
188
	ld	[IDX1+i], t1 	! 1
189
.Lfinal_loop:
190
	! IDX2(j) = j XOR 2
191
192
	xor	i, 8, t2
	! ld	[idx-16], t2	! 2
193
194
	add	wtxt, t1, t1	! 1
	ldub	[t1+2], t1	! 1
195

196
	ld	[wtxt+i], t0	! 0
197
	lduh	[wtxt+t2], t2	! 2
198
	and	t0, 255, t0	! 0
199
	ld	[IDX3 + i], t3	! 3
200
	
201
	and	t2, 255, t2	! 2
202
	ldub	[T+t1], t1	! 1
203
	ldub	[T+t0], t0	! 0
204
	sll	t1, 8, t1	! 1
205
	
206
	ldub	[wtxt+t3], t3	! 3
207
	or	t0, t1, t0	! 0, 1
208
	ldub	[T+t2], t2	! 2
209
	ldub	[T+t3], t3	! 3
210
	
211
212
	sll	t2, 16, t2	! 2
	or	t0, t2, t0	! 0, 1, 2
213
	ld	[key + i], t2
214
	sll	t3, 24, t3	! 3
215
	
216
	or	t0, t3, t0	! 0, 1, 2, 3
217
218
219
220
221
222
223
	xor	t0, t2, t0
	add	i, 4, i
	cmp	i, 12
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
224
	stb	t1, [dst+1]
225
	
226
227
228
	stb	t3, [dst+3]
	stb	t2, [dst+2]
	stb	t0, [dst]
229
	add	dst, 4, dst
230
	
231
	bleu	.Lfinal_loop
232
	ld	[IDX1+i], t1 	! 1
233
	addcc	length, -16, length
234
	sub	ctx, src, %g2
235
	
236
	bne	.Lblock_loop
237
	sub	wtxt, src, %g3
238

239
.Lend:
240
241
	ret
	restore
242
243
244
245
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

	! After implementing double buffering
	! aes128 (ECB encrypt): 12.59s, 0.794MB/s
	! aes128 (ECB decrypt): 10.56s, 0.947MB/s
	! aes128 (CBC encrypt): 17.91s, 0.558MB/s
	! aes128 (CBC decrypt): 12.30s, 0.813MB/s
	! 
	! aes192 (ECB encrypt): 15.03s, 0.665MB/s
	! aes192 (ECB decrypt): 12.56s, 0.796MB/s
	! aes192 (CBC encrypt): 20.30s, 0.493MB/s
	! aes192 (CBC decrypt): 14.26s, 0.701MB/s
	! 
	! aes256 (ECB encrypt): 17.30s, 0.578MB/s
	! aes256 (ECB decrypt): 14.51s, 0.689MB/s
	! aes256 (CBC encrypt): 22.75s, 0.440MB/s
	! aes256 (CBC decrypt): 16.35s, 0.612MB/s
	
	! After reordering aes-encrypt.c and aes-decypt.c
	! (the order probably causes strange cache-effects):
	! aes128 (ECB encrypt): 9.21s, 1.086MB/s
	! aes128 (ECB decrypt): 11.13s, 0.898MB/s
	! aes128 (CBC encrypt): 14.12s, 0.708MB/s
	! aes128 (CBC decrypt): 13.77s, 0.726MB/s
	! 
	! aes192 (ECB encrypt): 10.86s, 0.921MB/s
	! aes192 (ECB decrypt): 13.17s, 0.759MB/s
	! aes192 (CBC encrypt): 15.74s, 0.635MB/s
	! aes192 (CBC decrypt): 15.91s, 0.629MB/s
	! 
	! aes256 (ECB encrypt): 12.71s, 0.787MB/s
	! aes256 (ECB decrypt): 15.38s, 0.650MB/s
	! aes256 (CBC encrypt): 17.49s, 0.572MB/s
	! aes256 (CBC decrypt): 17.87s, 0.560MB/s
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349

	! After further optimizations of the initial and final loops,
	! source_loop and final_loop. 
	! aes128 (ECB encrypt): 8.07s, 1.239MB/s
	! aes128 (ECB decrypt): 9.48s, 1.055MB/s
	! aes128 (CBC encrypt): 12.76s, 0.784MB/s
	! aes128 (CBC decrypt): 12.15s, 0.823MB/s
	! 
	! aes192 (ECB encrypt): 9.43s, 1.060MB/s
	! aes192 (ECB decrypt): 11.20s, 0.893MB/s
	! aes192 (CBC encrypt): 14.19s, 0.705MB/s
	! aes192 (CBC decrypt): 13.97s, 0.716MB/s
	! 
	! aes256 (ECB encrypt): 10.81s, 0.925MB/s
	! aes256 (ECB decrypt): 12.92s, 0.774MB/s
	! aes256 (CBC encrypt): 15.59s, 0.641MB/s
	! aes256 (CBC decrypt): 15.76s, 0.635MB/s