aes.asm 8.96 KB
Newer Older
1
! -*- mode: asm; asm-comment-char: ?!; -*-  
Niels Möller's avatar
Niels Möller committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
! nettle, low-level cryptographics library
! 
! Copyright (C) 2002 Niels Mller
!  
! The nettle library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
! 
! The nettle library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
! License for more details.
! 
! You should have received a copy of the GNU Lesser General Public License
! along with the nettle library; see the file COPYING.LIB.  If not, write to
! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! MA 02111-1307, USA.

21 22
! NOTE: Some of the %g registers are reserved for operating system etc
! (see gcc/config/sparc.h). We should use only %g1-%g3 to be safe.
Niels Möller's avatar
Niels Möller committed
23
	
24
	! Used registers:	%l0,1,2,3,4,5,6,7
25 26
	!			%i0,1,2,3,4 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
27
	!			%g1,2,3,5
Niels Möller's avatar
Niels Möller committed
28
	
29
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
30
	
31
! Arguments
32
define(ctx, %i0)
33
define(T, %i1)
34
define(length, %i2)
35
define(dst, %i3)
36
define(src, %i4)
37

38
! Loop invariants
39
define(wtxt, %l0)
40
define(tmp, %l1)
41
define(diff, %l2)
42
define(nrounds, %l3)
43 44

! Loop variables
45
define(round, %l4)
46
define(i, %l5)
47
define(key, %o4)
48

49 50 51
! Further loop invariants
define(T0, %l6)
define(T1, %l7)
52
define(T2, %g5)
53
define(T3, %o7)
54 55

C IDX1 cointains the permutation values * 4 + 2
56
define(IDX1, <T + AES_SIDX1 >)
57
C IDX3 cointains the permutation values * 4
58
define(IDX3, <T + AES_SIDX3 >)
59

60
! Teporaries
61
define(t0, %o0)
62
define(t1, %o1)
63
define(t2, %o2)
64 65
define(t3, %o3)

Niels Möller's avatar
Niels Möller committed
66 67 68 69 70 71 72 73 74
C AES_ROUND(i)
C Compute one word in the round function. 
C Input in wtxt, output stored in tmp + i.
C
C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
C the instruction is a part of. 
C
C The code uses the register %o[j], aka tj, as the primary 
C register for that sub-expression. True for j==1,3.
75 76
define(<AES_ROUND>, <
	ld	[IDX1+$1], t1		! 1
77
	ldub	[wtxt+$1+3], t0		! 0
78
	ldub	[wtxt+t1], t1		! 1
79
	sll	t0, 2, t0		! 0
80
	
81
	ld	[T0+t0], t0		! 0
82
	sll	t1, 2, t1		! 1
83
	ld	[T1+t1], t1		! 1
84
	ld	[IDX3+$1], t2		! 3
85 86
	
	xor	t0, t1, t0		! 0, 1
87
	! IDX2(j) = j XOR 2
88 89 90
	ldub	[wtxt+eval($1 ^ 8)+1], t1	! 2
	ldub	[wtxt+t2], t2		! 3
	sll	t1, 2, t1		! 2
91
	
92 93 94 95
	ld	[T2+t1], t1		! 2
	sll	t2, 2, t2		! 3
	ld	[T3+t2], t2		! 3
	xor	t0, t1, t0		! 0, 1, 2
96
	
97 98
	! Fetch roundkey
	ld	[key + $1], t1
99
	xor	t0, t2, t0		! 0, 1, 2, 3
100
	xor	t0, t1, t0
101
	st	t0, [tmp + $1]>)dnl
Niels Möller's avatar
Niels Möller committed
102 103 104

C AES_FINAL_ROUND(i)
C Compute one word in the final round function. 
105
C Input in wtxt, output converted to an octet string and stored at dst. 
Niels Möller's avatar
Niels Möller committed
106 107 108 109 110
C
C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
C the instruction is a part of. 
define(<AES_FINAL_ROUND>, <
	ld	[IDX1+$1], t1		! 1
111
	ldub	[wtxt+$1+3], t0		! 0
Niels Möller's avatar
Niels Möller committed
112
	ldub	[wtxt+t1], t1		! 1
113 114
	ldub	[T+t0], t0		! 0
	ldub	[T+t1], t1		! 1
115 116 117
	ld	[IDX3 + $1], t3		! 3
	sll	t1, 8, t1		! 1
	or	t0, t1, t0		! 0, 1
Niels Möller's avatar
Niels Möller committed
118

119
	
Niels Möller's avatar
Niels Möller committed
120
	! IDX2(j) = j XOR 2
121
	ldub	[wtxt+eval($1 ^ 8)+1], t2	! 2
122
	
Niels Möller's avatar
Niels Möller committed
123 124 125 126 127 128 129 130
	
	ldub	[wtxt+t3], t3		! 3
	ldub	[T+t2], t2		! 2
	ldub	[T+t3], t3		! 3
	
	sll	t2, 16, t2		! 2
	or	t0, t2, t0		! 0, 1, 2
	sll	t3, 24, t3		! 3
131
	ld	[key + $1], t2
Niels Möller's avatar
Niels Möller committed
132 133 134 135 136 137 138
	
	or	t0, t3, t0		! 0, 1, 2, 3
	xor	t0, t2, t0
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
139
	stb	t1, [dst+$1+1]
Niels Möller's avatar
Niels Möller committed
140
	
141 142 143
	stb	t3, [dst+$1+3]
	stb	t2, [dst+$1+2]
	stb	t0, [dst+$1]>)dnl
144
	
145 146 147 148 149 150 151 152
C The stack frame looks like
C
C %fp -   4: OS-dependent link field
C %fp -   8: OS-dependent link field
C %fp -  24: tmp, uint32_t[4]
C %fp -  40: wtxt, uint32_t[4]
C %fp - 136: OS register save area. 
define(<FRAME_SIZE>, 136)
153

154 155 156 157 158 159
	.section	".text"
	.align 16
	.global _aes_crypt
	.type	_aes_crypt,#function
	.proc	020
	
160
_aes_crypt:
161
	save	%sp, -FRAME_SIZE, %sp
162
	cmp	length, 0
163
	be	.Lend
164 165 166 167
	! wtxt
	add	%fp, -24, wtxt
	
	add	%fp, -40, tmp
168

169
	ld	[ctx + AES_NROUNDS], nrounds
170 171
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
172
	! The loop variable will be multiplied by 16.
173 174
	! More loop invariants
	add	T, AES_TABLE0, T0
Niels Möller's avatar
Niels Möller committed
175
	
176
	add	T, AES_TABLE1, T1
177 178
	add	T, AES_TABLE2, T2
	add	T, AES_TABLE3, T3
Niels Möller's avatar
Niels Möller committed
179
	
180
	! Read src, and add initial subkey
181
	! Difference between ctx and src.
Niels Möller's avatar
Niels Möller committed
182
	! NOTE: These instructions are duplicated in the delay slot,
183
	! and the instruction before the branch.
184 185 186
	sub	ctx, src, %g2
	! Difference between wtxt and src
	sub	wtxt, src, %g3
187
.Lblock_loop:
188 189
	! For stop condition. Note that src is incremented in the
	! delay slot
190
	add	src, 8, %g1
191
	
192
.Lsource_loop:
193 194 195 196
	ldub	[src+3], t3
	ldub	[src+2], t2
	sll	t3, 24, t3
	ldub	[src+1], t1
197
	
198 199 200 201
	sll	t2, 16, t2
	or	t3, t2, t3
	ldub	[src], t0
	sll	t1, 8, t1
202
	
203
	! Get subkey
204
	ld	[src+%g2], t2
205 206
	or	t3, t1, t3
	or	t3, t0, t3
207
	xor	t3, t2, t3
208
	
209
	cmp	src, %g1
210
	st	t3, [src+%g3]
211
	bleu	.Lsource_loop
212
	add	src, 4, src
213
	
214
	sub	nrounds, 1, round
215
	add	ctx, 16, key
Niels Möller's avatar
Niels Möller committed
216

217
.Lround_loop:
218

Niels Möller's avatar
Niels Möller committed
219 220 221 222
	AES_ROUND(0)	! i = 0
	AES_ROUND(4)	! i = 1
	AES_ROUND(8)	! i = 2
	AES_ROUND(12)	! i = 3
223
			
224
	! switch roles for tmp and wtxt
225
	xor	wtxt, diff, wtxt
226
	xor	tmp, diff, tmp
227

228
	subcc	round, 1, round
229
	bne	.Lround_loop
Niels Möller's avatar
Niels Möller committed
230
	add	key, 16, key
231

232
	C Final round, and storage of the output
233

Niels Möller's avatar
Niels Möller committed
234
	AES_FINAL_ROUND(0)	! i = 0
Niels Möller's avatar
Niels Möller committed
235 236 237
	AES_FINAL_ROUND(4)	! i = 1
	AES_FINAL_ROUND(8)	! i = 2
	AES_FINAL_ROUND(12)	! i = 3
238 239
	add	dst, 16, dst
		
240
	addcc	length, -16, length
241
	sub	ctx, src, %g2
242
	
243
	bne	.Lblock_loop
244
	sub	wtxt, src, %g3
245

246
.Lend:
247 248
	ret
	restore
249 250 251 252
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338

	! After implementing double buffering
	! aes128 (ECB encrypt): 12.59s, 0.794MB/s
	! aes128 (ECB decrypt): 10.56s, 0.947MB/s
	! aes128 (CBC encrypt): 17.91s, 0.558MB/s
	! aes128 (CBC decrypt): 12.30s, 0.813MB/s
	! 
	! aes192 (ECB encrypt): 15.03s, 0.665MB/s
	! aes192 (ECB decrypt): 12.56s, 0.796MB/s
	! aes192 (CBC encrypt): 20.30s, 0.493MB/s
	! aes192 (CBC decrypt): 14.26s, 0.701MB/s
	! 
	! aes256 (ECB encrypt): 17.30s, 0.578MB/s
	! aes256 (ECB decrypt): 14.51s, 0.689MB/s
	! aes256 (CBC encrypt): 22.75s, 0.440MB/s
	! aes256 (CBC decrypt): 16.35s, 0.612MB/s
	
	! After reordering aes-encrypt.c and aes-decypt.c
	! (the order probably causes strange cache-effects):
	! aes128 (ECB encrypt): 9.21s, 1.086MB/s
	! aes128 (ECB decrypt): 11.13s, 0.898MB/s
	! aes128 (CBC encrypt): 14.12s, 0.708MB/s
	! aes128 (CBC decrypt): 13.77s, 0.726MB/s
	! 
	! aes192 (ECB encrypt): 10.86s, 0.921MB/s
	! aes192 (ECB decrypt): 13.17s, 0.759MB/s
	! aes192 (CBC encrypt): 15.74s, 0.635MB/s
	! aes192 (CBC decrypt): 15.91s, 0.629MB/s
	! 
	! aes256 (ECB encrypt): 12.71s, 0.787MB/s
	! aes256 (ECB decrypt): 15.38s, 0.650MB/s
	! aes256 (CBC encrypt): 17.49s, 0.572MB/s
	! aes256 (CBC decrypt): 17.87s, 0.560MB/s
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356

	! After further optimizations of the initial and final loops,
	! source_loop and final_loop. 
	! aes128 (ECB encrypt): 8.07s, 1.239MB/s
	! aes128 (ECB decrypt): 9.48s, 1.055MB/s
	! aes128 (CBC encrypt): 12.76s, 0.784MB/s
	! aes128 (CBC decrypt): 12.15s, 0.823MB/s
	! 
	! aes192 (ECB encrypt): 9.43s, 1.060MB/s
	! aes192 (ECB decrypt): 11.20s, 0.893MB/s
	! aes192 (CBC encrypt): 14.19s, 0.705MB/s
	! aes192 (CBC decrypt): 13.97s, 0.716MB/s
	! 
	! aes256 (ECB encrypt): 10.81s, 0.925MB/s
	! aes256 (ECB decrypt): 12.92s, 0.774MB/s
	! aes256 (CBC encrypt): 15.59s, 0.641MB/s
	! aes256 (CBC decrypt): 15.76s, 0.635MB/s