aes.asm 4.36 KB
Newer Older
1
	! Benchmarks on my slow sparcstation:	
2
	! Original C code	
3
4
5
6
7
8
9
10
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
11
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
12
13
14
15
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
16
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s
17

18
19
	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.
20

21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s
Niels Möller's avatar
Niels Möller committed
37
	
38
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
39
	
40
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
41
	
42
43
	.section	".text"
	.align 4
44
45
	.global _aes_crypt
	.type	_aes_crypt,#function
46
	.proc	020
47
48
49
50
51
52

define(ctx, %o5)
define(T, %o0)
define(length, %o4)
define(dst, %o3)
define(src, %o2)
53
54

define(wtxt, %l2)
55
define(tmp, %o1)
56
_aes_crypt:
57
! Why -136?
58
	save	%sp, -136, %sp
59

60
61
62
63
64
65
! Why this moving around of the input parameters?
	mov	%i2, length
	mov	%i0, ctx
	mov	%i1, T
	mov	%i3, dst
	cmp	length, 0
66
	be	.Lend
67
	mov	%i4, src
68
	! wtxt
69
	add	%fp, -24, %l1
70
71
72
	mov	%l1, wtxt
.Lblock_loop:
	! Read src, and add initial subkey
73
	mov	0, %i3
74
.Lsource_loop:
75
	sll	%i3, 2, %i2
76
	add	%i2, src, %i0
77
78
79
80
81
82
83
	ldub	[%i0+3], %g2
	add	%i3, 1, %i3
	ldub	[%i0+2], %g3
	sll	%g2, 24, %g2
	ldub	[%i0+1], %i1
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
84
	ldub	[src+%i2], %i0
85
	sll	%i1, 8, %i1
86
	ld	[ctx+%i2], %g3
87
88
89
90
	or	%g2, %i1, %g2
	or	%g2, %i0, %g2
	xor	%g2, %g3, %g2
	cmp	%i3, 3
91
92
93
94
95
	bleu	.Lsource_loop
	st	%g2, [wtxt+%i2]

	! FIXME: We can safely assume that nrounds > 1 
	ld	[ctx + AES_NROUNDS], %g2
96
	mov	1, %g1
97
98
99
	! cmp	%g1, %g2
	! bgeu,a	.Lfinal_round
	! sll	%g1, 4, %g2
100
101

	add	%fp, -40, tmp
102
	mov	%g2, %o7
103
104
105
	mov	tmp, %l0

	! wtxt
106
	mov	%l1, %g4
107
108
109

	! round:	%i5
	! 4*round:	%i3
110
	mov	0, %i5
111
.Lround_loop:
112
	add	T, AES_IDX3, %i4
113
.Linner_loop:
114
	! AES_IDX1
115
116
117
	ld	[%i4-32], %g3
	sll	%i5, 2, %i3
	sll	%g3, 2, %g3
118
	! AES_IDX2
119
	ld	[%i4-16], %i2
120
	! wtxt[IDX1...]
121
122
123
124
125
126
127
128
129
130
131
	add	%g4, %g3, %g3
	ldub	[%g3+2], %i1
	sll	%i2, 2, %i2
	ld	[%i4], %g2
	sll	%i1, 2, %i1
	ld	[%g4+%i3], %i0
	sll	%g2, 2, %g2
	lduh	[%g4+%i2], %g3
	and	%i0, 255, %i0
	ldub	[%g4+%g2], %i2
	sll	%i0, 2, %i0
132
	add	%i0, AES_TABLE0, %i0
133
	ld	[T+%i0], %g2
134
135

	add	%i1, AES_TABLE1, %i1
136
	and	%g3, 255, %g3
137
	ld	[T+%i1], %i0
138
	sll	%g3, 2, %g3
139
	add	%g3, AES_TABLE2, %g3
140
	ld	[T+%g3], %i1
141
	sll	%i2, 2, %i2
142
	add	%i2, AES_TABLE3, %i2
143
	ld	[T+%i2], %g3
144
145
146
147
148
149
	xor	%g2, %i0, %g2
	xor	%g2, %i1, %g2
	add	%i5, 1, %i5
	xor	%g2, %g3, %g2
	st	%g2, [%l0+%i3]
	cmp	%i5, 3
150
	bleu	.Linner_loop
151
	add	%i4, 4, %i4
152
	
153
	sll	%g1, 4, %g2
154
	add	%g2, ctx, %i1
155
156
	mov	0, %i5
	mov	%l1, %i3
157
	mov	tmp, %i2
158
.Lroundkey_loop:
159
160
161
162
163
164
165
	sll	%i5, 2, %g2
	ld	[%i1], %i0
	add	%i5, 1, %i5
	ld	[%i2+%g2], %g3
	cmp	%i5, 3
	xor	%g3, %i0, %g3
	st	%g3, [%i3+%g2]
166
	bleu	.Lroundkey_loop
167
168
169
	add	%i1, 4, %i1
	add	%g1, 1, %g1
	cmp	%g1, %o7
170
	blu	.Lround_loop
171
172
	mov	0, %i5
	sll	%g1, 4, %g2
173
.Lfinal_round:
174
	add	%g2, ctx, %o7
175
176
	mov	0, %o1
	mov	%l1, %g1
177
	add	T, 288, %g4
178
.Lfinal_loop:
179
180
181
182
183
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
	ldub	[%g2+2], %i3
184
	add	%i5, dst, %i4
185
186
187
188
189
190
191
192
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
	lduh	[%g1+%g3], %i2
	and	%g2, 255, %g2
	ld	[%g4], %i0
	and	%i2, 255, %i2
193
	ldub	[T+%i3], %i1
194
	sll	%i0, 2, %i0
195
	ldub	[T+%g2], %g3
196
197
198
	sll	%i1, 8, %i1
	ldub	[%g1+%i0], %i3
	or	%g3, %i1, %g3
199
	ldub	[T+%i2], %g2
200
	cmp	%o1, 3
201
	ldub	[T+%i3], %i0
202
203
204
205
206
207
208
209
210
211
212
213
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
	sll	%i0, 24, %i0
	or	%g3, %i0, %g3
	xor	%g3, %g2, %g3
	srl	%g3, 24, %i0
	srl	%g3, 16, %i1
	srl	%g3, 8, %g2
	stb	%g2, [%i4+1]
	stb	%i0, [%i4+3]
	stb	%i1, [%i4+2]
214
	stb	%g3, [dst+%i5]
215
	add	%o7, 4, %o7
216
	bleu	.Lfinal_loop
217
	add	%g4, 4, %g4
218
	
219
220
	add	dst, 16, dst
	addcc	length, -16, length
221
	bne	.Lblock_loop
222
	add	src, 16, src
223
.Lend:
224
225
	ret
	restore
226
227
228
229
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt