aes.asm 4.39 KB
Newer Older
1
	! Benchmarks on my slow sparcstation:	
2
	! Original C code	
3
4
5
6
7
8
9
10
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
11
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
12
13
14
15
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
16
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s
17

18
19
	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.
20

21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s
Niels Möller's avatar
Niels Möller committed
37
	
38
include(`asm.m4')
Niels Möller's avatar
Niels Möller committed
39
	
40
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
41
	
42
43
	.section	".text"
	.align 4
44
45
	.global _aes_crypt
	.type	_aes_crypt,#function
46
	.proc	020
47
48
49
50
51
52

define(ctx, %o5)
define(T, %o0)
define(length, %o4)
define(dst, %o3)
define(src, %o2)
53
54

define(wtxt, %l2)
55
define(tmp, %o1)
56
_aes_crypt:
57
	!#PROLOGUE# 0
58
! Why -136?
59
60
	save	%sp, -136, %sp
	!#PROLOGUE# 1
61
62
63
64
65
66
! Why this moving around of the input parameters?
	mov	%i2, length
	mov	%i0, ctx
	mov	%i1, T
	mov	%i3, dst
	cmp	length, 0
67
	be	.Lend
68
	mov	%i4, src
69
	! wtxt?
70
	add	%fp, -24, %l1
71
72
73
	mov	%l1, wtxt
.Lblock_loop:
	! Read src, and add initial subkey
74
	mov	0, %i3
75
.Lsource_loop:
76
	sll	%i3, 2, %i2
77
	add	%i2, src, %i0
78
79
80
81
82
83
84
	ldub	[%i0+3], %g2
	add	%i3, 1, %i3
	ldub	[%i0+2], %g3
	sll	%g2, 24, %g2
	ldub	[%i0+1], %i1
	sll	%g3, 16, %g3
	or	%g2, %g3, %g2
85
	ldub	[src+%i2], %i0
86
	sll	%i1, 8, %i1
87
	ld	[ctx+%i2], %g3
88
89
90
91
	or	%g2, %i1, %g2
	or	%g2, %i0, %g2
	xor	%g2, %g3, %g2
	cmp	%i3, 3
92
93
94
95
96
	bleu	.Lsource_loop
	st	%g2, [wtxt+%i2]

	! FIXME: We can safely assume that nrounds > 1 
	ld	[ctx + AES_NROUNDS], %g2
97
98
	mov	1, %g1
	cmp	%g1, %g2
99
	bgeu,a	.Lfinal_round
100
	sll	%g1, 4, %g2
101
102

	add	%fp, -40, tmp
103
	mov	%g2, %o7
104
105
106
	mov	tmp, %l0

	! wtxt
107
	mov	%l1, %g4
108
109
110

	! round:	%i5
	! 4*round:	%i3
111
	mov	0, %i5
112
.Lround_loop:
113
	add	T, AES_IDX3, %i4
114
.Linner_loop:
115
	! AES_IDX1
116
117
118
	ld	[%i4-32], %g3
	sll	%i5, 2, %i3
	sll	%g3, 2, %g3
119
	! AES_IDX2
120
	ld	[%i4-16], %i2
121
	! wtxt[IDX1...]
122
123
124
125
126
127
128
129
130
131
132
	add	%g4, %g3, %g3
	ldub	[%g3+2], %i1
	sll	%i2, 2, %i2
	ld	[%i4], %g2
	sll	%i1, 2, %i1
	ld	[%g4+%i3], %i0
	sll	%g2, 2, %g2
	lduh	[%g4+%i2], %g3
	and	%i0, 255, %i0
	ldub	[%g4+%g2], %i2
	sll	%i0, 2, %i0
133
	add	%i0, AES_TABLE0, %i0
134
	ld	[T+%i0], %g2
135
136

	add	%i1, AES_TABLE1, %i1
137
	and	%g3, 255, %g3
138
	ld	[T+%i1], %i0
139
	sll	%g3, 2, %g3
140
	add	%g3, AES_TABLE2, %g3
141
	ld	[T+%g3], %i1
142
	sll	%i2, 2, %i2
143
	add	%i2, AES_TABLE3, %i2
144
	ld	[T+%i2], %g3
145
146
147
148
149
150
	xor	%g2, %i0, %g2
	xor	%g2, %i1, %g2
	add	%i5, 1, %i5
	xor	%g2, %g3, %g2
	st	%g2, [%l0+%i3]
	cmp	%i5, 3
151
	bleu	.Linner_loop
152
	add	%i4, 4, %i4
153
	
154
	sll	%g1, 4, %g2
155
	add	%g2, ctx, %i1
156
157
	mov	0, %i5
	mov	%l1, %i3
158
	mov	tmp, %i2
159
.Lroundkey_loop:
160
161
162
163
164
165
166
	sll	%i5, 2, %g2
	ld	[%i1], %i0
	add	%i5, 1, %i5
	ld	[%i2+%g2], %g3
	cmp	%i5, 3
	xor	%g3, %i0, %g3
	st	%g3, [%i3+%g2]
167
	bleu	.Lroundkey_loop
168
169
170
	add	%i1, 4, %i1
	add	%g1, 1, %g1
	cmp	%g1, %o7
171
	blu	.Lround_loop
172
173
	mov	0, %i5
	sll	%g1, 4, %g2
174
.Lfinal_round:
175
	add	%g2, ctx, %o7
176
177
	mov	0, %o1
	mov	%l1, %g1
178
	add	T, 288, %g4
179
.Lfinal_loop:
180
181
182
183
184
	ld	[%g4-32], %g2
	sll	%o1, 2, %i5
	sll	%g2, 2, %g2
	add	%g1, %g2, %g2
	ldub	[%g2+2], %i3
185
	add	%i5, dst, %i4
186
187
188
189
190
191
192
193
	ld	[%g4-16], %g3
	add	%o1, 1, %o1
	ld	[%g1+%i5], %g2
	sll	%g3, 2, %g3
	lduh	[%g1+%g3], %i2
	and	%g2, 255, %g2
	ld	[%g4], %i0
	and	%i2, 255, %i2
194
	ldub	[T+%i3], %i1
195
	sll	%i0, 2, %i0
196
	ldub	[T+%g2], %g3
197
198
199
	sll	%i1, 8, %i1
	ldub	[%g1+%i0], %i3
	or	%g3, %i1, %g3
200
	ldub	[T+%i2], %g2
201
	cmp	%o1, 3
202
	ldub	[T+%i3], %i0
203
204
205
206
207
208
209
210
211
212
213
214
	sll	%g2, 16, %g2
	or	%g3, %g2, %g3
	ld	[%o7], %g2
	sll	%i0, 24, %i0
	or	%g3, %i0, %g3
	xor	%g3, %g2, %g3
	srl	%g3, 24, %i0
	srl	%g3, 16, %i1
	srl	%g3, 8, %g2
	stb	%g2, [%i4+1]
	stb	%i0, [%i4+3]
	stb	%i1, [%i4+2]
215
	stb	%g3, [dst+%i5]
216
	add	%o7, 4, %o7
217
	bleu	.Lfinal_loop
218
	add	%g4, 4, %g4
219
	
220
221
	add	dst, 16, dst
	addcc	length, -16, length
222
	bne	.Lblock_loop
223
	add	src, 16, src
224
.Lend:
225
226
	ret
	restore
227
228
229
230
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt