aes.asm 12.9 KB
Newer Older
1
! -*- mode: asm; asm-comment-char: ?!; -*-  
Niels Möller's avatar
Niels Möller committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
! nettle, low-level cryptographics library
! 
! Copyright (C) 2002 Niels Möller
!  
! The nettle library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
! 
! The nettle library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
! License for more details.
! 
! You should have received a copy of the GNU Lesser General Public License
! along with the nettle library; see the file COPYING.LIB.  If not, write to
! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! MA 02111-1307, USA.

21
22
! NOTE: Some of the %g registers are reserved for operating system etc
! (see gcc/config/sparc.h). We should use only %g1-%g3 to be safe.
Niels Möller's avatar
Niels Möller committed
23
	
24
	! Used registers:	%l0,1,2,3,4,5,6,7
25
26
	!			%i0,1,2,3,4 (%i6=%fp, %i7 = return)
	!			%o0,1,2,3,4,7 (%o6=%sp)
27
	!			%g1,2,3,5
Niels Möller's avatar
Niels Möller committed
28
	
29
	.file	"aes.asm"
Niels Möller's avatar
Niels Möller committed
30
	
31
! Arguments
32
define(ctx, %i0)
33
define(T, %i1)
34
define(length, %i2)
35
define(dst, %i3)
36
define(src, %i4)
37

38
! Loop invariants
39
define(wtxt, %l0)
40
define(tmp, %l1)
41
define(diff, %l2)
42
define(nrounds, %l3)
43
44

! Loop variables
45
define(round, %l4)
46
define(i, %l5)
47
define(key, %o4)
48

49
50
51
! Further loop invariants
define(T0, %l6)
define(T1, %l7)
52
define(T2, %g5)
53
define(T3, %o7)
54
55

C IDX1 cointains the permutation values * 4 + 2
56
define(IDX1, <T + AES_SIDX1 >)
57
C IDX3 cointains the permutation values * 4
58
define(IDX3, <T + AES_SIDX3 >)
59

60
! Teporaries
61
define(t0, %o0)
62
define(t1, %o1)
63
define(t2, %o2)
64
65
define(t3, %o3)

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
dnl AES_ROUND(i)
dnl Compute one word in the round function. 
dnl Input in wtxt, output stored in tmp + i.
define(<AES_ROUND>, <
	ld	[IDX1+$1], t1		! 1
	
	ldub	[wtxt+t1], t1		! 1
	ld	[IDX3+$1], t3		! 3
	
	sll	t1, 2, t1		! 1
	ld	[wtxt+$1], t0		! 0
	! IDX2(j) = j XOR 2
	lduh	[wtxt+eval($1 ^ 8)], t2		! 2
	and	t0, 255, t0		! 0
	
	ldub	[wtxt+t3], t3		! 3
	sll	t0, 2, t0		! 0
	ld	[T0+t0], t0		! 0
	and	t2, 255, t2		! 2
	
	ld	[T1+t1], t1		! 1
	sll	t2, 2, t2		! 2
	ld	[T2+t2], t2		! 2
	sll	t3, 2, t3		! 3
	
	ld	[T3+t3], t3		! 3
	xor	t0, t1, t0		! 0, 1
	xor	t0, t2, t0		! 0, 1, 2
	! Fetch roundkey
	ld	[key + $1], t1
	
	xor	t0, t3, t0		! 0, 1, 2, 3
	xor	t0, t1, t0
	st	t0, [tmp + $1]>)
	
	
102
103
104
105
106
107
108
109
C The stack frame looks like
C
C %fp -   4: OS-dependent link field
C %fp -   8: OS-dependent link field
C %fp -  24: tmp, uint32_t[4]
C %fp -  40: wtxt, uint32_t[4]
C %fp - 136: OS register save area. 
define(<FRAME_SIZE>, 136)
110

111
112
113
114
115
116
	.section	".text"
	.align 16
	.global _aes_crypt
	.type	_aes_crypt,#function
	.proc	020
	
117
_aes_crypt:
118
	save	%sp, -FRAME_SIZE, %sp
119
	cmp	length, 0
120
	be	.Lend
121
122
123
124
	! wtxt
	add	%fp, -24, wtxt
	
	add	%fp, -40, tmp
125

126
	ld	[ctx + AES_NROUNDS], nrounds
127
128
	! Compute xor, so that we can swap efficiently.
	xor	wtxt, tmp, diff
129
	! The loop variable will be multiplied by 16.
130
131
	! More loop invariants
	add	T, AES_TABLE0, T0
Niels Möller's avatar
Niels Möller committed
132
	
133
	add	T, AES_TABLE1, T1
134
135
	add	T, AES_TABLE2, T2
	add	T, AES_TABLE3, T3
Niels Möller's avatar
Niels Möller committed
136
	
137
	! Read src, and add initial subkey
138
	! Difference between ctx and src.
Niels Möller's avatar
Niels Möller committed
139
	! NOTE: These instructions are duplicated in the delay slot,
140
	! and the instruction before the branch.
141
142
143
	sub	ctx, src, %g2
	! Difference between wtxt and src
	sub	wtxt, src, %g3
144
.Lblock_loop:
145
146
	! For stop condition. Note that src is incremented in the
	! delay slot
147
	add	src, 8, %g1
148
	
149
.Lsource_loop:
150
151
152
153
	ldub	[src+3], t3
	ldub	[src+2], t2
	sll	t3, 24, t3
	ldub	[src+1], t1
154
	
155
156
157
158
	sll	t2, 16, t2
	or	t3, t2, t3
	ldub	[src], t0
	sll	t1, 8, t1
159
	
160
	! Get subkey
161
	ld	[src+%g2], t2
162
163
	or	t3, t1, t3
	or	t3, t0, t3
164
	xor	t3, t2, t3
165
	
166
	cmp	src, %g1
167
	st	t3, [src+%g3]
168
	bleu	.Lsource_loop
169
	add	src, 4, src
170
	
171
	sub	nrounds, 1, round
172
	add	ctx, 16, key
Niels Möller's avatar
Niels Möller committed
173

174
.Lround_loop:
175
	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
176
177
178
179
	! the instruction is a part of. 
	!
	! The code uses the register %o[j], aka tj, as the primary 
	! register for that sub-expression. True for j==1,3.
180

181
182
	C Unrolled inner loop begins
	
183
	C i = 0
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
	AES_ROUND(0)
C 	ld	[IDX1+0], t1		! 1
C 	
C 	ldub	[wtxt+t1], t1		! 1
C 	ld	[IDX3+0], t3		! 3
C 	
C 	sll	t1, 2, t1		! 1
C 	ld	[wtxt], t0		! 0
C 	! IDX2(j) = j XOR 2
C 	lduh	[wtxt+8], t2		! 2
C 	and	t0, 255, t0		! 0
C 	
C 	ldub	[wtxt+t3], t3		! 3
C 	sll	t0, 2, t0		! 0
C 	ld	[T0+t0], t0		! 0
C 	and	t2, 255, t2		! 2
C 	
C 	ld	[T1+t1], t1		! 1
C 	sll	t2, 2, t2		! 2
C 	ld	[T2+t2], t2		! 2
C 	sll	t3, 2, t3		! 3
C 	
C 	ld	[T3+t3], t3		! 3
C 	xor	t0, t1, t0		! 0, 1
C 	xor	t0, t2, t0		! 0, 1, 2
C 	! Fetch roundkey
C 	ld	[key], t1
C 	
C 	xor	t0, t3, t0		! 0, 1, 2, 3
C 	xor	t0, t1, t0
C 	st	t0, [tmp]
215
216

	C i = 1
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
	AES_ROUND(4)
C 	ld	[IDX1+4], t1		! 1
C 	
C 	ldub	[wtxt+t1], t1		! 1
C 	ld	[IDX3+4], t3		! 3
C 	
C 	sll	t1, 2, t1		! 1
C 	ld	[wtxt+4], t0		! 0
C 	! IDX2(j) = j XOR 2
C 	lduh	[wtxt+12], t2		! 2
C 	and	t0, 255, t0		! 0
C 	
C 	ldub	[wtxt+t3], t3		! 3
C 	sll	t0, 2, t0		! 0
C 	ld	[T0+t0], t0		! 0
C 	and	t2, 255, t2		! 2
C 	
C 	ld	[T1+t1], t1		! 1
C 	sll	t2, 2, t2		! 2
C 	ld	[T2+t2], t2		! 2
C 	sll	t3, 2, t3		! 3
C 	
C 	ld	[T3+t3], t3		! 3
C 	xor	t0, t1, t0		! 0, 1
C 	xor	t0, t2, t0		! 0, 1, 2
C 	! Fetch roundkey
C 	ld	[key+4], t1
C 	
C 	xor	t0, t3, t0		! 0, 1, 2, 3
C 	xor	t0, t1, t0
C 	st	t0, [tmp+4]
248
249

	C = 2
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
	AES_ROUND(8)
C 	ld	[IDX1+8], t1		! 1
C 	
C 	ldub	[wtxt+t1], t1		! 1
C 	ld	[IDX3+8], t3		! 3
C 	
C 	sll	t1, 2, t1		! 1
C 	ld	[wtxt+8], t0		! 0
C 	! IDX2(j) = j XOR 2
C 	lduh	[wtxt], t2		! 2
C 	and	t0, 255, t0		! 0
C 	
C 	ldub	[wtxt+t3], t3		! 3
C 	sll	t0, 2, t0		! 0
C 	ld	[T0+t0], t0		! 0
C 	and	t2, 255, t2		! 2
C 	
C 	ld	[T1+t1], t1		! 1
C 	sll	t2, 2, t2		! 2
C 	ld	[T2+t2], t2		! 2
C 	sll	t3, 2, t3		! 3
C 	
C 	ld	[T3+t3], t3		! 3
C 	xor	t0, t1, t0		! 0, 1
C 	xor	t0, t2, t0		! 0, 1, 2
C 	! Fetch roundkey
C 	ld	[key+8], t1
C 	
C 	xor	t0, t3, t0		! 0, 1, 2, 3
C 	xor	t0, t1, t0
C 	st	t0, [tmp+8]
281
282

	C = 3
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
	AES_ROUND(12)
C 	ld	[IDX1+12], t1		! 1
C 	
C 	ldub	[wtxt+t1], t1		! 1
C 	ld	[IDX3+12], t3		! 3
C 	
C 	sll	t1, 2, t1		! 1
C 	ld	[wtxt+12], t0		! 0
C 	! IDX2(j) = j XOR 2
C 	lduh	[wtxt+4], t2		! 2
C 	and	t0, 255, t0		! 0
C 	
C 	ldub	[wtxt+t3], t3		! 3
C 	sll	t0, 2, t0		! 0
C 	ld	[T0+t0], t0		! 0
C 	and	t2, 255, t2		! 2
C 	
C 	ld	[T1+t1], t1		! 1
C 	sll	t2, 2, t2		! 2
C 	ld	[T2+t2], t2		! 2
C 	sll	t3, 2, t3		! 3
C 	
C 	ld	[T3+t3], t3		! 3
C 	xor	t0, t1, t0		! 0, 1
C 	xor	t0, t2, t0		! 0, 1, 2
C 	! Fetch roundkey
C 	ld	[key+12], t1
C 	
C 	xor	t0, t3, t0		! 0, 1, 2, 3
C 	xor	t0, t1, t0
C 	st	t0, [tmp+12]
314
			
315
	C Unrolled inner loop ends
316
	
317
	! switch roles for tmp and wtxt
318
	xor	wtxt, diff, wtxt
319
	xor	tmp, diff, tmp
320

321
	subcc	round, 1, round
322
	bne	.Lround_loop
Niels Möller's avatar
Niels Möller committed
323
	add	key, 16, key
324

Niels Möller's avatar
Niels Möller committed
325
	! Final round
326

327
	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
328
	! the instruction is part of.
329
330
331
332

	C Unrolled final loop begins

	C i = 0
333
	ld	[IDX1+0], t1 	! 1
334
	ldub	[wtxt+t1], t1	! 1
335

336
337
338
	ld	[wtxt+0], t0	! 0
	! IDX2(j) = j XOR 2
	lduh	[wtxt+8], t2	! 2
339
	and	t0, 255, t0	! 0
340
	ld	[IDX3 + 0], t3	! 3
341
342
343
344
345
346
347
348
349
350
351
352
353
	
	and	t2, 255, t2	! 2
	ldub	[T+t1], t1	! 1
	ldub	[T+t0], t0	! 0
	sll	t1, 8, t1	! 1
	
	ldub	[wtxt+t3], t3	! 3
	or	t0, t1, t0	! 0, 1
	ldub	[T+t2], t2	! 2
	ldub	[T+t3], t3	! 3
	
	sll	t2, 16, t2	! 2
	or	t0, t2, t0	! 0, 1, 2
354
	ld	[key + 0], t2
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
	sll	t3, 24, t3	! 3
	
	or	t0, t3, t0	! 0, 1, 2, 3
	xor	t0, t2, t0
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
	stb	t1, [dst+1]
	
	stb	t3, [dst+3]
	stb	t2, [dst+2]
	stb	t0, [dst]
	add	dst, 4, dst
	
	C i = 1
371
	ld	[IDX1+4], t1 	! 1
372

373
	ldub	[wtxt+t1], t1	! 1
374

375
	ld	[wtxt+4], t0	! 0
376
377
	! IDX2(j) = j XOR 2
	lduh	[wtxt+12], t2	! 2
378
	and	t0, 255, t0	! 0
379
	ld	[IDX3 + 4], t3	! 3
380
381
382
383
384
385
386
387
388
389
390
391
392
	
	and	t2, 255, t2	! 2
	ldub	[T+t1], t1	! 1
	ldub	[T+t0], t0	! 0
	sll	t1, 8, t1	! 1
	
	ldub	[wtxt+t3], t3	! 3
	or	t0, t1, t0	! 0, 1
	ldub	[T+t2], t2	! 2
	ldub	[T+t3], t3	! 3
	
	sll	t2, 16, t2	! 2
	or	t0, t2, t0	! 0, 1, 2
393
	ld	[key + 4], t2
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
	sll	t3, 24, t3	! 3
	
	or	t0, t3, t0	! 0, 1, 2, 3
	xor	t0, t2, t0
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
	stb	t1, [dst+1]
	
	stb	t3, [dst+3]
	stb	t2, [dst+2]
	stb	t0, [dst]
	add	dst, 4, dst
	
	C i = 2
410
	ld	[IDX1+8], t1 	! 1
411
	
412
	ldub	[wtxt+t1], t1	! 1
413

414
415
416
	ld	[wtxt+8], t0	! 0
	! IDX2(j) = j XOR 2
	lduh	[wtxt+0], t2	! 2
417
	and	t0, 255, t0	! 0
418
	ld	[IDX3 + 8], t3	! 3
419
420
421
422
423
424
425
426
427
428
429
430
431
	
	and	t2, 255, t2	! 2
	ldub	[T+t1], t1	! 1
	ldub	[T+t0], t0	! 0
	sll	t1, 8, t1	! 1
	
	ldub	[wtxt+t3], t3	! 3
	or	t0, t1, t0	! 0, 1
	ldub	[T+t2], t2	! 2
	ldub	[T+t3], t3	! 3
	
	sll	t2, 16, t2	! 2
	or	t0, t2, t0	! 0, 1, 2
432
	ld	[key + 8], t2
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
	sll	t3, 24, t3	! 3
	
	or	t0, t3, t0	! 0, 1, 2, 3
	xor	t0, t2, t0
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
	stb	t1, [dst+1]
	
	stb	t3, [dst+3]
	stb	t2, [dst+2]
	stb	t0, [dst]
	add	dst, 4, dst
	
	C i = 3
449
	ld	[IDX1+12], t1 	! 1
450

451
	ldub	[wtxt+t1], t1	! 1
452

453
	ld	[wtxt+12], t0	! 0
454
455
	! IDX2(j) = j XOR 2
	lduh	[wtxt+4], t2	! 2
456
	and	t0, 255, t0	! 0
457
	ld	[IDX3 + 12], t3	! 3
458
	
459
	and	t2, 255, t2	! 2
460
	ldub	[T+t1], t1	! 1
461
	ldub	[T+t0], t0	! 0
462
	sll	t1, 8, t1	! 1
463
	
464
	ldub	[wtxt+t3], t3	! 3
465
	or	t0, t1, t0	! 0, 1
466
	ldub	[T+t2], t2	! 2
467
	ldub	[T+t3], t3	! 3
468
	
469
470
	sll	t2, 16, t2	! 2
	or	t0, t2, t0	! 0, 1, 2
471
	ld	[key + 12], t2
472
	sll	t3, 24, t3	! 3
473
	
474
	or	t0, t3, t0	! 0, 1, 2, 3
475
476
477
478
479
	xor	t0, t2, t0
	
	srl	t0, 24, t3
	srl	t0, 16, t2
	srl	t0, 8, t1
480
	stb	t1, [dst+1]
481
	
482
483
484
	stb	t3, [dst+3]
	stb	t2, [dst+2]
	stb	t0, [dst]
485
	add	dst, 4, dst
486
	
487
488
	C Unrolled final loop ends
	
489
	addcc	length, -16, length
490
	sub	ctx, src, %g2
491
	
492
	bne	.Lblock_loop
493
	sub	wtxt, src, %g3
494

495
.Lend:
496
497
	ret
	restore
498
499
500
501
.LLFE1:
.LLfe1:
	.size	_aes_crypt,.LLfe1-_aes_crypt

502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
	! Benchmarks on my slow sparcstation:	
	! Original C code	
	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
	! 
	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
	! 
	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
	! aes256 (CBC decrypt): 23.22s, 0.431MB/s

	! After unrolling key_addition32, and getting rid of
	! some sll x, 2, x, encryption speed is 0.760 MB/s.

	! Next, the C code was optimized to use larger tables and
	! no rotates. New timings:
	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
	! 
	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
	! 
	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
	! aes256 (CBC decrypt): 17.47s, 0.572MB/s

	! After optimization using pre-shifted indices
	! (AES_SIDX[1-3]): 
	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
	! 
	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
	! 
	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
	! aes256 (CBC decrypt): 16.46s, 0.608MB/s
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587

	! After implementing double buffering
	! aes128 (ECB encrypt): 12.59s, 0.794MB/s
	! aes128 (ECB decrypt): 10.56s, 0.947MB/s
	! aes128 (CBC encrypt): 17.91s, 0.558MB/s
	! aes128 (CBC decrypt): 12.30s, 0.813MB/s
	! 
	! aes192 (ECB encrypt): 15.03s, 0.665MB/s
	! aes192 (ECB decrypt): 12.56s, 0.796MB/s
	! aes192 (CBC encrypt): 20.30s, 0.493MB/s
	! aes192 (CBC decrypt): 14.26s, 0.701MB/s
	! 
	! aes256 (ECB encrypt): 17.30s, 0.578MB/s
	! aes256 (ECB decrypt): 14.51s, 0.689MB/s
	! aes256 (CBC encrypt): 22.75s, 0.440MB/s
	! aes256 (CBC decrypt): 16.35s, 0.612MB/s
	
	! After reordering aes-encrypt.c and aes-decypt.c
	! (the order probably causes strange cache-effects):
	! aes128 (ECB encrypt): 9.21s, 1.086MB/s
	! aes128 (ECB decrypt): 11.13s, 0.898MB/s
	! aes128 (CBC encrypt): 14.12s, 0.708MB/s
	! aes128 (CBC decrypt): 13.77s, 0.726MB/s
	! 
	! aes192 (ECB encrypt): 10.86s, 0.921MB/s
	! aes192 (ECB decrypt): 13.17s, 0.759MB/s
	! aes192 (CBC encrypt): 15.74s, 0.635MB/s
	! aes192 (CBC decrypt): 15.91s, 0.629MB/s
	! 
	! aes256 (ECB encrypt): 12.71s, 0.787MB/s
	! aes256 (ECB decrypt): 15.38s, 0.650MB/s
	! aes256 (CBC encrypt): 17.49s, 0.572MB/s
	! aes256 (CBC decrypt): 17.87s, 0.560MB/s
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605

	! After further optimizations of the initial and final loops,
	! source_loop and final_loop. 
	! aes128 (ECB encrypt): 8.07s, 1.239MB/s
	! aes128 (ECB decrypt): 9.48s, 1.055MB/s
	! aes128 (CBC encrypt): 12.76s, 0.784MB/s
	! aes128 (CBC decrypt): 12.15s, 0.823MB/s
	! 
	! aes192 (ECB encrypt): 9.43s, 1.060MB/s
	! aes192 (ECB decrypt): 11.20s, 0.893MB/s
	! aes192 (CBC encrypt): 14.19s, 0.705MB/s
	! aes192 (CBC decrypt): 13.97s, 0.716MB/s
	! 
	! aes256 (ECB encrypt): 10.81s, 0.925MB/s
	! aes256 (ECB decrypt): 12.92s, 0.774MB/s
	! aes256 (CBC encrypt): 15.59s, 0.641MB/s
	! aes256 (CBC decrypt): 15.76s, 0.635MB/s