Skip to content
Snippets Groups Projects
Select Git revision
  • fea9ee6ee458e589b23ef5b62b572ceddc063960
  • master default protected
  • hpke
  • ppc-chacha-4core
  • delete-internal-name-mangling
  • master-updates
  • ppc-gcm
  • ppc-chacha-2core
  • refactor-ecc-mod
  • ppc-chacha-core
  • use-mpn_cnd-functions
  • optimize-ecc-invert
  • default-m4-quote-char
  • power-asm-wip
  • test-fat
  • chacha-3core-neon
  • x86_64-salsa20-2core
  • salsa20-2core-neon
  • bcrypt
  • arm-salsa20-chacha-vsra
  • test-shlib-dir
  • nettle_3.6_release_20200429
  • nettle_3.6rc3
  • nettle_3.6rc2
  • nettle_3.6rc1
  • nettle_3.5.1_release_20190627
  • nettle_3.5_release_20190626
  • nettle_3.5rc1
  • nettle_3.4.1_release_20181204
  • nettle_3.4.1rc1
  • nettle_3.4_release_20171119
  • nettle_3.4rc2
  • nettle_3.4rc1
  • nettle_3.3_release_20161001
  • nettle_3.2_release_20160128
  • nettle_3.1.1_release_20150424
  • nettle_3.1_release_20150407
  • nettle_3.1rc3
  • nettle_3.1rc2
  • nettle_3.1rc1
  • nettle_3.0_release_20140607
41 results

aes.asm

Blame
  • Forked from Nettle / nettle
    4132 commits behind the upstream repository.
    Niels Möller's avatar
    Niels Möller authored
    Rev: src/nettle/sparc/aes.asm:1.115
    fea9ee6e
    History
    aes.asm 10.77 KiB
    ! -*- mode: asm; asm-comment-char: ?!; -*-  
    ! nettle, low-level cryptographics library
    ! 
    ! Copyright (C) 2002 Niels Mller
    !  
    ! The nettle library is free software; you can redistribute it and/or modify
    ! it under the terms of the GNU Lesser General Public License as published by
    ! the Free Software Foundation; either version 2.1 of the License, or (at your
    ! option) any later version.
    ! 
    ! The nettle library is distributed in the hope that it will be useful, but
    ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    ! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
    ! License for more details.
    ! 
    ! You should have received a copy of the GNU Lesser General Public License
    ! along with the nettle library; see the file COPYING.LIB.  If not, write to
    ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
    ! MA 02111-1307, USA.
    
    ! NOTE: Some of the %g registers are reserved for operating system etc
    ! (see gcc/config/sparc.h). We should use only %g1-%g3 to be safe.
    	
    	! Used registers:	%l0,1,2,3,4,5,6,7
    	!			%i0,1,2,3,4 (%i6=%fp, %i7 = return)
    	!			%o0,1,2,3,4,7 (%o6=%sp)
    	!			%g1,2,3,5
    	
    	.file	"aes.asm"
    	
    ! Arguments
    define(ctx, %i0)
    define(T, %i1)
    define(length, %i2)
    define(dst, %i3)
    define(src, %i4)
    
    ! Loop invariants
    define(wtxt, %l0)
    define(tmp, %l1)
    define(diff, %l2)
    define(nrounds, %l3)
    
    ! Loop variables
    define(round, %l4)
    define(i, %l5)
    define(key, %o4)
    
    ! Further loop invariants
    define(T0, %l6)
    define(T1, %l7)
    define(T2, %g5)
    define(T3, %o7)
    
    C IDX1 cointains the permutation values * 4 + 2
    define(IDX1, <T + AES_SIDX1 >)
    C IDX3 cointains the permutation values * 4
    define(IDX3, <T + AES_SIDX3 >)
    
    ! Teporaries
    define(t0, %o0)
    define(t1, %o1)
    define(t2, %o2)
    define(t3, %o3)
    
    C AES_LOAD(i)
    C Get one word of input, XOR with first subkey, store in wtxt
    define(<AES_LOAD>, <
    	ldub	[src+3], t3
    	ldub	[src+2], t2
    	sll	t3, 24, t3
    	ldub	[src+1], t1
    	
    	sll	t2, 16, t2
    	or	t3, t2, t3
    	ldub	[src+0], t0
    	sll	t1, 8, t1
    	
    	! Get subkey
    	ld	[ctx + $1], t2
    	or	t3, t1, t3
    	or	t3, t0, t3
    	xor	t3, t2, t3
    	
    	st	t3, [wtxt+$1]
    	add	src, 4, src
    	
    	C ldub	[src + $1], t0
    	C ldub	[src + $1 + 1], t1
    	C sll	t1, 8, t1
    	C or	t0, t1, t0
    	C ldub	[src + $1 + 2], t2
    	C sll	t2, 16, t2
    	C or	t0, t2, t0
    	C ldub	[src + $1 + 3], t1
    	C sll	t1, 24, t1
    	C or	t0, t1, t0
    	C ld	[key + $1], t2
    	C xor	t0, t2, t0
    	C st	t0, [wtxt + $1]
    	>)dnl
    
    C AES_ROUND(i)
    C Compute one word in the round function. 
    C Input in wtxt, output stored in tmp + i.
    C
    C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
    C the instruction is a part of. 
    C
    C The code uses the register %o[j], aka tj, as the primary 
    C register for that sub-expression. True for j==1,3.
    define(<AES_ROUND>, <
    	ld	[IDX1+$1], t1		! 1
    	ldub	[wtxt+$1+3], t0		! 0
    	ldub	[wtxt+t1], t1		! 1
    	sll	t0, 2, t0		! 0
    	
    	ld	[T0+t0], t0		! 0
    	sll	t1, 2, t1		! 1
    	ld	[T1+t1], t1		! 1
    	ld	[IDX3+$1], t2		! 3
    	
    	xor	t0, t1, t0		! 0, 1
    	! IDX2(j) = j XOR 2
    	ldub	[wtxt+eval($1 ^ 8)+1], t1	! 2
    	ldub	[wtxt+t2], t2		! 3
    	sll	t1, 2, t1		! 2
    	
    	ld	[T2+t1], t1		! 2
    	sll	t2, 2, t2		! 3
    	ld	[T3+t2], t2		! 3
    	xor	t0, t1, t0		! 0, 1, 2
    	
    	! Fetch roundkey
    	ld	[key + $1], t1
    	xor	t0, t2, t0		! 0, 1, 2, 3
    	xor	t0, t1, t0
    	st	t0, [tmp + $1]>)dnl
    
    C AES_FINAL_ROUND(i)
    C Compute one word in the final round function. 
    C Input in wtxt, output converted to an octet string and stored at dst. 
    C
    C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
    C the instruction is a part of. 
    define(<AES_FINAL_ROUND>, <
    	ld	[IDX1+$1], t1		! 1
    	ldub	[wtxt+$1+3], t0		! 0
    	ldub	[wtxt+t1], t1		! 1
    	ldub	[T+t0], t0		! 0
    	
    	ldub	[T+t1], t1		! 1
    	ld	[IDX3 + $1], t2		! 3
    	sll	t1, 8, t1		! 1
    	or	t0, t1, t0		! 0, 1
    	
    	! IDX2(j) = j XOR 2
    	ldub	[wtxt+eval($1 ^ 8)+1], t1	! 2
    	ldub	[wtxt+t2], t2		! 3
    	ldub	[T+t1], t1		! 2
    	ldub	[T+t2], t2		! 3
    	
    	sll	t1, 16, t1		! 2
    	or	t0, t1, t0		! 0, 1, 2
    	sll	t2, 24, t2		! 3
    	ld	[key + $1], t1
    	
    	or	t0, t2, t0		! 0, 1, 2, 3
    	xor	t0, t1, t0
    	srl	t0, 24, t1
    	stb	t1, [dst+$1+3]
    	
    	srl	t0, 16, t1
    	stb	t1, [dst+$1+2]
    	srl	t0, 8, t1
    	stb	t1, [dst+$1+1]
    	
    	stb	t0, [dst+$1]>)dnl
    	
    C The stack frame looks like
    C
    C %fp -   4: OS-dependent link field
    C %fp -   8: OS-dependent link field
    C %fp -  24: tmp, uint32_t[4]
    C %fp -  40: wtxt, uint32_t[4]
    C %fp - 136: OS register save area. 
    define(<FRAME_SIZE>, 136)
    
    	.section	".text"
    	.align 16
    	.global _aes_crypt
    	.type	_aes_crypt,#function
    	.proc	020
    	
    _aes_crypt:
    	save	%sp, -FRAME_SIZE, %sp
    	cmp	length, 0
    	be	.Lend
    	! wtxt
    	add	%fp, -24, wtxt
    	
    	add	%fp, -40, tmp
    
    	ld	[ctx + AES_NROUNDS], nrounds
    	! Compute xor, so that we can swap efficiently.
    	xor	wtxt, tmp, diff
    	! The loop variable will be multiplied by 16.
    	! More loop invariants
    	add	T, AES_TABLE0, T0
    	
    	add	T, AES_TABLE1, T1
    	add	T, AES_TABLE2, T2
    	add	T, AES_TABLE3, T3
    	
    	! Read src, and add initial subkey
    	! Difference between ctx and src.
    	! NOTE: These instructions are duplicated in the delay slot,
    	! and the instruction before the branch.
    	sub	ctx, src, %g2
    	! Difference between wtxt and src
    	sub	wtxt, src, %g3
    .Lblock_loop:
    	! For stop condition. Note that src is incremented in the
    	! delay slot
    	add	src, 8, %g1
    
    	C AES_LOAD(0)	! i = 0
    	C AES_LOAD(4)	! i = 1
    	C AES_LOAD(8)	! i = 2
    	C AES_LOAD(12)	! i = 3
    	C add	src, 16, src
    			
    C .Lsource_loop:
    	C Begin loop
    	C i = 0
    	AES_LOAD(0)
    	C ldub	[src+3], t3
    	C ldub	[src+2], t2
    	C sll	t3, 24, t3
    	C ldub	[src+1], t1
    	C 
    	C sll	t2, 16, t2
    	C or	t3, t2, t3
    	C ldub	[src], t0
    	C sll	t1, 8, t1
    	C 
    	C ! Get subkey
    	C ld	[ctx + 0], t2
    	C or	t3, t1, t3
    	C or	t3, t0, t3
    	C xor	t3, t2, t3
    	C 
    	C st	t3, [wtxt+0]
    	C add	src, 4, src
    
    	C i = 1
    	AES_LOAD(4)
    	C ldub	[src+3], t3
    	C ldub	[src+2], t2
    	C sll	t3, 24, t3
    	C ldub	[src+1], t1
    	C 
    	C sll	t2, 16, t2
    	C or	t3, t2, t3
    	C ldub	[src], t0
    	C sll	t1, 8, t1
    	C 
    	C ! Get subkey
    	C ld	[ctx + 4], t2
    	C or	t3, t1, t3
    	C or	t3, t0, t3
    	C xor	t3, t2, t3
    	C 
    	C C cmp	src, %g1
    	C st	t3, [wtxt + 4]
    	C C bleu	.Lsource_loop
    	C add	src, 4, src
    	
    	C i = 2
    	AES_LOAD(8)
    	C ldub	[src+3], t3
    	C ldub	[src+2], t2
    	C sll	t3, 24, t3
    	C ldub	[src+1], t1
    	C 
    	C sll	t2, 16, t2
    	C or	t3, t2, t3
    	C ldub	[src], t0
    	C sll	t1, 8, t1
    	C 
    	C ! Get subkey
    	C ld	[src+%g2], t2
    	C or	t3, t1, t3
    	C or	t3, t0, t3
    	C xor	t3, t2, t3
    	C 
    	C C cmp	src, %g1
    	C st	t3, [src+%g3]
    	C C bleu	.Lsource_loop
    	C add	src, 4, src
    	C i = 3
    	AES_LOAD(12)
    	C ldub	[src+3], t3
    	C ldub	[src+2], t2
    	C sll	t3, 24, t3
    	C ldub	[src+1], t1
    	C 
    	C sll	t2, 16, t2
    	C or	t3, t2, t3
    	C ldub	[src], t0
    	C sll	t1, 8, t1
    	C 
    	C ! Get subkey
    	C ld	[src+%g2], t2
    	C or	t3, t1, t3
    	C or	t3, t0, t3
    	C xor	t3, t2, t3
    	C 
    	C C cmp	src, %g1
    	C st	t3, [src+%g3]
    	C C bleu	.Lsource_loop
    	C add	src, 4, src
    
    	C End loop
    	
    	sub	nrounds, 1, round
    	add	ctx, 16, key
    
    .Lround_loop:
    
    	AES_ROUND(0)	! i = 0
    	AES_ROUND(4)	! i = 1
    	AES_ROUND(8)	! i = 2
    	AES_ROUND(12)	! i = 3
    			
    	! switch roles for tmp and wtxt
    	xor	wtxt, diff, wtxt
    	xor	tmp, diff, tmp
    
    	subcc	round, 1, round
    	bne	.Lround_loop
    	add	key, 16, key
    
    	C Final round, and storage of the output
    
    	AES_FINAL_ROUND(0)	! i = 0
    	AES_FINAL_ROUND(4)	! i = 1
    	AES_FINAL_ROUND(8)	! i = 2
    	AES_FINAL_ROUND(12)	! i = 3
    	add	dst, 16, dst
    		
    	addcc	length, -16, length
    	sub	ctx, src, %g2
    	
    	bne	.Lblock_loop
    	sub	wtxt, src, %g3
    
    .Lend:
    	ret
    	restore
    .LLFE1:
    .LLfe1:
    	.size	_aes_crypt,.LLfe1-_aes_crypt
    
    	! Benchmarks on my slow sparcstation:	
    	! Original C code	
    	! aes128 (ECB encrypt): 14.36s, 0.696MB/s
    	! aes128 (ECB decrypt): 17.19s, 0.582MB/s
    	! aes128 (CBC encrypt): 16.08s, 0.622MB/s
    	! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
    	! 
    	! aes192 (ECB encrypt): 16.85s, 0.593MB/s
    	! aes192 (ECB decrypt): 19.64s, 0.509MB/s
    	! aes192 (CBC encrypt): 18.43s, 0.543MB/s
    	! aes192 (CBC decrypt): 20.76s, 0.482MB/s
    	! 
    	! aes256 (ECB encrypt): 19.12s, 0.523MB/s
    	! aes256 (ECB decrypt): 22.57s, 0.443MB/s
    	! aes256 (CBC encrypt): 20.92s, 0.478MB/s
    	! aes256 (CBC decrypt): 23.22s, 0.431MB/s
    
    	! After unrolling key_addition32, and getting rid of
    	! some sll x, 2, x, encryption speed is 0.760 MB/s.
    
    	! Next, the C code was optimized to use larger tables and
    	! no rotates. New timings:
    	! aes128 (ECB encrypt): 13.10s, 0.763MB/s
    	! aes128 (ECB decrypt): 11.51s, 0.869MB/s
    	! aes128 (CBC encrypt): 15.15s, 0.660MB/s
    	! aes128 (CBC decrypt): 13.10s, 0.763MB/s
    	! 
    	! aes192 (ECB encrypt): 15.68s, 0.638MB/s
    	! aes192 (ECB decrypt): 13.59s, 0.736MB/s
    	! aes192 (CBC encrypt): 17.65s, 0.567MB/s
    	! aes192 (CBC decrypt): 15.31s, 0.653MB/s
    	! 
    	! aes256 (ECB encrypt): 17.95s, 0.557MB/s
    	! aes256 (ECB decrypt): 15.90s, 0.629MB/s
    	! aes256 (CBC encrypt): 20.16s, 0.496MB/s
    	! aes256 (CBC decrypt): 17.47s, 0.572MB/s
    
    	! After optimization using pre-shifted indices
    	! (AES_SIDX[1-3]): 
    	! aes128 (ECB encrypt): 12.46s, 0.803MB/s
    	! aes128 (ECB decrypt): 10.74s, 0.931MB/s
    	! aes128 (CBC encrypt): 17.74s, 0.564MB/s
    	! aes128 (CBC decrypt): 12.43s, 0.805MB/s
    	! 
    	! aes192 (ECB encrypt): 14.59s, 0.685MB/s
    	! aes192 (ECB decrypt): 12.76s, 0.784MB/s
    	! aes192 (CBC encrypt): 19.97s, 0.501MB/s
    	! aes192 (CBC decrypt): 14.46s, 0.692MB/s
    	! 
    	! aes256 (ECB encrypt): 17.00s, 0.588MB/s
    	! aes256 (ECB decrypt): 14.81s, 0.675MB/s
    	! aes256 (CBC encrypt): 22.65s, 0.442MB/s
    	! aes256 (CBC decrypt): 16.46s, 0.608MB/s
    
    	! After implementing double buffering
    	! aes128 (ECB encrypt): 12.59s, 0.794MB/s
    	! aes128 (ECB decrypt): 10.56s, 0.947MB/s
    	! aes128 (CBC encrypt): 17.91s, 0.558MB/s
    	! aes128 (CBC decrypt): 12.30s, 0.813MB/s
    	! 
    	! aes192 (ECB encrypt): 15.03s, 0.665MB/s
    	! aes192 (ECB decrypt): 12.56s, 0.796MB/s
    	! aes192 (CBC encrypt): 20.30s, 0.493MB/s
    	! aes192 (CBC decrypt): 14.26s, 0.701MB/s
    	! 
    	! aes256 (ECB encrypt): 17.30s, 0.578MB/s
    	! aes256 (ECB decrypt): 14.51s, 0.689MB/s
    	! aes256 (CBC encrypt): 22.75s, 0.440MB/s
    	! aes256 (CBC decrypt): 16.35s, 0.612MB/s
    	
    	! After reordering aes-encrypt.c and aes-decypt.c
    	! (the order probably causes strange cache-effects):
    	! aes128 (ECB encrypt): 9.21s, 1.086MB/s
    	! aes128 (ECB decrypt): 11.13s, 0.898MB/s
    	! aes128 (CBC encrypt): 14.12s, 0.708MB/s
    	! aes128 (CBC decrypt): 13.77s, 0.726MB/s
    	! 
    	! aes192 (ECB encrypt): 10.86s, 0.921MB/s
    	! aes192 (ECB decrypt): 13.17s, 0.759MB/s
    	! aes192 (CBC encrypt): 15.74s, 0.635MB/s
    	! aes192 (CBC decrypt): 15.91s, 0.629MB/s
    	! 
    	! aes256 (ECB encrypt): 12.71s, 0.787MB/s
    	! aes256 (ECB decrypt): 15.38s, 0.650MB/s
    	! aes256 (CBC encrypt): 17.49s, 0.572MB/s
    	! aes256 (CBC decrypt): 17.87s, 0.560MB/s
    
    	! After further optimizations of the initial and final loops,
    	! source_loop and final_loop. 
    	! aes128 (ECB encrypt): 8.07s, 1.239MB/s
    	! aes128 (ECB decrypt): 9.48s, 1.055MB/s
    	! aes128 (CBC encrypt): 12.76s, 0.784MB/s
    	! aes128 (CBC decrypt): 12.15s, 0.823MB/s
    	! 
    	! aes192 (ECB encrypt): 9.43s, 1.060MB/s
    	! aes192 (ECB decrypt): 11.20s, 0.893MB/s
    	! aes192 (CBC encrypt): 14.19s, 0.705MB/s
    	! aes192 (CBC decrypt): 13.97s, 0.716MB/s
    	! 
    	! aes256 (ECB encrypt): 10.81s, 0.925MB/s
    	! aes256 (ECB decrypt): 12.92s, 0.774MB/s
    	! aes256 (CBC encrypt): 15.59s, 0.641MB/s
    	! aes256 (CBC decrypt): 15.76s, 0.635MB/s