diff --git a/ChangeLog b/ChangeLog index d9166612c0ffb01d92b032eba8017312dd7fe5c5..a1243072651406ec0cab5388c920ca90630bed6b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,32 @@ for indexing, and reducing number of used callee-save registers. * powerpc64/p8/aes-decrypt-internal.asm: Likewise. +2024-01-27 Niels Möller <nisse@lysator.liu.se> + + * aes-invert-internal.c (_nettle_aes_invert): Don't reorder the subkeys. + * aes-decrypt-internal.c (_nettle_aes_decrypt): Updated to process + subkeys starting from the end, and let subkeys pointer point at + the subkey for the first decrypt round, located at the end of the + array. + * aes128-decrypt.c (nettle_aes128_decrypt): Updated accordingly. + * aes192-decrypt.c (nettle_aes192_decrypt): Likewise. + * aes256-decrypt.c (nettle_aes256_decrypt): Likewise. + * arm/aes.m4 (AES_LOAD_INCR): New macro, specifying desired + increment of key pointer. + * arm/aes-decrypt-internal.asm: Updated for new conventions. + * arm/v6/aes-decrypt-internal.asm: Likewise. + * arm64/crypto/aes128-decrypt.asm: Likewise. + * arm64/crypto/aes192-decrypt.asm: Likewise. + * arm64/crypto/aes256-decrypt.asm: Likewise. + * powerpc64/p8/aes-decrypt-internal.asm: Likewise. + * sparc64/aes-decrypt-internal.asm: Likewise. + * x86/aes-decrypt-internal.asm: Likewise. + * x86_64/aes-decrypt-internal.asm: Likewise. + * x86_64/aes-decrypt-internal.asm: Likewise. + * x86_64/aesni/aes128-decrypt.asm: Likewise. + * x86_64/aesni/aes192-decrypt.asm: Likewise. + * x86_64/aesni/aes256-decrypt.asm: Likewise. + 2024-01-26 Niels Möller <nisse@lysator.liu.se> Delete all sparc32 assembly. diff --git a/aes-decrypt-internal.c b/aes-decrypt-internal.c index 9e8cf34a100f773e94c4c4e35062feb3c4f00a55..47747ab868dc00dd0ce0cd1f3004fd101bfa3a82 100644 --- a/aes-decrypt-internal.c +++ b/aes-decrypt-internal.c @@ -60,6 +60,7 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys, { uint32_t w0, w1, w2, w3; /* working ciphertext */ uint32_t t0, t1, t2, t3; + const uint32_t *p; unsigned i; /* Get clear text, using little-endian byte order. @@ -70,12 +71,12 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys, w2 = LE_READ_UINT32(src + 8) ^ keys[2]; w3 = LE_READ_UINT32(src + 12) ^ keys[3]; - for (i = 1; i < rounds; i++) + for (i = 1, p = keys - 4; i < rounds; i++, p -= 4) { - t0 = AES_ROUND(T, w0, w3, w2, w1, keys[4*i]); - t1 = AES_ROUND(T, w1, w0, w3, w2, keys[4*i + 1]); - t2 = AES_ROUND(T, w2, w1, w0, w3, keys[4*i + 2]); - t3 = AES_ROUND(T, w3, w2, w1, w0, keys[4*i + 3]); + t0 = AES_ROUND(T, w0, w3, w2, w1, p[0]); + t1 = AES_ROUND(T, w1, w0, w3, w2, p[1]); + t2 = AES_ROUND(T, w2, w1, w0, w3, p[2]); + t3 = AES_ROUND(T, w3, w2, w1, w0, p[3]); /* We could unroll the loop twice, to avoid these assignments. If all eight variables fit in registers, @@ -88,10 +89,10 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys, /* Final round */ - t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, keys[4*i]); - t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, keys[4*i + 1]); - t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, keys[4*i + 2]); - t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, keys[4*i + 3]); + t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, p[0]); + t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, p[1]); + t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, p[2]); + t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, p[3]); LE_WRITE_UINT32(dst, t0); LE_WRITE_UINT32(dst + 4, t1); diff --git a/aes-internal.h b/aes-internal.h index 64cf7be56ed1d27ecc414154866770b977593e99..442b045b279121612cf3b2a17177ecf852f2a1e5 100644 --- a/aes-internal.h +++ b/aes-internal.h @@ -66,6 +66,8 @@ _nettle_aes_encrypt(unsigned rounds, const uint32_t *keys, size_t length, uint8_t *dst, const uint8_t *src); +/* The keys pointer points at the subkeys for the first decrypt round, + located at the end of the array. */ void _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys, const struct aes_table *T, diff --git a/aes-invert-internal.c b/aes-invert-internal.c index a2faefa4d0262f7c2ed3581f7f2b8889f0d37796..7364616cb84edaea7eacf7ac1a1f09000307d44c 100644 --- a/aes-invert-internal.c +++ b/aes-invert-internal.c @@ -111,9 +111,9 @@ static const uint32_t mtable[0x100] = 0xbe805d9f,0xb58d5491,0xa89a4f83,0xa397468d, }; -#define MIX_COLUMN(T, key) do { \ +#define MIX_COLUMN(T, out, in) do { \ uint32_t _k, _nk, _t; \ - _k = (key); \ + _k = (in); \ _nk = T[_k & 0xff]; \ _k >>= 8; \ _t = T[_k & 0xff]; \ @@ -124,7 +124,7 @@ static const uint32_t mtable[0x100] = _k >>= 8; \ _t = T[_k & 0xff]; \ _nk ^= ROTL32(24, _t); \ - (key) = _nk; \ + (out) = _nk; \ } while(0) @@ -136,29 +136,13 @@ _nettle_aes_invert(unsigned rounds, uint32_t *dst, const uint32_t *src) { unsigned i; - /* Reverse the order of subkeys, in groups of 4. */ - /* FIXME: Instead of reordering the subkeys, change the access order - of aes_decrypt, since it's a separate function anyway? */ - if (src == dst) - { - unsigned j, k; + /* Transform all subkeys but the first and last. */ + for (i = 4; i < 4 * rounds; i++) + MIX_COLUMN (mtable, dst[i], src[i]); - for (i = 0, j = rounds * 4; - i < j; - i += 4, j -= 4) - for (k = 0; k<4; k++) - SWAP(dst[i+k], dst[j+k]); - } - else + if (src != dst) { - unsigned k; - - for (i = 0; i <= rounds * 4; i += 4) - for (k = 0; k < 4; k++) - dst[i+k] = src[rounds * 4 - i + k]; + dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; + dst[i] = src[i]; dst[i+1] = src[i+1]; dst[i+2] = src[i+2]; dst[i+3] = src[i+3]; } - - /* Transform all subkeys but the first and last. */ - for (i = 4; i < 4 * rounds; i++) - MIX_COLUMN (mtable, dst[i]); } diff --git a/aes128-decrypt.c b/aes128-decrypt.c index 436438cc38cd36961ae01572bb629ea82a3a9762..96d6bc6ed57c64c1bd90b80266ce1e0f1f461ff6 100644 --- a/aes128-decrypt.c +++ b/aes128-decrypt.c @@ -54,6 +54,6 @@ nettle_aes128_decrypt(const struct aes128_ctx *ctx, const uint8_t *src) { assert(!(length % AES_BLOCK_SIZE) ); - _nettle_aes_decrypt(_AES128_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table, - length, dst, src); + _nettle_aes_decrypt(_AES128_ROUNDS, ctx->keys + 4*_AES128_ROUNDS, + &_nettle_aes_decrypt_table, length, dst, src); } diff --git a/aes192-decrypt.c b/aes192-decrypt.c index 7746c76e574c09072c90836623e8c42d2581c82f..bf8702e4c679b0129210ad84efd3a0716dfa3562 100644 --- a/aes192-decrypt.c +++ b/aes192-decrypt.c @@ -54,6 +54,6 @@ nettle_aes192_decrypt(const struct aes192_ctx *ctx, const uint8_t *src) { assert(!(length % AES_BLOCK_SIZE) ); - _nettle_aes_decrypt(_AES192_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table, - length, dst, src); + _nettle_aes_decrypt(_AES192_ROUNDS, ctx->keys + 4 * _AES192_ROUNDS, + &_nettle_aes_decrypt_table, length, dst, src); } diff --git a/aes256-decrypt.c b/aes256-decrypt.c index 89411c10ee3ecad7688271f4996b5c268b2a4826..ae050797abacdbc1b5afe6346956be12f8c76a36 100644 --- a/aes256-decrypt.c +++ b/aes256-decrypt.c @@ -54,6 +54,6 @@ nettle_aes256_decrypt(const struct aes256_ctx *ctx, const uint8_t *src) { assert(!(length % AES_BLOCK_SIZE) ); - _nettle_aes_decrypt(_AES256_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table, - length, dst, src); + _nettle_aes_decrypt(_AES256_ROUNDS, ctx->keys + 4 * _AES256_ROUNDS, + &_nettle_aes_decrypt_table, length, dst, src); } diff --git a/arm/aes-decrypt-internal.asm b/arm/aes-decrypt-internal.asm index 1b04ed9a0a61aeb0c745e86fc39a24935c91c2d3..ed0d66f8cb87cf57f181549a9e5c5f4ca4730a6d 100644 --- a/arm/aes-decrypt-internal.asm +++ b/arm/aes-decrypt-internal.asm @@ -111,11 +111,12 @@ define(`AES_DECRYPT_ROUND', ` and T0, MASK, $1, ror #22 ldr T0, [TABLE, T0] - ldm $9!, {$1,$2,$3,$4} + ldm $9, {$1,$2,$3,$4} eor $8, $8, T0 sub TABLE, TABLE, #3072 eor $5, $5, $1 eor $6, $6, $2 + sub $9, $9, #16 eor $7, $7, $3 eor $8, $8, $4 ') @@ -142,7 +143,7 @@ PROLOGUE(_nettle_aes_decrypt) AES_LOAD(X0,KEY,W0) AES_LOAD(X0,KEY,W1) AES_LOAD(X0,KEY,W2) - AES_LOAD(X0,KEY,W3) + AES_LOAD_INCR(X0,KEY,W3, -28) str X0, FRAME_SRC diff --git a/arm/aes.m4 b/arm/aes.m4 index 95382de28122fcdf977e0e5ed90ee2031de38296..59706ba9e6784ee50dc2fc744d9b336a847098c6 100644 --- a/arm/aes.m4 +++ b/arm/aes.m4 @@ -1,6 +1,6 @@ C Loads one word, and adds it to the subkey. Uses T0 -C AES_LOAD(SRC, KEY, REG) -define(`AES_LOAD', ` +C AES_LOAD(SRC, KEY, REG, INCR) +define(`AES_LOAD_INCR', ` ldrb $3, [$1], #+1 ldrb T0, [$1], #+1 orr $3, T0, lsl #8 @@ -8,9 +8,13 @@ define(`AES_LOAD', ` orr $3, T0, lsl #16 ldrb T0, [$1], #+1 orr $3, T0, lsl #24 - ldr T0, [$2], #+4 + ldr T0, [$2], #$4 eor $3, T0 ') +C Loads one word, and adds it to the subkey. Uses T0 +C AES_LOAD(SRC, KEY, REG) +define(`AES_LOAD', `AES_LOAD_INCR($1, $2, $3, +4)') + C Stores one word. Destroys input. C AES_STORE(DST, X) define(`AES_STORE', ` diff --git a/arm/v6/aes-decrypt-internal.asm b/arm/v6/aes-decrypt-internal.asm index e8c6e91ad67304dcd8d8cc097c6ce028252b101b..10e0373ed5d32745e8d9cce979ac278b629b8909 100644 --- a/arm/v6/aes-decrypt-internal.asm +++ b/arm/v6/aes-decrypt-internal.asm @@ -114,11 +114,12 @@ define(`AES_DECRYPT_ROUND', ` uxtb T0, $1, ror #24 ldr T0, [TABLE, T0, lsl #2] - ldm $9!, {$1,$2,$3,$4} + ldm $9, {$1,$2,$3,$4} eor $8, $8, T0 sub TABLE, TABLE, #3072 eor $5, $5, $1 eor $6, $6, $2 + sub $9, $9, #16 eor $7, $7, $3 eor $8, $8, $4 ') @@ -148,7 +149,7 @@ PROLOGUE(_nettle_aes_decrypt) AES_LOAD(SRC,KEY,W0) AES_LOAD(SRC,KEY,W1) AES_LOAD(SRC,KEY,W2) - AES_LOAD(SRC,KEY,W3) + AES_LOAD_INCR(SRC,KEY,W3, -28) str SRC, FRAME_SRC diff --git a/arm64/crypto/aes128-decrypt.asm b/arm64/crypto/aes128-decrypt.asm index aadfc480509b6aadec171808e5317c2ffbe7428e..70f2a64ece0f403b40b405fe37dca27c5bc49b8d 100644 --- a/arm64/crypto/aes128-decrypt.asm +++ b/arm64/crypto/aes128-decrypt.asm @@ -73,16 +73,16 @@ PROLOGUE(nettle_aes128_decrypt) L4B_loop: ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64 - AESD_ROUND_4B(S0,S1,S2,S3,K0) - AESD_ROUND_4B(S0,S1,S2,S3,K1) - AESD_ROUND_4B(S0,S1,S2,S3,K2) - AESD_ROUND_4B(S0,S1,S2,S3,K3) - AESD_ROUND_4B(S0,S1,S2,S3,K4) - AESD_ROUND_4B(S0,S1,S2,S3,K5) - AESD_ROUND_4B(S0,S1,S2,S3,K6) - AESD_ROUND_4B(S0,S1,S2,S3,K7) + AESD_ROUND_4B(S0,S1,S2,S3,K10) + AESD_ROUND_4B(S0,S1,S2,S3,K9) AESD_ROUND_4B(S0,S1,S2,S3,K8) - AESD_LAST_ROUND_4B(S0,S1,S2,S3,K9,K10) + AESD_ROUND_4B(S0,S1,S2,S3,K7) + AESD_ROUND_4B(S0,S1,S2,S3,K6) + AESD_ROUND_4B(S0,S1,S2,S3,K5) + AESD_ROUND_4B(S0,S1,S2,S3,K4) + AESD_ROUND_4B(S0,S1,S2,S3,K3) + AESD_ROUND_4B(S0,S1,S2,S3,K2) + AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0) st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64 @@ -97,16 +97,16 @@ L1B: L1B_loop: ld1 {S0.16b},[SRC],#16 - AESD_ROUND_1B(S0,K0) - AESD_ROUND_1B(S0,K1) - AESD_ROUND_1B(S0,K2) - AESD_ROUND_1B(S0,K3) - AESD_ROUND_1B(S0,K4) - AESD_ROUND_1B(S0,K5) - AESD_ROUND_1B(S0,K6) - AESD_ROUND_1B(S0,K7) + AESD_ROUND_1B(S0,K10) + AESD_ROUND_1B(S0,K9) AESD_ROUND_1B(S0,K8) - AESD_LAST_ROUND_1B(S0,K9,K10) + AESD_ROUND_1B(S0,K7) + AESD_ROUND_1B(S0,K6) + AESD_ROUND_1B(S0,K5) + AESD_ROUND_1B(S0,K4) + AESD_ROUND_1B(S0,K3) + AESD_ROUND_1B(S0,K2) + AESD_LAST_ROUND_1B(S0,K1,K0) st1 {S0.16b},[DST],#16 diff --git a/arm64/crypto/aes192-decrypt.asm b/arm64/crypto/aes192-decrypt.asm index 769edd15dbe65a363d9eba0e64aef659b57f1de0..b8eb30fd58776b321cd26ebd5d96466dd001d8e9 100644 --- a/arm64/crypto/aes192-decrypt.asm +++ b/arm64/crypto/aes192-decrypt.asm @@ -76,18 +76,18 @@ PROLOGUE(nettle_aes192_decrypt) L4B_loop: ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64 - AESD_ROUND_4B(S0,S1,S2,S3,K0) - AESD_ROUND_4B(S0,S1,S2,S3,K1) - AESD_ROUND_4B(S0,S1,S2,S3,K2) - AESD_ROUND_4B(S0,S1,S2,S3,K3) - AESD_ROUND_4B(S0,S1,S2,S3,K4) - AESD_ROUND_4B(S0,S1,S2,S3,K5) - AESD_ROUND_4B(S0,S1,S2,S3,K6) - AESD_ROUND_4B(S0,S1,S2,S3,K7) - AESD_ROUND_4B(S0,S1,S2,S3,K8) - AESD_ROUND_4B(S0,S1,S2,S3,K9) + AESD_ROUND_4B(S0,S1,S2,S3,K12) + AESD_ROUND_4B(S0,S1,S2,S3,K11) AESD_ROUND_4B(S0,S1,S2,S3,K10) - AESD_LAST_ROUND_4B(S0,S1,S2,S3,K11,K12) + AESD_ROUND_4B(S0,S1,S2,S3,K9) + AESD_ROUND_4B(S0,S1,S2,S3,K8) + AESD_ROUND_4B(S0,S1,S2,S3,K7) + AESD_ROUND_4B(S0,S1,S2,S3,K6) + AESD_ROUND_4B(S0,S1,S2,S3,K5) + AESD_ROUND_4B(S0,S1,S2,S3,K4) + AESD_ROUND_4B(S0,S1,S2,S3,K3) + AESD_ROUND_4B(S0,S1,S2,S3,K2) + AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0) st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64 @@ -102,18 +102,18 @@ L1B: L1B_loop: ld1 {S0.16b},[SRC],#16 - AESD_ROUND_1B(S0,K0) - AESD_ROUND_1B(S0,K1) - AESD_ROUND_1B(S0,K2) - AESD_ROUND_1B(S0,K3) - AESD_ROUND_1B(S0,K4) - AESD_ROUND_1B(S0,K5) - AESD_ROUND_1B(S0,K6) - AESD_ROUND_1B(S0,K7) - AESD_ROUND_1B(S0,K8) - AESD_ROUND_1B(S0,K9) + AESD_ROUND_1B(S0,K12) + AESD_ROUND_1B(S0,K11) AESD_ROUND_1B(S0,K10) - AESD_LAST_ROUND_1B(S0,K11,K12) + AESD_ROUND_1B(S0,K9) + AESD_ROUND_1B(S0,K8) + AESD_ROUND_1B(S0,K7) + AESD_ROUND_1B(S0,K6) + AESD_ROUND_1B(S0,K5) + AESD_ROUND_1B(S0,K4) + AESD_ROUND_1B(S0,K3) + AESD_ROUND_1B(S0,K2) + AESD_LAST_ROUND_1B(S0,K1,K0) st1 {S0.16b},[DST],#16 diff --git a/arm64/crypto/aes256-decrypt.asm b/arm64/crypto/aes256-decrypt.asm index d787a74a40e7f2092612d6c63fc64d50d1e7c4f0..dc1633157bccf5701fc66efa18568e4461cb4de5 100644 --- a/arm64/crypto/aes256-decrypt.asm +++ b/arm64/crypto/aes256-decrypt.asm @@ -78,20 +78,20 @@ PROLOGUE(nettle_aes256_decrypt) L4B_loop: ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64 - AESD_ROUND_4B(S0,S1,S2,S3,K0) - AESD_ROUND_4B(S0,S1,S2,S3,K1) - AESD_ROUND_4B(S0,S1,S2,S3,K2) - AESD_ROUND_4B(S0,S1,S2,S3,K3) - AESD_ROUND_4B(S0,S1,S2,S3,K4) - AESD_ROUND_4B(S0,S1,S2,S3,K5) - AESD_ROUND_4B(S0,S1,S2,S3,K6) - AESD_ROUND_4B(S0,S1,S2,S3,K7) - AESD_ROUND_4B(S0,S1,S2,S3,K8) - AESD_ROUND_4B(S0,S1,S2,S3,K9) - AESD_ROUND_4B(S0,S1,S2,S3,K10) - AESD_ROUND_4B(S0,S1,S2,S3,K11) + AESD_ROUND_4B(S0,S1,S2,S3,K14) + AESD_ROUND_4B(S0,S1,S2,S3,K13) AESD_ROUND_4B(S0,S1,S2,S3,K12) - AESD_LAST_ROUND_4B(S0,S1,S2,S3,K13,K14) + AESD_ROUND_4B(S0,S1,S2,S3,K11) + AESD_ROUND_4B(S0,S1,S2,S3,K10) + AESD_ROUND_4B(S0,S1,S2,S3,K9) + AESD_ROUND_4B(S0,S1,S2,S3,K8) + AESD_ROUND_4B(S0,S1,S2,S3,K7) + AESD_ROUND_4B(S0,S1,S2,S3,K6) + AESD_ROUND_4B(S0,S1,S2,S3,K5) + AESD_ROUND_4B(S0,S1,S2,S3,K4) + AESD_ROUND_4B(S0,S1,S2,S3,K3) + AESD_ROUND_4B(S0,S1,S2,S3,K2) + AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0) st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64 @@ -106,20 +106,20 @@ L1B: L1B_loop: ld1 {S0.16b},[SRC],#16 - AESD_ROUND_1B(S0,K0) - AESD_ROUND_1B(S0,K1) - AESD_ROUND_1B(S0,K2) - AESD_ROUND_1B(S0,K3) - AESD_ROUND_1B(S0,K4) - AESD_ROUND_1B(S0,K5) - AESD_ROUND_1B(S0,K6) - AESD_ROUND_1B(S0,K7) - AESD_ROUND_1B(S0,K8) - AESD_ROUND_1B(S0,K9) - AESD_ROUND_1B(S0,K10) - AESD_ROUND_1B(S0,K11) + AESD_ROUND_1B(S0,K14) + AESD_ROUND_1B(S0,K13) AESD_ROUND_1B(S0,K12) - AESD_LAST_ROUND_1B(S0,K13,K14) + AESD_ROUND_1B(S0,K11) + AESD_ROUND_1B(S0,K10) + AESD_ROUND_1B(S0,K9) + AESD_ROUND_1B(S0,K8) + AESD_ROUND_1B(S0,K7) + AESD_ROUND_1B(S0,K6) + AESD_ROUND_1B(S0,K5) + AESD_ROUND_1B(S0,K4) + AESD_ROUND_1B(S0,K3) + AESD_ROUND_1B(S0,K2) + AESD_LAST_ROUND_1B(S0,K1,K0) st1 {S0.16b},[DST],#16 diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm index 6336e5b78c01265e0b925100c5bee65f6a981678..d2a07a3824fc7005664975ff92c02595f891c9cb 100644 --- a/powerpc64/p8/aes-decrypt-internal.asm +++ b/powerpc64/p8/aes-decrypt-internal.asm @@ -115,7 +115,7 @@ IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7) mtctr ROUNDS - li r9,0x10 + li r9,-0x10 .align 5 L8x_round_loop: @@ -123,7 +123,7 @@ L8x_round_loop: vperm K,K,K,SWAP_MASK OPN_XXY(vncipher, ZERO, S0, S1, S2, S3, S4, S5, S6, S7) OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7) - addi r9,r9,0x10 + subi r9,r9,0x10 bdnz L8x_round_loop lxvd2x VSR(K),r9,KEYS @@ -172,14 +172,14 @@ IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') OPN_XXY(vxor, K, S0, S1, S2, S3) mtctr ROUNDS - li r9,0x10 + li r9,-0x10 .align 5 L4x_round_loop: lxvd2x VSR(K),r9,KEYS vperm K,K,K,SWAP_MASK OPN_XXY(vncipher, ZERO, S0, S1, S2, S3) OPN_XXY(vxor, K, S0, S1, S2, S3) - addi r9,r9,0x10 + subi r9,r9,0x10 bdnz L4x_round_loop lxvd2x VSR(K),r9,KEYS @@ -216,7 +216,7 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK vxor S1,S1,K mtctr ROUNDS - li r9,0x10 + li r9,-0x10 .align 5 L2x_round_loop: lxvd2x VSR(K),r9,KEYS @@ -225,7 +225,7 @@ L2x_round_loop: vncipher S1,S1,ZERO vxor S0,S0,K vxor S1,S1,K - addi r9,r9,0x10 + subi r9,r9,0x10 bdnz L2x_round_loop lxvd2x VSR(K),r9,KEYS @@ -258,14 +258,14 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK') vxor S0,S0,K mtctr ROUNDS - li r9,0x10 + li r9,-0x10 .align 5 L1x_round_loop: lxvd2x VSR(K),r9,KEYS vperm K,K,K,SWAP_MASK vncipher S0,S0,ZERO vxor S0,S0,K - addi r9,r9,0x10 + subi r9,r9,0x10 bdnz L1x_round_loop lxvd2x VSR(K),r9,KEYS diff --git a/sparc64/aes-decrypt-internal.asm b/sparc64/aes-decrypt-internal.asm index a43ced3646dbc54f34660672ffa8f0de64ece29f..8918de1baf84a9f700d76ca8ff8c508651e34a76 100644 --- a/sparc64/aes-decrypt-internal.asm +++ b/sparc64/aes-decrypt-internal.asm @@ -103,33 +103,32 @@ PROLOGUE(_nettle_aes_decrypt) mov ROUNDS, COUNT add SRC, 16, SRC - add KEY, 16, KEY + sub KEY, 32, KEY .Lround_loop: C The AES_ROUND macro uses T0,... T3 C Transform W -> X - AES_ROUND(0, W0, W3, W2, W1, KEY, X0) - AES_ROUND(1, W1, W0, W3, W2, KEY, X1) - AES_ROUND(2, W2, W1, W0, W3, KEY, X2) - AES_ROUND(3, W3, W2, W1, W0, KEY, X3) + AES_ROUND(4, W0, W3, W2, W1, KEY, X0) + AES_ROUND(5, W1, W0, W3, W2, KEY, X1) + AES_ROUND(6, W2, W1, W0, W3, KEY, X2) + AES_ROUND(7, W3, W2, W1, W0, KEY, X3) C Transform X -> W - AES_ROUND(4, X0, X3, X2, X1, KEY, W0) - AES_ROUND(5, X1, X0, X3, X2, KEY, W1) - AES_ROUND(6, X2, X1, X0, X3, KEY, W2) - AES_ROUND(7, X3, X2, X1, X0, KEY, W3) + AES_ROUND(0, X0, X3, X2, X1, KEY, W0) + AES_ROUND(1, X1, X0, X3, X2, KEY, W1) + AES_ROUND(2, X2, X1, X0, X3, KEY, W2) + AES_ROUND(3, X3, X2, X1, X0, KEY, W3) subcc COUNT, 1, COUNT bne .Lround_loop - add KEY, 32, KEY + sub KEY, 32, KEY C Penultimate round - AES_ROUND(0, W0, W3, W2, W1, KEY, X0) - AES_ROUND(1, W1, W0, W3, W2, KEY, X1) - AES_ROUND(2, W2, W1, W0, W3, KEY, X2) - AES_ROUND(3, W3, W2, W1, W0, KEY, X3) + AES_ROUND(4, W0, W3, W2, W1, KEY, X0) + AES_ROUND(5, W1, W0, W3, W2, KEY, X1) + AES_ROUND(6, W2, W1, W0, W3, KEY, X2) + AES_ROUND(7, W3, W2, W1, W0, KEY, X3) - add KEY, 16, KEY C Final round AES_FINAL_ROUND(0, T, X0, X3, X2, X1, KEY, DST) AES_FINAL_ROUND(1, T, X1, X0, X3, X2, KEY, DST) diff --git a/x86/aes-decrypt-internal.asm b/x86/aes-decrypt-internal.asm index 730ce2ed88c78aa32f6417f01e28e831065329eb..d8d980da8b4880a56e1c022ba38770fa93bfdf37 100644 --- a/x86/aes-decrypt-internal.asm +++ b/x86/aes-decrypt-internal.asm @@ -90,7 +90,7 @@ PROLOGUE(_nettle_aes_decrypt) jz .Lend shrl $4, PARAM_LENGTH - subl $1, PARAM_ROUNDS + decl PARAM_ROUNDS .Lblock_loop: movl PARAM_KEYS, KEY C address of subkeys @@ -103,7 +103,7 @@ PROLOGUE(_nettle_aes_decrypt) C Loop counter on stack movl TMP, FRAME_COUNT - addl $16,KEY C point to next key + subl $16,KEY C point to next key movl KEY,FRAME_KEY ALIGN(16) .Lround_loop: @@ -128,7 +128,7 @@ PROLOGUE(_nettle_aes_decrypt) xorl 4(KEY),SB xorl 8(KEY),SC xorl 12(KEY),SD - addl $16,FRAME_KEY C point to next key + subl $16,FRAME_KEY C point to next key decl FRAME_COUNT jnz .Lround_loop diff --git a/x86_64/aes-decrypt-internal.asm b/x86_64/aes-decrypt-internal.asm index d3bedc25ac557d163e84b4bb5afa6607d5aea84c..afa44f3568d72bba9038d536d3d3f2c4075dfcce 100644 --- a/x86_64/aes-decrypt-internal.asm +++ b/x86_64/aes-decrypt-internal.asm @@ -83,8 +83,8 @@ PROLOGUE(_nettle_aes_decrypt) push %r15 subl $1, XREG(ROUNDS) - push ROUNDS C Rounds at (%rsp) - + push ROUNDS C Rounds stored at (%rsp) + mov PARAM_TABLE, TABLE mov PARAM_LENGTH, LENGTH shr $4, LENGTH @@ -96,7 +96,8 @@ PROLOGUE(_nettle_aes_decrypt) movl (%rsp), XREG(ROUNDS) - add $16, KEY C point to next key + sub $16, KEY C point to next key + ALIGN(16) .Lround_loop: AES_ROUND(TABLE, SA,SD,SC,SB, TA, TMP) @@ -113,7 +114,7 @@ PROLOGUE(_nettle_aes_decrypt) xorl 8(KEY),SC xorl 12(KEY),SD - add $16, KEY C point to next key + sub $16, KEY C point to next key decl XREG(ROUNDS) jnz .Lround_loop diff --git a/x86_64/aesni/aes128-decrypt.asm b/x86_64/aesni/aes128-decrypt.asm index 79111e470fcc092495cf344547b791486985d371..b200989403f80596bd68a5a588ace74f0722115b 100644 --- a/x86_64/aesni/aes128-decrypt.asm +++ b/x86_64/aesni/aes128-decrypt.asm @@ -64,17 +64,17 @@ PROLOGUE(nettle_aes128_decrypt) test LENGTH, LENGTH jz .Lend - movups (CTX), KEY0 - movups 16(CTX), KEY1 - movups 32(CTX), KEY2 - movups 48(CTX), KEY3 - movups 64(CTX), KEY4 + movups 160(CTX), KEY0 + movups 144(CTX), KEY1 + movups 128(CTX), KEY2 + movups 112(CTX), KEY3 + movups 96(CTX), KEY4 movups 80(CTX), KEY5 - movups 96(CTX), KEY6 - movups 112(CTX), KEY7 - movups 128(CTX), KEY8 - movups 144(CTX), KEY9 - movups 160(CTX), KEY10 + movups 64(CTX), KEY6 + movups 48(CTX), KEY7 + movups 32(CTX), KEY8 + movups 16(CTX), KEY9 + movups (CTX), KEY10 shr LENGTH jnc .Lblock_loop diff --git a/x86_64/aesni/aes192-decrypt.asm b/x86_64/aesni/aes192-decrypt.asm index 399f89b6ebe11d5473161eaecb22fa3d7951836b..24c178278876ccae8ab0a684651824cfd5340a53 100644 --- a/x86_64/aesni/aes192-decrypt.asm +++ b/x86_64/aesni/aes192-decrypt.asm @@ -66,19 +66,19 @@ PROLOGUE(nettle_aes192_decrypt) test LENGTH, LENGTH jz .Lend - movups (CTX), KEY0 - movups 16(CTX), KEY1 - movups 32(CTX), KEY2 - movups 48(CTX), KEY3 - movups 64(CTX), KEY4 - movups 80(CTX), KEY5 + movups 192(CTX), KEY0 + movups 176(CTX), KEY1 + movups 160(CTX), KEY2 + movups 144(CTX), KEY3 + movups 128(CTX), KEY4 + movups 112(CTX), KEY5 movups 96(CTX), KEY6 - movups 112(CTX), KEY7 - movups 128(CTX), KEY8 - movups 144(CTX), KEY9 - movups 160(CTX), KEY10 - movups 176(CTX), KEY11 - movups 192(CTX), KEY12 + movups 80(CTX), KEY7 + movups 64(CTX), KEY8 + movups 48(CTX), KEY9 + movups 32(CTX), KEY10 + movups 16(CTX), KEY11 + movups (CTX), KEY12 shr LENGTH jnc .Lblock_loop diff --git a/x86_64/aesni/aes256-decrypt.asm b/x86_64/aesni/aes256-decrypt.asm index 0fc5ad2a705f191a35982c5d9d3dda3328fee4f2..247655a33eec578954adb2772f3fe5659449d2c1 100644 --- a/x86_64/aesni/aes256-decrypt.asm +++ b/x86_64/aesni/aes256-decrypt.asm @@ -67,20 +67,20 @@ PROLOGUE(nettle_aes256_decrypt) test LENGTH, LENGTH jz .Lend - movups (CTX), KEY0_7 - movups 16(CTX), KEY1 - movups 32(CTX), KEY2 - movups 48(CTX), KEY3 - movups 64(CTX), KEY4 - movups 80(CTX), KEY5 - movups 96(CTX), KEY6 - movups 128(CTX), KEY8 - movups 144(CTX), KEY9 - movups 160(CTX), KEY10 - movups 176(CTX), KEY11 - movups 192(CTX), KEY12 - movups 208(CTX), KEY13 - movups 224(CTX), KEY14 + movups 224(CTX), KEY0_7 + movups 208(CTX), KEY1 + movups 192(CTX), KEY2 + movups 176(CTX), KEY3 + movups 160(CTX), KEY4 + movups 144(CTX), KEY5 + movups 128(CTX), KEY6 + movups 96(CTX), KEY8 + movups 80(CTX), KEY9 + movups 64(CTX), KEY10 + movups 48(CTX), KEY11 + movups 32(CTX), KEY12 + movups 16(CTX), KEY13 + movups (CTX), KEY14 shr LENGTH jnc .Lblock_loop @@ -95,7 +95,7 @@ PROLOGUE(nettle_aes256_decrypt) aesdec KEY5, X aesdec KEY6, X aesdec KEY0_7, X - movups (CTX), KEY0_7 + movups 224(CTX), KEY0_7 aesdec KEY8, X aesdec KEY9, X aesdec KEY10, X @@ -130,7 +130,7 @@ PROLOGUE(nettle_aes256_decrypt) aesdec KEY6, Y aesdec KEY0_7, X aesdec KEY0_7, Y - movups (CTX), KEY0_7 + movups 224(CTX), KEY0_7 aesdec KEY8, X aesdec KEY8, Y aesdec KEY9, X