diff --git a/ChangeLog b/ChangeLog index d24b28bfa096008620c664c5828b5472601b147e..b1186f930c68995842d26d63983c33c827456d48 100644 --- a/ChangeLog +++ b/ChangeLog @@ -7,6 +7,14 @@ (salsa20_set_key): Rearranged slightly, to avoid unnecessary byte-to-word conversions. + (LE_SWAP32): Renamed macro from... + (U32TO32_LITTLE): ... old name. + (U32TO8_LITTLE): Deleted macro. + (salsa20_wordtobyte): Renamed function to... + (salsa20_hash): ... new name. Changed output argument from byte + array to word array. Use memxor3, which brings a considerable + performance gain. + * nettle-internal.c (salsa20_set_key_hack): Updated salsa20_set_iv call. * testsuite/salsa20-test.c (test_salsa20): Deleted iv_length diff --git a/salsa20.c b/salsa20.c index c1584fbe2e6abbe1b15da92307a3ebef64449313..84e70f1e156ee875e1b414aaa12d3f8d382e23b1 100644 --- a/salsa20.c +++ b/salsa20.c @@ -34,29 +34,29 @@ #endif #include <assert.h> +#include <string.h> #include "salsa20.h" #include "macros.h" +#include "memxor.h" -#define SWAP32(v) \ +#ifdef WORDS_BIGENDIAN +#define LE_SWAP32(v) ((ROTL32(8, v) & 0x00FF00FFUL) | \ (ROTL32(24, v) & 0xFF00FF00UL)) - -#ifdef WORDS_BIGENDIAN -#define U32TO32_LITTLE(v) SWAP32(v) #else -#define U32TO32_LITTLE(v) (v) +#define LE_SWAP32(v) (v) #endif -#define U32TO8_LITTLE(p, v) (((uint32_t*)(p))[0] = U32TO32_LITTLE(v)) - -static void salsa20_wordtobyte(uint8_t output[SALSA20_BLOCK_SIZE],const uint32_t input[_SALSA20_INPUT_LENGTH]) +static void +salsa20_hash(uint32_t *output, const uint32_t *input) { uint32_t x[_SALSA20_INPUT_LENGTH]; int i; - for (i = 0;i < _SALSA20_INPUT_LENGTH;++i) x[i] = input[i]; + memcpy (x, input, sizeof (x)); + for (i = 20;i > 0;i -= 2) { x[ 4] ^= ROTL32( 7, x[ 0] + x[12]); x[ 8] ^= ROTL32( 9, x[ 4] + x[ 0]); @@ -91,8 +91,14 @@ static void salsa20_wordtobyte(uint8_t output[SALSA20_BLOCK_SIZE],const uint32_t x[14] ^= ROTL32(13, x[13] + x[12]); x[15] ^= ROTL32(18, x[14] + x[13]); } - for (i = 0;i < _SALSA20_INPUT_LENGTH;++i) x[i] = x[i] + input[i]; - for (i = 0;i < _SALSA20_INPUT_LENGTH;++i) U32TO8_LITTLE(output + 4 * i,x[i]); + for (i = 0;i < _SALSA20_INPUT_LENGTH;++i) + { + uint32_t t = x[i] + input[i]; + /* NOTE: We return a word array of byte-swapped values, rather + than using a byte array and LE_WRITE_UINT32, to avoid having + to care about unaligned bytes. */ + output[i] = LE_SWAP32 (t); + } } void @@ -149,24 +155,27 @@ salsa20_crypt(struct salsa20_ctx *ctx, uint8_t *c, const uint8_t *m) { - uint8_t output[SALSA20_BLOCK_SIZE]; - unsigned i; - - if (!length) return; - for (;;) { - salsa20_wordtobyte(output,ctx->input); - ctx->input[8]++; - if (!ctx->input[8]) { - ctx->input[9]++; + uint32_t output[_SALSA20_INPUT_LENGTH]; + + if (!length) + return; + + for (;;) + { + salsa20_hash(output,ctx->input); + ctx->input[9] += (++ctx->input[8] == 0); + /* stopping at 2^70 length per nonce is user's responsibility */ - } - if (length <= SALSA20_BLOCK_SIZE) { - for (i = 0;i < length;++i) c[i] = m[i] ^ output[i]; - return; - } - for (i = 0;i < SALSA20_BLOCK_SIZE;++i) c[i] = m[i] ^ output[i]; - length -= SALSA20_BLOCK_SIZE; - c += SALSA20_BLOCK_SIZE; - m += SALSA20_BLOCK_SIZE; + + if (length <= SALSA20_BLOCK_SIZE) + { + memxor3 (c, m, (uint8_t *) output, length); + return; + } + memxor3 (c, m, (uint8_t *) output, SALSA20_BLOCK_SIZE); + + length -= SALSA20_BLOCK_SIZE; + c += SALSA20_BLOCK_SIZE; + m += SALSA20_BLOCK_SIZE; } }