diff --git a/ChangeLog b/ChangeLog
index 7ceffa1549c4441a0cd71027d1aa265dacc6516e..d24b28bfa096008620c664c5828b5472601b147e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,6 +4,8 @@
 	size allowed.
 	(U8TO32_LITTLE): Deleted macro. Use LE_READ_UINT32 instead, which
 	avoids unaligned reads.
+	(salsa20_set_key): Rearranged slightly, to avoid unnecessary
+	byte-to-word conversions.
 
 	* nettle-internal.c (salsa20_set_key_hack): Updated salsa20_set_iv
 	call.
diff --git a/salsa20.c b/salsa20.c
index c93d47332714c487ffbec451e685bbcb339388c4..c1584fbe2e6abbe1b15da92307a3ebef64449313 100644
--- a/salsa20.c
+++ b/salsa20.c
@@ -95,15 +95,20 @@ static void salsa20_wordtobyte(uint8_t output[SALSA20_BLOCK_SIZE],const uint32_t
   for (i = 0;i < _SALSA20_INPUT_LENGTH;++i) U32TO8_LITTLE(output + 4 * i,x[i]);
 }
 
-static const char sigma[_SALSA20_INPUT_LENGTH] = "expand 32-byte k";
-static const char tau[_SALSA20_INPUT_LENGTH] = "expand 16-byte k";
-
 void
 salsa20_set_key(struct salsa20_ctx *ctx,
 		unsigned length, const uint8_t *key)
 {
-  const char *constants;
-
+  static const uint32_t sigma[4] = {
+    /* "expand 32-byte k" */
+    0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+  };
+  static const uint32_t tau[4] = {
+    /* "expand 16-byte k" */
+    0x61707865, 0x3120646e, 0x79622d36, 0x6b206574
+  };
+  const uint32_t *constants;
+  
   assert (length == SALSA20_MIN_KEY_SIZE || length == SALSA20_MAX_KEY_SIZE);
 
   ctx->input[1] = LE_READ_UINT32(key + 0);
@@ -111,19 +116,22 @@ salsa20_set_key(struct salsa20_ctx *ctx,
   ctx->input[3] = LE_READ_UINT32(key + 8);
   ctx->input[4] = LE_READ_UINT32(key + 12);
   if (length == SALSA20_MAX_KEY_SIZE) { /* recommended */
-    key += 16;
+    ctx->input[11] = LE_READ_UINT32(key + 16);
+    ctx->input[12] = LE_READ_UINT32(key + 20);
+    ctx->input[13] = LE_READ_UINT32(key + 24);
+    ctx->input[14] = LE_READ_UINT32(key + 28);
     constants = sigma;
   } else { /* kbits == 128 */
+    ctx->input[11] = ctx->input[1];
+    ctx->input[12] = ctx->input[2];
+    ctx->input[13] = ctx->input[3];
+    ctx->input[14] = ctx->input[4];
     constants = tau;
   }
-  ctx->input[11] = LE_READ_UINT32(key + 0);
-  ctx->input[12] = LE_READ_UINT32(key + 4);
-  ctx->input[13] = LE_READ_UINT32(key + 8);
-  ctx->input[14] = LE_READ_UINT32(key + 12);
-  ctx->input[0]  = LE_READ_UINT32(constants + 0);
-  ctx->input[5]  = LE_READ_UINT32(constants + 4);
-  ctx->input[10] = LE_READ_UINT32(constants + 8);
-  ctx->input[15] = LE_READ_UINT32(constants + 12);
+  ctx->input[0]  = constants[0];
+  ctx->input[5]  = constants[1];
+  ctx->input[10] = constants[2];
+  ctx->input[15] = constants[3];
 }
 
 void