diff --git a/ChangeLog b/ChangeLog
index bedbad0a47ce7808c40c355b7723afff47e45b29..e5b830948ce453a92bb1a28a5954203b9dc0d436 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2012-03-31  Niels Möller  <nisse@lysator.liu.se>
 
+	* salsa20.c (ROTL32): Deleted macro, use the one from macros.h
+	instead, with reversed order of arguments.
+	(ROTATE, XOR, PLUS, PLUSONE): Deleted macros, use ROTL32 and
+	builtin operators directly.
+
 	Unification of rotation macros.
 	* macros.h (ROTL32): New macro, to replace (almost) all other
 	rotation macros.
diff --git a/salsa20.c b/salsa20.c
index f81b755be88fb802aa81299ad0b39c0267bee76c..af452439081d29d58291048b345bf9b3b645a50b 100644
--- a/salsa20.c
+++ b/salsa20.c
@@ -37,11 +37,11 @@
 
 #include "salsa20.h"
 
-#define ROTL32(x,n) ((((x))<<(n)) | (((x))>>(32-(n))))
+#include "macros.h"
 
 #define SWAP32(v)				\
-  ((ROTL32(v,  8) & 0x00FF00FFUL) |		\
-   (ROTL32(v, 24) & 0xFF00FF00UL))
+  ((ROTL32(8,  v) & 0x00FF00FFUL) |		\
+   (ROTL32(24, v) & 0xFF00FF00UL))
 
 #ifdef WORDS_BIGENDIAN
 #define U32TO32_LITTLE(v) SWAP32(v)
@@ -52,17 +52,6 @@
 #define U8TO32_LITTLE(p) U32TO32_LITTLE(((uint32_t*)(p))[0])
 #define U32TO8_LITTLE(p, v) (((uint32_t*)(p))[0] = U32TO32_LITTLE(v))
 
-/*
-salsa20-ref.c version 20051118
-D. J. Bernstein
-Public domain.
-*/
-
-#define ROTATE(v,c) (ROTL32(v,c))
-#define XOR(v,w) ((v) ^ (w))
-#define PLUS(v,w) ((v) + (w))
-#define PLUSONE(v) (PLUS((v),1))
-
 static void salsa20_wordtobyte(uint8_t output[64],const uint32_t input[16])
 {
   uint32_t x[16];
@@ -70,40 +59,40 @@ static void salsa20_wordtobyte(uint8_t output[64],const uint32_t input[16])
 
   for (i = 0;i < 16;++i) x[i] = input[i];
   for (i = 20;i > 0;i -= 2) {
-    x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7));
-    x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9));
-    x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13));
-    x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18));
-    x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7));
-    x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9));
-    x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13));
-    x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18));
-    x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7));
-    x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9));
-    x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13));
-    x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18));
-    x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7));
-    x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9));
-    x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13));
-    x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18));
-    x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7));
-    x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9));
-    x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13));
-    x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18));
-    x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7));
-    x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9));
-    x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13));
-    x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18));
-    x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7));
-    x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9));
-    x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13));
-    x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18));
-    x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7));
-    x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9));
-    x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13));
-    x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18));
+    x[ 4] ^= ROTL32( 7, x[ 0] + x[12]);
+    x[ 8] ^= ROTL32( 9, x[ 4] + x[ 0]);
+    x[12] ^= ROTL32(13, x[ 8] + x[ 4]);
+    x[ 0] ^= ROTL32(18, x[12] + x[ 8]);
+    x[ 9] ^= ROTL32( 7, x[ 5] + x[ 1]);
+    x[13] ^= ROTL32( 9, x[ 9] + x[ 5]);
+    x[ 1] ^= ROTL32(13, x[13] + x[ 9]);
+    x[ 5] ^= ROTL32(18, x[ 1] + x[13]);
+    x[14] ^= ROTL32( 7, x[10] + x[ 6]);
+    x[ 2] ^= ROTL32( 9, x[14] + x[10]);
+    x[ 6] ^= ROTL32(13, x[ 2] + x[14]);
+    x[10] ^= ROTL32(18, x[ 6] + x[ 2]);
+    x[ 3] ^= ROTL32( 7, x[15] + x[11]);
+    x[ 7] ^= ROTL32( 9, x[ 3] + x[15]);
+    x[11] ^= ROTL32(13, x[ 7] + x[ 3]);
+    x[15] ^= ROTL32(18, x[11] + x[ 7]);
+    x[ 1] ^= ROTL32( 7, x[ 0] + x[ 3]);
+    x[ 2] ^= ROTL32( 9, x[ 1] + x[ 0]);
+    x[ 3] ^= ROTL32(13, x[ 2] + x[ 1]);
+    x[ 0] ^= ROTL32(18, x[ 3] + x[ 2]);
+    x[ 6] ^= ROTL32( 7, x[ 5] + x[ 4]);
+    x[ 7] ^= ROTL32( 9, x[ 6] + x[ 5]);
+    x[ 4] ^= ROTL32(13, x[ 7] + x[ 6]);
+    x[ 5] ^= ROTL32(18, x[ 4] + x[ 7]);
+    x[11] ^= ROTL32( 7, x[10] + x[ 9]);
+    x[ 8] ^= ROTL32( 9, x[11] + x[10]);
+    x[ 9] ^= ROTL32(13, x[ 8] + x[11]);
+    x[10] ^= ROTL32(18, x[ 9] + x[ 8]);
+    x[12] ^= ROTL32( 7, x[15] + x[14]);
+    x[13] ^= ROTL32( 9, x[12] + x[15]);
+    x[14] ^= ROTL32(13, x[13] + x[12]);
+    x[15] ^= ROTL32(18, x[14] + x[13]);
   }
-  for (i = 0;i < 16;++i) x[i] = PLUS(x[i],input[i]);
+  for (i = 0;i < 16;++i) x[i] = x[i] + input[i];
   for (i = 0;i < 16;++i) U32TO8_LITTLE(output + 4 * i,x[i]);
 }
 
@@ -161,9 +150,9 @@ salsa20_crypt(struct salsa20_ctx *ctx,
   if (!length) return;
   for (;;) {
     salsa20_wordtobyte(output,ctx->input);
-    ctx->input[8] = PLUSONE(ctx->input[8]);
+    ctx->input[8]++;
     if (!ctx->input[8]) {
-      ctx->input[9] = PLUSONE(ctx->input[9]);
+      ctx->input[9]++;
       /* stopping at 2^70 length per nonce is user's responsibility */
     }
     if (length <= 64) {