diff --git a/ChangeLog b/ChangeLog
index 93dea76261fdbbdcfc19bd009575d40eb95ed563..7c6a3d7f155b1e54e4a4451e161c17f2d02ab3f8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2012-10-27  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/salsa20-crypt.asm: Include x86_64/salsa20.m4.
+	Make all exits go via .Lend and W64_EXIT.
+
+	* x86_64/salsa20.m4: New file, extracted from
+	x86_64/salsa20-crypt.asm.
+
 2012-10-26  Niels Möller  <nisse@lysator.liu.se>
 
 	* configure.ac (LIBNETTLE_LINK, LIBHOGWEED_LIBS): Add $(CFLAGS) on
diff --git a/x86_64/salsa20-crypt.asm b/x86_64/salsa20-crypt.asm
index 2e059a70a43cc8042716403433cba8bb185cc0ba..5d11980475d72f0350f591235300d471221be1bb 100644
--- a/x86_64/salsa20-crypt.asm
+++ b/x86_64/salsa20-crypt.asm
@@ -34,6 +34,8 @@ define(<M0110>, <%xmm7>)
 define(<M0011>, <%xmm8>)
 define(<COUNT>, <%rax>)
 
+include_src(<x86_64/salsa20.m4>)
+
 C Possible improvements:
 C 
 C Do two blocks (or more) at a time in parallel, to avoid limitations
@@ -43,52 +45,7 @@ C Avoid redoing the permutation of the input for each block (all but
 C the two counter words are constant). Could also keep the input in
 C registers.
 
-C QROUND(x0, x1, x2, x3)
-define(<QROUND>, <
-	movaps	$4, T0		C 0
-	paddd	$1, T0		C 1
-	movaps	T0, T1		C 2
-	pslld	<$>7, T0	C 2
-	psrld	<$>25, T1	C 3
-	pxor	T0, $2		C 3
-	pxor	T1, $2		C 4
-
-	movaps	$1, T0		C 0
-	paddd	$2, T0		C 5
-	movaps	T0, T1		C 6
-	pslld	<$>9, T0	C 6
-	psrld	<$>23, T1	C 7
-	pxor	T0, $3		C 7
-	pxor	T1, $3		C 8
-
-	movaps	$2, T0		C 0
-	paddd	$3, T0		C 9
-	movaps	T0, T1		C 10
-	pslld	<$>13, T0	C 10
-	psrld	<$>19, T1	C 11
-	pxor	T0, $4		C 11
-	pxor	T1, $4		C 12
-
-	movaps	$3, T0		C 0
-	paddd	$4, T0		C 13
-	movaps	T0, T1		C 14
-	pslld	<$>18, T0	C 14
-	psrld	<$>14, T1	C 15
-	pxor	T0, $1		C 15
-	pxor	T1, $1		C 16
->)
-
-C SWAP(x0, x1, mask)
-C Swaps bits in x0 and x1, with bits selected by the mask
-define(<SWAP>, <
-	movaps	$1, T0
-	pxor	$2, $1
-	pand	$3, $1
-	pxor	$1, $2
-	pxor	T0, $1
->)
-
-	.file "salsa20.asm"
+	.file "salsa20-crypt.asm"
 	
 	C salsa20_crypt(struct salsa20_ctx *ctx, unsigned length,
 	C		uint8_t *dst, const uint8_t *src)
@@ -115,18 +72,18 @@ PROLOGUE(nettle_salsa20_crypt)
 
 	C On input, each xmm register is one row. We start with
 	C
-	C	 0  1  2  3
-	C	 4  5  6  7
-	C	 8  9 10 11
-	C	12 13 14 15
+	C	 0  1  2  3     C K K K
+	C	 4  5  6  7	K C I I
+	C	 8  9 10 11	B B C K
+	C	12 13 14 15	K K K C
 	C
 	C Diagrams are in little-endian order, with least significant word to
 	C the left. We rotate the columns, to get instead
 	C
-	C	 0  5 10 15
-	C	 4  9 14  3
-	C	 8 13  2  7
-	C	12  1  6 11
+	C	 0  5 10 15	C C C C
+	C	 4  9 14  3	K B K K
+	C	 8 13  2  7	B K K I
+	C	12  1  6 11	K K I K
 	C 
 	C The original rows are now diagonals.
 	SWAP(X0, X1, M0101)
@@ -267,10 +224,10 @@ PROLOGUE(nettle_salsa20_crypt)
 	shr	$16, XREG(T64)
 .Llt2:
 	test	$1, LENGTH
-	jz	.Lpartial_done
+	jz	.Lend
 	xor	(SRC, POS), LREG(T64)
 	mov	LREG(T64), (DST, POS)
-.Lpartial_done:
-	ret
+
+	jmp	.Lend
 
 EPILOGUE(nettle_salsa20_crypt)
diff --git a/x86_64/salsa20.m4 b/x86_64/salsa20.m4
new file mode 100644
index 0000000000000000000000000000000000000000..708c48969257a19a9482f86393b89fbbd194a9a9
--- /dev/null
+++ b/x86_64/salsa20.m4
@@ -0,0 +1,45 @@
+C Needs T0 and T1
+C QROUND(x0, x1, x2, x3)
+define(<QROUND>, <
+	movaps	$4, T0		C 0
+	paddd	$1, T0		C 1
+	movaps	T0, T1		C 2
+	pslld	<$>7, T0	C 2
+	psrld	<$>25, T1	C 3
+	pxor	T0, $2		C 3
+	pxor	T1, $2		C 4
+
+	movaps	$1, T0		C 0
+	paddd	$2, T0		C 5
+	movaps	T0, T1		C 6
+	pslld	<$>9, T0	C 6
+	psrld	<$>23, T1	C 7
+	pxor	T0, $3		C 7
+	pxor	T1, $3		C 8
+
+	movaps	$2, T0		C 0
+	paddd	$3, T0		C 9
+	movaps	T0, T1		C 10
+	pslld	<$>13, T0	C 10
+	psrld	<$>19, T1	C 11
+	pxor	T0, $4		C 11
+	pxor	T1, $4		C 12
+
+	movaps	$3, T0		C 0
+	paddd	$4, T0		C 13
+	movaps	T0, T1		C 14
+	pslld	<$>18, T0	C 14
+	psrld	<$>14, T1	C 15
+	pxor	T0, $1		C 15
+	pxor	T1, $1		C 16
+>)
+
+C SWAP(x0, x1, mask)
+C Swaps bits in x0 and x1, with bits selected by the mask
+define(<SWAP>, <
+	movaps	$1, T0
+	pxor	$2, $1
+	pand	$3, $1
+	pxor	$1, $2
+	pxor	T0, $1
+>)