diff --git a/ChangeLog b/ChangeLog
index bb299f1c26d836e9cd61e47ff51840c22fa38fce..816aa79a039825559392c76525849a6583bd3cc1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,50 @@
+2015-01-11  Niels Möller  <nisse@lysator.liu.se>
+
+	Merged memxor-reorg changes, starting at 2014-10-23.
+
+2015-01-10  Niels Möller  <nisse@lysator.liu.se>
+
+	* arm/memxor.asm (memxor3): Moved to new file.
+	* arm/memxor3.asm: New file.
+
+2014-11-24  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/memxor3.asm (memxor3): New file, code moved from old
+	memxor.asm.
+	* x86_64/memxor.asm (memxor): Rewritten, no longer jumps into
+	memxor3.
+
+	* configure.ac (asm_replace_list): Added memxor.asm and
+	memxor3.asm.
+
+2014-10-23  Niels Möller  <nisse@lysator.liu.se>
+
+	* configure.ac (IF_ASM): New substituted variable.
+	* testsuite/Makefile.in (VALGRIND): Allow partial loads only when
+	build includes assembly files.
+
+	* memxor-internal.h (READ_PARTIAL): New macro.
+	* memxor.c (memxor_different_alignment): Avoid out-of-bounds
+	reads, corresponding to valgrind's --partial-loads-ok. Use
+	READ_PARTIAL.
+	* memxor3.c: Analogous changes for unaligned operations.
+
+	* configure.ac (asm_replace_list): Deleted memxor.asm, now
+	incompatible with the memxor/memxor3 split.
+
+	* memxor3.c: New file, split off from memxor.c.
+	* memxor-internal.h: New file, declarations shared by memxor.c and
+	memxor3.c.
+	* memxor.c: memxor3 fucntions moved out from this file.
+	* Makefile.in (nettle_SOURCES): Added memxor3.c.
+	(DISTFILES): Added memxor-internal.h.
+
+	* memxor.c (memxor_common_alignment, memxor_different_alignment)
+	(memxor): Change loop order, iterate from the end.
+	(memxor3_common_alignment): Unroll twice.
+	(word_t): On x86_64, unconditionally define as uint64_t, to get 64
+	bits also in M$ windows. Replaced all uses of SIZEOF_LONG.
+
 2014-12-12  Niels Möller  <nisse@lysator.liu.se>
 
 	* cbc.h (CBC_ENCRYPT, CBC_DECRYPT): Make type-checking hack
diff --git a/Makefile.in b/Makefile.in
index 10a58b0b269fc16e0be54be0756dea58fa46e8e7..0d3c424622b1f0b413816c3c01f9fa0155da4d2b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -109,7 +109,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c \
 		 knuth-lfib.c \
 		 md2.c md2-meta.c md4.c md4-meta.c \
 		 md5.c md5-compress.c md5-compat.c md5-meta.c \
-		 memxor.c \
+		 memxor.c memxor3.c \
 		 nettle-meta-aeads.c nettle-meta-armors.c \
 		 nettle-meta-ciphers.c nettle-meta-hashes.c \
 		 pbkdf2.c pbkdf2-hmac-sha1.c pbkdf2-hmac-sha256.c \
@@ -216,7 +216,7 @@ DISTFILES = $(SOURCES) $(HEADERS) getopt.h getopt_int.h \
 	$(des_headers) descore.README \
 	aes-internal.h camellia-internal.h serpent-internal.h \
 	cast128_sboxes.h desinfo.h desCode.h \
-	nettle-internal.h nettle-write.h \
+	memxor-internal.h nettle-internal.h nettle-write.h \
 	gmp-glue.h ecc-internal.h \
 	mini-gmp.h mini-gmp.c asm.m4 \
 	nettle.texinfo nettle.info nettle.html nettle.pdf sha-example.c
diff --git a/arm/memxor.asm b/arm/memxor.asm
index fd0f633095670399d7c274c27d0ceb3c19516e53..a50e91bc4a463dc71bddd58df490bc083b959681 100644
--- a/arm/memxor.asm
+++ b/arm/memxor.asm
@@ -227,274 +227,3 @@ PROLOGUE(nettle_memxor)
 	b	.Lmemxor_bytes
 	
 EPILOGUE(nettle_memxor)
-
-define(<DST>, <r0>)
-define(<AP>, <r1>)
-define(<BP>, <r2>)
-define(<N>, <r3>)
-undefine(<CNT>)
-undefine(<TNC>)
-
-C Temporaries r4-r7
-define(<ACNT>, <r8>)
-define(<ATNC>, <r10>)
-define(<BCNT>, <r11>)
-define(<BTNC>, <r12>)
-
-	C memxor3(void *dst, const void *a, const void *b, size_t n)
-	.align 2
-PROLOGUE(nettle_memxor3)
-	cmp	N, #0
-	beq	.Lmemxor3_ret
-
-	push	{r4,r5,r6,r7,r8,r10,r11}
-	cmp	N, #7
-
-	add	AP, N
-	add	BP, N
-	add	DST, N
-
-	bcs	.Lmemxor3_large
-
-	C Simple byte loop
-.Lmemxor3_bytes:
-	ldrb	r4, [AP, #-1]!
-	ldrb	r5, [BP, #-1]!
-	eor	r4, r5
-	strb	r4, [DST, #-1]!
-	subs	N, #1
-	bne	.Lmemxor3_bytes
-
-.Lmemxor3_done:
-	pop	{r4,r5,r6,r7,r8,r10,r11}
-.Lmemxor3_ret:
-	bx	lr
-
-.Lmemxor3_align_loop:
-	ldrb	r4, [AP, #-1]!
-	ldrb	r5, [BP, #-1]!
-	eor	r5, r4
-	strb	r5, [DST, #-1]!
-	sub	N, #1
-
-.Lmemxor3_large:
-	tst	DST, #3
-	bne	.Lmemxor3_align_loop
-
-	C We have at least 4 bytes left to do here.
-	sub	N, #4
-	ands	ACNT, AP, #3
-	lsl	ACNT, #3
-	beq	.Lmemxor3_a_aligned
-
-	ands	BCNT, BP, #3
-	lsl	BCNT, #3
-	bne	.Lmemxor3_uu
-
-	C Swap
-	mov	r4, AP
-	mov	AP, BP
-	mov	BP, r4
-
-.Lmemxor3_au:
-	C NOTE: We have the relevant shift count in ACNT, not BCNT
-
-	C AP is aligned, BP is not
-	C           v original SRC
-	C +-------+------+
-	C |SRC-4  |SRC   |
-	C +---+---+------+
-	C     |DST-4  |
-	C     +-------+
-	C
-	C With little-endian, we need to do
-	C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
-	rsb	ATNC, ACNT, #32
-	bic	BP, #3
-
-	ldr	r4, [BP]
-
-	tst	N, #4
-	itet	eq
-	moveq	r5, r4
-	subne	N, #4
-	beq	.Lmemxor3_au_odd
-
-.Lmemxor3_au_loop:
-	ldr	r5, [BP, #-4]!
-	ldr	r6, [AP, #-4]!
-	eor	r6, r6, r4, lsl ATNC
-	eor	r6, r6, r5, lsr ACNT
-	str	r6, [DST, #-4]!
-.Lmemxor3_au_odd:
-	ldr	r4, [BP, #-4]!
-	ldr	r6, [AP, #-4]!
-	eor	r6, r6, r5, lsl ATNC
-	eor	r6, r6, r4, lsr ACNT
-	str	r6, [DST, #-4]!
-	subs	N, #8
-	bcs	.Lmemxor3_au_loop
-	adds	N, #8
-	beq	.Lmemxor3_done
-
-	C Leftover bytes in r4, low end
-	ldr	r5, [AP, #-4]
-	eor	r4, r5, r4, lsl ATNC
-
-.Lmemxor3_au_leftover:
-	C Store a byte at a time
-	ror	r4, #24
-	strb	r4, [DST, #-1]!
-	subs	N, #1
-	beq	.Lmemxor3_done
-	subs	ACNT, #8
-	sub	AP, #1
-	bne	.Lmemxor3_au_leftover
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_a_aligned:
-	ands	ACNT, BP, #3
-	lsl	ACNT, #3
-	bne	.Lmemxor3_au ;
-
-	C a, b and dst all have the same alignment.
-	subs	N, #8
-	bcc	.Lmemxor3_aligned_word_end
-
-	C This loop runs at 8 cycles per iteration. It has been
-	C observed running at only 7 cycles, for this speed, the loop
-	C started at offset 0x2ac in the object file.
-
-	C FIXME: consider software pipelining, similarly to the memxor
-	C loop.
-	
-.Lmemxor3_aligned_word_loop:
-	ldmdb	AP!, {r4,r5,r6}
-	ldmdb	BP!, {r7,r8,r10}
-	subs	N, #12
-	eor	r4, r7
-	eor	r5, r8
-	eor	r6, r10
-	stmdb	DST!, {r4, r5,r6}
-	bcs	.Lmemxor3_aligned_word_loop
-
-.Lmemxor3_aligned_word_end:
-	C We have 0-11 bytes left to do, and N holds number of bytes -12.
-	adds	N, #4
-	bcc	.Lmemxor3_aligned_lt_8
-	C Do 8 bytes more, leftover is in N
-	ldmdb	AP!, {r4, r5}
-	ldmdb	BP!, {r6, r7}
-	eor	r4, r6
-	eor	r5, r7
-	stmdb	DST!, {r4,r5}
-	beq	.Lmemxor3_done
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_aligned_lt_8:
-	adds	N, #4
-	bcc	.Lmemxor3_aligned_lt_4
-
-	ldr	r4, [AP,#-4]!
-	ldr	r5, [BP,#-4]!
-	eor	r4, r5
-	str	r4, [DST,#-4]!
-	beq	.Lmemxor3_done
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_aligned_lt_4:
-	adds	N, #4	
-	beq	.Lmemxor3_done
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_uu:
-
-	cmp	ACNT, BCNT
-	bic	AP, #3
-	bic	BP, #3
-	rsb	ATNC, ACNT, #32
-
-	bne	.Lmemxor3_uud
-
-	C AP and BP are unaligned in the same way
-
-	ldr	r4, [AP]
-	ldr	r6, [BP]
-	eor	r4, r6
-
-	tst	N, #4
-	itet	eq
-	moveq	r5, r4
-	subne	N, #4
-	beq	.Lmemxor3_uu_odd
-
-.Lmemxor3_uu_loop:
-	ldr	r5, [AP, #-4]!
-	ldr	r6, [BP, #-4]!
-	eor	r5, r6
-	lsl	r4, ATNC
-	eor	r4, r4, r5, lsr ACNT
-	str	r4, [DST, #-4]!
-.Lmemxor3_uu_odd:
-	ldr	r4, [AP, #-4]!
-	ldr	r6, [BP, #-4]!
-	eor	r4, r6
-	lsl	r5, ATNC
-	eor	r5, r5, r4, lsr ACNT
-	str	r5, [DST, #-4]!
-	subs	N, #8
-	bcs	.Lmemxor3_uu_loop
-	adds	N, #8
-	beq	.Lmemxor3_done
-
-	C Leftover bytes in a4, low end
-	ror	r4, ACNT
-.Lmemxor3_uu_leftover:
-	ror	r4, #24
-	strb	r4, [DST, #-1]!
-	subs	N, #1
-	beq	.Lmemxor3_done
-	subs	ACNT, #8
-	bne	.Lmemxor3_uu_leftover
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_uud:
-	C Both AP and BP unaligned, and in different ways
-	rsb	BTNC, BCNT, #32
-
-	ldr	r4, [AP]
-	ldr	r6, [BP]
-
-	tst	N, #4
-	ittet	eq
-	moveq	r5, r4
-	moveq	r7, r6
-	subne	N, #4
-	beq	.Lmemxor3_uud_odd
-
-.Lmemxor3_uud_loop:
-	ldr	r5, [AP, #-4]!
-	ldr	r7, [BP, #-4]!
-	lsl	r4, ATNC
-	eor	r4, r4, r6, lsl BTNC
-	eor	r4, r4, r5, lsr ACNT
-	eor	r4, r4, r7, lsr BCNT
-	str	r4, [DST, #-4]!
-.Lmemxor3_uud_odd:
-	ldr	r4, [AP, #-4]!
-	ldr	r6, [BP, #-4]!
-	lsl	r5, ATNC
-	eor	r5, r5, r7, lsl BTNC
-	eor	r5, r5, r4, lsr ACNT
-	eor	r5, r5, r6, lsr BCNT
-	str	r5, [DST, #-4]!
-	subs	N, #8
-	bcs	.Lmemxor3_uud_loop
-	adds	N, #8
-	beq	.Lmemxor3_done
-
-	C FIXME: More clever left-over handling? For now, just adjust pointers.
-	add	AP, AP,	ACNT, lsr #3
-	add	BP, BP, BCNT, lsr #3
-	b	.Lmemxor3_bytes
-EPILOGUE(nettle_memxor3)
diff --git a/arm/memxor3.asm b/arm/memxor3.asm
new file mode 100644
index 0000000000000000000000000000000000000000..139fd2082278d3b7cb82739530706aabd3cdea44
--- /dev/null
+++ b/arm/memxor3.asm
@@ -0,0 +1,315 @@
+C arm/memxor3.asm
+
+ifelse(<
+   Copyright (C) 2013, 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Possible speedups:
+C
+C The ldm instruction can do load two registers per cycle,
+C if the address is two-word aligned. Or three registers in two
+C cycles, regardless of alignment.
+
+C Register usage:
+
+define(<DST>, <r0>)
+define(<AP>, <r1>)
+define(<BP>, <r2>)
+define(<N>, <r3>)
+
+C Temporaries r4-r7
+define(<ACNT>, <r8>)
+define(<ATNC>, <r10>)
+define(<BCNT>, <r11>)
+define(<BTNC>, <r12>)
+
+	.syntax unified
+
+	.file "memxor3.asm"
+
+	.text
+	.arm
+
+	C memxor3(void *dst, const void *a, const void *b, size_t n)
+	.align 2
+PROLOGUE(nettle_memxor3)
+	cmp	N, #0
+	beq	.Lmemxor3_ret
+
+	push	{r4,r5,r6,r7,r8,r10,r11}
+	cmp	N, #7
+
+	add	AP, N
+	add	BP, N
+	add	DST, N
+
+	bcs	.Lmemxor3_large
+
+	C Simple byte loop
+.Lmemxor3_bytes:
+	ldrb	r4, [AP, #-1]!
+	ldrb	r5, [BP, #-1]!
+	eor	r4, r5
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	bne	.Lmemxor3_bytes
+
+.Lmemxor3_done:
+	pop	{r4,r5,r6,r7,r8,r10,r11}
+.Lmemxor3_ret:
+	bx	lr
+
+.Lmemxor3_align_loop:
+	ldrb	r4, [AP, #-1]!
+	ldrb	r5, [BP, #-1]!
+	eor	r5, r4
+	strb	r5, [DST, #-1]!
+	sub	N, #1
+
+.Lmemxor3_large:
+	tst	DST, #3
+	bne	.Lmemxor3_align_loop
+
+	C We have at least 4 bytes left to do here.
+	sub	N, #4
+	ands	ACNT, AP, #3
+	lsl	ACNT, #3
+	beq	.Lmemxor3_a_aligned
+
+	ands	BCNT, BP, #3
+	lsl	BCNT, #3
+	bne	.Lmemxor3_uu
+
+	C Swap
+	mov	r4, AP
+	mov	AP, BP
+	mov	BP, r4
+
+.Lmemxor3_au:
+	C NOTE: We have the relevant shift count in ACNT, not BCNT
+
+	C AP is aligned, BP is not
+	C           v original SRC
+	C +-------+------+
+	C |SRC-4  |SRC   |
+	C +---+---+------+
+	C     |DST-4  |
+	C     +-------+
+	C
+	C With little-endian, we need to do
+	C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
+	rsb	ATNC, ACNT, #32
+	bic	BP, #3
+
+	ldr	r4, [BP]
+
+	tst	N, #4
+	itet	eq
+	moveq	r5, r4
+	subne	N, #4
+	beq	.Lmemxor3_au_odd
+
+.Lmemxor3_au_loop:
+	ldr	r5, [BP, #-4]!
+	ldr	r6, [AP, #-4]!
+	eor	r6, r6, r4, lsl ATNC
+	eor	r6, r6, r5, lsr ACNT
+	str	r6, [DST, #-4]!
+.Lmemxor3_au_odd:
+	ldr	r4, [BP, #-4]!
+	ldr	r6, [AP, #-4]!
+	eor	r6, r6, r5, lsl ATNC
+	eor	r6, r6, r4, lsr ACNT
+	str	r6, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_au_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C Leftover bytes in r4, low end
+	ldr	r5, [AP, #-4]
+	eor	r4, r5, r4, lsl ATNC
+
+.Lmemxor3_au_leftover:
+	C Store a byte at a time
+	ror	r4, #24
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	beq	.Lmemxor3_done
+	subs	ACNT, #8
+	sub	AP, #1
+	bne	.Lmemxor3_au_leftover
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_a_aligned:
+	ands	ACNT, BP, #3
+	lsl	ACNT, #3
+	bne	.Lmemxor3_au ;
+
+	C a, b and dst all have the same alignment.
+	subs	N, #8
+	bcc	.Lmemxor3_aligned_word_end
+
+	C This loop runs at 8 cycles per iteration. It has been
+	C observed running at only 7 cycles, for this speed, the loop
+	C started at offset 0x2ac in the object file.
+
+	C FIXME: consider software pipelining, similarly to the memxor
+	C loop.
+
+.Lmemxor3_aligned_word_loop:
+	ldmdb	AP!, {r4,r5,r6}
+	ldmdb	BP!, {r7,r8,r10}
+	subs	N, #12
+	eor	r4, r7
+	eor	r5, r8
+	eor	r6, r10
+	stmdb	DST!, {r4, r5,r6}
+	bcs	.Lmemxor3_aligned_word_loop
+
+.Lmemxor3_aligned_word_end:
+	C We have 0-11 bytes left to do, and N holds number of bytes -12.
+	adds	N, #4
+	bcc	.Lmemxor3_aligned_lt_8
+	C Do 8 bytes more, leftover is in N
+	ldmdb	AP!, {r4, r5}
+	ldmdb	BP!, {r6, r7}
+	eor	r4, r6
+	eor	r5, r7
+	stmdb	DST!, {r4,r5}
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_8:
+	adds	N, #4
+	bcc	.Lmemxor3_aligned_lt_4
+
+	ldr	r4, [AP,#-4]!
+	ldr	r5, [BP,#-4]!
+	eor	r4, r5
+	str	r4, [DST,#-4]!
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_4:
+	adds	N, #4
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_uu:
+
+	cmp	ACNT, BCNT
+	bic	AP, #3
+	bic	BP, #3
+	rsb	ATNC, ACNT, #32
+
+	bne	.Lmemxor3_uud
+
+	C AP and BP are unaligned in the same way
+
+	ldr	r4, [AP]
+	ldr	r6, [BP]
+	eor	r4, r6
+
+	tst	N, #4
+	itet	eq
+	moveq	r5, r4
+	subne	N, #4
+	beq	.Lmemxor3_uu_odd
+
+.Lmemxor3_uu_loop:
+	ldr	r5, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	eor	r5, r6
+	lsl	r4, ATNC
+	eor	r4, r4, r5, lsr ACNT
+	str	r4, [DST, #-4]!
+.Lmemxor3_uu_odd:
+	ldr	r4, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	eor	r4, r6
+	lsl	r5, ATNC
+	eor	r5, r5, r4, lsr ACNT
+	str	r5, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_uu_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C Leftover bytes in a4, low end
+	ror	r4, ACNT
+.Lmemxor3_uu_leftover:
+	ror	r4, #24
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	beq	.Lmemxor3_done
+	subs	ACNT, #8
+	bne	.Lmemxor3_uu_leftover
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_uud:
+	C Both AP and BP unaligned, and in different ways
+	rsb	BTNC, BCNT, #32
+
+	ldr	r4, [AP]
+	ldr	r6, [BP]
+
+	tst	N, #4
+	ittet	eq
+	moveq	r5, r4
+	moveq	r7, r6
+	subne	N, #4
+	beq	.Lmemxor3_uud_odd
+
+.Lmemxor3_uud_loop:
+	ldr	r5, [AP, #-4]!
+	ldr	r7, [BP, #-4]!
+	lsl	r4, ATNC
+	eor	r4, r4, r6, lsl BTNC
+	eor	r4, r4, r5, lsr ACNT
+	eor	r4, r4, r7, lsr BCNT
+	str	r4, [DST, #-4]!
+.Lmemxor3_uud_odd:
+	ldr	r4, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	lsl	r5, ATNC
+	eor	r5, r5, r7, lsl BTNC
+	eor	r5, r5, r4, lsr ACNT
+	eor	r5, r5, r6, lsr BCNT
+	str	r5, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_uud_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C FIXME: More clever left-over handling? For now, just adjust pointers.
+	add	AP, AP,	ACNT, lsr #3
+	add	BP, BP, BCNT, lsr #3
+	b	.Lmemxor3_bytes
+EPILOGUE(nettle_memxor3)
diff --git a/configure.ac b/configure.ac
index bb33962cca1004143ccd18c49c859e0eb5e4e80e..76beb3160853abde4dfaafcb90fcc35a558c6d2f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -272,7 +272,8 @@ fi
 # to a new object file).
 asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		arcfour-crypt.asm camellia-crypt-internal.asm \
-		md5-compress.asm memxor.asm poly1305-internal.asm \
+		md5-compress.asm memxor.asm memxor3.asm \
+		poly1305-internal.asm \
 		chacha-core-internal.asm \
 		salsa20-crypt.asm salsa20-core-internal.asm \
 		serpent-encrypt.asm serpent-decrypt.asm \
@@ -356,6 +357,12 @@ fi
 AC_SUBST([OPT_ASM_NETTLE_SOURCES])
 AC_SUBST([OPT_ASM_HOGWEED_SOURCES])
 AC_SUBST([ASM_RODATA])
+if test "x$enable_assembler" = xyes ; then
+  IF_ASM=''
+else
+  IF_ASM='#'
+fi
+AC_SUBST([IF_ASM])
 
 AH_VERBATIM([HAVE_NATIVE],
 [/* Define to 1 each of the following for which a native (ie. CPU specific)
diff --git a/memxor-internal.h b/memxor-internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5e99073b594d50f7763d8aea8977c7c5dc201
--- /dev/null
+++ b/memxor-internal.h
@@ -0,0 +1,73 @@
+/* memxor-internal.h
+
+   Copyright (C) 2010, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef NETTLE_MEMXOR_INTERNAL_H_INCLUDED
+#define NETTLE_MEMXOR_INTERNAL_H_INCLUDED
+
+#include "nettle-types.h"
+
+/* The word_t type is intended to be the native word size. */
+#if defined(__x86_64__) || defined(__arch64__)
+/* Including on M$ windows, where unsigned long is only 32 bits */
+typedef uint64_t word_t;
+#else
+typedef unsigned long int word_t;
+#endif
+
+#define ALIGN_OFFSET(p) ((uintptr_t) (p) % sizeof(word_t))
+
+#ifndef WORDS_BIGENDIAN
+#define MERGE(w0, sh_1, w1, sh_2) \
+  (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+#else
+#define MERGE(w0, sh_1, w1, sh_2) \
+  (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+#endif
+
+#ifndef WORDS_BIGENDIAN
+#define READ_PARTIAL(r,p,n) do {			\
+    word_t _rp_x;					\
+    unsigned _rp_i;					\
+    for (_rp_i = (n), _rp_x = (p)[--_rp_i]; _rp_i > 0;)	\
+      _rp_x = (_rp_x << CHAR_BIT) | (p)[--_rp_i];	\
+    (r) = _rp_x;					\
+  } while (0)
+#else
+#define READ_PARTIAL(r,p,n) do {			\
+    word_t _rp_x;						\
+    unsigned _rp_i;						\
+    for (_rp_x = (p)[0], _rp_i = 1; _rp_i < (n); _rp_i++)	\
+      _rp_x = (_rp_x << CHAR_BIT) | (p)[_rp_i];			\
+    (r) = _rp_x;						\
+  } while (0)
+#endif
+
+#endif /* NETTLE_MEMXOR_INTERNAL_H_INCLUDED */
diff --git a/memxor.c b/memxor.c
index 99f46f37b7962b4c62e620df64bac24e5eb59b33..36306ac87e155e1b1d06e2b7a40a13bbd7421be5 100644
--- a/memxor.c
+++ b/memxor.c
@@ -1,6 +1,6 @@
 /* memxor.c
 
-   Copyright (C) 2010 Niels Möller
+   Copyright (C) 2010, 2014 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -37,26 +37,11 @@
 # include "config.h"
 #endif
 
+#include <assert.h>
 #include <limits.h>
 
 #include "memxor.h"
-
-/* For uintptr_t */
-#include "nettle-types.h"
-
-typedef unsigned long int word_t;
-
-#if SIZEOF_LONG & (SIZEOF_LONG - 1)
-#error Word size must be a power of two
-#endif
-
-#define ALIGN_OFFSET(p) ((uintptr_t) (p) % sizeof(word_t))
-
-#ifndef WORDS_BIGENDIAN
-#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
-#else
-#define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
-#endif
+#include "memxor-internal.h"
 
 #define WORD_T_THRESH 16
 
@@ -70,13 +55,14 @@ memxor_common_alignment (word_t *dst, const word_t *src, size_t n)
 
   if (n & 1)
     {
-      *dst++ ^= *src++;
       n--;
+      dst[n] ^= src[n];
     }
-  for (; n >= 2; dst += 2, src += 2, n -= 2)
+  while (n >= 2)
     {
-      dst[0] ^= src[0];
-      dst[1] ^= src[1];
+      n -= 2;
+      dst[n+1] ^= src[n+1];
+      dst[n] ^= src[n];
     }
 }
 
@@ -84,35 +70,52 @@ memxor_common_alignment (word_t *dst, const word_t *src, size_t n)
    words, not bytes. Assumes we can read complete words at the start
    and end of the src operand. */
 static void
-memxor_different_alignment (word_t *dst, const char *src, size_t n)
+memxor_different_alignment (word_t *dst, const unsigned char *src, size_t n)
 {
-  size_t i;
   int shl, shr;
   const word_t *src_word;
   unsigned offset = ALIGN_OFFSET (src);
   word_t s0, s1;
 
+  assert (n > 0);
   shl = CHAR_BIT * offset;
   shr = CHAR_BIT * (sizeof(word_t) - offset);
 
-  src_word = (const word_t *) ((uintptr_t) src & -SIZEOF_LONG);
+  src_word = (const word_t *) ((uintptr_t) src & -sizeof(word_t));
+
+  /* Read top offset bytes, in native byte order. */
+  READ_PARTIAL (s0, (unsigned char *) &src_word[n], offset);
+#ifdef WORDS_BIGENDIAN
+  s0 <<= shr; /* FIXME: Eliminate this shift? */
+#endif
 
-  /* FIXME: Unroll four times, like memcmp? */
-  i = n & 1;
-  s0 = src_word[i];
-  if (i)
+  /* Do n-1 regular iterations */
+  if (n & 1)
+    s1 = s0;
+  else
     {
-      s1 = src_word[0];
-      dst[0] ^= MERGE (s1, shl, s0, shr);
+      n--;
+      s1 = src_word[n];
+      dst[n] ^= MERGE (s1, shl, s0, shr);
     }
 
-  for (; i < n; i += 2)
+  assert (n & 1);
+  while (n > 2)
     {
-      s1 = src_word[i+1];
-      dst[i] ^= MERGE(s0, shl, s1, shr);
-      s0 = src_word[i+2];
-      dst[i+1] ^= MERGE(s1, shl, s0, shr);
+      n -= 2;
+      s0 = src_word[n+1];
+      dst[n+1] ^= MERGE(s0, shl, s1, shr);
+      s1 = src_word[n]; /* FIXME: Overread on last iteration */
+      dst[n] ^= MERGE(s1, shl, s0, shr);
     }
+  assert (n == 1);
+  /* Read low wordsize - offset bytes */
+  READ_PARTIAL (s0, src, sizeof(word_t) - offset);
+#ifndef WORDS_BIGENDIAN
+  s0 <<= shl; /* FIXME: eliminate shift? */
+#endif /* !WORDS_BIGENDIAN */
+
+  dst[0] ^= MERGE(s0, shl, s1, shr);
 }
 
 /* Performance, Intel SU1400 (x86_64): 0.25 cycles/byte aligned, 0.45
@@ -123,216 +126,36 @@ memxor_different_alignment (word_t *dst, const char *src, size_t n)
 void *
 memxor(void *dst_in, const void *src_in, size_t n)
 {
-  char *dst = dst_in;
-  const char *src = src_in;
-
-  if (n >= WORD_T_THRESH)
-    {
-      /* There are at least some bytes to compare.  No need to test
-	 for N == 0 in this alignment loop.  */
-      while (ALIGN_OFFSET (dst))
-	{
-	  *dst++ ^= *src++;
-	  n--;
-	}
-      if (ALIGN_OFFSET (src))
-	memxor_different_alignment ((word_t *) dst, src, n / sizeof(word_t));
-      else
-	memxor_common_alignment ((word_t *) dst, (const word_t *) src, n / sizeof(word_t));
-
-      dst += n & -SIZEOF_LONG;
-      src += n & -SIZEOF_LONG;
-      n = n & (SIZEOF_LONG - 1);
-    }
-  for (; n > 0; n--)
-    *dst++ ^= *src++;
-
-  return dst_in;
-}
-
-
-/* XOR word-aligned areas. n is the number of words, not bytes. */
-static void
-memxor3_common_alignment (word_t *dst,
-			  const word_t *a, const word_t *b, size_t n)
-{
-  /* FIXME: Require n > 0? */
-  while (n-- > 0)
-    dst[n] = a[n] ^ b[n];
-}
-
-static void
-memxor3_different_alignment_b (word_t *dst,
-			       const word_t *a, const char *b, unsigned offset, size_t n)
-{
-  int shl, shr;
-  const word_t *b_word;
-
-  word_t s0, s1;
-
-  shl = CHAR_BIT * offset;
-  shr = CHAR_BIT * (sizeof(word_t) - offset);
-
-  b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
-
-  if (n & 1)
-    {
-      n--;
-      s1 = b_word[n];
-      s0 = b_word[n+1];
-      dst[n] = a[n] ^ MERGE (s1, shl, s0, shr);
-    }
-  else
-    s1 = b_word[n];
-  
-  while (n > 0)
-    {
-      n -= 2;
-      s0 = b_word[n+1]; 
-      dst[n+1] = a[n+1] ^ MERGE(s0, shl, s1, shr);
-      s1 = b_word[n];
-      dst[n] = a[n] ^ MERGE(s1, shl, s0, shr);
-    }
-}
-
-static void
-memxor3_different_alignment_ab (word_t *dst,
-				const char *a, const char *b,
-				unsigned offset, size_t n)
-{
-  int shl, shr;
-  const word_t *a_word;
-  const word_t *b_word;
-  
-  word_t s0, s1;
-
-  shl = CHAR_BIT * offset;
-  shr = CHAR_BIT * (sizeof(word_t) - offset);
-
-  a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG);
-  b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
-
-  if (n & 1)
-    {
-      n--;
-      s1 = a_word[n] ^ b_word[n];
-      s0 = a_word[n+1] ^ b_word[n+1];
-      dst[n] = MERGE (s1, shl, s0, shr);
-    }
-  else    
-    s1 = a_word[n] ^ b_word[n];
-  
-  while (n > 0)
-    {
-      n -= 2;
-      s0 = a_word[n+1] ^ b_word[n+1]; 
-      dst[n+1] = MERGE(s0, shl, s1, shr);
-      s1 = a_word[n] ^ b_word[n];
-      dst[n] = MERGE(s1, shl, s0, shr);
-    }
-}
-
-static void
-memxor3_different_alignment_all (word_t *dst,
-				 const char *a, const char *b,
-				 unsigned a_offset, unsigned b_offset,
-				 size_t n)
-{
-  int al, ar, bl, br;
-  const word_t *a_word;
-  const word_t *b_word;
-  
-  word_t a0, a1, b0, b1;
-
-  al = CHAR_BIT * a_offset;
-  ar = CHAR_BIT * (sizeof(word_t) - a_offset);
-  bl = CHAR_BIT * b_offset;
-  br = CHAR_BIT * (sizeof(word_t) - b_offset);
-
-  a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG);
-  b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
-
-  if (n & 1)
-    {
-      n--;
-      a1 = a_word[n]; a0 = a_word[n+1];
-      b1 = b_word[n]; b0 = b_word[n+1];
-      
-      dst[n] = MERGE (a1, al, a0, ar) ^ MERGE (b1, bl, b0, br);
-    }
-  else    
-    {
-      a1 = a_word[n];
-      b1 = b_word[n];
-    }
-  
-  while (n > 0)
-    {
-      n -= 2;
-      a0 = a_word[n+1]; b0 = b_word[n+1]; 
-      dst[n+1] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br);
-      a1 = a_word[n]; b1 = b_word[n];
-      dst[n] = MERGE(a1, al, a0, ar) ^ MERGE(b1, bl, b0, br);
-    }
-}
-
-/* Current implementation processes data in descending order, to
-   support overlapping operation with one of the sources overlapping
-   the start of the destination area. This feature is used only
-   internally by cbc decrypt, and it is not advertised or documented
-   to nettle users. */
-void *
-memxor3(void *dst_in, const void *a_in, const void *b_in, size_t n)
-{
-  char *dst = dst_in;
-  const char *a = a_in;
-  const char *b = b_in;
+  unsigned char *dst = dst_in;
+  const unsigned char *src = src_in;
 
   if (n >= WORD_T_THRESH)
     {
       unsigned i;
-      unsigned a_offset;
-      unsigned b_offset;
+      unsigned offset;
       size_t nwords;
-
+      /* There are at least some bytes to compare.  No need to test
+	 for N == 0 in this alignment loop.  */
       for (i = ALIGN_OFFSET(dst + n); i > 0; i--)
 	{
 	  n--;
-	  dst[n] = a[n] ^ b[n];
+	  dst[n] ^= src[n];
 	}
-
-      a_offset = ALIGN_OFFSET(a + n);
-      b_offset = ALIGN_OFFSET(b + n);
-
+      offset = ALIGN_OFFSET(src + n);
       nwords = n / sizeof (word_t);
       n %= sizeof (word_t);
 
-      if (a_offset == b_offset)
-	{
-	  if (!a_offset)
-	    memxor3_common_alignment((word_t *) (dst + n),
-				     (const word_t *) (a + n),
-				     (const word_t *) (b + n), nwords);
-	  else
-	    memxor3_different_alignment_ab((word_t *) (dst + n),
-					   a + n, b + n, a_offset,
-					   nwords);
-	}
-      else if (!a_offset)
-	memxor3_different_alignment_b((word_t *) (dst + n),
-				      (const word_t *) (a + n), b + n,
-				      b_offset, nwords);
-      else if (!b_offset)
-	memxor3_different_alignment_b((word_t *) (dst + n),
-				      (const word_t *) (b + n), a + n,
-				      a_offset, nwords);
+      if (offset)
+	memxor_different_alignment ((word_t *) (dst+n), src+n, nwords);
       else
-	memxor3_different_alignment_all((word_t *) (dst + n), a + n, b + n,
-					a_offset, b_offset, nwords);
-					
+	memxor_common_alignment ((word_t *) (dst+n),
+				 (const word_t *) (src+n), nwords);
+    }
+  while (n > 0)
+    {
+      n--;
+      dst[n] ^= src[n];
     }
-  while (n-- > 0)
-    dst[n] = a[n] ^ b[n];
 
   return dst;
 }
diff --git a/memxor3.c b/memxor3.c
new file mode 100644
index 0000000000000000000000000000000000000000..fe208bf12e8448178eb0b0e21994f41fa72fcd4c
--- /dev/null
+++ b/memxor3.c
@@ -0,0 +1,292 @@
+/* memxor3.c
+
+   Copyright (C) 2010, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+/* Implementation inspired by memcmp in glibc, contributed to the FSF
+   by Torbjorn Granlund.
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+#include <limits.h>
+
+#include "memxor.h"
+#include "memxor-internal.h"
+
+#define WORD_T_THRESH 16
+
+/* XOR word-aligned areas. n is the number of words, not bytes. */
+static void
+memxor3_common_alignment (word_t *dst,
+			  const word_t *a, const word_t *b, size_t n)
+{
+  /* FIXME: Require n > 0? */
+  if (n & 1)
+    {
+      n--;
+      dst[n] = a[n] ^ b[n];
+    }
+  while (n > 0)
+    {
+      n -= 2;
+      dst[n+1] = a[n+1] ^ b[n+1];
+      dst[n] = a[n] ^ b[n];
+    }
+}
+
+static void
+memxor3_different_alignment_b (word_t *dst,
+			       const word_t *a, const unsigned char *b,
+			       unsigned offset, size_t n)
+{
+  int shl, shr;
+  const word_t *b_word;
+
+  word_t s0, s1;
+
+  assert (n > 0);
+
+  shl = CHAR_BIT * offset;
+  shr = CHAR_BIT * (sizeof(word_t) - offset);
+
+  b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t));
+
+  /* Read top offset bytes, in native byte order. */
+  READ_PARTIAL (s0, (unsigned char *) &b_word[n], offset);
+#ifdef WORDS_BIGENDIAN
+  s0 <<= shr;
+#endif
+
+  if (n & 1)
+    s1 = s0;
+  else
+    {
+      n--;
+      s1 = b_word[n];
+      dst[n] = a[n] ^ MERGE (s1, shl, s0, shr);
+    }
+
+  while (n > 2)
+    {
+      n -= 2;
+      s0 = b_word[n+1];
+      dst[n+1] = a[n+1] ^ MERGE(s0, shl, s1, shr);
+      s1 = b_word[n];
+      dst[n] = a[n] ^ MERGE(s1, shl, s0, shr);
+    }
+  assert (n == 1);
+  /* Read low wordsize - offset bytes */
+  READ_PARTIAL (s0, b, sizeof(word_t) - offset);
+#ifndef WORDS_BIGENDIAN
+  s0 <<= shl;
+#endif /* !WORDS_BIGENDIAN */
+
+  dst[0] = a[0] ^ MERGE(s0, shl, s1, shr);
+}
+
+static void
+memxor3_different_alignment_ab (word_t *dst,
+				const unsigned char *a, const unsigned char *b,
+				unsigned offset, size_t n)
+{
+  int shl, shr;
+  const word_t *a_word;
+  const word_t *b_word;
+
+  word_t s0, s1, t;
+
+  assert (n > 0);
+
+  shl = CHAR_BIT * offset;
+  shr = CHAR_BIT * (sizeof(word_t) - offset);
+
+  a_word = (const word_t *) ((uintptr_t) a & -sizeof(word_t));
+  b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t));
+
+  /* Read top offset bytes, in native byte order. */
+  READ_PARTIAL (s0, (unsigned char *) &a_word[n], offset);
+  READ_PARTIAL (t,  (unsigned char *) &b_word[n], offset);
+  s0 ^= t;
+#ifdef WORDS_BIGENDIAN
+  s0 <<= shr;
+#endif
+
+  if (n & 1)
+    s1 = s0;
+  else
+    {
+      n--;
+      s1 = a_word[n] ^ b_word[n];
+      dst[n] = MERGE (s1, shl, s0, shr);
+    }
+
+  while (n > 2)
+    {
+      n -= 2;
+      s0 = a_word[n+1] ^ b_word[n+1];
+      dst[n+1] = MERGE(s0, shl, s1, shr);
+      s1 = a_word[n] ^ b_word[n];
+      dst[n] = MERGE(s1, shl, s0, shr);
+    }
+  assert (n == 1);
+  /* Read low wordsize - offset bytes */
+  READ_PARTIAL (s0, a, sizeof(word_t) - offset);
+  READ_PARTIAL (t,  b, sizeof(word_t) - offset);
+  s0 ^= t;
+#ifndef WORDS_BIGENDIAN
+  s0 <<= shl;
+#endif /* !WORDS_BIGENDIAN */
+
+  dst[0] = MERGE(s0, shl, s1, shr);
+}
+
+static void
+memxor3_different_alignment_all (word_t *dst,
+				 const unsigned char *a, const unsigned char *b,
+				 unsigned a_offset, unsigned b_offset,
+				 size_t n)
+{
+  int al, ar, bl, br;
+  const word_t *a_word;
+  const word_t *b_word;
+
+  word_t a0, a1, b0, b1;
+
+  al = CHAR_BIT * a_offset;
+  ar = CHAR_BIT * (sizeof(word_t) - a_offset);
+  bl = CHAR_BIT * b_offset;
+  br = CHAR_BIT * (sizeof(word_t) - b_offset);
+
+  a_word = (const word_t *) ((uintptr_t) a & -sizeof(word_t));
+  b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t));
+
+  /* Read top offset bytes, in native byte order. */
+  READ_PARTIAL (a0, (unsigned char *) &a_word[n], a_offset);
+  READ_PARTIAL (b0, (unsigned char *) &b_word[n], b_offset);
+#ifdef WORDS_BIGENDIAN
+  a0 <<= ar;
+  b0 <<= br;
+#endif
+
+  if (n & 1)
+    {
+      a1 = a0; b1 = b0;
+    }
+  else
+    {
+      n--;
+      a1 = a_word[n];
+      b1 = b_word[n];
+
+      dst[n] = MERGE (a1, al, a0, ar) ^ MERGE (b1, bl, b0, br);
+    }
+  while (n > 2)
+    {
+      n -= 2;
+      a0 = a_word[n+1]; b0 = b_word[n+1];
+      dst[n+1] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br);
+      a1 = a_word[n]; b1 = b_word[n];
+      dst[n] = MERGE(a1, al, a0, ar) ^ MERGE(b1, bl, b0, br);
+    }
+  assert (n == 1);
+  /* Read low wordsize - offset bytes */
+  READ_PARTIAL (a0, a, sizeof(word_t) - a_offset);
+  READ_PARTIAL (b0, b, sizeof(word_t) - b_offset);
+#ifndef WORDS_BIGENDIAN
+  a0 <<= al;
+  b0 <<= bl;
+#endif /* !WORDS_BIGENDIAN */
+
+  dst[0] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br);
+}
+
+/* Current implementation processes data in descending order, to
+   support overlapping operation with one of the sources overlapping
+   the start of the destination area. This feature is used only
+   internally by cbc decrypt, and it is not advertised or documented
+   to nettle users. */
+void *
+memxor3(void *dst_in, const void *a_in, const void *b_in, size_t n)
+{
+  unsigned char *dst = dst_in;
+  const unsigned char *a = a_in;
+  const unsigned char *b = b_in;
+
+  if (n >= WORD_T_THRESH)
+    {
+      unsigned i;
+      unsigned a_offset;
+      unsigned b_offset;
+      size_t nwords;
+
+      for (i = ALIGN_OFFSET(dst + n); i > 0; i--)
+	{
+	  n--;
+	  dst[n] = a[n] ^ b[n];
+	}
+
+      a_offset = ALIGN_OFFSET(a + n);
+      b_offset = ALIGN_OFFSET(b + n);
+
+      nwords = n / sizeof (word_t);
+      n %= sizeof (word_t);
+
+      if (a_offset == b_offset)
+	{
+	  if (!a_offset)
+	    memxor3_common_alignment((word_t *) (dst + n),
+				     (const word_t *) (a + n),
+				     (const word_t *) (b + n), nwords);
+	  else
+	    memxor3_different_alignment_ab((word_t *) (dst + n),
+					   a + n, b + n, a_offset,
+					   nwords);
+	}
+      else if (!a_offset)
+	memxor3_different_alignment_b((word_t *) (dst + n),
+				      (const word_t *) (a + n), b + n,
+				      b_offset, nwords);
+      else if (!b_offset)
+	memxor3_different_alignment_b((word_t *) (dst + n),
+				      (const word_t *) (b + n), a + n,
+				      a_offset, nwords);
+      else
+	memxor3_different_alignment_all((word_t *) (dst + n), a + n, b + n,
+					a_offset, b_offset, nwords);
+
+    }
+  while (n-- > 0)
+    dst[n] = a[n] ^ b[n];
+
+  return dst;
+}
diff --git a/testsuite/Makefile.in b/testsuite/Makefile.in
index 96c0bc80aee9b24c239fbc1cb5c2714b6f1fd049..6bc1907c79fbea3c94eabb82f1044116842d3326 100644
--- a/testsuite/Makefile.in
+++ b/testsuite/Makefile.in
@@ -114,7 +114,7 @@ $(TARGETS) $(EXTRA_TARGETS): testutils.$(OBJEXT) ../nettle-internal.$(OBJEXT) \
 
 # --partial-loads-ok=yes is needed for memxor's handling of unaligned
 # data.
-VALGRIND = valgrind --error-exitcode=1 --leak-check=full --partial-loads-ok=yes --show-reachable=yes
+VALGRIND = valgrind --error-exitcode=1 --leak-check=full --show-reachable=yes @IF_ASM@ --partial-loads-ok=yes
 
 # The PATH update is for locating dlls on w*ndows.
 check: $(TS_ALL)
diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm
index e14e31a13f4617bc202bb66239703caedee4f47d..69d6cfe382a12e8a0d2ace1d1019a9db44d23f24 100644
--- a/x86_64/memxor.asm
+++ b/x86_64/memxor.asm
@@ -1,7 +1,7 @@
 C x86_64/memxor.asm
 
 ifelse(<
-   Copyright (C) 2010, Niels Möller
+   Copyright (C) 2010, 2014, Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -32,9 +32,8 @@ ifelse(<
 
 C Register usage:
 define(<DST>, <%rax>) C Originally in %rdi
-define(<AP>, <%rsi>)
-define(<BP>, <%rdx>)
-define(<N>, <%r10>)
+define(<SRC>, <%rsi>)
+define(<N>, <%rdx>)
 define(<TMP>, <%r8>)
 define(<TMP2>, <%r9>)
 define(<CNT>, <%rdi>)
@@ -53,20 +52,7 @@ define(<USE_SSE2>, <no>)
 
 PROLOGUE(nettle_memxor)
 	W64_ENTRY(3, 0)
-	mov	%rdx, %r10
-	mov	%rdi, %rdx
-	jmp 	.Lmemxor3_entry
-EPILOGUE(nettle_memxor)
 
-	C memxor3(void *dst, const void *a, const void *b, size_t n)
-	C 	          %rdi              %rsi              %rdx      %rcx
-	ALIGN(16)
-	
-PROLOGUE(nettle_memxor3)
-	W64_ENTRY(4, 0)
-	C %cl needed for shift count, so move away N
-	mov	%rcx, N
-.Lmemxor3_entry:
 	test	N, N
 	C Get number of unaligned bytes at the end
 	C %rdi is used as CNT, %rax as DST and as return value
@@ -87,9 +73,8 @@ PROLOGUE(nettle_memxor3)
 .Lalign_loop:
 	
 	sub	$1, N
-	movb	(AP, N), LREG(TMP)
-	xorb	(BP, N), LREG(TMP)
-	movb	LREG(TMP), (DST, N)
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
 	sub	$1, CNT
 	jnz	.Lalign_loop
 
@@ -98,83 +83,7 @@ ifelse(USE_SSE2, yes, <
 	cmp	$16, N
 	jnc	.Lsse2_case
 >)
-	C Check for the case that AP and BP have the same alignment,
-	C but different from DST.
-	mov	AP, TMP
-	sub	BP, TMP
-	test	$7, TMP
-	jnz	.Lno_shift_case
-	mov	AP, %rcx
-	sub	DST, %rcx
-	and	$7, %rcx
-	jz	.Lno_shift_case
-	sub	%rcx, AP
-	sub	%rcx, BP
-	shl	$3, %rcx
-
-	C Unrolling, with aligned values alternating in S0 and S1
-	test	$8, N
-	jnz	.Lshift_odd
-	mov	(AP, N), S1
-	xor	(BP, N), S1
-	jmp	.Lshift_next
-
-.Lshift_odd:
-	mov	-8(AP, N), S1
-	mov	(AP, N), S0
-	xor	-8(BP, N), S1
-	xor	(BP, N), S0
-	mov	S1, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S0
-	neg	%cl
-	
-	or	S0, TMP
-	mov	TMP, -8(DST, N)
-	sub	$8, N
-	jz	.Ldone
-	jmp 	.Lshift_next
-
-	ALIGN(16)
 
-.Lshift_loop:
-	mov	8(AP, N), S0
-	xor	8(BP, N), S0
-	mov	S0, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S1
-	neg	%cl
-	or	S1, TMP
-	mov	TMP, 8(DST, N)
-
-	mov	(AP, N), S1
-	xor	(BP, N), S1
-	mov	S1, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S0
-	neg 	%cl
-	or	S0, TMP
-	mov	TMP, (DST, N)
-.Lshift_next:
-	sub	$16, N
-	C FIXME: Handle the case N == 16 specially,
-	C like in the non-shifted case? 
-C 	ja	.Lshift_loop
-C 	jz	.Ldone
-	jnc	.Lshift_loop
-
-	add	$15, N
-	jnc	.Ldone
-
-	shr	$3, %rcx
-	add	%rcx, AP
-	add	%rcx, BP
-	jmp	.Lfinal_loop
-	
-.Lno_shift_case:
 	C Next destination word is -8(DST, N)
 	C Setup for unrolling
 	test	$8, N
@@ -183,21 +92,18 @@ C 	jz	.Ldone
 	sub	$8, N
 	jz	.Lone_word
 
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 	
 	jmp	.Lword_next
 
 	ALIGN(16)
 
 .Lword_loop:
-	mov	8(AP, N), TMP
-	mov	(AP, N), TMP2
-	xor	8(BP, N), TMP
-	xor	(BP, N), TMP2
-	mov	TMP, 8(DST, N)
-	mov	TMP2, (DST, N)
+	mov	8(SRC, N), TMP
+	mov	(SRC, N), TMP2
+	xor	TMP, 8(DST, N)
+	xor	TMP2, (DST, N)
 
 .Lword_next:
 	sub	$16, N
@@ -205,33 +111,28 @@ C 	jz	.Ldone
 	jnz	.Lfinal
 
 	C Final operation is word aligned
-	mov	8(AP, N), TMP
-	xor	8(BP, N), TMP
-	mov	TMP, 8(DST, N)
+	mov	8(SRC, N), TMP
+	xor	TMP, 8(DST, N)
 	
 .Lone_word:
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+	W64_EXIT(3, 0)
 	ret
 
 .Lfinal:
 	add	$15, N
 
 .Lfinal_loop:
-	movb	(AP, N), LREG(TMP)
-	xorb	(BP, N), LREG(TMP)
-	movb	LREG(TMP), (DST, N)
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
 .Lfinal_next:
 	sub	$1, N
 	jnc	.Lfinal_loop
 
 .Ldone:
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+	W64_EXIT(3, 0)
 	ret
 
 ifelse(USE_SSE2, yes, <
@@ -241,15 +142,14 @@ ifelse(USE_SSE2, yes, <
 	test	$8, TMP
 	jz	.Lsse2_next
 	sub	$8, N
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 	jmp	.Lsse2_next
 
 	ALIGN(16)
 .Lsse2_loop:
-	movdqu	(AP, N), %xmm0
-	movdqu	(BP, N), %xmm1
+	movdqu	(SRC, N), %xmm0
+	movdqa	(DST, N), %xmm1
 	pxor	%xmm0, %xmm1
 	movdqa	%xmm1, (DST, N)
 .Lsse2_next:
@@ -261,14 +161,13 @@ ifelse(USE_SSE2, yes, <
 	jnz	.Lfinal		
 
 	C Final operation is aligned
-	movdqu	(AP), %xmm0
-	movdqu	(BP), %xmm1
+	movdqu	(SRC), %xmm0
+	movdqa	(DST), %xmm1
 	pxor	%xmm0, %xmm1
 	movdqa	%xmm1, (DST)
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+
+	W64_EXIT(3, 0)
 	ret
 >)	
-	
 
-EPILOGUE(nettle_memxor3)
+EPILOGUE(nettle_memxor)
diff --git a/x86_64/memxor3.asm b/x86_64/memxor3.asm
new file mode 100644
index 0000000000000000000000000000000000000000..8ff3e79cf14ef942f78438faac5263b1d9a48d67
--- /dev/null
+++ b/x86_64/memxor3.asm
@@ -0,0 +1,263 @@
+C x86_64/memxor3.asm
+
+ifelse(<
+   Copyright (C) 2010, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+define(<DST>, <%rax>) C Originally in %rdi
+define(<AP>, <%rsi>)
+define(<BP>, <%rdx>)
+define(<N>, <%r10>)
+define(<TMP>, <%r8>)
+define(<TMP2>, <%r9>)
+define(<CNT>, <%rdi>)
+define(<S0>, <%r11>)
+define(<S1>, <%rdi>) C Overlaps with CNT 
+
+define(<USE_SSE2>, <no>)
+
+	.file "memxor3.asm"
+
+	.text
+
+	C memxor3(void *dst, const void *a, const void *b, size_t n)
+	C 	          %rdi              %rsi              %rdx      %rcx
+	ALIGN(16)
+	
+PROLOGUE(nettle_memxor3)
+	W64_ENTRY(4, 0)
+	C %cl needed for shift count, so move away N
+	mov	%rcx, N
+.Lmemxor3_entry:
+	test	N, N
+	C Get number of unaligned bytes at the end
+	C %rdi is used as CNT, %rax as DST and as return value
+	mov	%rdi, %rax
+	jz	.Ldone
+	add 	N, CNT
+	and	$7, CNT
+	
+	jz	.Laligned
+
+	cmp	$8, N
+	jc	.Lfinal_next
+
+	C FIXME: Instead of this loop, could try cmov with memory
+	C destination, as a sequence of one 8-bit, one 16-bit and one
+	C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+	C that step has to use a conditional).
+.Lalign_loop:
+	
+	sub	$1, N
+	movb	(AP, N), LREG(TMP)
+	xorb	(BP, N), LREG(TMP)
+	movb	LREG(TMP), (DST, N)
+	sub	$1, CNT
+	jnz	.Lalign_loop
+
+.Laligned:
+ifelse(USE_SSE2, yes, <
+	cmp	$16, N
+	jnc	.Lsse2_case
+>)
+	C Check for the case that AP and BP have the same alignment,
+	C but different from DST.
+	mov	AP, TMP
+	sub	BP, TMP
+	test	$7, TMP
+	jnz	.Lno_shift_case
+	mov	AP, %rcx
+	sub	DST, %rcx
+	and	$7, %rcx
+	jz	.Lno_shift_case
+	sub	%rcx, AP
+	sub	%rcx, BP
+	shl	$3, %rcx
+
+	C Unrolling, with aligned values alternating in S0 and S1
+	test	$8, N
+	jnz	.Lshift_odd
+	mov	(AP, N), S1
+	xor	(BP, N), S1
+	jmp	.Lshift_next
+
+.Lshift_odd:
+	mov	-8(AP, N), S1
+	mov	(AP, N), S0
+	xor	-8(BP, N), S1
+	xor	(BP, N), S0
+	mov	S1, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S0
+	neg	%cl
+	
+	or	S0, TMP
+	mov	TMP, -8(DST, N)
+	sub	$8, N
+	jz	.Ldone
+	jmp 	.Lshift_next
+
+	ALIGN(16)
+
+.Lshift_loop:
+	mov	8(AP, N), S0
+	xor	8(BP, N), S0
+	mov	S0, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S1
+	neg	%cl
+	or	S1, TMP
+	mov	TMP, 8(DST, N)
+
+	mov	(AP, N), S1
+	xor	(BP, N), S1
+	mov	S1, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S0
+	neg 	%cl
+	or	S0, TMP
+	mov	TMP, (DST, N)
+.Lshift_next:
+	sub	$16, N
+	C FIXME: Handle the case N == 16 specially,
+	C like in the non-shifted case? 
+C 	ja	.Lshift_loop
+C 	jz	.Ldone
+	jnc	.Lshift_loop
+
+	add	$15, N
+	jnc	.Ldone
+
+	shr	$3, %rcx
+	add	%rcx, AP
+	add	%rcx, BP
+	jmp	.Lfinal_loop
+	
+.Lno_shift_case:
+	C Next destination word is -8(DST, N)
+	C Setup for unrolling
+	test	$8, N
+	jz	.Lword_next
+
+	sub	$8, N
+	jz	.Lone_word
+
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	
+	jmp	.Lword_next
+
+	ALIGN(16)
+
+.Lword_loop:
+	mov	8(AP, N), TMP
+	mov	(AP, N), TMP2
+	xor	8(BP, N), TMP
+	xor	(BP, N), TMP2
+	mov	TMP, 8(DST, N)
+	mov	TMP2, (DST, N)
+
+.Lword_next:
+	sub	$16, N
+	ja	.Lword_loop	C Not zero and no carry
+	jnz	.Lfinal
+
+	C Final operation is word aligned
+	mov	8(AP, N), TMP
+	xor	8(BP, N), TMP
+	mov	TMP, 8(DST, N)
+	
+.Lone_word:
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+
+.Lfinal:
+	add	$15, N
+
+.Lfinal_loop:
+	movb	(AP, N), LREG(TMP)
+	xorb	(BP, N), LREG(TMP)
+	movb	LREG(TMP), (DST, N)
+.Lfinal_next:
+	sub	$1, N
+	jnc	.Lfinal_loop
+
+.Ldone:
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+
+ifelse(USE_SSE2, yes, <
+
+.Lsse2_case:
+	lea	(DST, N), TMP
+	test	$8, TMP
+	jz	.Lsse2_next
+	sub	$8, N
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	jmp	.Lsse2_next
+
+	ALIGN(16)
+.Lsse2_loop:
+	movdqu	(AP, N), %xmm0
+	movdqu	(BP, N), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST, N)
+.Lsse2_next:
+	sub	$16, N
+	ja	.Lsse2_loop
+	
+	C FIXME: See if we can do a full word first, before the
+	C byte-wise final loop.
+	jnz	.Lfinal		
+
+	C Final operation is aligned
+	movdqu	(AP), %xmm0
+	movdqu	(BP), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST)
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+>)	
+	
+
+EPILOGUE(nettle_memxor3)