diff --git a/ChangeLog b/ChangeLog
index a7f6c11d56ecc565b3b709b1ef1df32998608c97..00620bf671d158d253155c232ba19c5a6022e61d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,132 @@
+2014-01-20  Niels Möller  <nisse@lysator.liu.se>
+
+	Merged poly1305-changes (starting at 2013-11-08).
+	* x86_64/poly1305-internal.asm: Update to new interface.
+	poly1305_digest much simplified.
+
+	* poly1305.h (struct poly1305_ctx): Moved block and index
+	fields...
+	(struct poly1305_aes_ctx): ... to here.
+	* asm.m4: Delete also from the assembly definition of struct
+	poly1305_ctx.
+
+	* poly1305-internal.c (poly1305_digest): Don't do final padding
+	here, leave that to caller. Add digest to the provided nonce s,
+	and deleted length and dst arguments. Also reset h0-h4 to zero
+	when done.
+	(_poly1305_block): Renamed, from...
+	(poly1305_block): ...old name.
+
+	* poly1305-aes.c (poly1305_aes_update): New function.
+	(poly1305_aes_digest): Update for poly1305_digest changes, do
+	final padding here.
+
+	* poly1305.c (poly1305_update): Deleted file and function. Moved
+	to poly1305-aes.c.
+	* Makefile.in (nettle_SOURCES): Deleted poly1305.c.
+
+2014-01-17  Niels Möller  <nisse@lysator.liu.se>
+
+	* poly1305-internal.c (poly1305_block): Additional argument with
+	the high bit.
+	(poly1305_block_internal): Deleted function, code moved into the
+	poly1305_block.
+	(poly1305_digest): Simplified padding code, call poly1305_block
+	with high bit 0.
+	* poly1305.h (poly1305_block): Update prototype.
+	* poly1305.c (poly1305_update): Call poly1305_block with high bit 1.
+	* x86_64/poly1305-internal.asm (poly1305_block): Handle new
+	argument.
+
+	* poly1305.h (struct poly1305_ctx): Moved nonce field from here...
+	(struct poly1305_aes_ctx): ... to here.
+	* poly1305-aes.c (poly1305_aes_set_nonce, poly1305_aes_digest):
+	Updated for above.
+	* poly1305.c (poly1305_set_nonce): Deleted function.
+	* asm.m4: Delete nonce also from the assembly definition of struct
+	poly1305_ctx.
+
+2014-01-16  Niels Möller  <nisse@lysator.liu.se>
+
+	* poly1305-aes.c: Include poly1305.c. Rewrite functions without
+	using the POLY1305_* macros.
+
+	* Makefile.in (HEADERS): Deleted poly1305-aes.h.
+
+	* poly1305.h (POLY1305_CTX, POLY1305_SET_KEY, POLY1305_SET_NONCE)
+	(POLY1305_DIGEST): Deleted macros. Only implemented variant is
+	poly1305-aes.
+	(POLY1305_DIGEST_SIZE, POLY1305_BLOCK_SIZE, POLY1305_KEY_SIZE):
+	New constants.
+	(POLY1305_AES_KEY_SIZE, POLY1305_AES_DIGEST_SIZE): Moved here,
+	from poly1305-aes.h.
+	(struct poly1305_aes_ctx): Likewise.
+	(poly1305_aes_set_key, poly1305_aes_set_nonce)
+	(poly1305_aes_update, poly1305_aes_digest): Likewise.
+	* poly1305-aes.h: Deleted file, declarations moved to poly1305.h.
+	Update all users.
+
+	* poly1305-internal.c (s2, s3, s4): Fixed macros.
+
+	* poly1305-aes.h (struct poly1305_aes_ctx): Replace struct aes_ctx
+	by struct aes128_ctx.
+	* poly1305-aes.c (poly1305_aes_set_key, poly1305_aes_digest):
+	Update to use aes128_* functions.
+	* poly1305.h (POLY1305_SET_KEY): Drop key size argument when
+	calling set_key.
+
+2013-12-19  Niels Möller  <nisse@lysator.liu.se>
+
+	* poly1305-aes.h (poly1305_aes_update): Define as an alias for
+	poly1305_update, using preprocessor and a type cast.
+
+	* poly1305-aes.c (poly1305_aes_update): Deleted function.
+
+	* poly1305.h (poly1305_update): Declare.
+	(_POLY1305_BLOCK, POLY1305_UPDATE): Deleted macros.
+
+	* poly1305.c (poly1305_update): New function.
+
+2013-11-21  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/poly1305-internal.asm: New file. Almost a factor of two
+	speedup.
+
+	* configure.ac (asm_replace_list): Added poly1305-internal.asm.
+
+	* asm.m4: Define struct ffsets for 64-bit poly1305_ctx.
+
+	* poly1305.h (POLY1305_DIGEST): Pass the encrypted nonce as an
+	additional argument to poly1305_digest.
+	(struct poly1305_ctx): Introduce unions, to support either 26-bit
+	or 64-bit implementation.
+
+	* poly1305-internal.c (poly1305_digest): Added s argument.
+
+	* poly1305.c (poly1305_set_s): Deleted function.
+
+2013-11-12  Niels Möller  <nisse@lysator.liu.se>
+
+	* poly1305-internal.c: New file, for poly1305 functions depending
+	on the internal mod (2^130 - 5) representation.
+	(poly1305_block_internal): New helper function.
+	(poly1305_block, poly1305_digest): Use it.
+
+2013-11-08  Nikos Mavrogiannopoulos  <nmav@gnutls.org>
+
+	* poly1305.h: New file.
+	* poly1305.c: New file.
+	* poly1305-aes.h: New file.
+	* poly1305-aes.c: New file.
+	* Makefile.in (nettle_SOURCES): Added poly1305-aes.c and poly1305.c.
+	(HEADERS): Added poly1305-aes.h and poly1305.h.
+
+	* testsuite/poly1305-test.c: New file.
+	* testsuite/Makefile.in (TS_NETTLE_SOURCES): Added poly1305-test.c.
+
+	* examples/nettle-benchmark.c (time_poly1305_aes): New function.
+	(main): Benchmark poly1305.
+
 2014-01-20  Niels Möller  <nisse@lysator.liu.se>
 
 	* Makefile.in (nettle_SOURCES): Added salsa20-set-nonce.c,
diff --git a/Makefile.in b/Makefile.in
index db2f2ba68a90f9796202245f377fcd6040313e0c..6a3d58d96168692838e8edf70ae7f8d730dbb658 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -105,6 +105,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c \
 		 serpent-set-key.c serpent-encrypt.c serpent-decrypt.c \
 		 serpent-meta.c \
 		 twofish.c twofish-meta.c \
+		 poly1305-aes.c poly1305-internal.c \
 		 umac-nh.c umac-nh-n.c umac-l2.c umac-l3.c \
 		 umac-poly64.c umac-poly128.c umac-set-key.c \
 		 umac32.c umac64.c umac96.c umac128.c \
@@ -164,7 +165,7 @@ HEADERS = aes.h arcfour.h arctwo.h asn1.h bignum.h blowfish.h \
 	  pgp.h pkcs1.h realloc.h ripemd160.h rsa.h rsa-compat.h \
 	  salsa20.h sexp.h \
 	  serpent.h sha.h sha1.h sha2.h sha3.h twofish.h \
-	  umac.h yarrow.h
+	  umac.h yarrow.h poly1305.h
 
 INSTALL_HEADERS = $(HEADERS) nettle-stdint.h
 
diff --git a/asm.m4 b/asm.m4
index 200b13618ada20853c3f6868aa309286c0d3c483..55da2bfbc307189c14280f131509fc748345f92f 100644
--- a/asm.m4
+++ b/asm.m4
@@ -76,4 +76,14 @@ STRUCTURE(AES)
   STRUCT(TABLE2, AES_TABLE_SIZE)
   STRUCT(TABLE3, AES_TABLE_SIZE)
 
+C For 64-bit implementation
+STRUCTURE(P1305)
+  STRUCT(R0, 8)
+  STRUCT(R1, 8)
+  STRUCT(S1, 8)
+  STRUCT(PAD, 12)
+  STRUCT(H2, 4)
+  STRUCT(H0, 8)
+  STRUCT(H1, 8)
+
 divert
diff --git a/configure.ac b/configure.ac
index d78bcb0ec3c1227e98236bf17bad24b2b7211324..d54e91d33866b1dad9ba2f672fcc75521780456c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -262,7 +262,7 @@ fi
 # to a new object file).
 asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		arcfour-crypt.asm camellia-crypt-internal.asm \
-		md5-compress.asm memxor.asm \
+		md5-compress.asm memxor.asm poly1305-internal.asm \
 		salsa20-crypt.asm salsa20-core-internal.asm \
 		serpent-encrypt.asm serpent-decrypt.asm \
 		sha1-compress.asm sha256-compress.asm sha512-compress.asm \
diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c
index 6a8aa6e2a1fe4237a63b233f9e6bab8bb422ba48..c139e1a8930b61263c56f60d47ec9288ccafe38a 100644
--- a/examples/nettle-benchmark.c
+++ b/examples/nettle-benchmark.c
@@ -56,6 +56,7 @@
 #include "sha3.h"
 #include "twofish.h"
 #include "umac.h"
+#include "poly1305.h"
 
 #include "nettle-meta.h"
 #include "nettle-internal.h"
@@ -398,6 +399,23 @@ time_umac(void)
 	  time_function(bench_hash, &info));
 }
 
+static void
+time_poly1305_aes(void)
+{
+  static uint8_t data[BENCH_BLOCK];
+  struct bench_hash_info info;
+  struct poly1305_aes_ctx ctx;
+  uint8_t key[32];
+
+  poly1305_aes_set_key (&ctx, key);
+  info.ctx = &ctx;
+  info.update = (nettle_hash_update_func *) poly1305_aes_update;
+  info.data = data;
+
+  display("poly1305-aes", "update", 1024,
+	  time_function(bench_hash, &info));
+}
+
 static void
 time_gcm(void)
 {
@@ -718,6 +736,9 @@ main(int argc, char **argv)
   if (!alg || strstr ("umac", alg))
     time_umac();
 
+  if (!alg || strstr ("poly1305-aes", alg))
+    time_poly1305_aes();
+
   for (i = 0; ciphers[i]; i++)
     if (!alg || strstr(ciphers[i]->name, alg))
       time_cipher(ciphers[i]);
diff --git a/poly1305-aes.c b/poly1305-aes.c
new file mode 100644
index 0000000000000000000000000000000000000000..e4a6f748f25ce378bac10c9f98c2732bedcfa06a
--- /dev/null
+++ b/poly1305-aes.c
@@ -0,0 +1,78 @@
+/* nettle, low-level cryptographics library
+ *
+ * Copyright (C) 2013 Nikos Mavrogiannopoulos
+ * Copyright (C) 2014 Niels Möller
+ *
+ * The nettle library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * The nettle library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the nettle library; see the file COPYING.LIB.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02111-1301, USA.
+ */
+
+#if HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <assert.h>
+#include <string.h>
+
+#include "poly1305.h"
+#include "macros.h"
+
+void
+poly1305_aes_set_key (struct poly1305_aes_ctx *ctx, const uint8_t * key)
+{
+  aes128_set_encrypt_key(&ctx->aes, (key));
+  poly1305_set_key(&ctx->pctx, (key+16));
+  ctx->index = 0;
+}
+
+void
+poly1305_aes_set_nonce (struct poly1305_aes_ctx *ctx,
+			const uint8_t * nonce)
+{
+  memcpy (ctx->nonce, nonce, POLY1305_AES_NONCE_SIZE);
+}
+
+#define COMPRESS(ctx, data) _poly1305_block(&(ctx)->pctx, (data), 1)
+
+void
+poly1305_aes_update (struct poly1305_aes_ctx *ctx, size_t length, const uint8_t *data)
+{
+  MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+}
+
+void
+poly1305_aes_digest (struct poly1305_aes_ctx *ctx,
+		     size_t length, uint8_t *digest)
+{
+  uint8_t s[POLY1305_BLOCK_SIZE];
+  /* final bytes */
+  if (ctx->index > 0)
+    {
+      assert (ctx->index < POLY1305_BLOCK_SIZE);
+
+      ctx->block[ctx->index] = 1;
+      memset (ctx->block + ctx->index + 1,
+	      0, POLY1305_BLOCK_SIZE - 1 - ctx->index);
+
+      _poly1305_block (&ctx->pctx, ctx->block, 0);
+    }
+  aes128_encrypt(&ctx->aes, POLY1305_BLOCK_SIZE, s, ctx->nonce);
+  
+  poly1305_digest (&ctx->pctx, s);
+  memcpy (digest, s, length);
+
+  INCREMENT (16, ctx->nonce);
+  ctx->index = 0;
+}
diff --git a/poly1305-internal.c b/poly1305-internal.c
new file mode 100644
index 0000000000000000000000000000000000000000..b33a3c9dd2f8e93edda801ab27af994fcadcc808
--- /dev/null
+++ b/poly1305-internal.c
@@ -0,0 +1,170 @@
+/* nettle, low-level cryptographics library
+ *
+ * Placed by the author under public domain or the MIT license.
+ * (see https://github.com/floodyberry/poly1305-donna )
+ * Modified for nettle by Nikos Mavrogiannopoulos and Niels Möller.
+ *
+ * Copyright: 2012-2013 Andrew M. (floodyberry)
+ * Copyright: 2013 Nikos Mavrogiannopoulos
+ * Copyright: 2013 Niels Möller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <assert.h>
+#include <string.h>
+
+#include "poly1305.h"
+
+#include "macros.h"
+
+#define mul32x32_64(a,b) ((uint64_t)(a) * (b))
+
+#define r0 r.r32[0]
+#define r1 r.r32[1]
+#define r2 r.r32[2]
+#define r3 r.r32[3]
+#define r4 r.r32[4]
+#define s1 r.r32[5]
+#define s2 s32[0]
+#define s3 s32[1]
+#define s4 s32[2]
+
+#define h0 h.h32[0]
+#define h1 h.h32[1]
+#define h2 h.h32[2]
+#define h3 h.h32[3]
+#define h4 hh
+
+void
+poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
+{
+  uint32_t t0,t1,t2,t3;
+
+  t0 = LE_READ_UINT32(key);
+  t1 = LE_READ_UINT32(key+4);
+  t2 = LE_READ_UINT32(key+8);
+  t3 = LE_READ_UINT32(key+12);
+
+  ctx->r0 = t0 & 0x3ffffff; t0 >>= 26; t0 |= t1 << 6;
+  ctx->r1 = t0 & 0x3ffff03; t1 >>= 20; t1 |= t2 << 12;
+  ctx->r2 = t1 & 0x3ffc0ff; t2 >>= 14; t2 |= t3 << 18;
+  ctx->r3 = t2 & 0x3f03fff; t3 >>= 8;
+  ctx->r4 = t3 & 0x00fffff;
+
+  ctx->s1 = ctx->r1 * 5;
+  ctx->s2 = ctx->r2 * 5;
+  ctx->s3 = ctx->r3 * 5;
+  ctx->s4 = ctx->r4 * 5;
+
+  ctx->h0 = 0;
+  ctx->h1 = 0;
+  ctx->h2 = 0;
+  ctx->h3 = 0;
+  ctx->h4 = 0;
+}
+
+void
+_poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned t4)
+{
+  uint32_t t0,t1,t2,t3;
+  uint32_t b;
+  uint64_t t[5];
+  uint64_t c;
+
+  t0 = LE_READ_UINT32(m);
+  t1 = LE_READ_UINT32(m+4);
+  t2 = LE_READ_UINT32(m+8);
+  t3 = LE_READ_UINT32(m+12);
+
+  ctx->h0 += t0 & 0x3ffffff;
+  ctx->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
+  ctx->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
+  ctx->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
+  ctx->h4 += (t3 >> 8) | ((uint32_t) t4 << 24);
+
+  /* poly1305_donna_mul: */
+  t[0]  = mul32x32_64(ctx->h0,ctx->r0) + mul32x32_64(ctx->h1,ctx->s4) + mul32x32_64(ctx->h2,ctx->s3) + mul32x32_64(ctx->h3,ctx->s2) + mul32x32_64(ctx->h4,ctx->s1);
+  t[1]  = mul32x32_64(ctx->h0,ctx->r1) + mul32x32_64(ctx->h1,ctx->r0) + mul32x32_64(ctx->h2,ctx->s4) + mul32x32_64(ctx->h3,ctx->s3) + mul32x32_64(ctx->h4,ctx->s2);
+  t[2]  = mul32x32_64(ctx->h0,ctx->r2) + mul32x32_64(ctx->h1,ctx->r1) + mul32x32_64(ctx->h2,ctx->r0) + mul32x32_64(ctx->h3,ctx->s4) + mul32x32_64(ctx->h4,ctx->s3);
+  t[3]  = mul32x32_64(ctx->h0,ctx->r3) + mul32x32_64(ctx->h1,ctx->r2) + mul32x32_64(ctx->h2,ctx->r1) + mul32x32_64(ctx->h3,ctx->r0) + mul32x32_64(ctx->h4,ctx->s4);
+  t[4]  = mul32x32_64(ctx->h0,ctx->r4) + mul32x32_64(ctx->h1,ctx->r3) + mul32x32_64(ctx->h2,ctx->r2) + mul32x32_64(ctx->h3,ctx->r1) + mul32x32_64(ctx->h4,ctx->r0);
+
+  ctx->h0 = (uint32_t)t[0] & 0x3ffffff; c =           (t[0] >> 26);
+  t[1] += c;      ctx->h1 = (uint32_t)t[1] & 0x3ffffff; b = (uint32_t)(t[1] >> 26);
+  t[2] += b;      ctx->h2 = (uint32_t)t[2] & 0x3ffffff; b = (uint32_t)(t[2] >> 26);
+  t[3] += b;      ctx->h3 = (uint32_t)t[3] & 0x3ffffff; b = (uint32_t)(t[3] >> 26);
+  t[4] += b;      ctx->h4 = (uint32_t)t[4] & 0x3ffffff; b = (uint32_t)(t[4] >> 26);
+  ctx->h0 += b * 5;
+}
+
+/* Adds digest to the nonce */
+void
+poly1305_digest (struct poly1305_ctx *ctx, uint8_t *s)
+{
+  uint32_t b, nb;
+  uint64_t f0,f1,f2,f3;
+  uint32_t g0,g1,g2,g3,g4;
+
+  b = ctx->h0 >> 26; ctx->h0 = ctx->h0 & 0x3ffffff;
+  ctx->h1 +=     b; b = ctx->h1 >> 26; ctx->h1 = ctx->h1 & 0x3ffffff;
+  ctx->h2 +=     b; b = ctx->h2 >> 26; ctx->h2 = ctx->h2 & 0x3ffffff;
+  ctx->h3 +=     b; b = ctx->h3 >> 26; ctx->h3 = ctx->h3 & 0x3ffffff;
+  ctx->h4 +=     b; b = ctx->h4 >> 26; ctx->h4 = ctx->h4 & 0x3ffffff;
+  ctx->h0 += b * 5; b = ctx->h0 >> 26; ctx->h0 = ctx->h0 & 0x3ffffff;
+  ctx->h1 +=     b;
+
+  g0 = ctx->h0 + 5; b = g0 >> 26; g0 &= 0x3ffffff;
+  g1 = ctx->h1 + b; b = g1 >> 26; g1 &= 0x3ffffff;
+  g2 = ctx->h2 + b; b = g2 >> 26; g2 &= 0x3ffffff;
+  g3 = ctx->h3 + b; b = g3 >> 26; g3 &= 0x3ffffff;
+  g4 = ctx->h4 + b - (1 << 26);
+
+  b = (g4 >> 31) - 1;
+  nb = ~b;
+  ctx->h0 = (ctx->h0 & nb) | (g0 & b);
+  ctx->h1 = (ctx->h1 & nb) | (g1 & b);
+  ctx->h2 = (ctx->h2 & nb) | (g2 & b);
+  ctx->h3 = (ctx->h3 & nb) | (g3 & b);
+  ctx->h4 = (ctx->h4 & nb) | (g4 & b);
+
+  f0 = ((ctx->h0      ) | (ctx->h1 << 26)) + (uint64_t)LE_READ_UINT32(s);
+  f1 = ((ctx->h1 >>  6) | (ctx->h2 << 20)) + (uint64_t)LE_READ_UINT32(s+4);
+  f2 = ((ctx->h2 >> 12) | (ctx->h3 << 14)) + (uint64_t)LE_READ_UINT32(s+8);
+  f3 = ((ctx->h3 >> 18) | (ctx->h4 <<  8)) + (uint64_t)LE_READ_UINT32(s+12);
+
+  LE_WRITE_UINT32(s, f0);
+  f1 += (f0 >> 32);
+  LE_WRITE_UINT32(s+4, f1);
+  f2 += (f1 >> 32);
+  LE_WRITE_UINT32(s+8, f2);
+  f3 += (f2 >> 32);
+  LE_WRITE_UINT32(s+12, f3);
+
+  ctx->h0 = 0;
+  ctx->h1 = 0;
+  ctx->h2 = 0;
+  ctx->h3 = 0;
+  ctx->h4 = 0;
+}
diff --git a/poly1305.h b/poly1305.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ba4c7cb0b7fe73f7540c0ab5c125db69f23789b
--- /dev/null
+++ b/poly1305.h
@@ -0,0 +1,120 @@
+/* poly1305.h
+ *
+ * Poly1305 message authentication code.
+ */
+
+/* nettle, low-level cryptographics library
+ *
+ * Copyright (C) 2013 Nikos Mavrogiannopoulos
+ * Copyright (C) 2013, 2014 Niels Möller
+ *
+ * The nettle library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * The nettle library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the nettle library; see the file COPYING.LIB.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02111-1301, USA.
+ */
+
+#ifndef NETTLE_POLY1305_H_INCLUDED
+#define NETTLE_POLY1305_H_INCLUDED
+
+#include "aes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Name mangling */
+#define poly1305_set_key nettle_poly1305_set_key
+#define poly1305_digest nettle_poly1305_digest
+#define _poly1305_block _nettle_poly1305_block
+
+#define poly1305_aes_set_key nettle_poly1305_aes_set_key
+#define poly1305_aes_set_nonce nettle_poly1305_aes_set_nonce
+#define poly1305_aes_update nettle_poly1305_aes_update
+#define poly1305_aes_digest nettle_poly1305_aes_digest
+
+/* Low level functions/macros for the poly1305 construction. */
+
+#define POLY1305_DIGEST_SIZE 16
+#define POLY1305_BLOCK_SIZE 16
+#define POLY1305_KEY_SIZE 16
+
+struct poly1305_ctx {
+  /* Key, 128-bit value and some cached multiples. */
+  union
+  {
+    uint32_t r32[6];
+    uint64_t r64[3];
+  } r;
+  uint32_t s32[3];
+  /* State, represented as words of 26, 32 or 64 bits, depending on
+     implementation. */
+  /* High bits first, to maintain alignment. */
+  uint32_t hh;
+  union
+  {
+    uint32_t h32[4];
+    uint64_t h64[2];
+  } h;
+};
+
+/* Low-level internal interface. */
+void poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[POLY1305_KEY_SIZE]);
+/* Extracts digest, and adds it to s, the encrypted nonce. */
+void poly1305_digest (struct poly1305_ctx *ctx, uint8_t *s);
+/* Internal function. Process one block. */
+void _poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[POLY1305_BLOCK_SIZE],
+		      unsigned high);
+
+/* poly1305-aes */
+
+#define POLY1305_AES_KEY_SIZE 32
+#define POLY1305_AES_DIGEST_SIZE 16
+#define POLY1305_AES_NONCE_SIZE 16
+
+struct poly1305_aes_ctx
+{
+  /* Keep aes context last, to make it possible to use a general
+     poly1305_update if other variants are added. */
+  struct poly1305_ctx pctx;
+  uint8_t block[POLY1305_BLOCK_SIZE];
+  unsigned index;
+  uint8_t nonce[POLY1305_BLOCK_SIZE];
+  struct aes128_ctx aes;
+};
+
+/* Also initialize the nonce to zero. */
+void
+poly1305_aes_set_key (struct poly1305_aes_ctx *ctx, const uint8_t *key);
+
+/* Optional, if not used, messages get incrementing nonces starting
+   from zero. */
+void
+poly1305_aes_set_nonce (struct poly1305_aes_ctx *ctx,
+		        const uint8_t *nonce);
+
+/* Update is not aes-specific, but since this is the only implemented
+   variant, we need no more general poly1305_update. */
+void
+poly1305_aes_update (struct poly1305_aes_ctx *ctx, size_t length, const uint8_t *data);
+
+/* Also increments the nonce */
+void
+poly1305_aes_digest (struct poly1305_aes_ctx *ctx,
+	       	     size_t length, uint8_t *digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NETTLE_POLY1305_H_INCLUDED */
diff --git a/testsuite/.test-rules.make b/testsuite/.test-rules.make
index 5549fc09929541c5cccca4493cd8f825dddd50a5..2900468b5df15a0c2a7e5bc35a0f30ca9cad7633 100644
--- a/testsuite/.test-rules.make
+++ b/testsuite/.test-rules.make
@@ -106,6 +106,9 @@ gcm-test$(EXEEXT): gcm-test.$(OBJEXT)
 eax-test$(EXEEXT): eax-test.$(OBJEXT)
 	$(LINK) eax-test.$(OBJEXT) $(TEST_OBJS) -o eax-test$(EXEEXT)
 
+poly1305-test$(EXEEXT): poly1305-test.$(OBJEXT)
+	$(LINK) poly1305-test.$(OBJEXT) $(TEST_OBJS) -o poly1305-test$(EXEEXT)
+
 hmac-test$(EXEEXT): hmac-test.$(OBJEXT)
 	$(LINK) hmac-test.$(OBJEXT) $(TEST_OBJS) -o hmac-test$(EXEEXT)
 
diff --git a/testsuite/Makefile.in b/testsuite/Makefile.in
index d63fd1abe7daf656cb896011591a3b213a3292b6..ea85ef95929c0ff9780ef787f3997691f8378f05 100644
--- a/testsuite/Makefile.in
+++ b/testsuite/Makefile.in
@@ -26,6 +26,7 @@ TS_NETTLE_SOURCES = aes-test.c arcfour-test.c arctwo-test.c \
 		    serpent-test.c twofish-test.c \
 		    knuth-lfib-test.c \
 		    cbc-test.c ctr-test.c gcm-test.c eax-test.c \
+		    poly1305-test.c \
 		    hmac-test.c umac-test.c \
 		    meta-hash-test.c meta-cipher-test.c meta-armor-test.c \
 		    buffer-test.c yarrow-test.c pbkdf2-test.c
diff --git a/testsuite/poly1305-test.c b/testsuite/poly1305-test.c
new file mode 100644
index 0000000000000000000000000000000000000000..ee70b3c51b84667cec3baa9bcfd4b8ff33ae228f
--- /dev/null
+++ b/testsuite/poly1305-test.c
@@ -0,0 +1,86 @@
+#include "testutils.h"
+#include "poly1305.h"
+
+static void
+update (void *ctx, nettle_hash_update_func *f,
+	const struct tstring *msg,
+	unsigned length)
+{
+  for (; length > msg->length; length -= msg->length)
+    f(ctx, msg->length, msg->data);
+  f(ctx, length, msg->data);
+}
+
+static void
+check_digest (const char *name, void *ctx, nettle_hash_digest_func *f,
+	      const struct tstring *msg, unsigned length,
+	      unsigned tag_length, const uint8_t *ref)
+{
+  uint8_t tag[16];
+  f(ctx, tag_length, tag);
+  if (memcmp (tag, ref, tag_length) != 0)
+    {
+      printf ("%s failed\n", name);
+      printf ("msg: "); print_hex (msg->length, msg->data);
+      printf ("length: %u\n", length);
+      printf ("tag: "); print_hex (tag_length, tag);
+      printf ("ref: "); print_hex (tag_length, ref);
+      abort ();
+    }
+
+}
+
+static void
+test_poly1305 (const struct tstring *key,
+	   const struct tstring *nonce,
+	   const struct tstring *msg,
+	   unsigned length,
+	   const struct tstring *ref)
+{
+  struct poly1305_aes_ctx ctx;
+
+  ASSERT (key->length == POLY1305_AES_KEY_SIZE);
+  ASSERT (ref->length == POLY1305_AES_DIGEST_SIZE);
+
+  poly1305_aes_set_key (&ctx, key->data);
+  poly1305_aes_set_nonce (&ctx, nonce->data);
+
+  update(&ctx, (nettle_hash_update_func *) poly1305_aes_update, msg, length);
+
+  check_digest ("poly1305-aes", &ctx, (nettle_hash_digest_func *) poly1305_aes_digest,
+		msg, length, 16, ref->data);
+}
+
+void
+test_main(void)
+{
+  /* From Bernstein's paper. */
+  test_poly1305
+   (SHEX("75deaa25c09f208e1dc4ce6b5cad3fbfa0f3080000f46400d0c7e9076c834403"),
+    SHEX("61ee09218d29b0aaed7e154a2c5509cc"),
+    SHEX(""), 0,
+    SHEX("dd3fab2251f11ac759f0887129cc2ee7"));
+
+  test_poly1305
+   (SHEX("ec074c835580741701425b623235add6851fc40c3467ac0be05cc20404f3f700"),
+    SHEX("fb447350c4e868c52ac3275cf9d4327e"),
+    SHEX("f3f6"), 2,
+    SHEX("f4c633c3044fc145f84f335cb81953de"));
+
+  test_poly1305
+   (SHEX("6acb5f61a7176dd320c5c1eb2edcdc74"
+         "48443d0bb0d21109c89a100b5ce2c208"),
+    SHEX("ae212a55399729595dea458bc621ff0e"),
+    SHEX("663cea190ffb83d89593f3f476b6bc24"
+         "d7e679107ea26adb8caf6652d0656136"), 32,
+    SHEX("0ee1c16bb73f0f4fd19881753c01cdbe"));
+
+  test_poly1305
+   (SHEX("e1a5668a4d5b66a5f68cc5424ed5982d12976a08c4426d0ce8a82407c4f48207"),
+    SHEX("9ae831e743978d3a23527c7128149e3a"),
+    SHEX("ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491"
+         "faf0990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb2109"
+         "5c1bf9"), 63,
+    SHEX("5154ad0d2cb26e01274fc51148491f1b"));
+
+}
diff --git a/x86_64/poly1305-internal.asm b/x86_64/poly1305-internal.asm
new file mode 100644
index 0000000000000000000000000000000000000000..453c62b2bb99cc6c413620ececb62f61e795dd35
--- /dev/null
+++ b/x86_64/poly1305-internal.asm
@@ -0,0 +1,172 @@
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+	.file "poly1305-internal.asm"
+
+C Registers mainly used by poly1305_block
+define(<CTX>, <%rdi>)
+define(<T0>, <%rcx>)
+define(<T1>, <%rsi>)
+define(<T2>, <%r8>)
+define(<H0>, <%r9>)
+define(<H1>, <%r10>)
+define(<H2>, <%r11>)
+	
+	C poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
+	.text
+	C Registers:
+	C  %rdi: ctx
+	C  %rsi: key
+	C  %r8: mask
+	ALIGN(16)
+PROLOGUE(nettle_poly1305_set_key)
+	W64_ENTRY(2,0)
+	mov	$0x0ffffffc0fffffff, %r8
+	mov	(%rsi), %rax
+	and	%r8, %rax
+	and	$-4, %r8
+	mov	%rax, (CTX)
+	mov	8(%rsi), %rax
+	and	%r8, %rax
+	mov	%rax, P1305_R1 (CTX)
+	shr	$2, %rax
+	imul	$5, %rax
+	mov	%rax, P1305_S1 (CTX)
+	xor	XREG(%rax), XREG(%rax)
+	mov	%rax, P1305_H0 (CTX)
+	mov	%rax, P1305_H1 (CTX)
+	mov	XREG(%rax), P1305_H2 (CTX)
+	
+	W64_EXIT(2,0)
+	ret
+
+EPILOGUE(nettle_poly1305_set_key)
+
+C 64-bit multiplication mod 2^130 - 5
+C
+C (x_0 + B x_1 + B^2 x_2) * (r_0 + B r_1) =
+C     1   B B^2 B^3 
+C   x_0 r_0
+C       x_0 r_1
+C	x_1 r_0
+C	    x_1 r_1
+C	    x_2 r_0
+C               x_2 r_1
+C Then r_1 B^2 = r_1/4 (2^130) = 5/4 r_1.
+C and  r_1 B^3 = 5/4 B r_1
+C So we get
+C
+C  x_0 r_0 + x_1 (5/4 r_1) + B (x_0 r_1 + x_1 r_0 + x_2 5/4 r_1 + B x_2 r_0)
+C     1   B B^2 B^3 
+C   x_0 r_0
+C   x_1 r'_1
+C       x_0 r_1
+C	x_1 r_0
+C       x_2 r'_1
+C           x_2 r_0
+
+	C _poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned hi)
+	
+PROLOGUE(_nettle_poly1305_block)
+	W64_ENTRY(3, 0)
+	mov	(%rsi), T0
+	mov	8(%rsi), T1
+	mov	XREG(%rdx),	XREG(T2)
+
+	C Registers:
+	C Inputs:  CTX, T0, T1, T2,
+	C Outputs: H0, H1, H2, stored into the context.
+
+	add	P1305_H0 (CTX), T0
+	adc	P1305_H1 (CTX), T1
+	adc	P1305_H2 (CTX), XREG(T2)
+	mov	P1305_R0 (CTX), %rax
+	mul	T0			C x0*r0
+	mov	%rax, H0
+	mov	%rdx, H1
+	mov	P1305_S1 (CTX), %rax	C 5/4 r1
+	mov	%rax, H2
+	mul	T1			C x1*r1'
+	imul	T2, H2			C x2*r1'
+	imul	P1305_R0 (CTX), T2	C x2*r0
+	add	%rax, H0
+	adc	%rdx, H1
+	mov	P1305_R0 (CTX), %rax
+	mul	T1			C x1*r0
+	add	%rax, H2
+	adc	%rdx, T2
+	mov	P1305_R1 (CTX), %rax
+	mul	T0			C x0*r1
+	add	%rax, H2
+	adc	%rdx, T2
+	mov	T2, %rax
+	shr	$2, %rax
+	imul	$5, %rax
+	and	$3, XREG(T2)
+	add	%rax, H0
+	adc	H2, H1
+	adc	$0, XREG(T2)
+	mov	H0, P1305_H0 (CTX)
+	mov	H1, P1305_H1 (CTX)
+	mov	XREG(T2), P1305_H2 (CTX)
+	W64_EXIT(3, 0)
+	ret
+EPILOGUE(_nettle_poly1305_block)
+
+	C poly1305_digest (struct poly1305_ctx *ctx, uint8_t *s)
+	C Registers:
+	C   %rdi: ctx
+	C   %rsi: s
+	
+PROLOGUE(nettle_poly1305_digest)
+	W64_ENTRY(2, 0)
+
+	mov	P1305_H0 (CTX), H0
+	mov	P1305_H1 (CTX), H1
+	mov	P1305_H2 (CTX), XREG(H2)
+	mov	XREG(H2), XREG(%rax)
+	shr	$2, XREG(%rax)
+	and	$3, H2
+	imul	$5, XREG(%rax)
+	add	%rax, H0
+	adc	$0, H1
+	adc	$0, XREG(H2)
+
+C Use %rax instead of %rsi
+define(<T1>, <%rax>)
+	C Add 5, use result if >= 2^130
+	mov	$5, T0
+	xor	T1, T1
+	add	H0, T0
+	adc	H1, T1
+	adc	$0, XREG(H2)
+	cmp	$4, XREG(H2)
+	cmovnc	T0, H0
+	cmovnc	T1, H1
+
+	add	H0, (%rsi)
+	adc	H1, 8(%rsi)
+
+	xor	XREG(%rax), XREG(%rax)
+	mov	%rax, P1305_H0 (CTX)
+	mov	%rax, P1305_H1 (CTX)
+	mov	XREG(%rax), P1305_H2 (CTX)
+	W64_EXIT(2, 0)
+	ret
+