diff --git a/ChangeLog b/ChangeLog
index 30a4d327297c62c4395a1e0c7a5c35fbe3e7468e..7de86fe446c378e938055f1149fb6756df92c367 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
 2014-01-17  Niels Möller  <nisse@lysator.liu.se>
 
+	* poly1305-internal.c (poly1305_block): Additional argument with
+	the high bit.
+	(poly1305_block_internal): Deleted function, code moved into the
+	poly1305_block.
+	(poly1305_digest): Simplified padding code, call poly1305_block
+	with high bit 0.
+	* poly1305.h (poly1305_block): Update prototype.
+	* poly1305.c (poly1305_update): Call poly1305_block with high bit 1.
+	* x86_64/poly1305-internal.asm (poly1305_block): Handle new
+	argument.
+
 	* poly1305.h (struct poly1305_ctx): Moved nonce field from here...
 	(struct poly1305_aes_ctx): ... to here.
 	* poly1305-aes.c (poly1305_aes_set_nonce, poly1305_aes_digest):
diff --git a/poly1305-internal.c b/poly1305-internal.c
index e53a210d48b861c19d914fd1c44c5e972de0e8e9..62c6976c72bb75d7cb4fc2310fe013af593a56bf 100644
--- a/poly1305-internal.c
+++ b/poly1305-internal.c
@@ -2,9 +2,11 @@
  *
  * Placed by the author under public domain or the MIT license.
  * (see https://github.com/floodyberry/poly1305-donna )
- * Modified for nettle by Nikos Mavrogiannopoulos.
+ * Modified for nettle by Nikos Mavrogiannopoulos and Niels Möller.
  *
  * Copyright: 2012-2013 Andrew M. (floodyberry)
+ * Copyright: 2013 Nikos Mavrogiannopoulos
+ * Copyright: 2013 Niels Möller
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -30,6 +32,7 @@
 #include "config.h"
 #endif
 
+#include <assert.h>
 #include <string.h>
 
 #include "poly1305.h"
@@ -82,20 +85,24 @@ poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
   ctx->h4 = 0;
 }
 
-static void
-poly1305_block_internal (struct poly1305_ctx *ctx,
-			 uint32_t t0, uint32_t t1, uint32_t t2, uint32_t t3,
-			 uint32_t t4)
+void
+poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned t4)
 {
+  uint32_t t0,t1,t2,t3;
   uint32_t b;
   uint64_t t[5];
   uint64_t c;
 
+  t0 = LE_READ_UINT32(m);
+  t1 = LE_READ_UINT32(m+4);
+  t2 = LE_READ_UINT32(m+8);
+  t3 = LE_READ_UINT32(m+12);
+
   ctx->h0 += t0 & 0x3ffffff;
   ctx->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
   ctx->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
   ctx->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
-  ctx->h4 += (t3 >> 8) | (t4 << 24);
+  ctx->h4 += (t3 >> 8) | ((uint32_t) t4 << 24);
 
   /* poly1305_donna_mul: */
   t[0]  = mul32x32_64(ctx->h0,ctx->r0) + mul32x32_64(ctx->h1,ctx->s4) + mul32x32_64(ctx->h2,ctx->s3) + mul32x32_64(ctx->h3,ctx->s2) + mul32x32_64(ctx->h4,ctx->s1);
@@ -112,20 +119,6 @@ poly1305_block_internal (struct poly1305_ctx *ctx,
   ctx->h0 += b * 5;
 }
 
-void
-poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16])
-{
-  uint32_t t0,t1,t2,t3;
-
-  /* full blocks */
-  t0 = LE_READ_UINT32(m);
-  t1 = LE_READ_UINT32(m+4);
-  t2 = LE_READ_UINT32(m+8);
-  t3 = LE_READ_UINT32(m+12);
-
-  poly1305_block_internal (ctx, t0, t1, t2, t3, 1);
-}
-
 void
 poly1305_digest (struct poly1305_ctx *ctx,
  		 size_t length, uint8_t *digest,
@@ -140,20 +133,13 @@ poly1305_digest (struct poly1305_ctx *ctx,
   /* poly1305_donna_atmost15bytes: */
   if (ctx->index > 0)
     {
-      uint32_t t0,t1,t2,t3;
-      size_t j;
-      uint8_t mp[16];
-
-      for (j = 0; j < ctx->index; j++) mp[j] = ctx->block[j];
-      mp[j++] = 1;
-      for (; j < 16; j++)	mp[j] = 0;
+      assert (ctx->index < POLY1305_BLOCK_SIZE);
 
-      t0 = LE_READ_UINT32(mp);
-      t1 = LE_READ_UINT32(mp+4);
-      t2 = LE_READ_UINT32(mp+8);
-      t3 = LE_READ_UINT32(mp+12);
+      ctx->block[ctx->index] = 1;
+      memset (ctx->block + ctx->index + 1,
+	      0, POLY1305_BLOCK_SIZE - 1 - ctx->index);
 
-      poly1305_block_internal (ctx, t0, t1, t2, t3, 0);
+      poly1305_block (ctx, ctx->block, 0);
     }
 
   b = ctx->h0 >> 26; ctx->h0 = ctx->h0 & 0x3ffffff;
diff --git a/poly1305.c b/poly1305.c
index 76b00ff1499c1ec840625745d7699091b0304541..20f669f5846c92f315529a05eaf7717495d10ee0 100644
--- a/poly1305.c
+++ b/poly1305.c
@@ -28,8 +28,10 @@
 
 #include "macros.h"
 
+#define COMPRESS(ctx, data) poly1305_block((ctx), (data), 1)
+
 void
 poly1305_update (struct poly1305_ctx *ctx, size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, poly1305_block, (void) 0);
+  MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
 }
diff --git a/poly1305.h b/poly1305.h
index 3257bf6303fae0d564046b7dda67ed5627424d00..3517d9c3dcaaeb1f56901d9ddfd52d9b3904a8a9 100644
--- a/poly1305.h
+++ b/poly1305.h
@@ -72,7 +72,8 @@ struct poly1305_ctx {
 };
 
 void poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[POLY1305_KEY_SIZE]);
-void poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[POLY1305_BLOCK_SIZE]);
+void poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[POLY1305_BLOCK_SIZE],
+		     unsigned high);
 void poly1305_update (struct poly1305_ctx *ctx, size_t size, const uint8_t *data);
 void poly1305_digest (struct poly1305_ctx *ctx,
 		      size_t length, uint8_t *digest, const uint8_t *s);
diff --git a/x86_64/poly1305-internal.asm b/x86_64/poly1305-internal.asm
index 076b30d8db636cc951c2420e828af72c63f5162d..9b8ae0139449585869096d477666c5467f401d1c 100644
--- a/x86_64/poly1305-internal.asm
+++ b/x86_64/poly1305-internal.asm
@@ -75,12 +75,12 @@ C So we get
 C
 C  x_0 r_0 + x_1 (5/4 r_1) + B (x_0 r_1 + x_1 r_0 + x_2 5/4 r_1 + B x_2 r_0)	
 
-	C poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16])
+	C poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned hi)
 	
 PROLOGUE(nettle_poly1305_block)
 	mov	(%rsi), T0
 	mov	8(%rsi), T1
-	mov	$1,	T2
+	mov	XREG(%rdx),	XREG(T2)
 C FIXME: Support windows ABI 
 	C Registers:
 	C Inputs:  CTX, T0, T1, T2,