diff --git a/ChangeLog b/ChangeLog
index ce820fc87ef32256a7abac30d6b1e80d1f9acd7f..dcd2a7f2776d4fbf57dc2e475340c4b709de39a0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2014-08-26  Niels Möller  <nisse@lysator.liu.se>
 
+	* ecc-25519.c (ecc_25519_modq): New function.
+
+	* eccdata.c (output_curve): Precomputation for curve25519 mod q.
+
 	* mini-gmp.c (mpz_abs_sub_bit): Do full normalization, needed in
 	case the most significant bit is cleared.
 
diff --git a/ecc-25519.c b/ecc-25519.c
index 6781475a9dd38d1247f002ef5de7cb0f10246733..3cbc60edee3024e94c5bed407c99edaab7118f49 100644
--- a/ecc-25519.c
+++ b/ecc-25519.c
@@ -35,6 +35,8 @@
 # include "config.h"
 #endif
 
+#include <assert.h>
+
 #include "ecc.h"
 #include "ecc-internal.h"
 
@@ -49,9 +51,9 @@ void
 ecc_25519_modp (const struct ecc_curve *ecc, mp_limb_t *rp);
 #else
 
-#define HIGH_BITS (GMP_NUMB_BITS * ECC_LIMB_SIZE - 255)
+#define PHIGH_BITS (GMP_NUMB_BITS * ECC_LIMB_SIZE - 255)
 
-#if HIGH_BITS == 0
+#if PHIGH_BITS == 0
 #error Unsupported limb size */
 #endif
 
@@ -61,15 +63,45 @@ ecc_25519_modp(const struct ecc_curve *ecc UNUSED, mp_limb_t *rp)
   mp_limb_t hi, cy;
 
   cy = mpn_addmul_1 (rp, rp + ECC_LIMB_SIZE, ECC_LIMB_SIZE,
-		     (mp_limb_t) 19 << HIGH_BITS);
+		     (mp_limb_t) 19 << PHIGH_BITS);
   hi = rp[ECC_LIMB_SIZE-1];
-  cy = (cy << HIGH_BITS) + (hi >> (GMP_NUMB_BITS - HIGH_BITS));
-  rp[ECC_LIMB_SIZE-1] = (hi & (GMP_NUMB_MASK >> HIGH_BITS))
+  cy = (cy << PHIGH_BITS) + (hi >> (GMP_NUMB_BITS - PHIGH_BITS));
+  rp[ECC_LIMB_SIZE-1] = (hi & (GMP_NUMB_MASK >> PHIGH_BITS))
     + sec_add_1 (rp, rp, ECC_LIMB_SIZE - 1, 19 * cy);
 }
-
 #endif /* HAVE_NATIVE_ecc_25519_modp */
 
+#define QHIGH_BITS (GMP_NUMB_BITS * ECC_LIMB_SIZE - 252)
+
+#if QHIGH_BITS == 0
+#error Unsupported limb size */
+#endif
+
+static void
+ecc_25519_modq (const struct ecc_curve *ecc, mp_limb_t *rp)
+{
+  mp_size_t n;
+  mp_limb_t cy;
+
+  /* n is the offset where we add in the next term */
+  for (n = ECC_LIMB_SIZE; n-- > 0;)
+    {
+      mp_limb_t cy;
+
+      cy = mpn_submul_1 (rp + n,
+			 ecc->Bmodq_shifted, ECC_LIMB_SIZE,
+			 rp[n + ECC_LIMB_SIZE]);
+      /* Top limb of mBmodq_shifted is zero, so we get cy == 0 or 1 */
+      assert (cy < 2);
+      cnd_add_n (cy, rp+n, ecc_q, ECC_LIMB_SIZE);
+    }
+
+  cy = mpn_submul_1 (rp, ecc_q, ECC_LIMB_SIZE,
+		     rp[ECC_LIMB_SIZE-1] >> (GMP_NUMB_BITS - QHIGH_BITS));
+  assert (cy < 2);
+  cnd_add_n (cy, rp, ecc_q, ECC_LIMB_SIZE);
+}
+
 /* Needs 2*ecc->size limbs at rp, and 2*ecc->size additional limbs of
    scratch space. No overlap allowed. */
 static void
@@ -218,7 +250,8 @@ const struct ecc_curve nettle_curve25519 =
   ecc_25519_modp,
   NULL,
   ecc_25519_modp,
-  NULL,
+  ecc_25519_modq,
+
 
   ecc_mul_a_eh,
   ecc_mul_g_eh,
@@ -235,8 +268,8 @@ const struct ecc_curve nettle_curve25519 =
   ecc_pp1h,
   ecc_redc_ppm1,
   ecc_unit,
-  ecc_Bmodq,
-  ecc_Bmodq_shifted,
+  ecc_Bmodq,  
+  ecc_mBmodq_shifted, /* Use q - 2^{252} instead. */ 
   ecc_qp1h,
   ecc_table
 };
diff --git a/eccdata.c b/eccdata.c
index cd2c1fb1e9d4b8add69d1c57655744ac3b70fb3e..9069e61082467bb099788373755bb1f9b9d71737 100644
--- a/eccdata.c
+++ b/eccdata.c
@@ -952,6 +952,28 @@ output_curve (const struct ecc_curve *ecc, unsigned bits_per_limb)
   bits = output_modulo ("ecc_Bmodq", ecc->q, limb_size, bits_per_limb);
   printf ("#define ECC_BMODQ_SIZE %u\n",
 	  (bits + bits_per_limb - 1) / bits_per_limb);
+  bits = mpz_sizeinbase (ecc->q, 2);
+  if (bits < ecc->bit_size)
+    {
+      /* for curve25519, with q = 2^k + q', with a much smaller q' */
+      unsigned mbits;
+      unsigned shift;
+
+      /* Shift to align the one bit at B */
+      shift = bits_per_limb * limb_size + 1 - bits;
+      
+      mpz_set (t, ecc->q);
+      mpz_clrbit (t, bits-1);
+      mbits = mpz_sizeinbase (t, 2);
+
+      /* The shifted value must be a limb smaller than q. */
+      if (mbits + shift + bits_per_limb <= bits)
+	{
+	  /* q of the form 2^k + q', with q' a limb smaller */
+	  mpz_mul_2exp (t, t, shift);
+	  output_bignum ("ecc_mBmodq_shifted", t, limb_size, bits_per_limb);
+	}
+    }
 
   if (ecc->bit_size < limb_size * bits_per_limb)
     {