diff --git a/ChangeLog b/ChangeLog
index 1d476b2394377854a29079e2aa37ba7dc829c060..12e048893d186ac8215e66dd28a9f80cb73f888e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2014-08-18  Niels Möller  <nisse@lysator.liu.se>
+
+	* eccdata.c (output_curve): Compute constants needed for
+	Shanks-Tonelli.
+	* ecc-25519.c (ecc_modp_powm_2kp1, ecc_25519_sqrt): New functions.
+	* ecc-internal.h (ecc_25519_sqrt): Declare it.
+
 2014-08-06  Niels Möller  <nisse@lysator.liu.se>
 
 	* testsuite/curve25519-dh-test.c (test_g): Use curve25519_base.
diff --git a/ecc-25519.c b/ecc-25519.c
index 18f38964de02fee4e399ab6b871f6bc551314025..e6d402f26addfae31af4567fa26a4d722fab9d73 100644
--- a/ecc-25519.c
+++ b/ecc-25519.c
@@ -60,9 +60,135 @@ ecc_25519_modp(const struct ecc_curve *ecc UNUSED, mp_limb_t *rp)
     + sec_add_1 (rp, rp, ECC_LIMB_SIZE - 1, 19 * cy);
 }
 
-/* We'll also need square roots, see
-   http://www.math.vt.edu/people/brown/doc/sqrts.pdf for a description
-   of Shanks-Tonelli. A quadratic non-residue is 2. */
+/* Needs 2*ecc->size limbs at rp, and 2*ecc->size additional limbs of
+   scratch space. No overlap allowed. */
+static void
+ecc_modp_powm_2kp1 (const struct ecc_curve *ecc,
+		    mp_limb_t *rp, const mp_limb_t *xp,
+		    unsigned k, mp_limb_t *tp)
+{
+  if (k & 1)
+    {
+      ecc_modp_sqr (ecc, tp, xp);
+      k--;
+    }
+  else
+    {
+      ecc_modp_sqr (ecc, rp, xp);
+      ecc_modp_sqr (ecc, tp, rp);
+      k -= 2;
+    }
+  while (k > 0)
+    {
+      ecc_modp_sqr (ecc, rp, tp);
+      ecc_modp_sqr (ecc, tp, rp);
+      k -= 2;
+    }
+  ecc_modp_mul (ecc, rp, tp, xp);
+#undef t1
+#undef t2
+}
+
+/* Compute x such that x^2 = a (mod p). Returns one on success, zero
+   on failure. using the e == 2 special case of the Shanks-Tonelli
+   algorithm (see http://www.math.vt.edu/people/brown/doc/sqrts.pdf,
+   or Henri Cohen, Computational Algebraic Number Theory, 1.5.1.
+
+   NOTE: Not side-channel silent. FIXME: Compute square root in the
+   extended field if a isn't a square (mod p)? FIXME: Accept scratch
+   space from caller (could allow scratch == rp). */
+#if ECC_SQRT_E != 2
+#error Broken curve25519 parameters
+#endif
+int
+ecc_25519_sqrt(mp_limb_t *rp, const mp_limb_t *ap)
+{
+  mp_size_t itch;
+  mp_limb_t *scratch;
+  int res;
+  const struct ecc_curve *ecc = &nettle_curve25519;
+
+  itch = 7*ECC_LIMB_SIZE;
+  scratch = gmp_alloc_limbs (itch);
+
+#define t0 scratch
+#define a7 (scratch + 2*ECC_LIMB_SIZE)
+#define t1 (scratch + 3*ECC_LIMB_SIZE)
+#define t2 (scratch + 5*ECC_LIMB_SIZE)
+#define scratch_out (scratch + 3*ECC_LIMB_SIZE) /* overlap t1, t2 */
+
+#define xp (scratch + ECC_LIMB_SIZE)
+#define bp (scratch + 2*ECC_LIMB_SIZE)
+
+  /* a^{2^252 - 3} = a^{(p-5)/8}, using the addition chain
+     2^252 - 3
+     = 1 + (2^252-4)
+     = 1 + 4 (2^250-1)
+     = 1 + 4 (2^125+1)(2^125-1)
+     = 1 + 4 (2^125+1)(1+2(2^124-1))
+     = 1 + 4 (2^125+1)(1+2(2^62+1)(2^62-1))
+     = 1 + 4 (2^125+1)(1+2(2^62+1)(2^31+1)(2^31-1))
+     = 1 + 4 (2^125+1)(1+2(2^62+1)(2^31+1)(7+8(2^28-1)))
+     = 1 + 4 (2^125+1)(1+2(2^62+1)(2^31+1)(7+8(2^14+1)(2^14-1)))
+     = 1 + 4 (2^125+1)(1+2(2^62+1)(2^31+1)(7+8(2^14+1)(2^7+1)(2^7-1)))
+     = 1 + 4 (2^125+1)(1+2(2^62+1)(2^31+1)(7+8(2^14+1)(2^7+1)(1+2(2^6-1))))
+     = 1 + 4 (2^125+1)(1+2(2^62+1)(2^31+1)(7+8(2^14+1)(2^7+1)(1+2(2^3+1)*7)))
+  */ 
+     
+  ecc_modp_powm_2kp1 (ecc, t1, ap, 1, t2);  /* a^3 */
+  ecc_modp_sqr (ecc, t0, t1);		    /* a^6 */
+  ecc_modp_mul (ecc, a7, t0, ap);	    /* a^7 */
+  ecc_modp_powm_2kp1 (ecc, t0, a7, 3, t1);  /* a^63 = a^{2^6-1} */
+  ecc_modp_sqr (ecc, t1, t0);		    /* a^{2^7-2} */
+  ecc_modp_mul (ecc, t0, t1, ap);	    /* a^{2^7-1} */
+  ecc_modp_powm_2kp1 (ecc, t1, t0, 7, t2);  /* a^{2^14-1}*/
+  ecc_modp_powm_2kp1 (ecc, t0, t1, 14, t2); /* a^{2^28-1} */
+  ecc_modp_sqr (ecc, t1, t0);		    /* a^{2^29-2} */
+  ecc_modp_sqr (ecc, t2, t1);		    /* a^{2^30-4} */
+  ecc_modp_sqr (ecc, t1, t2);		    /* a^{2^31-8} */
+  ecc_modp_mul (ecc, t0, t1, a7);	    /* a^{2^31-1} */
+  ecc_modp_powm_2kp1 (ecc, t1, t0, 31, t2); /* a^{2^62-1} */  
+  ecc_modp_powm_2kp1 (ecc, t0, t1, 62, t2); /* a^{2^124-1}*/
+  ecc_modp_sqr (ecc, t1, t0);		    /* a^{2^125-2} */
+  ecc_modp_mul (ecc, t0, t1, ap);	    /* a^{2^125-1} */
+  ecc_modp_powm_2kp1 (ecc, t1, t0, 125, t2); /* a^{2^250-1} */
+  ecc_modp_sqr (ecc, t0, t1);		    /* a^{2^251-2} */
+  ecc_modp_sqr (ecc, t1, t0);		    /* a^{2^252-4} */
+  ecc_modp_mul (ecc, t0, t1, ap);	    /* a^{2^252-3} */
+
+  /* Compute candidate root x and fudgefactor b. */
+  ecc_modp_mul (ecc, xp, t0, ap); /* a^{(p+3)/8 */
+  ecc_modp_mul (ecc, bp, t0, xp); /* a^{(p-1)/4} */
+  /* Check if b == 1 (mod p) */
+  if (mpn_cmp (bp, ecc->p, ECC_LIMB_SIZE) >= 0)
+    mpn_sub_n (bp, bp, ecc->p, ECC_LIMB_SIZE);
+  if (mpn_cmp (bp, ecc->unit, ECC_LIMB_SIZE) == 0)
+    {
+      mpn_copyi (rp, xp, ECC_LIMB_SIZE);
+      res = 1;
+    }
+  else
+    {
+      mpn_add_1 (bp, bp, ECC_LIMB_SIZE, 1);
+      if (mpn_cmp (bp, ecc->p, ECC_LIMB_SIZE) == 0)
+	{
+	  ecc_modp_mul (&nettle_curve25519, bp, xp, ecc_sqrt_z);
+	  mpn_copyi (rp, bp, ECC_LIMB_SIZE);
+	  res = 1;
+	}
+      else
+	res = 0;
+    }
+  gmp_free_limbs (scratch, itch);
+  return res;
+#undef t0
+#undef t1
+#undef t2
+#undef a7
+#undef xp
+#undef bp
+#undef scratch_out
+}
 
 const struct ecc_curve nettle_curve25519 =
 {
diff --git a/ecc-internal.h b/ecc-internal.h
index 2ed15ca7e6b24eeacc10c2b908e6bd954aa951d4..2589eb608cb5604e46afc687372595a153e28c9e 100644
--- a/ecc-internal.h
+++ b/ecc-internal.h
@@ -63,6 +63,7 @@
 #define sec_sub_1 _nettle_sec_sub_1
 #define sec_tabselect _nettle_sec_tabselect
 #define sec_modinv _nettle_sec_modinv
+#define ecc_25519_sqrt _nettle_ecc_25519_sqrt
 
 #define ECC_MAX_SIZE ((521 + GMP_NUMB_BITS - 1) / GMP_NUMB_BITS)
 
@@ -237,6 +238,9 @@ sec_modinv (mp_limb_t *vp, mp_limb_t *ap, mp_size_t n,
 	    const mp_limb_t *mp, const mp_limb_t *mp1h, mp_size_t bit_size,
 	    mp_limb_t *scratch);
 
+int
+ecc_25519_sqrt(mp_limb_t *rp, const mp_limb_t *ap);
+
 /* Current scratch needs: */
 #define ECC_MODINV_ITCH(size) (3*(size))
 #define ECC_J_TO_A_ITCH(size) (5*(size))
diff --git a/eccdata.c b/eccdata.c
index 297e32dd257a29e94ecdce002ee5179f68677588..cd2c1fb1e9d4b8add69d1c57655744ac3b70fb3e 100644
--- a/eccdata.c
+++ b/eccdata.c
@@ -926,7 +926,7 @@ output_curve (const struct ecc_curve *ecc, unsigned bits_per_limb)
 {
   unsigned limb_size = (ecc->bit_size + bits_per_limb - 1)/bits_per_limb;
   unsigned i;
-  unsigned bits;
+  unsigned bits, e;
   int redc_limbs;
   mpz_t t;
 
@@ -1034,6 +1034,62 @@ output_curve (const struct ecc_curve *ecc, unsigned bits_per_limb)
     }
   printf ("#define ECC_REDC_SIZE %d\n", redc_limbs);
 
+  /* For mod p square root computation. */
+  if (mpz_fdiv_ui (ecc->p, 4) == 3)
+    {
+      /* x = a^{(p+1)/4} gives square root of a (if it exists,
+	 otherwise the square root of -a). */
+      e = 1;
+      mpz_add_ui (t, ecc->p, 1);
+      mpz_fdiv_q_2exp (t, t, 2); 
+    }
+  else
+    {
+      /* p-1 = 2^e s, s odd, t = (s-1)/2*/
+      unsigned g, i;
+      mpz_t s;
+      mpz_t z;
+
+      mpz_init (s);
+      mpz_init (z);
+
+      mpz_sub_ui (s, ecc->p, 1);
+      e = mpz_scan1 (s, 0);
+      assert (e > 1);
+
+      mpz_fdiv_q_2exp (s, s, e);
+
+      /* Find a non-square g, g^{(p-1)/2} = -1,
+	 and z = g^{(p-1)/4 */
+      for (g = 2; ; g++)
+	{
+	  mpz_set_ui (z, g);
+	  mpz_powm (z, z, s, ecc->p);
+	  mpz_mul (t, z, z);
+	  mpz_mod (t, t, ecc->p);
+
+	  for (i = 2; i < e; i++)
+	    {
+	      mpz_mul (t, t, t);
+	      mpz_mod (t, t, ecc->p);
+	    }
+	  if (mpz_cmp_ui (t, 1) != 0)
+	    break;
+	}
+      mpz_add_ui (t, t, 1);
+      assert (mpz_cmp (t, ecc->p) == 0);
+      output_bignum ("ecc_sqrt_z", z, limb_size, bits_per_limb);
+
+      mpz_fdiv_q_2exp (t, s, 1);
+
+      mpz_clear (s);
+      mpz_clear (z);
+    }
+  printf ("#define ECC_SQRT_E %u\n", e);
+  printf ("#define ECC_SQRT_T_BITS %u\n",
+	  (unsigned) mpz_sizeinbase (t, 2));
+  output_bignum ("ecc_sqrt_t", t, limb_size, bits_per_limb);      
+
   printf ("#if USE_REDC\n");
   printf ("#define ecc_unit ecc_Bmodp\n");