From a3328c58174827dbec966748a074db439d8e151a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Tue, 23 Sep 2014 13:14:02 +0200
Subject: [PATCH] ecc_mod_inv interface and allocation changes.

---
 ChangeLog                   | 14 ++++++++++++++
 curve25519-eh-to-x.c        |  4 ++--
 curve25519-mul.c            |  2 +-
 ecc-ecdsa-sign.c            |  5 ++---
 ecc-ecdsa-verify.c          | 32 ++++++++++++++++----------------
 ecc-eh-to-a.c               |  5 ++---
 ecc-internal.h              |  4 ++--
 ecc-j-to-a.c                |  7 ++-----
 ecc-mod-inv.c               | 33 +++++++++++----------------------
 examples/ecc-benchmark.c    |  3 +--
 testsuite/ecc-modinv-test.c |  7 +++----
 11 files changed, 56 insertions(+), 60 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 2d084646..0c97135f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2014-09-23  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc-mod-inv.c (ecc_mod_inv): Interface change, make ap input
+	const, and require 2n limbs at rp. Preparing for powm-based
+	alternative implementations. Drop #if:ed out code and dp
+	temporary. Updated all callers, more complicated cases described
+	below.
+	* ecc-internal.h (typedef ecc_mod_inv_func): Added const to input
+	argument.
+	(ECC_MOD_INV_ITCH): Renamed, was ECC_MODINV_ITCH, and reduced to
+	2*n.
+	* ecc-ecdsa-verify.c (ecc_ecdsa_verify): Overhauled allocation,
+	putting mod_inv scratch at the end.
+
 2014-09-22  Niels Möller  <nisse@lysator.liu.se>
 
 	* ecc-random.c (ecc_mod_random): Renamed, and take a const struct
diff --git a/curve25519-eh-to-x.c b/curve25519-eh-to-x.c
index 47f07129..8ba221f5 100644
--- a/curve25519-eh-to-x.c
+++ b/curve25519-eh-to-x.c
@@ -65,8 +65,8 @@ curve25519_eh_to_x (mp_limb_t *xp, const mp_limb_t *p,
      x = 0, and we should be fine, since ecc_modp_inv returns 0
      in this case. */
   ecc_modp_sub (ecc, t0, wp, vp);
-  /* Needs 3*size scratch, for a total of 5*size */
-  ecc->p.invert (&ecc->p, t1, t0, t2);
+  /* Needs a total of 5*size storage. */
+  ecc->p.invert (&ecc->p, t1, t0, t2 + ecc->p.size);
   
   ecc_modp_add (ecc, t0, wp, vp);
   ecc_modp_mul (ecc, t2, t0, t1);
diff --git a/curve25519-mul.c b/curve25519-mul.c
index f4e4fa7b..0e280244 100644
--- a/curve25519-mul.c
+++ b/curve25519-mul.c
@@ -132,7 +132,7 @@ curve25519_mul (uint8_t *q, const uint8_t *n, const uint8_t *p)
       ecc_modp_addmul_1 (ecc, AA, E, 121665);
       ecc_modp_mul (ecc, z2, E, AA);      
     }
-  ecc->p.invert (&ecc->p, x3, z2, z3);
+  ecc->p.invert (&ecc->p, x3, z2, z3 + ecc->p.size);
   ecc_modp_mul (ecc, z3, x2, x3);
   cy = mpn_sub_n (x2, z3, ecc->p.m, ecc->p.size);
   cnd_copy (cy, x2, z3, ecc->p.size);
diff --git a/ecc-ecdsa-sign.c b/ecc-ecdsa-sign.c
index de17ac4c..57e05a41 100644
--- a/ecc-ecdsa-sign.c
+++ b/ecc-ecdsa-sign.c
@@ -82,9 +82,8 @@ ecc_ecdsa_sign (const struct ecc_curve *ecc,
   /* x coordinate only, modulo q */
   ecc->h_to_a (ecc, 2, rp, P, P + 3*ecc->p.size);
 
-  /* Invert k, uses 5 * ecc->p.size including scratch */
-  mpn_copyi (hp, kp, ecc->p.size);
-  ecc->q.invert (&ecc->q, kinv, hp, tp);
+  /* Invert k, uses 4 * ecc->p.size including scratch */
+  ecc->q.invert (&ecc->q, kinv, kp, tp); /* NOTE: Also clobbers hp */
   
   /* Process hash digest */
   ecc_hash (ecc, hp, length, digest);
diff --git a/ecc-ecdsa-verify.c b/ecc-ecdsa-verify.c
index a259962e..a60c89d6 100644
--- a/ecc-ecdsa-verify.c
+++ b/ecc-ecdsa-verify.c
@@ -92,11 +92,12 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
   */
 
 #define P2 scratch
-#define P1 (scratch + 3*ecc->p.size)
-#define sinv (scratch + 3*ecc->p.size)
+#define u1 (scratch + 3*ecc->p.size)
 #define u2 (scratch + 4*ecc->p.size)
-#define hp (scratch + 4*ecc->p.size)
-#define u1 (scratch + 6*ecc->p.size)
+
+#define P1 (scratch + 4*ecc->p.size)
+#define sinv (scratch)
+#define hp (scratch + ecc->p.size)
 
   if (! (ecdsa_in_range (ecc, rp)
 	 && ecdsa_in_range (ecc, sp)))
@@ -105,10 +106,13 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
   /* FIXME: Micro optimizations: Either simultaneous multiplication.
      Or convert to projective coordinates (can be done without
      division, I think), and write an ecc_add_ppp. */
-  
-  /* Compute sinv, use P2 as scratch */
-  mpn_copyi (sinv + ecc->p.size, sp, ecc->p.size);
-  ecc->q.invert (&ecc->q, sinv, sinv + ecc->p.size, P2);
+
+  /* Compute sinv */
+  ecc->q.invert (&ecc->q, sinv, sp, sinv + 2*ecc->p.size);
+
+  /* u1 = h / s, P1 = u1 * G */
+  ecc_hash (ecc, hp, length, digest);
+  ecc_modq_mul (ecc, u1, hp, sinv);
 
   /* u2 = r / s, P2 = u2 * Y */
   ecc_modq_mul (ecc, u2, rp, sinv);
@@ -116,16 +120,12 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
    /* Total storage: 5*ecc->p.size + ecc->mul_itch */
   ecc->mul (ecc, P2, u2, pp, u2 + ecc->p.size);
 
-  /* u1 = h / s, P1 = u1 * G */
-  ecc_hash (ecc, hp, length, digest);
-  ecc_modq_mul (ecc, u1, hp, sinv);
-
   /* u = 0 can happen only if h = 0 or h = q, which is extremely
      unlikely. */
   if (!zero_p (u1, ecc->p.size))
     {
-      /* Total storage: 6*ecc->p.size + ecc->mul_g_itch (ecc->p.size) */
-      ecc->mul_g (ecc, P1, u1, u1 + ecc->p.size);
+      /* Total storage: 7*ecc->p.size + ecc->mul_g_itch (ecc->p.size) */
+      ecc->mul_g (ecc, P1, u1, P1 + 3*ecc->p.size);
 
       /* NOTE: ecc_add_jjj and/or ecc_j_to_a will produce garbage in
 	 case u1 G = +/- u2 V. However, anyone who gets his or her
@@ -142,10 +142,10 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
 	 private key by guessing.
        */
       /* Total storage: 6*ecc->p.size + ecc->add_hhh_itch */
-      ecc->add_hhh (ecc, P1, P1, P2, u1);
+      ecc->add_hhh (ecc, P1, P1, P2, P1 + 3*ecc->p.size);
     }
   /* x coordinate only, modulo q */
-  ecc->h_to_a (ecc, 2, P2, P1, u1);
+  ecc->h_to_a (ecc, 2, P2, P1, P1 + 3*ecc->p.size);
 
   return (mpn_cmp (rp, P2, ecc->p.size) == 0);
 #undef P2
diff --git a/ecc-eh-to-a.c b/ecc-eh-to-a.c
index a0182308..4cfcad3b 100644
--- a/ecc-eh-to-a.c
+++ b/ecc-eh-to-a.c
@@ -63,9 +63,8 @@ ecc_eh_to_a (const struct ecc_curve *ecc,
 
   mp_limb_t cy;
 
-  mpn_copyi (tp, zp, ecc->p.size);
-  /* Needs 3*size scratch */
-  ecc->p.invert (&ecc->p, izp, tp, tp + ecc->p.size);
+  /* Needs 2*size scratch */
+  ecc->p.invert (&ecc->p, izp, zp, tp + ecc->p.size);
 
   ecc_modp_mul (ecc, tp, xp, izp);
   cy = mpn_sub_n (r, tp, ecc->p.m, ecc->p.size);
diff --git a/ecc-internal.h b/ecc-internal.h
index b96751f8..fe9acfb3 100644
--- a/ecc-internal.h
+++ b/ecc-internal.h
@@ -80,7 +80,7 @@ struct ecc_modulo;
 typedef void ecc_mod_func (const struct ecc_modulo *m, mp_limb_t *rp);
 
 typedef void ecc_mod_inv_func (const struct ecc_modulo *m,
-			       mp_limb_t *vp, mp_limb_t *ap,
+			       mp_limb_t *vp, const mp_limb_t *ap,
 			       mp_limb_t *scratch);
 
 typedef void ecc_add_func (const struct ecc_curve *ecc,
@@ -263,7 +263,7 @@ curve25519_eh_to_x (mp_limb_t *xp, const mp_limb_t *p,
 		    mp_limb_t *scratch);
 
 /* Current scratch needs: */
-#define ECC_MODINV_ITCH(size) (3*(size))
+#define ECC_MOD_INV_ITCH(size) (2*(size))
 #define ECC_J_TO_A_ITCH(size) (5*(size))
 #define ECC_EH_TO_A_ITCH(size) (4*(size))
 #define ECC_DUP_JJ_ITCH(size) (5*(size))
diff --git a/ecc-j-to-a.c b/ecc-j-to-a.c
index 8c52eb36..2e48b94d 100644
--- a/ecc-j-to-a.c
+++ b/ecc-j-to-a.c
@@ -52,7 +52,7 @@ ecc_j_to_a (const struct ecc_curve *ecc,
 	    mp_limb_t *scratch)
 {
 #define izp   scratch
-#define up   (scratch + ecc->p.size)
+#define up   (scratch + 2*ecc->p.size)
 #define iz2p (scratch + ecc->p.size)
 #define iz3p (scratch + 2*ecc->p.size)
 #define izBp (scratch + 3*ecc->p.size)
@@ -65,10 +65,7 @@ ecc_j_to_a (const struct ecc_curve *ecc,
       /* Set v = (r_z / B^2)^-1,
 
 	 r_x = p_x v^2 / B^3 =  ((v/B * v)/B * p_x)/B
-	 r_y = p_y v^3 / B^4 = (((v/B * v)/B * v)/B * p_x)/B
-
-	 Skip the first redc, if we want to stay in Montgomery
-	 representation.
+	 r_y = p_y v^3 / B^4 = (((v/B * v)/B * v)/B * p_y)/B
       */
 
       mpn_copyi (up, p + 2*ecc->p.size, ecc->p.size);
diff --git a/ecc-mod-inv.c b/ecc-mod-inv.c
index 22e533f2..f65c9da4 100644
--- a/ecc-mod-inv.c
+++ b/ecc-mod-inv.c
@@ -56,18 +56,21 @@ cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n)
 
 /* Compute a^{-1} mod m, with running time depending only on the size.
    Returns zero if a == 0 (mod m), to be consistent with a^{phi(m)-1}.
-   Also needs (m+1)/2, and m must be odd. */
+   Also needs (m+1)/2, and m must be odd.
+
+   Needs 2n limbs available at rp, and 2n additional scratch limbs.
+*/
 
 /* FIXME: Could use mpn_sec_invert (in GMP-6), but with a bit more
    scratch need since it doesn't precompute (m+1)/2. */
 void
 ecc_mod_inv (const struct ecc_modulo *m,
-	     mp_limb_t *vp, mp_limb_t *ap,
+	     mp_limb_t *vp, const mp_limb_t *in_ap,
 	     mp_limb_t *scratch)
 {
-#define bp scratch
-#define dp (scratch + n)
-#define up (scratch + 2*n)
+#define ap scratch
+#define bp (scratch + n)
+#define up (vp + n)
 
   mp_size_t n = m->size;
   /* Avoid the mp_bitcnt_t type for compatibility with older GMP
@@ -91,6 +94,7 @@ ecc_mod_inv (const struct ecc_modulo *m,
   mpn_zero (up+1, n - 1);
   mpn_copyi (bp, m->m, n);
   mpn_zero (vp, n);
+  mpn_copyi (ap, in_ap, n);
 
   for (i = m->bit_size + GMP_NUMB_BITS * n; i-- > 0; )
     {
@@ -134,29 +138,14 @@ ecc_mod_inv (const struct ecc_modulo *m,
       assert (bp[0] & 1);
       odd = ap[0] & 1;
 
-      /* Which variant is fastest depends on the speed of the various
-	 cnd_* functions. Assembly implementation would help. */
-#if 1
       swap = cnd_sub_n (odd, ap, bp, n);
       cnd_add_n (swap, bp, ap, n);
       cnd_neg (swap, ap, ap, n);
-#else
-      swap = odd & mpn_sub_n (dp, ap, bp, n);
-      cnd_copy (swap, bp, ap, n);
-      cnd_neg (swap, dp, dp, n);
-      cnd_copy (odd, ap, dp, n);
-#endif
 
-#if 1
       cnd_swap (swap, up, vp, n);
       cy = cnd_sub_n (odd, up, vp, n);
       cy -= cnd_add_n (cy, up, m->m, n);
-#else
-      cy = cnd_sub_n (odd, up, vp, n);
-      cnd_add_n (swap, vp, up, n);
-      cnd_neg (swap, up, up, n);
-      cnd_add_n (cy ^ swap, up, m->p, n);
-#endif
+
       cy = mpn_rshift (ap, ap, n, 1);
       assert (cy == 0);
       cy = mpn_rshift (up, up, n, 1);
@@ -164,7 +153,7 @@ ecc_mod_inv (const struct ecc_modulo *m,
       assert (cy == 0);
     }
   assert ( (ap[0] | ap[n-1]) == 0);
+#undef ap
 #undef bp
-#undef dp
 #undef up
 }
diff --git a/examples/ecc-benchmark.c b/examples/ecc-benchmark.c
index 0ecf3658..edf30fc2 100644
--- a/examples/ecc-benchmark.c
+++ b/examples/ecc-benchmark.c
@@ -173,8 +173,7 @@ static void
 bench_modinv (void *p)
 {
   struct ecc_ctx *ctx = (struct ecc_ctx *) p;
-  mpn_copyi (ctx->rp + ctx->ecc->p.size, ctx->ap, ctx->ecc->p.size);
-  ctx->ecc->p.invert (&ctx->ecc->p, ctx->rp, ctx->rp + ctx->ecc->p.size, ctx->tp);
+  ctx->ecc->p.invert (&ctx->ecc->p, ctx->rp, ctx->ap, ctx->tp);
 }
 
 #if !NETTLE_USE_MINI_GMP
diff --git a/testsuite/ecc-modinv-test.c b/testsuite/ecc-modinv-test.c
index ea49c2b6..2faefc86 100644
--- a/testsuite/ecc-modinv-test.c
+++ b/testsuite/ecc-modinv-test.c
@@ -42,9 +42,9 @@ test_modulo (gmp_randstate_t rands, const char *name,
 	     const struct ecc_modulo *m)
 {
   mp_limb_t a[MAX_ECC_SIZE];
-  mp_limb_t ai[MAX_ECC_SIZE];
+  mp_limb_t ai[2*MAX_ECC_SIZE];
   mp_limb_t ref[MAX_ECC_SIZE];
-  mp_limb_t scratch[ECC_MODINV_ITCH (MAX_ECC_SIZE)];
+  mp_limb_t scratch[ECC_MOD_INV_ITCH (MAX_ECC_SIZE)];
   unsigned j;
   mpz_t r;
 
@@ -66,9 +66,8 @@ test_modulo (gmp_randstate_t rands, const char *name,
     }
 	  
   /* Check behaviour for a = m */
-  mpn_copyi (a, m->m, m->size);
   memset (ai, 17, m->size * sizeof(*ai));
-  m->invert (m, ai, a, scratch);
+  m->invert (m, ai, m->m, scratch);
   if (!mpn_zero_p (ai, m->size))
     {
       fprintf (stderr, "%s->invert failed for a = p input (bit size %u):\n",
-- 
GitLab