remove W array from s_mp_mul_comba and s_mp_sqr_comba

minad · minad · commit 827e6d1ee690 · 2019-11-06T08:56:06.000+01:00
remove calls to comba from s_mp_mul and s_mp_mul_high

TODO:
* Remove remaining W arrays
* Replace mp_exch/mp_clear pairs by mp_clear/copy
* Check if more mp_init* calls can be replaced by MP_ALIAS/mp_init_size/mp_grow optimization
diff --git a/etc/tune.c b/etc/tune.c
@@ -58,7 +58,15 @@ static int s_number_of_test_loops;
 static int s_stabilization_extra;
 static int s_offset = 1;
 
-#define s_mp_mul_full(a, b, c) s_mp_mul(a, b, c, (a)->used + (b)->used + 1)
+static mp_err s_mul_full(const mp_int *a, const mp_int *b, mp_int *c)
+{
+   if (MP_HAS(S_MP_MUL_HIGH_COMBA)
+       && (MP_MIN(a->used, b->used) < MP_MAX_COMBA)) {
+      return s_mp_mul_comba(a, b, c, a->used + b->used + 1);
+   }
+   return s_mp_mul(a, b, c, a->used + b->used + 1);
+}
+
 static uint64_t s_time_mul(int size)
 {
    int x;
@@ -87,7 +95,7 @@ static uint64_t s_time_mul(int size)
          goto LBL_ERR;
       }
       if (s_check_result == 1) {
-         if ((e = s_mp_mul_full(&a,&b,&d)) != MP_OKAY) {
+         if ((e = s_mul_full(&a,&b,&d)) != MP_OKAY) {
             t1 = UINT64_MAX;
             goto LBL_ERR;
          }
diff --git a/mp_mul.c b/mp_mul.c
@@ -31,14 +31,7 @@ mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
    } else if (MP_HAS(S_MP_MUL_KARATSUBA) &&
               (min >= MP_MUL_KARATSUBA_CUTOFF)) {
       err = s_mp_mul_karatsuba(a, b, c);
-   } else if (MP_HAS(S_MP_MUL_COMBA) &&
-              /* can we use the fast multiplier?
-               *
-               * The fast multiplier can be used if the output will
-               * have less than MP_WARRAY digits and the number of
-               * digits won't affect carry propagation
-               */
-              (digs < MP_WARRAY) &&
+   } else if (MP_HAS(S_MP_MUL_COMBA) && /* can we use the fast multiplier? */
               (min <= MP_MAX_COMBA)) {
       err = s_mp_mul_comba(a, b, c, digs);
    } else if (MP_HAS(S_MP_MUL)) {
diff --git a/mp_reduce.c b/mp_reduce.c
@@ -23,17 +23,11 @@ mp_err mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu)
 
    /* according to HAC this optimization is ok */
    if ((mp_digit)um > ((mp_digit)1 << (MP_DIGIT_BIT - 1))) {
-      if ((err = mp_mul(&q, mu, &q)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-   } else if (MP_HAS(S_MP_MUL_HIGH)) {
-      if ((err = s_mp_mul_high(&q, mu, &q, um)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
+      if ((err = mp_mul(&q, mu, &q)) != MP_OKAY)                    goto LBL_ERR;
    } else if (MP_HAS(S_MP_MUL_HIGH_COMBA)) {
-      if ((err = s_mp_mul_high_comba(&q, mu, &q, um)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
+      if ((err = s_mp_mul_high_comba(&q, mu, &q, um)) != MP_OKAY)   goto LBL_ERR;
+   } else if (MP_HAS(S_MP_MUL_HIGH)) {
+      if ((err = s_mp_mul_high(&q, mu, &q, um)) != MP_OKAY)         goto LBL_ERR;
    } else {
       err = MP_VAL;
       goto LBL_ERR;
@@ -43,41 +37,33 @@ mp_err mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu)
    mp_rshd(&q, um + 1);
 
    /* x = x mod b**(k+1), quick (no division) */
-   if ((err = mp_mod_2d(x, MP_DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
-      goto LBL_ERR;
-   }
+   if ((err = mp_mod_2d(x, MP_DIGIT_BIT * (um + 1), x)) != MP_OKAY) goto LBL_ERR;
 
    /* q = q * m mod b**(k+1), quick (no division) */
-   if ((err = s_mp_mul(&q, m, &q, um + 1)) != MP_OKAY) {
-      goto LBL_ERR;
+   if (MP_HAS(S_MP_MUL_COMBA)
+       && (MP_MIN(q.used, m->used) < MP_MAX_COMBA)) {
+      if ((err = s_mp_mul_comba(&q, m, &q, um + 1)) != MP_OKAY)     goto LBL_ERR;
+   } else {
+      if ((err = s_mp_mul(&q, m, &q, um + 1)) != MP_OKAY)           goto LBL_ERR;
    }
 
    /* x = x - q */
-   if ((err = mp_sub(x, &q, x)) != MP_OKAY) {
-      goto LBL_ERR;
-   }
+   if ((err = mp_sub(x, &q, x)) != MP_OKAY)                         goto LBL_ERR;
 
    /* If x < 0, add b**(k+1) to it */
    if (mp_cmp_d(x, 0uL) == MP_LT) {
       mp_set(&q, 1uL);
-      if ((err = mp_lshd(&q, um + 1)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-      if ((err = mp_add(x, &q, x)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
+      if ((err = mp_lshd(&q, um + 1)) != MP_OKAY)                   goto LBL_ERR;
+      if ((err = mp_add(x, &q, x)) != MP_OKAY)                      goto LBL_ERR;
    }
 
    /* Back off if it's too big */
    while (mp_cmp(x, m) != MP_LT) {
-      if ((err = s_mp_sub(x, m, x)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
+      if ((err = s_mp_sub(x, m, x)) != MP_OKAY)                     goto LBL_ERR;
    }
 
 LBL_ERR:
    mp_clear(&q);
-
    return err;
 }
 #endif
diff --git a/mp_sqr.c b/mp_sqr.c
@@ -14,7 +14,6 @@ mp_err mp_sqr(const mp_int *a, mp_int *b)
               (a->used >= MP_SQR_KARATSUBA_CUTOFF)) {
       err = s_mp_sqr_karatsuba(a, b);
    } else if (MP_HAS(S_MP_SQR_COMBA) && /* can we use the fast comba multiplier? */
-              (((a->used * 2) + 1) < MP_WARRAY) &&
               (a->used < (MP_MAX_COMBA / 2))) {
       err = s_mp_sqr_comba(a, b);
    } else if (MP_HAS(S_MP_SQR)) {
diff --git a/s_mp_mul.c b/s_mp_mul.c
@@ -9,20 +9,20 @@
  */
 mp_err s_mp_mul(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
-   mp_int  t;
+   mp_int  tmp, *c_;
    mp_err  err;
    int     pa, ix;
 
-   /* can we use the fast multiplier? */
-   if ((digs < MP_WARRAY) &&
-       (MP_MIN(a->used, b->used) < MP_MAX_COMBA)) {
-      return s_mp_mul_comba(a, b, c, digs);
-   }
-
-   if ((err = mp_init_size(&t, digs)) != MP_OKAY) {
+   /* prepare the destination */
+   err = (MP_ALIAS(a, c) || MP_ALIAS(b, c))
+         ? mp_init_size((c_ = &tmp), digs)
+         : mp_grow((c_ = c), digs);
+   if (err != MP_OKAY) {
       return err;
    }
-   t.used = digs;
+
+   s_mp_zero_digs(c_->dp, c_->used);
+   c_->used = digs;
 
    /* compute the digits of the product directly */
    pa = a->used;
@@ -36,26 +36,29 @@ mp_err s_mp_mul(const mp_int *a, const mp_int *b, mp_int *c, int digs)
       /* compute the columns of the output and propagate the carry */
       for (iy = 0; iy < pb; iy++) {
          /* compute the column as a mp_word */
-         mp_word r = (mp_word)t.dp[ix + iy] +
+         mp_word r = (mp_word)c_->dp[ix + iy] +
                      ((mp_word)a->dp[ix] * (mp_word)b->dp[iy]) +
                      (mp_word)u;
 
          /* the new column is the lower part of the result */
-         t.dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);
+         c_->dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);
 
          /* get the carry word from the result */
          u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
       }
       /* set carry if it is placed below digs */
       if ((ix + iy) < digs) {
-         t.dp[ix + pb] = u;
+         c_->dp[ix + pb] = u;
       }
    }
 
-   mp_clamp(&t);
-   mp_exch(&t, c);
+   mp_clamp(c_);
+
+   if (c_ == &tmp) {
+      mp_clear(c);
+      *c = *c_;
+   }
 
-   mp_clear(&t);
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_mul_comba.c b/s_mp_mul_comba.c
@@ -23,19 +23,22 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
    int      oldused, pa, ix;
    mp_err   err;
-   mp_digit W[MP_WARRAY];
-   mp_word  _W;
+   mp_word  W;
+   mp_int   tmp, *c_;
 
-   /* grow the destination as required */
-   if ((err = mp_grow(c, digs)) != MP_OKAY) {
+   /* prepare the destination */
+   err = (MP_ALIAS(a, c) || MP_ALIAS(b, c))
+         ? mp_init_size((c_ = &tmp), digs)
+         : mp_grow((c_ = c), digs);
+   if (err != MP_OKAY) {
       return err;
    }
 
    /* number of output digits to produce */
    pa = MP_MIN(digs, a->used + b->used);
 
    /* clear the carry */
-   _W = 0;
+   W = 0;
    for (ix = 0; ix < pa; ix++) {
       int tx, ty, iy, iz;
 
@@ -50,29 +53,30 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 
       /* execute loop */
       for (iz = 0; iz < iy; ++iz) {
-         _W += (mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz];
+         W += (mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz];
       }
 
       /* store term */
-      W[ix] = (mp_digit)_W & MP_MASK;
+      c_->dp[ix] = (mp_digit)W & MP_MASK;
 
       /* make next carry */
-      _W = _W >> (mp_word)MP_DIGIT_BIT;
+      W = W >> (mp_word)MP_DIGIT_BIT;
    }
 
    /* setup dest */
-   oldused  = c->used;
-   c->used = pa;
-
-   for (ix = 0; ix < pa; ix++) {
-      /* now extract the previous digit [below the carry] */
-      c->dp[ix] = W[ix];
-   }
+   oldused  = c_->used;
+   c_->used = pa;
 
    /* clear unused digits [that existed in the old copy of c] */
-   s_mp_zero_digs(c->dp + c->used, oldused - c->used);
+   s_mp_zero_digs(c_->dp + c_->used, oldused - c_->used);
+
+   mp_clamp(c_);
+
+   if (c_ == &tmp) {
+      mp_clear(c);
+      *c = *c_;
+   }
 
-   mp_clamp(c);
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_mul_high.c b/s_mp_mul_high.c
@@ -12,13 +12,6 @@ mp_err s_mp_mul_high(const mp_int *a, const mp_int *b, mp_int *c, int digs)
    int      pa, pb, ix;
    mp_err   err;
 
-   /* can we use the fast multiplier? */
-   if (MP_HAS(S_MP_MUL_HIGH_COMBA)
-       && ((a->used + b->used + 1) < MP_WARRAY)
-       && (MP_MIN(a->used, b->used) < MP_MAX_COMBA)) {
-      return s_mp_mul_high_comba(a, b, c, digs);
-   }
-
    if ((err = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
       return err;
    }
diff --git a/s_mp_sqr.c b/s_mp_sqr.c
@@ -6,29 +6,36 @@
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
 mp_err s_mp_sqr(const mp_int *a, mp_int *b)
 {
-   mp_int   t;
+   mp_int   tmp, *b_;
    int      ix, pa;
    mp_err   err;
 
    pa = a->used;
-   if ((err = mp_init_size(&t, (2 * pa) + 1)) != MP_OKAY) {
+
+   /* prepare the destination */
+   err = MP_ALIAS(a, b)
+         ? mp_init_size((b_ = &tmp), (2 * pa) + 1)
+         : mp_grow((b_ = b), (2 * pa + 1));
+   if (err != MP_OKAY) {
       return err;
    }
 
+   s_mp_zero_digs(b_->dp, b_->used);
+
    /* default used is maximum possible size */
-   t.used = (2 * pa) + 1;
+   b_->used = (2 * pa) + 1;
 
    for (ix = 0; ix < pa; ix++) {
       mp_digit u;
       int iy;
 
       /* first calculate the digit at 2*ix */
       /* calculate double precision result */
-      mp_word r = (mp_word)t.dp[2*ix] +
+      mp_word r = (mp_word)b_->dp[2*ix] +
                   ((mp_word)a->dp[ix] * (mp_word)a->dp[ix]);
 
       /* store lower part in result */
-      t.dp[ix+ix] = (mp_digit)(r & (mp_word)MP_MASK);
+      b_->dp[ix+ix] = (mp_digit)(r & (mp_word)MP_MASK);
 
       /* get the carry */
       u           = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
@@ -40,26 +47,30 @@ mp_err s_mp_sqr(const mp_int *a, mp_int *b)
          /* now calculate the double precision result, note we use
           * addition instead of *2 since it's easier to optimize
           */
-         r       = (mp_word)t.dp[ix + iy] + r + r + (mp_word)u;
+         r       = (mp_word)b_->dp[ix + iy] + r + r + (mp_word)u;
 
          /* store lower part */
-         t.dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);
+         b_->dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);
 
          /* get carry */
          u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
       }
       /* propagate upwards */
       while (u != 0uL) {
-         r       = (mp_word)t.dp[ix + iy] + (mp_word)u;
-         t.dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);
+         r       = (mp_word)b_->dp[ix + iy] + (mp_word)u;
+         b_->dp[ix + iy] = (mp_digit)(r & (mp_word)MP_MASK);
          u       = (mp_digit)(r >> (mp_word)MP_DIGIT_BIT);
          ++iy;
       }
    }
 
-   mp_clamp(&t);
-   mp_exch(&t, b);
-   mp_clear(&t);
+   mp_clamp(b_);
+
+   if (b_ == &tmp) {
+      mp_clear(b);
+      *b = *b_;
+   }
+
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_sqr_comba.c b/s_mp_sqr_comba.c
diff --git a/tommath_class.h b/tommath_class.h
diff --git a/tommath_private.h b/tommath_private.h