lift comba limit for s_mp_mul_comba

minad · minad · commit c421004becd3 · 2019-11-10T16:18:33.000+01:00
this is how it is done in tfm
diff --git a/demo/test.c b/demo/test.c
@@ -1895,7 +1895,7 @@ static int test_s_mp_mul_balance(void)
    return EXIT_FAILURE;
 }
 
-#define s_mp_mul_full(a, b, c) s_mp_mul(a, b, c, (a)->used + (b)->used + 1)
+#define s_mp_mul_full(a, b, c) s_mp_mul_comba(a, b, c, (a)->used + (b)->used + 1)
 static int test_s_mp_mul_karatsuba(void)
 {
    mp_int a, b, c, d;
diff --git a/etc/tune.c b/etc/tune.c
@@ -60,11 +60,7 @@ static int s_offset = 1;
 
 static mp_err s_mul_full(const mp_int *a, const mp_int *b, mp_int *c)
 {
-   if (MP_HAS(S_MP_MUL_HIGH_COMBA)
-       && (MP_MIN(a->used, b->used) < MP_MAX_COMBA)) {
-      return s_mp_mul_comba(a, b, c, a->used + b->used + 1);
-   }
-   return s_mp_mul(a, b, c, a->used + b->used + 1);
+   return s_mp_mul_comba(a, b, c, a->used + b->used + 1);
 }
 
 static uint64_t s_time_mul(int size)
diff --git a/libtommath_VS2008.vcproj b/libtommath_VS2008.vcproj
@@ -868,10 +868,6 @@
 			RelativePath="s_mp_montgomery_reduce_comba.c"
 			>
 		</File>
-		<File
-			RelativePath="s_mp_mul.c"
-			>
-		</File>
 		<File
 			RelativePath="s_mp_mul_balance.c"
 			>
diff --git a/makefile b/makefile
@@ -46,7 +46,7 @@ mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_r
 mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
 s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
 s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
-s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
+s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
 s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
 s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
 s_mp_zero_buf.o s_mp_zero_digs.o
diff --git a/makefile.mingw b/makefile.mingw
@@ -48,7 +48,7 @@ mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_r
 mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
 s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
 s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
-s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
+s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
 s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
 s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
 s_mp_zero_buf.o s_mp_zero_digs.o
diff --git a/makefile.msvc b/makefile.msvc
@@ -41,7 +41,7 @@ mp_sqrmod.obj mp_sqrt.obj mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod
 mp_to_ubin.obj mp_ubin_size.obj mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj \
 s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_get_bit.obj \
 s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log.obj s_mp_log_d.obj s_mp_log_pow2.obj s_mp_montgomery_reduce_comba.obj \
-s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
+s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
 s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj s_mp_radix_map.obj s_mp_rand_jenkins.obj \
 s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj \
 s_mp_zero_buf.obj s_mp_zero_digs.obj
diff --git a/makefile.shared b/makefile.shared
@@ -43,7 +43,7 @@ mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_r
 mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
 s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
 s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
-s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
+s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
 s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
 s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
 s_mp_zero_buf.o s_mp_zero_digs.o
diff --git a/makefile.unix b/makefile.unix
@@ -49,7 +49,7 @@ mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_r
 mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
 s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
 s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
-s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
+s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
 s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
 s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
 s_mp_zero_buf.o s_mp_zero_digs.o
diff --git a/mp_mul.c b/mp_mul.c
@@ -46,11 +46,8 @@ mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
    } else if (MP_HAS(S_MP_MUL_KARATSUBA) &&
               (min >= MP_MUL_KARATSUBA_CUTOFF)) {
       err = s_mp_mul_karatsuba(a, b, c);
-   } else if (MP_HAS(S_MP_MUL_COMBA) && /* can we use the fast multiplier? */
-              (min <= MP_MAX_COMBA)) {
+   } else if (MP_HAS(S_MP_MUL_COMBA)) {
       err = s_mp_mul_comba(a, b, c, digs);
-   } else if (MP_HAS(S_MP_MUL)) {
-      err = s_mp_mul(a, b, c, digs);
    } else {
       err = MP_VAL;
    }
diff --git a/mp_reduce.c b/mp_reduce.c
@@ -40,12 +40,7 @@ mp_err mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu)
    if ((err = mp_mod_2d(x, MP_DIGIT_BIT * (um + 1), x)) != MP_OKAY) goto LBL_ERR;
 
    /* q = q * m mod b**(k+1), quick (no division) */
-   if (MP_HAS(S_MP_MUL_COMBA)
-       && (MP_MIN(q.used, m->used) < MP_MAX_COMBA)) {
-      if ((err = s_mp_mul_comba(&q, m, &q, um + 1)) != MP_OKAY)     goto LBL_ERR;
-   } else {
-      if ((err = s_mp_mul(&q, m, &q, um + 1)) != MP_OKAY)           goto LBL_ERR;
-   }
+   if ((err = s_mp_mul_comba(&q, m, &q, um + 1)) != MP_OKAY)        goto LBL_ERR;
 
    /* x = x - q */
    if ((err = mp_sub(x, &q, x)) != MP_OKAY)                         goto LBL_ERR;
diff --git a/s_mp_mul.c b/s_mp_mul.c
diff --git a/s_mp_mul_comba.c b/s_mp_mul_comba.c
@@ -23,7 +23,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
    int      oldused, pa, ix;
    mp_err   err;
-   mp_word  W;
+   mp_digit c0, c1, c2;
    mp_int   tmp, *c_;
 
    /* prepare the destination */
@@ -38,7 +38,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
    pa = MP_MIN(digs, a->used + b->used);
 
    /* clear the carry */
-   W = 0;
+   c0 = c1 = c2 = 0;
    for (ix = 0; ix < pa; ix++) {
       int tx, ty, iy, iz;
 
@@ -51,16 +51,59 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
        */
       iy = MP_MIN(a->used-tx, ty+1);
 
-      /* execute loop */
-      for (iz = 0; iz < iy; ++iz) {
-         W += (mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz];
+      /* execute loop
+       *
+       * Give the autovectorizer a hint! this might not be necessary.
+       * I don't think the generated code will be particularily good here,
+       * if we will use full width digits the masks will go away.
+       */
+      for (iz = 0; iz + 3 < iy;) {
+         mp_word w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
+
+         w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
+
+         w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
+
+         w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
+      }
+
+      /* execute rest of loop */
+      for (; iz < iy;) {
+         mp_word w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
       }
 
       /* store term */
-      c_->dp[ix] = (mp_digit)W & MP_MASK;
+      c_->dp[ix] = c0;
 
       /* make next carry */
-      W = W >> (mp_word)MP_DIGIT_BIT;
+      c0 = c1;
+      c1 = c2;
+      c2 = 0;
    }
 
    /* setup dest */
diff --git a/tommath_class.h b/tommath_class.h
@@ -150,7 +150,6 @@
 #   define S_MP_LOG_D_C
 #   define S_MP_LOG_POW2_C
 #   define S_MP_MONTGOMERY_REDUCE_COMBA_C
-#   define S_MP_MUL_C
 #   define S_MP_MUL_BALANCE_C
 #   define S_MP_MUL_COMBA_C
 #   define S_MP_MUL_HIGH_C
@@ -542,7 +541,6 @@
 
 #if defined(MP_MUL_C)
 #   define S_MP_MUL_BALANCE_C
-#   define S_MP_MUL_C
 #   define S_MP_MUL_COMBA_C
 #   define S_MP_MUL_KARATSUBA_C
 #   define S_MP_MUL_TOOM_C
@@ -737,7 +735,6 @@
 #   define MP_RSHD_C
 #   define MP_SET_C
 #   define MP_SUB_C
-#   define S_MP_MUL_C
 #   define S_MP_MUL_COMBA_C
 #   define S_MP_MUL_HIGH_C
 #   define S_MP_MUL_HIGH_COMBA_C
@@ -1125,14 +1122,6 @@
 #   define S_MP_ZERO_DIGS_C
 #endif
 
-#if defined(S_MP_MUL_C)
-#   define MP_CLAMP_C
-#   define MP_CLEAR_C
-#   define MP_GROW_C
-#   define MP_INIT_SIZE_C
-#   define S_MP_ZERO_DIGS_C
-#endif
-
 #if defined(S_MP_MUL_BALANCE_C)
 #   define MP_ADD_C
 #   define MP_CLAMP_C
diff --git a/tommath_private.h b/tommath_private.h
@@ -175,7 +175,6 @@ MP_PRIVATE mp_err s_mp_invmod(const mp_int *a, const mp_int *b, mp_int *c) MP_WU
 MP_PRIVATE mp_err s_mp_invmod_odd(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_log(const mp_int *a, uint32_t base, uint32_t *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho) MP_WUR;
-MP_PRIVATE mp_err s_mp_mul(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
 MP_PRIVATE mp_err s_mp_mul_balance(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
 MP_PRIVATE mp_err s_mp_mul_high(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;

Original file line number	Diff line number	Diff line change
`@@ -1895,7 +1895,7 @@ static int test_s_mp_mul_balance(void)`
`1895`	`1895`	`return EXIT_FAILURE;`
`1896`	`1896`	`}`
`1897`	`1897`
`1898`		`-#define s_mp_mul_full(a, b, c) s_mp_mul(a, b, c, (a)->used + (b)->used + 1)`
	`1898`	`+#define s_mp_mul_full(a, b, c) s_mp_mul_comba(a, b, c, (a)->used + (b)->used + 1)`
`1899`	`1899`	`static int test_s_mp_mul_karatsuba(void)`
`1900`	`1900`	`{`
`1901`	`1901`	`mp_int a, b, c, d;`
Original file line number	Diff line number	Diff line change
`@@ -60,11 +60,7 @@ static int s_offset = 1;`
`60`	`60`
`61`	`61`	`static mp_err s_mul_full(const mp_int a, const mp_int b, mp_int *c)`
`62`	`62`	`{`
`63`		`- if (MP_HAS(S_MP_MUL_HIGH_COMBA)`
`64`		`- && (MP_MIN(a->used, b->used) < MP_MAX_COMBA)) {`
`65`		`- return s_mp_mul_comba(a, b, c, a->used + b->used + 1);`
`66`		`- }`
`67`		`- return s_mp_mul(a, b, c, a->used + b->used + 1);`
	`63`	`+ return s_mp_mul_comba(a, b, c, a->used + b->used + 1);`
`68`	`64`	`}`
`69`	`65`
`70`	`66`	`static uint64_t s_time_mul(int size)`
Original file line number	Diff line number	Diff line change
`@@ -868,10 +868,6 @@`
`868`	`868`	`RelativePath="s_mp_montgomery_reduce_comba.c"`
`869`	`869`	`>`
`870`	`870`	`</File>`
`871`		`- <File`
`872`		`- RelativePath="s_mp_mul.c"`
`873`		`- >`
`874`		`- </File>`
`875`	`871`	`<File`
`876`	`872`	`RelativePath="s_mp_mul_balance.c"`
`877`	`873`	`>`