Skip to content

Commit c421004

Browse files
committed
lift comba limit for s_mp_mul_comba
this is how it is done in tfm
1 parent defc68b commit c421004

14 files changed

+59
-108
lines changed

demo/test.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1895,7 +1895,7 @@ static int test_s_mp_mul_balance(void)
18951895
return EXIT_FAILURE;
18961896
}
18971897

1898-
#define s_mp_mul_full(a, b, c) s_mp_mul(a, b, c, (a)->used + (b)->used + 1)
1898+
#define s_mp_mul_full(a, b, c) s_mp_mul_comba(a, b, c, (a)->used + (b)->used + 1)
18991899
static int test_s_mp_mul_karatsuba(void)
19001900
{
19011901
mp_int a, b, c, d;

etc/tune.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,7 @@ static int s_offset = 1;
6060

6161
static mp_err s_mul_full(const mp_int *a, const mp_int *b, mp_int *c)
6262
{
63-
if (MP_HAS(S_MP_MUL_HIGH_COMBA)
64-
&& (MP_MIN(a->used, b->used) < MP_MAX_COMBA)) {
65-
return s_mp_mul_comba(a, b, c, a->used + b->used + 1);
66-
}
67-
return s_mp_mul(a, b, c, a->used + b->used + 1);
63+
return s_mp_mul_comba(a, b, c, a->used + b->used + 1);
6864
}
6965

7066
static uint64_t s_time_mul(int size)

libtommath_VS2008.vcproj

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -868,10 +868,6 @@
868868
RelativePath="s_mp_montgomery_reduce_comba.c"
869869
>
870870
</File>
871-
<File
872-
RelativePath="s_mp_mul.c"
873-
>
874-
</File>
875871
<File
876872
RelativePath="s_mp_mul_balance.c"
877873
>

makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_r
4646
mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
4747
s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
4848
s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
49-
s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
49+
s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
5050
s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
5151
s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
5252
s_mp_zero_buf.o s_mp_zero_digs.o

makefile.mingw

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_r
4848
mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
4949
s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
5050
s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
51-
s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
51+
s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
5252
s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
5353
s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
5454
s_mp_zero_buf.o s_mp_zero_digs.o

makefile.msvc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ mp_sqrmod.obj mp_sqrt.obj mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod
4141
mp_to_ubin.obj mp_ubin_size.obj mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj \
4242
s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_get_bit.obj \
4343
s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log.obj s_mp_log_d.obj s_mp_log_pow2.obj s_mp_montgomery_reduce_comba.obj \
44-
s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
44+
s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
4545
s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj s_mp_radix_map.obj s_mp_rand_jenkins.obj \
4646
s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj \
4747
s_mp_zero_buf.obj s_mp_zero_digs.obj

makefile.shared

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_r
4343
mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
4444
s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
4545
s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
46-
s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
46+
s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
4747
s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
4848
s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
4949
s_mp_zero_buf.o s_mp_zero_digs.o

makefile.unix

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ mp_sqrmod.o mp_sqrt.o mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_r
4949
mp_to_ubin.o mp_ubin_size.o mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
5050
s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_get_bit.o \
5151
s_mp_invmod.o s_mp_invmod_odd.o s_mp_log.o s_mp_log_d.o s_mp_log_pow2.o s_mp_montgomery_reduce_comba.o \
52-
s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
52+
s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
5353
s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_rand_jenkins.o \
5454
s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o \
5555
s_mp_zero_buf.o s_mp_zero_digs.o

mp_mul.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,8 @@ mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
4646
} else if (MP_HAS(S_MP_MUL_KARATSUBA) &&
4747
(min >= MP_MUL_KARATSUBA_CUTOFF)) {
4848
err = s_mp_mul_karatsuba(a, b, c);
49-
} else if (MP_HAS(S_MP_MUL_COMBA) && /* can we use the fast multiplier? */
50-
(min <= MP_MAX_COMBA)) {
49+
} else if (MP_HAS(S_MP_MUL_COMBA)) {
5150
err = s_mp_mul_comba(a, b, c, digs);
52-
} else if (MP_HAS(S_MP_MUL)) {
53-
err = s_mp_mul(a, b, c, digs);
5451
} else {
5552
err = MP_VAL;
5653
}

mp_reduce.c

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,7 @@ mp_err mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu)
4040
if ((err = mp_mod_2d(x, MP_DIGIT_BIT * (um + 1), x)) != MP_OKAY) goto LBL_ERR;
4141

4242
/* q = q * m mod b**(k+1), quick (no division) */
43-
if (MP_HAS(S_MP_MUL_COMBA)
44-
&& (MP_MIN(q.used, m->used) < MP_MAX_COMBA)) {
45-
if ((err = s_mp_mul_comba(&q, m, &q, um + 1)) != MP_OKAY) goto LBL_ERR;
46-
} else {
47-
if ((err = s_mp_mul(&q, m, &q, um + 1)) != MP_OKAY) goto LBL_ERR;
48-
}
43+
if ((err = s_mp_mul_comba(&q, m, &q, um + 1)) != MP_OKAY) goto LBL_ERR;
4944

5045
/* x = x - q */
5146
if ((err = mp_sub(x, &q, x)) != MP_OKAY) goto LBL_ERR;

s_mp_mul.c

Lines changed: 0 additions & 64 deletions
This file was deleted.

s_mp_mul_comba.c

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
2323
{
2424
int oldused, pa, ix;
2525
mp_err err;
26-
mp_word W;
26+
mp_digit c0, c1, c2;
2727
mp_int tmp, *c_;
2828

2929
/* prepare the destination */
@@ -38,7 +38,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
3838
pa = MP_MIN(digs, a->used + b->used);
3939

4040
/* clear the carry */
41-
W = 0;
41+
c0 = c1 = c2 = 0;
4242
for (ix = 0; ix < pa; ix++) {
4343
int tx, ty, iy, iz;
4444

@@ -51,16 +51,59 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
5151
*/
5252
iy = MP_MIN(a->used-tx, ty+1);
5353

54-
/* execute loop */
55-
for (iz = 0; iz < iy; ++iz) {
56-
W += (mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz];
54+
/* execute loop
55+
*
56+
* Give the autovectorizer a hint! this might not be necessary.
57+
* I don't think the generated code will be particularily good here,
58+
* if we will use full width digits the masks will go away.
59+
*/
60+
for (iz = 0; iz + 3 < iy;) {
61+
mp_word w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
62+
c0 = (mp_digit)(w & MP_MASK);
63+
w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
64+
c1 = (mp_digit)(w & MP_MASK);
65+
c2 += (mp_digit)(w >> MP_DIGIT_BIT);
66+
++iz;
67+
68+
w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
69+
c0 = (mp_digit)(w & MP_MASK);
70+
w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
71+
c1 = (mp_digit)(w & MP_MASK);
72+
c2 += (mp_digit)(w >> MP_DIGIT_BIT);
73+
++iz;
74+
75+
w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
76+
c0 = (mp_digit)(w & MP_MASK);
77+
w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
78+
c1 = (mp_digit)(w & MP_MASK);
79+
c2 += (mp_digit)(w >> MP_DIGIT_BIT);
80+
++iz;
81+
82+
w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
83+
c0 = (mp_digit)(w & MP_MASK);
84+
w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
85+
c1 = (mp_digit)(w & MP_MASK);
86+
c2 += (mp_digit)(w >> MP_DIGIT_BIT);
87+
++iz;
88+
}
89+
90+
/* execute rest of loop */
91+
for (; iz < iy;) {
92+
mp_word w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
93+
c0 = (mp_digit)(w & MP_MASK);
94+
w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
95+
c1 = (mp_digit)(w & MP_MASK);
96+
c2 += (mp_digit)(w >> MP_DIGIT_BIT);
97+
++iz;
5798
}
5899

59100
/* store term */
60-
c_->dp[ix] = (mp_digit)W & MP_MASK;
101+
c_->dp[ix] = c0;
61102

62103
/* make next carry */
63-
W = W >> (mp_word)MP_DIGIT_BIT;
104+
c0 = c1;
105+
c1 = c2;
106+
c2 = 0;
64107
}
65108

66109
/* setup dest */

tommath_class.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@
150150
# define S_MP_LOG_D_C
151151
# define S_MP_LOG_POW2_C
152152
# define S_MP_MONTGOMERY_REDUCE_COMBA_C
153-
# define S_MP_MUL_C
154153
# define S_MP_MUL_BALANCE_C
155154
# define S_MP_MUL_COMBA_C
156155
# define S_MP_MUL_HIGH_C
@@ -542,7 +541,6 @@
542541

543542
#if defined(MP_MUL_C)
544543
# define S_MP_MUL_BALANCE_C
545-
# define S_MP_MUL_C
546544
# define S_MP_MUL_COMBA_C
547545
# define S_MP_MUL_KARATSUBA_C
548546
# define S_MP_MUL_TOOM_C
@@ -737,7 +735,6 @@
737735
# define MP_RSHD_C
738736
# define MP_SET_C
739737
# define MP_SUB_C
740-
# define S_MP_MUL_C
741738
# define S_MP_MUL_COMBA_C
742739
# define S_MP_MUL_HIGH_C
743740
# define S_MP_MUL_HIGH_COMBA_C
@@ -1125,14 +1122,6 @@
11251122
# define S_MP_ZERO_DIGS_C
11261123
#endif
11271124

1128-
#if defined(S_MP_MUL_C)
1129-
# define MP_CLAMP_C
1130-
# define MP_CLEAR_C
1131-
# define MP_GROW_C
1132-
# define MP_INIT_SIZE_C
1133-
# define S_MP_ZERO_DIGS_C
1134-
#endif
1135-
11361125
#if defined(S_MP_MUL_BALANCE_C)
11371126
# define MP_ADD_C
11381127
# define MP_CLAMP_C

tommath_private.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ MP_PRIVATE mp_err s_mp_invmod(const mp_int *a, const mp_int *b, mp_int *c) MP_WU
175175
MP_PRIVATE mp_err s_mp_invmod_odd(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
176176
MP_PRIVATE mp_err s_mp_log(const mp_int *a, uint32_t base, uint32_t *c) MP_WUR;
177177
MP_PRIVATE mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho) MP_WUR;
178-
MP_PRIVATE mp_err s_mp_mul(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
179178
MP_PRIVATE mp_err s_mp_mul_balance(const mp_int *a, const mp_int *b, mp_int *c) MP_WUR;
180179
MP_PRIVATE mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;
181180
MP_PRIVATE mp_err s_mp_mul_high(const mp_int *a, const mp_int *b, mp_int *c, int digs) MP_WUR;

0 commit comments

Comments
 (0)