1 /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
2 times as large as bn. Or more accurately, bn < an < 3bn.
4 Contributed to the GNU project by Torbjorn Granlund.
6 The idea of applying toom to unbalanced multiplication is due to Marco
7 Bodrato and Alberto Zanoni.
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
15 This file is part of the GNU MP Library.
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of the GNU Lesser General Public License as published by
19 the Free Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
25 License for more details.
27 You should have received a copy of the GNU Lesser General Public License
28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
34 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
35 avoided by instead reusing the pp area and the scratch allocation.
37 2. Apply optimizations also to mul_toom42.c.
43 /* Evaluate in: -1, 0, +1, +inf
51 v0 = a0 * b0 # A(0)*B(0)
52 v1 = (a0+ a1+ a2)*(b0+ b1) # A(1)*B(1) ah <= 2 bh <= 1
53 vm1 = (a0- a1+ a2)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0
54 vinf= a2 * b1 # A(inf)*B(inf)
57 #if TUNE_PROGRAM_BUILD
58 #define MAYBE_mul_toom22 1
60 #define MAYBE_mul_toom22 \
61 (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD)
64 #define TOOM22_MUL_N_REC(p, a, b, n, ws) \
66 if (! MAYBE_mul_toom22 \
67 || BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
68 mpn_mul_basecase (p, a, n, b, n); \
70 mpn_toom22_mul (p, a, n, b, n, ws); \
74 mpn_toom32_mul (mp_ptr pp,
75 mp_srcptr ap, mp_size_t an,
76 mp_srcptr bp, mp_size_t bn,
81 #if HAVE_NATIVE_mpn_add_nc
93 #define a2 (ap + 2 * n)
97 n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
102 ASSERT (0 < s && s <= n);
103 ASSERT (0 < t && t <= n);
107 as1 = TMP_SALLOC_LIMBS (n + 1);
108 asm1 = TMP_SALLOC_LIMBS (n + 1);
110 bs1 = TMP_SALLOC_LIMBS (n + 1);
111 bsm1 = TMP_SALLOC_LIMBS (n);
115 /* Compute as1 and asm1. */
116 a0_a2[n] = mpn_add (a0_a2, a0, n, a2, s);
117 #if HAVE_NATIVE_mpn_addsub_n
118 if (a0_a2[n] == 0 && mpn_cmp (a0_a2, a1, n) < 0)
120 cy = mpn_addsub_n (as1, asm1, a1, a0_a2, n);
127 cy = mpn_addsub_n (as1, asm1, a0_a2, a1, n);
128 as1[n] = a0_a2[n] + (cy >> 1);
129 asm1[n] = a0_a2[n] - (cy & 1);
133 as1[n] = a0_a2[n] + mpn_add_n (as1, a0_a2, a1, n);
134 if (a0_a2[n] == 0 && mpn_cmp (a0_a2, a1, n) < 0)
136 mpn_sub_n (asm1, a1, a0_a2, n);
142 cy = mpn_sub_n (asm1, a0_a2, a1, n);
143 asm1[n] = a0_a2[n] - cy;
148 /* Compute bs1 and bsm1. */
151 #if HAVE_NATIVE_mpn_addsub_n
152 if (mpn_cmp (b0, b1, n) < 0)
154 cy = mpn_addsub_n (bs1, bsm1, b1, b0, n);
159 cy = mpn_addsub_n (bs1, bsm1, b0, b1, n);
163 bs1[n] = mpn_add_n (bs1, b0, b1, n);
165 if (mpn_cmp (b0, b1, n) < 0)
167 mpn_sub_n (bsm1, b1, b0, n);
172 mpn_sub_n (bsm1, b0, b1, n);
178 bs1[n] = mpn_add (bs1, b0, n, b1, t);
180 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
182 mpn_sub_n (bsm1, b1, b0, t);
183 MPN_ZERO (bsm1 + t, n - t);
188 mpn_sub (bsm1, b0, n, b1, t);
192 ASSERT (as1[n] <= 2);
193 ASSERT (bs1[n] <= 1);
194 ASSERT (asm1[n] <= 1);
195 /*ASSERT (bsm1[n] == 0); */
197 #define v0 pp /* 2n */
198 #define v1 (scratch) /* 2n+1 */
199 #define vinf (pp + 3 * n) /* s+t */
200 #define vm1 (scratch + 2 * n + 1) /* 2n+1 */
201 #define scratch_out scratch + 4 * n + 2
203 /* vm1, 2n+1 limbs */
204 TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
207 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
210 /* vinf, s+t limbs */
211 if (s > t) mpn_mul (vinf, a2, s, b1, t);
212 else mpn_mul (vinf, b1, t, a2, s);
215 TOOM22_MUL_N_REC (v1, as1, bs1, n, scratch_out);
218 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
220 else if (as1[n] == 2)
222 #if HAVE_NATIVE_mpn_addlsh1_n
223 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
225 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
231 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
234 mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */
240 #if HAVE_NATIVE_mpn_rsh1add_n
241 mpn_rsh1add_n (vm1, v1, vm1, 2 * n + 1);
243 mpn_add_n (vm1, v1, vm1, 2 * n + 1);
244 mpn_rshift (vm1, vm1, 2 * n + 1, 1);
249 #if HAVE_NATIVE_mpn_rsh1sub_n
250 mpn_rsh1sub_n (vm1, v1, vm1, 2 * n + 1);
252 mpn_sub_n (vm1, v1, vm1, 2 * n + 1);
253 mpn_rshift (vm1, vm1, 2 * n + 1, 1);
257 mpn_sub_n (v1, v1, vm1, 2 * n + 1);
258 v1[2 * n] -= mpn_sub_n (v1, v1, v0, 2 * n);
261 pp[] prior to operations:
262 |_H vinf|_L vinf|_______|_______|_______|
264 summation scheme for remaining operations:
265 |_______|_______|_______|_______|_______|
266 |_Hvinf_|_Lvinf_| |_H v0__|_L v0__|
272 mpn_sub (vm1, vm1, 2 * n + 1, vinf, s + t);
273 #if HAVE_NATIVE_mpn_add_nc
274 cy = mpn_add_n (pp + n, pp + n, vm1, n);
275 cy = mpn_add_nc (pp + 2 * n, v1, vm1 + n, n, cy);
276 cy = mpn_add_nc (pp + 3 * n, pp + 3 * n, v1 + n, n, cy);
277 mpn_incr_u (pp + 3 * n, vm1[2 * n]);
278 if (LIKELY (n != s + t)) /* FIXME: Limit operand range to avoid condition */
279 mpn_incr_u (pp + 4 * n, cy + v1[2 * n]);
281 cy2 = mpn_add_n (pp + n, pp + n, vm1, n);
282 cy = mpn_add_n (pp + 2 * n, v1, vm1 + n, n);
283 mpn_incr_u (pp + 2 * n, cy2);
284 mpn_incr_u (pp + 3 * n, cy + vm1[2 * n]);
285 cy = mpn_add_n (pp + 3 * n, pp + 3 * n, v1 + n, n);
286 if (LIKELY (n != s + t)) /* FIXME: Limit operand range to avoid condition */
287 mpn_incr_u (pp + 4 * n, cy + v1[2 * n]);