1 /* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3
2 times as large as bn. Or more accurately, (4/3)bn < an < (5/2)bn.
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6 The idea of applying toom to unbalanced multiplication is due to Marco
7 Bodrato and Alberto Zanoni.
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
15 This file is part of the GNU MP Library.
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of the GNU Lesser General Public License as published by
19 the Free Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
25 License for more details.
27 You should have received a copy of the GNU Lesser General Public License
28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
34 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
35 avoided by instead reusing the pp area and the scratch allocation.
41 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
43 <-s-><--n--><--n--><--n--><--n-->
44 ___ ______ ______ ______ ______
45 |a4_|___a3_|___a2_|___a1_|___a0_|
49 v0 = a0 * b0 # A(0)*B(0)
50 v1 = ( a0+ a1+ a2+ a3+ a4)*( b0+ b1+ b2) # A(1)*B(1) ah <= 4 bh <= 2
51 vm1 = ( a0- a1+ a2- a3+ a4)*( b0- b1+ b2) # A(-1)*B(-1) |ah| <= 2 bh <= 1
52 v2 = ( a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) # A(2)*B(2) ah <= 30 bh <= 6
53 vh = (16a0+8a1+4a2+2a3+ a4)*(4b0+2b1+ b2) # A(1/2)*B(1/2) ah <= 30 bh <= 6
54 vmh = (16a0-8a1+4a2-2a3+ a4)*(4b0-2b1+ b2) # A(-1/2)*B(-1/2) -9<=ah<=20 -1<=bh<=4
55 vinf= a4 * b2 # A(inf)*B(inf)
59 mpn_toom53_mul (mp_ptr pp,
60 mp_srcptr ap, mp_size_t an,
61 mp_srcptr bp, mp_size_t bn,
68 mp_ptr as1, asm1, as2, ash, asmh;
69 mp_ptr bs1, bsm1, bs2, bsh, bsmh;
70 enum toom4_flags flags;
82 n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);
87 ASSERT (0 < s && s <= n);
88 ASSERT (0 < t && t <= n);
92 as1 = TMP_SALLOC_LIMBS (n + 1);
93 asm1 = TMP_SALLOC_LIMBS (n + 1);
94 as2 = TMP_SALLOC_LIMBS (n + 1);
95 ash = TMP_SALLOC_LIMBS (n + 1);
96 asmh = TMP_SALLOC_LIMBS (n + 1);
98 bs1 = TMP_SALLOC_LIMBS (n + 1);
99 bsm1 = TMP_SALLOC_LIMBS (n + 1);
100 bs2 = TMP_SALLOC_LIMBS (n + 1);
101 bsh = TMP_SALLOC_LIMBS (n + 1);
102 bsmh = TMP_SALLOC_LIMBS (n + 1);
107 /* Compute as1 and asm1. */
108 gp[n] = mpn_add_n (gp, a0, a2, n);
109 gp[n] += mpn_add (gp, gp, n, a4, s);
110 hp[n] = mpn_add_n (hp, a1, a3, n);
111 #if HAVE_NATIVE_mpn_addsub_n
112 if (mpn_cmp (gp, hp, n + 1) < 0)
114 mpn_addsub_n (as1, asm1, hp, gp, n + 1);
119 mpn_addsub_n (as1, asm1, gp, hp, n + 1);
123 mpn_add_n (as1, gp, hp, n + 1);
124 if (mpn_cmp (gp, hp, n + 1) < 0)
126 mpn_sub_n (asm1, hp, gp, n + 1);
131 mpn_sub_n (asm1, gp, hp, n + 1);
137 #if !HAVE_NATIVE_mpn_addlsh_n
138 ash[n] = mpn_lshift (ash, a2, n, 2); /* 4a2 */
140 #if HAVE_NATIVE_mpn_addlsh1_n
141 cy = mpn_addlsh1_n (as2, a3, a4, s);
143 cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy);
144 cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n);
145 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
146 as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
148 cy = mpn_lshift (as2, a4, s, 1);
149 cy += mpn_add_n (as2, a3, as2, s);
151 cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy);
152 cy = 4 * cy + mpn_lshift (as2, as2, n, 2);
153 cy += mpn_add_n (as2, a1, as2, n);
154 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
155 as2[n] = cy + mpn_add_n (as2, a0, as2, n);
156 mpn_add_n (as2, ash, as2, n + 1);
159 /* Compute ash and asmh. */
160 #if HAVE_NATIVE_mpn_addlsh_n
161 cy = mpn_addlsh_n (gp, a2, a0, n, 2); /* 4a0 + a2 */
162 cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2); /* 16a0 + 4a2 + a4 */ /* FIXME s */
164 cy = mpn_addlsh_n (hp, a3, a1, n, 2); /* 4a1 + a3 */
165 cy = 2 * cy + mpn_lshift (hp, hp, n, 1); /* 8a1 + 2a3 */
168 gp[n] = mpn_lshift (gp, a0, n, 4); /* 16a0 */
169 mpn_add (gp, gp, n + 1, a4, s); /* 16a0 + a4 */
170 mpn_add_n (gp, ash, gp, n+1); /* 16a0 + 4a2 + a4 */
171 cy = mpn_lshift (hp, a1, n, 3); /* 8a1 */
172 cy += mpn_lshift (ash, a3, n, 1); /* 2a3 */
173 cy += mpn_add_n (hp, ash, hp, n); /* 8a1 + 2a3 */
176 #if HAVE_NATIVE_mpn_addsub_n
177 if (mpn_cmp (gp, hp, n + 1) < 0)
179 mpn_addsub_n (ash, asmh, hp, gp, n + 1);
184 mpn_addsub_n (ash, asmh, gp, hp, n + 1);
188 mpn_add_n (ash, gp, hp, n + 1);
189 if (mpn_cmp (gp, hp, n + 1) < 0)
191 mpn_sub_n (asmh, hp, gp, n + 1);
196 mpn_sub_n (asmh, gp, hp, n + 1);
201 /* Compute bs1 and bsm1. */
202 bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */
203 #if HAVE_NATIVE_mpn_addsub_n
204 if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
206 bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1;
212 cy = mpn_addsub_n (bs1, bsm1, bs1, b1, n);
213 bsm1[n] = bs1[n] - (cy & 1);
217 if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
219 mpn_sub_n (bsm1, b1, bs1, n);
225 bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n);
227 bs1[n] += mpn_add_n (bs1, bs1, b1, n); /* b0+b1+b2 */
231 hp[n] = mpn_lshift (hp, b1, n, 1); /* 2b1 */
233 #ifdef HAVE_NATIVE_mpn_addlsh1_n
234 cy = mpn_addlsh1_n (bs2, b1, b2, t);
236 cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
237 bs2[n] = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
239 bs2[t] = mpn_lshift (bs2, b2, t, 2);
240 mpn_add (bs2, hp, n + 1, bs2, t + 1);
241 bs2[n] += mpn_add_n (bs2, bs2, b0, n);
244 /* Compute bsh and bsmh. */
245 #if HAVE_NATIVE_mpn_addlsh_n
246 gp[n] = mpn_addlsh_n (gp, b2, b0, n, 2); /* 4a0 + a2 */
248 cy = mpn_lshift (gp, b0, n, 2); /* 4b0 */
249 gp[n] = cy + mpn_add (gp, gp, n, b2, t); /* 4b0 + b2 */
251 #if HAVE_NATIVE_mpn_addsub_n
252 if (mpn_cmp (gp, hp, n + 1) < 0)
254 mpn_addsub_n (bsh, bsmh, hp, gp, n + 1);
258 mpn_addsub_n (bsh, bsmh, gp, hp, n + 1);
260 mpn_add_n (bsh, gp, hp, n + 1); /* 4b0 + 2b1 + b2 */
261 if (mpn_cmp (gp, hp, n + 1) < 0)
263 mpn_sub_n (bsmh, hp, gp, n + 1);
268 mpn_sub_n (bsmh, gp, hp, n + 1);
272 ASSERT (as1[n] <= 4);
273 ASSERT (bs1[n] <= 2);
274 ASSERT (asm1[n] <= 2);
275 ASSERT (bsm1[n] <= 1);
276 ASSERT (as2[n] <= 30);
277 ASSERT (bs2[n] <= 6);
278 ASSERT (ash[n] <= 30);
279 ASSERT (bsh[n] <= 6);
280 ASSERT (asmh[n] <= 20);
281 ASSERT (bsmh[n] <= 4);
283 #define v0 pp /* 2n */
284 #define v1 (scratch + 6 * n + 6) /* 2n+1 */
285 #define vm1 scratch /* 2n+1 */
286 #define v2 (scratch + 2 * n + 2) /* 2n+1 */
287 #define vinf (pp + 6 * n) /* s+t */
288 #define vh (pp + 2 * n) /* 2n+1 */
289 #define vmh (scratch + 4 * n + 4)
291 /* vm1, 2n+1 limbs */
292 #ifdef SMALLER_RECURSION
293 mpn_mul_n (vm1, asm1, bsm1, n);
296 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
298 else if (asm1[n] == 2)
300 #if HAVE_NATIVE_mpn_addlsh1_n
301 cy = 2 * bsm1[n] + mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
303 cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
309 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
311 #else /* SMALLER_RECURSION */
313 mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0));
314 #endif /* SMALLER_RECURSION */
316 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
318 /* vinf, s+t limbs */
319 if (s > t) mpn_mul (vinf, a4, s, b2, t);
320 else mpn_mul (vinf, b2, t, a4, s);
323 #ifdef SMALLER_RECURSION
324 mpn_mul_n (v1, as1, bs1, n);
327 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
329 else if (as1[n] == 2)
331 #if HAVE_NATIVE_mpn_addlsh1_n
332 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
334 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
337 else if (as1[n] != 0)
339 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
345 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
347 else if (bs1[n] == 2)
349 #if HAVE_NATIVE_mpn_addlsh1_n
350 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
352 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
356 #else /* SMALLER_RECURSION */
358 mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0));
359 #endif /* SMALLER_RECURSION */
361 mpn_mul_n (vh, ash, bsh, n + 1);
363 mpn_mul_n (vmh, asmh, bsmh, n + 1);
365 mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */
367 flags = vm1_neg ? toom4_w3_neg : 0;
368 flags |= vmh_neg ? toom4_w1_neg : 0;
370 mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8 * n + 8);