1 /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
2 as large as bn. Or more accurately, (5/2)bn < an < 6bn.
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6 The idea of applying toom to unbalanced multiplication is due to by Marco
7 Bodrato and Alberto Zanoni.
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
15 This file is part of the GNU MP Library.
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of the GNU Lesser General Public License as published by
19 the Free Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
25 License for more details.
27 You should have received a copy of the GNU Lesser General Public License
28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
34 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
35 avoided by instead reusing the pp area and the scratch allocation.
42 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
44 <-s-><--n--><--n--><--n-->
45 ___ ______ ______ ______ ______ ______
46 |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
50 v0 = a0 * b0 # A(0)*B(0)
51 v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1
52 vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0
53 v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2
54 vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2
55 vmh = (32a0-16a1+8a2-4a3+ 2a4- a5)*(2b0- b1) # A(-1/2)*B(-1/2) -20<=ah<=41 0<=bh<=1
56 vinf= a5 * b1 # A(inf)*B(inf)
60 mpn_toom62_mul (mp_ptr pp,
61 mp_srcptr ap, mp_size_t an,
62 mp_srcptr bp, mp_size_t bn,
66 int vm1_neg, vmh_neg, bsm_neg;
69 mp_ptr as1, asm1, as2, ash, asmh;
70 mp_ptr bs1, bsm1, bs2, bsh, bsmh;
71 enum toom4_flags flags;
83 n = 1 + (an >= 3 * bn ? (an - 1) / (unsigned long) 6 : (bn - 1) >> 1);
88 ASSERT (0 < s && s <= n);
89 ASSERT (0 < t && t <= n);
93 as1 = TMP_SALLOC_LIMBS (n + 1);
94 asm1 = TMP_SALLOC_LIMBS (n + 1);
95 as2 = TMP_SALLOC_LIMBS (n + 1);
96 ash = TMP_SALLOC_LIMBS (n + 1);
97 asmh = TMP_SALLOC_LIMBS (n + 1);
99 bs1 = TMP_SALLOC_LIMBS (n + 1);
100 bsm1 = TMP_SALLOC_LIMBS (n);
101 bs2 = TMP_SALLOC_LIMBS (n + 1);
102 bsh = TMP_SALLOC_LIMBS (n + 1);
103 bsmh = TMP_SALLOC_LIMBS (n + 1);
108 /* Compute as1 and asm1. */
109 a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n);
110 a0_a2[n] += mpn_add_n (a0_a2, a0_a2, a4, n);
111 a1_a3[n] = mpn_add_n (a1_a3, a1, a3, n);
112 a1_a3[n] += mpn_add (a1_a3, a1_a3, n, a5, s);
113 #if HAVE_NATIVE_mpn_addsub_n
114 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
116 mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1);
121 mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1);
125 mpn_add_n (as1, a0_a2, a1_a3, n + 1);
126 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
128 mpn_sub_n (asm1, a1_a3, a0_a2, n + 1);
133 mpn_sub_n (asm1, a0_a2, a1_a3, n + 1);
139 #if HAVE_NATIVE_mpn_addlsh1_n
140 cy = mpn_addlsh1_n (as2, a4, a5, s);
142 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy);
143 cy = 2 * cy + mpn_addlsh1_n (as2, a3, as2, n);
144 cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n);
145 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
146 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
148 cy = mpn_lshift (as2, a5, s, 1);
149 cy += mpn_add_n (as2, a4, as2, s);
151 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy);
152 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
153 cy += mpn_add_n (as2, a3, as2, n);
154 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
155 cy += mpn_add_n (as2, a2, as2, n);
156 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
157 cy += mpn_add_n (as2, a1, as2, n);
158 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
159 cy += mpn_add_n (as2, a0, as2, n);
163 /* Compute ash and asmh. */
164 #if HAVE_NATIVE_mpn_addlsh_n
165 cy = mpn_addlsh_n (a0_a2, a2, a0, n, 2); /* 4a0 + a2 */
166 cy = 4 * cy + mpn_addlsh_n (a0_a2, a4, a0_a2, n, 2); /* 16a0 + 4a2 + a4 */
167 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */
169 cy = mpn_addlsh_n (a1_a3, a3, a1, n, 2); /* 4a1 */
170 cy = 4 * cy + mpn_addlsh_n (a1_a3, a5, a1_a3, n, 2); /* 16a1 + 4a3 */
173 cy = mpn_lshift (a0_a2, a0, n, 2); /* 4a0 */
174 cy += mpn_add_n (a0_a2, a2, a0_a2, n); /* 4a0 + a2 */
175 cy = 4 * cy + mpn_lshift (a0_a2, a0_a2, n, 2); /* 16a0 + 4a2 */
176 cy += mpn_add_n (a0_a2, a4, a0_a2, n); /* 16a0 + 4a2 + a4 */
177 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */
179 cy = mpn_lshift (a1_a3, a1, n, 2); /* 4a1 */
180 cy += mpn_add_n (a1_a3, a3, a1_a3, n); /* 4a1 + a3 */
181 cy = 4 * cy + mpn_lshift (a1_a3, a1_a3, n, 2); /* 16a1 + 4a3 */
182 cy += mpn_add (a1_a3, a1_a3, n, a5, s); /* 16a1 + 4a3 + a5 */
185 #if HAVE_NATIVE_mpn_addsub_n
186 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
188 mpn_addsub_n (ash, asmh, a1_a3, a0_a2, n + 1);
193 mpn_addsub_n (ash, asmh, a0_a2, a1_a3, n + 1);
197 mpn_add_n (ash, a0_a2, a1_a3, n + 1);
198 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
200 mpn_sub_n (asmh, a1_a3, a0_a2, n + 1);
205 mpn_sub_n (asmh, a0_a2, a1_a3, n + 1);
210 /* Compute bs1 and bsm1. */
213 #if HAVE_NATIVE_mpn_addsub_n
214 if (mpn_cmp (b0, b1, n) < 0)
216 cy = mpn_addsub_n (bs1, bsm1, b1, b0, n);
221 cy = mpn_addsub_n (bs1, bsm1, b0, b1, n);
226 bs1[n] = mpn_add_n (bs1, b0, b1, n);
227 if (mpn_cmp (b0, b1, n) < 0)
229 mpn_sub_n (bsm1, b1, b0, n);
234 mpn_sub_n (bsm1, b0, b1, n);
241 bs1[n] = mpn_add (bs1, b0, n, b1, t);
242 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
244 mpn_sub_n (bsm1, b1, b0, t);
245 MPN_ZERO (bsm1 + t, n - t);
250 mpn_sub (bsm1, b0, n, b1, t);
257 /* Compute bs2, recycling bs1. bs2=bs1+b1 */
258 mpn_add (bs2, bs1, n + 1, b1, t);
260 /* Compute bsh and bsmh, recycling bs1 and bsm1. bsh=bs1+b0; bsmh=bsmh+b0 */
264 if (mpn_cmp (bsm1, b0, n) < 0)
267 mpn_sub_n (bsmh, b0, bsm1, n);
270 mpn_sub_n (bsmh, bsm1, b0, n);
273 bsmh[n] = mpn_add_n (bsmh, bsm1, b0, n);
274 mpn_add (bsh, bs1, n + 1, b0, n);
278 ASSERT (as1[n] <= 5);
279 ASSERT (bs1[n] <= 1);
280 ASSERT (asm1[n] <= 2);
281 /*ASSERT (bsm1[n] == 0);*/
282 ASSERT (as2[n] <= 62);
283 ASSERT (bs2[n] <= 2);
284 ASSERT (ash[n] <= 62);
285 ASSERT (bsh[n] <= 2);
286 ASSERT (asmh[n] <= 41);
287 ASSERT (bsmh[n] <= 1);
289 #define v0 pp /* 2n */
290 #define v1 (scratch + 6 * n + 6) /* 2n+1 */
291 #define vinf (pp + 6 * n) /* s+t */
292 #define vm1 scratch /* 2n+1 */
293 #define v2 (scratch + 2 * n + 2) /* 2n+1 */
294 #define vh (pp + 2 * n) /* 2n+1 */
295 #define vmh (scratch + 4 * n + 4)
297 /* vm1, 2n+1 limbs */
298 mpn_mul_n (vm1, asm1, bsm1, n);
302 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
304 else if (asm1[n] == 2)
306 #if HAVE_NATIVE_mpn_addlsh1_n
307 cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
309 cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
314 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
316 /* vinf, s+t limbs */
317 if (s > t) mpn_mul (vinf, a5, s, b1, t);
318 else mpn_mul (vinf, b1, t, a5, s);
321 mpn_mul_n (v1, as1, bs1, n);
324 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
326 else if (as1[n] == 2)
328 #if HAVE_NATIVE_mpn_addlsh1_n
329 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
331 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
334 else if (as1[n] != 0)
336 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
341 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
344 mpn_mul_n (vh, ash, bsh, n + 1);
346 mpn_mul_n (vmh, asmh, bsmh, n + 1);
348 mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */
350 flags = vm1_neg ? toom4_w3_neg : 0;
351 flags |= vmh_neg ? toom4_w1_neg : 0;
353 mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8 * n + 8);