1 /* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
2 size. Or more accurately, bn <= an < (4/3)bn.
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
12 This file is part of the GNU MP Library.
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of the GNU Lesser General Public License as published by
16 the Free Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
19 The GNU MP Library is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
22 License for more details.
24 You should have received a copy of the GNU Lesser General Public License
25 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
31 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
32 avoided by instead reusing the pp area and the scratch area.
33 2. Use new toom functions for the recursive calls.
39 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
41 <-s--><--n--><--n--><--n-->
42 ____ ______ ______ ______
43 |_a3_|___a2_|___a1_|___a0_|
44 |b3_|___b2_|___b1_|___b0_|
45 <-t-><--n--><--n--><--n-->
47 v0 = a0 * b0 # A(0)*B(0)
48 v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3
49 vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1
50 v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14
51 vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14
52 vmh = (8a0-4a1+2a2- a3)*(8b0-4b1+2b2- b3) # A(-1/2)*B(-1/2) -4<=ah<=9 -4<=bh<=9
53 vinf= a3 * b2 # A(inf)*B(inf)
56 #if TUNE_PROGRAM_BUILD
57 #define MAYBE_mul_basecase 1
58 #define MAYBE_mul_toom22 1
59 #define MAYBE_mul_toom44 1
61 #define MAYBE_mul_basecase \
62 (MUL_TOOM44_THRESHOLD < 4 * MUL_KARATSUBA_THRESHOLD)
63 #define MAYBE_mul_toom22 \
64 (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD)
65 #define MAYBE_mul_toom44 \
66 (MUL_FFT_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
69 #define TOOM44_MUL_N_REC(p, a, b, n, ws) \
71 if (MAYBE_mul_basecase \
72 && BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
73 mpn_mul_basecase (p, a, n, b, n); \
74 else if (MAYBE_mul_toom22 \
75 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
76 mpn_kara_mul_n (p, a, b, n, ws); \
77 else if (! MAYBE_mul_toom44 \
78 || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) \
79 mpn_toom3_mul_n (p, a, b, n, ws); \
81 mpn_toom44_mul (p, a, n, b, n, ws); \
85 mpn_toom44_mul (mp_ptr pp,
86 mp_srcptr ap, mp_size_t an,
87 mp_srcptr bp, mp_size_t bn,
93 mp_ptr as1, asm1, as2, ash, asmh;
94 mp_ptr bs1, bsm1, bs2, bsh, bsmh;
95 enum toom4_flags flags;
100 #define a2 (ap + 2*n)
101 #define a3 (ap + 3*n)
104 #define b2 (bp + 2*n)
105 #define b3 (bp + 3*n)
114 ASSERT (0 < s && s <= n);
115 ASSERT (0 < t && t <= n);
119 as1 = TMP_SALLOC_LIMBS (n + 1);
120 asm1 = TMP_SALLOC_LIMBS (n + 1);
121 as2 = TMP_SALLOC_LIMBS (n + 1);
122 ash = TMP_SALLOC_LIMBS (n + 1);
123 asmh = TMP_SALLOC_LIMBS (n + 1);
125 bs1 = TMP_SALLOC_LIMBS (n + 1);
126 bsm1 = TMP_SALLOC_LIMBS (n + 1);
127 bs2 = TMP_SALLOC_LIMBS (n + 1);
128 bsh = TMP_SALLOC_LIMBS (n + 1);
129 bsmh = TMP_SALLOC_LIMBS (n + 1);
136 /* Compute as1 and asm1. */
137 gp[n] = mpn_add_n (gp, a0, a2, n);
138 hp[n] = mpn_add (hp, a1, n, a3, s);
139 #if HAVE_NATIVE_mpn_addsub_n
140 if (mpn_cmp (gp, hp, n + 1) < 0)
142 mpn_addsub_n (as1, asm1, hp, gp, n + 1);
143 flags ^= toom4_w3_neg;
147 mpn_addsub_n (as1, asm1, gp, hp, n + 1);
150 mpn_add_n (as1, gp, hp, n + 1);
151 if (mpn_cmp (gp, hp, n + 1) < 0)
153 mpn_sub_n (asm1, hp, gp, n + 1);
154 flags ^= toom4_w3_neg;
158 mpn_sub_n (asm1, gp, hp, n + 1);
163 #if HAVE_NATIVE_mpn_addlsh1_n
164 cy = mpn_addlsh1_n (as2, a2, a3, s);
166 cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
167 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
168 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
170 cy = mpn_lshift (as2, a3, s, 1);
171 cy += mpn_add_n (as2, a2, as2, s);
173 cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
174 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
175 cy += mpn_add_n (as2, a1, as2, n);
176 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
177 cy += mpn_add_n (as2, a0, as2, n);
181 /* Compute ash and asmh. */
182 cy = mpn_lshift (gp, a0, n, 3); /* 8a0 */
183 #if HAVE_NATIVE_mpn_addlsh1_n
184 gp[n] = cy + mpn_addlsh1_n (gp, gp, a2, n); /* 8a0 + 2a2 */
186 cy += mpn_lshift (hp, a2, n, 1); /* 2a2 */
187 gp[n] = cy + mpn_add_n (gp, gp, hp, n); /* 8a0 + 2a2 */
189 cy = mpn_lshift (hp, a1, n, 2); /* 4a1 */
190 hp[n] = cy + mpn_add (hp, hp, n, a3, s); /* 4a1 + a3 */
191 #if HAVE_NATIVE_mpn_addsub_n
192 if (mpn_cmp (gp, hp, n + 1) < 0)
194 mpn_addsub_n (ash, asmh, hp, gp, n + 1);
195 flags ^= toom4_w1_neg;
199 mpn_addsub_n (ash, asmh, gp, hp, n + 1);
202 mpn_add_n (ash, gp, hp, n + 1);
203 if (mpn_cmp (gp, hp, n + 1) < 0)
205 mpn_sub_n (asmh, hp, gp, n + 1);
206 flags ^= toom4_w1_neg;
210 mpn_sub_n (asmh, gp, hp, n + 1);
214 /* Compute bs1 and bsm1. */
215 gp[n] = mpn_add_n (gp, b0, b2, n);
216 hp[n] = mpn_add (hp, b1, n, b3, t);
217 #if HAVE_NATIVE_mpn_addsub_n
218 if (mpn_cmp (gp, hp, n + 1) < 0)
220 mpn_addsub_n (bs1, bsm1, hp, gp, n + 1);
221 flags ^= toom4_w3_neg;
225 mpn_addsub_n (bs1, bsm1, gp, hp, n + 1);
228 mpn_add_n (bs1, gp, hp, n + 1);
229 if (mpn_cmp (gp, hp, n + 1) < 0)
231 mpn_sub_n (bsm1, hp, gp, n + 1);
232 flags ^= toom4_w3_neg;
236 mpn_sub_n (bsm1, gp, hp, n + 1);
241 #if HAVE_NATIVE_mpn_addlsh1_n
242 cy = mpn_addlsh1_n (bs2, b2, b3, t);
244 cy = mpn_add_1 (bs2 + t, b2 + t, n - t, cy);
245 cy = 2 * cy + mpn_addlsh1_n (bs2, b1, bs2, n);
246 cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
248 cy = mpn_lshift (bs2, b3, t, 1);
249 cy += mpn_add_n (bs2, b2, bs2, t);
251 cy = mpn_add_1 (bs2 + t, b2 + t, n - t, cy);
252 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
253 cy += mpn_add_n (bs2, b1, bs2, n);
254 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
255 cy += mpn_add_n (bs2, b0, bs2, n);
259 /* Compute bsh and bsmh. */
260 cy = mpn_lshift (gp, b0, n, 3); /* 8b0 */
261 #if HAVE_NATIVE_mpn_addlsh1_n
262 gp[n] = cy + mpn_addlsh1_n (gp, gp, b2, n); /* 8b0 + 2b2 */
264 cy += mpn_lshift (hp, b2, n, 1); /* 2b2 */
265 gp[n] = cy + mpn_add_n (gp, gp, hp, n); /* 8b0 + 2b2 */
267 cy = mpn_lshift (hp, b1, n, 2); /* 4b1 */
268 hp[n] = cy + mpn_add (hp, hp, n, b3, t); /* 4b1 + b3 */
269 #if HAVE_NATIVE_mpn_addsub_n
270 if (mpn_cmp (gp, hp, n + 1) < 0)
272 mpn_addsub_n (bsh, bsmh, hp, gp, n + 1);
273 flags ^= toom4_w1_neg;
277 mpn_addsub_n (bsh, bsmh, gp, hp, n + 1);
280 mpn_add_n (bsh, gp, hp, n + 1);
281 if (mpn_cmp (gp, hp, n + 1) < 0)
283 mpn_sub_n (bsmh, hp, gp, n + 1);
284 flags ^= toom4_w1_neg;
288 mpn_sub_n (bsmh, gp, hp, n + 1);
292 ASSERT (as1[n] <= 3);
293 ASSERT (bs1[n] <= 3);
294 ASSERT (asm1[n] <= 1);
295 ASSERT (bsm1[n] <= 1);
296 ASSERT (as2[n] <= 14);
297 ASSERT (bs2[n] <= 14);
298 ASSERT (ash[n] <= 14);
299 ASSERT (bsh[n] <= 14);
300 ASSERT (asmh[n] <= 9);
301 ASSERT (bsmh[n] <= 9);
303 #define v0 pp /* 2n */
304 #define v1 (scratch + 6 * n + 6) /* 2n+1 */
305 #define vm1 scratch /* 2n+1 */
306 #define v2 (scratch + 2 * n + 2) /* 2n+1 */
307 #define vinf (pp + 6 * n) /* s+t */
308 #define vh (pp + 2 * n) /* 2n+1 */
309 #define vmh (scratch + 4 * n + 4)
310 #define scratch_out (scratch + 8 * n + 8)
312 /* vm1, 2n+1 limbs */
313 TOOM44_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out); /* vm1, 2n+1 limbs */
315 TOOM44_MUL_N_REC (v2 , as2 , bs2 , n + 1, scratch_out); /* v2, 2n+1 limbs */
317 if (s > t) mpn_mul (vinf, a3, s, b3, t);
318 else TOOM44_MUL_N_REC (vinf, a3, b3, s, scratch_out); /* vinf, s+t limbs */
320 TOOM44_MUL_N_REC (v1 , as1 , bs1 , n + 1, scratch_out); /* v1, 2n+1 limbs */
322 TOOM44_MUL_N_REC (vh , ash , bsh , n + 1, scratch_out);
324 TOOM44_MUL_N_REC (vmh, asmh, bsmh, n + 1, scratch_out);
326 TOOM44_MUL_N_REC (v0 , ap , bp , n , scratch_out); /* v0, 2n limbs */
328 mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch_out);