1 /* mpn_gcdext -- Extended Greatest Common Divisor.
3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26 the size is returned (if inputs are non-normalized, result may be
27 non-normalized too). Temporary space needed is M->n + n.
30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
35 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
48 mpn_mul (tp, M->p[0][0], M->n, ap, n);
49 mpn_mul (rp, M->p[1][0], M->n, bp, n);
53 mpn_mul (tp, ap, n, M->p[0][0], M->n);
54 mpn_mul (rp, bp, n, M->p[1][0], M->n);
57 ah = mpn_add_n (rp, rp, tp, n + M->n);
61 mpn_mul (tp, M->p[1][1], M->n, bp, n);
62 mpn_mul (bp, M->p[0][1], M->n, ap, n);
66 mpn_mul (tp, bp, n, M->p[1][1], M->n);
67 mpn_mul (bp, ap, n, M->p[0][1], M->n);
69 bh = mpn_add_n (bp, bp, tp, n + M->n);
81 while ( (rp[n-1] | bp[n-1]) == 0)
88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
91 negative, and v is of the opposite sign. a, b are of size n, u and
92 v at most size n, and v must have space for n+1 limbs. */
95 mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96 mp_srcptr gp, mp_size_t gn,
97 mp_srcptr up, mp_size_t usize,
113 MPN_NORMALIZE (ap, an);
116 mpn_mul (tp, ap, an, up, size);
118 mpn_mul (tp, up, size, ap, an);
126 /* |v| = -v = (u a - g) / b */
128 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
129 MPN_NORMALIZE (tp, size);
135 /* |v| = v = (c - u a) / b = (c + |u| a) / b */
136 mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
141 /* Now divide t / b. There must be no remainder */
143 MPN_NORMALIZE (bp, bn);
147 ASSERT (vn <= n + 1);
149 /* FIXME: Use divexact. Or do the entire calculation mod 2^{n *
151 mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn);
152 vn -= (vp[vn-1] == 0);
154 /* Remainder must be zero */
158 for (i = 0; i < bn; i++)
167 /* Temporary storage:
169 Initial division: Quotient of at most an - n + 1 <= an limbs.
171 Storage for u0 and u1: 2(n+1).
173 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
175 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
177 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
179 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
181 For the lehmer call after the loop, Let T denote
182 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
183 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
184 + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient.
188 /* Optimal choice of p seems difficult. In each iteration the division
189 * of work between hgcd and the updates of u0 and u1 depends on the
190 * current size of the u. It may be desirable to use a different
191 * choice of p in each iteration. Also the input size seems to matter;
192 * choosing p = n / 3 in the first iteration seems to improve
193 * performance slightly for input size just above the threshold, but
194 * degrade performance for larger inputs. */
195 #define CHOOSE_P_1(n) ((n) / 2)
196 #define CHOOSE_P_2(n) ((n) / 3)
199 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
200 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
204 mp_size_t matrix_scratch;
205 mp_size_t ualloc = n + 1;
220 /* FIXME: Check for small sizes first, before setting up temporary
222 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
224 /* For initial division */
225 scratch = an - n + 1;
226 if (scratch > talloc)
229 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
232 mp_size_t hgcd_scratch;
233 mp_size_t update_scratch;
234 mp_size_t p1 = CHOOSE_P_1 (n);
235 mp_size_t p2 = CHOOSE_P_2 (n);
236 mp_size_t min_p = MIN(p1, p2);
237 mp_size_t max_p = MAX(p1, p2);
238 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
239 hgcd_scratch = mpn_hgcd_itch (n - min_p);
240 update_scratch = max_p + n - 1;
242 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
243 if (scratch > talloc)
246 /* Final mpn_gcdext_lehmer_n call. Need space for u and for
247 copies of a and b. */
248 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
249 + 3*GCDEXT_DC_THRESHOLD;
251 if (scratch > talloc)
254 /* Cofactors u0 and u1 */
258 tp = TMP_ALLOC_LIMBS(talloc);
262 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
264 if (mpn_zero_p (ap, n))
266 MPN_COPY (gp, bp, n);
273 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
275 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
281 MPN_ZERO (tp, 2*ualloc);
282 u0 = tp; tp += ualloc;
283 u1 = tp; tp += ualloc;
286 /* For the first hgcd call, there are no u updates, and it makes
287 some sense to use a different choice for p. */
289 /* FIXME: We could trim use of temporary storage, since u0 and u1
290 are not used yet. For the hgcd call, we could swap in the u0
291 and u1 pointers for the relevant matrix elements. */
293 struct hgcd_matrix M;
294 mp_size_t p = CHOOSE_P_1 (n);
297 mpn_hgcd_matrix_init (&M, n - p, tp);
298 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
301 ASSERT (M.n <= (n - p - 1)/2);
302 ASSERT (M.n + p <= (p + n - 1) / 2);
304 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
305 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
307 MPN_COPY (u0, M.p[1][0], M.n);
308 MPN_COPY (u1, M.p[1][1], M.n);
310 while ( (u0[un-1] | u1[un-1] ) == 0)
315 /* mpn_hgcd has failed. Then either one of a or b is very
316 small, or the difference is very small. Perform one
317 subtraction followed by one division. */
319 mp_size_t updated_un = 1;
323 /* Temporary storage 2n + 1 */
324 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
325 u0, u1, &updated_un, tp, tp + n);
333 ASSERT (un < ualloc);
337 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
339 struct hgcd_matrix M;
340 mp_size_t p = CHOOSE_P_2 (n);
343 mpn_hgcd_matrix_init (&M, n - p, tp);
344 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
349 t0 = tp + matrix_scratch;
350 ASSERT (M.n <= (n - p - 1)/2);
351 ASSERT (M.n + p <= (p + n - 1) / 2);
353 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
354 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
356 /* By the same analysis as for mpn_hgcd_matrix_mul */
357 ASSERT (M.n + un <= ualloc);
359 /* FIXME: This copying could be avoided by some swapping of
360 * pointers. May need more temporary storage, though. */
361 MPN_COPY (t0, u0, un);
363 /* Temporary storage ualloc */
364 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
366 ASSERT (un < ualloc);
367 ASSERT ( (u0[un-1] | u1[un-1]) > 0);
371 /* mpn_hgcd has failed. Then either one of a or b is very
372 small, or the difference is very small. Perform one
373 subtraction followed by one division. */
375 mp_size_t updated_un = un;
377 /* Temporary storage 2n + 1 */
378 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
379 u0, u1, &updated_un, tp, tp + n);
387 ASSERT (un < ualloc);
391 if (mpn_zero_p (ap, n))
393 MPN_COPY (gp, bp, n);
394 MPN_NORMALIZE_NOT_ZERO (u0, un);
395 MPN_COPY (up, u0, un);
401 else if (mpn_zero_p (bp, n))
403 MPN_COPY (gp, ap, n);
404 MPN_NORMALIZE_NOT_ZERO (u1, un);
405 MPN_COPY (up, u1, un);
411 else if (mpn_zero_p (u0, un))
417 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
418 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
425 /* We have A = ... a + ... b
433 |u0|, |u1| <= B / min(a, b)
435 Compute g = u a + v b = (u u1 - v u0) A + (...) B
436 Here, u, v are bounded by
452 lehmer_up = tp; tp += n;
454 /* Call mpn_gcdext_lehmer_n with copies of a and b. */
455 MPN_COPY (tp, ap, n);
456 MPN_COPY (tp + n, bp, n);
457 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
460 MPN_NORMALIZE (u0, u0n);
463 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
464 MPN_COPY (up, u0, u0n);
472 /* Compute v = (g - u a) / b */
473 lehmer_vn = compute_v (lehmer_vp,
474 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
480 lehmer_un = -lehmer_un;
485 MPN_NORMALIZE (u1, u1n);
487 /* It's possible that u0 = 1, u1 = 0 */
493 /* u1 == 0 ==> u u1 + v u0 = v */
494 MPN_COPY (up, lehmer_vp, lehmer_vn);
495 *usizep = negate ? lehmer_vn : - lehmer_vn;
501 ASSERT (lehmer_un + u1n <= ualloc);
502 ASSERT (lehmer_vn + u0n <= ualloc);
504 /* Now u0, u1, u are non-zero. We may still have v == 0 */
507 if (lehmer_un <= u1n)
508 /* Should be the common case */
509 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
511 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
513 un = u1n + lehmer_un;
514 un -= (up[un - 1] == 0);
520 /* Overwrites old u1 value */
521 if (lehmer_vn <= u0n)
522 /* Should be the common case */
523 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
525 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
527 u1n = u0n + lehmer_vn;
528 u1n -= (u1[u1n - 1] == 0);
532 cy = mpn_add (up, up, un, u1, u1n);
536 cy = mpn_add (up, u1, u1n, up, un);
542 ASSERT (un < ualloc);
544 *usizep = negate ? -un : un;