1 /* mpn_gcdext -- Extended Greatest Common Divisor.
3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26 the size is returned (if inputs are non-normalized, result may be
27 non-normalized too). Temporary space needed is M->n + n.
30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
35 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
48 mpn_mul (tp, M->p[0][0], M->n, ap, n);
49 mpn_mul (rp, M->p[1][0], M->n, bp, n);
53 mpn_mul (tp, ap, n, M->p[0][0], M->n);
54 mpn_mul (rp, bp, n, M->p[1][0], M->n);
57 ah = mpn_add_n (rp, rp, tp, n + M->n);
61 mpn_mul (tp, M->p[1][1], M->n, bp, n);
62 mpn_mul (bp, M->p[0][1], M->n, ap, n);
66 mpn_mul (tp, bp, n, M->p[1][1], M->n);
67 mpn_mul (bp, ap, n, M->p[0][1], M->n);
69 bh = mpn_add_n (bp, bp, tp, n + M->n);
81 while ( (rp[n-1] | bp[n-1]) == 0)
88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
91 negative, and v is of the opposite sign. a, b are of size n, u and
92 v at most size n, and v must have space for n+1 limbs. */
95 mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96 mp_srcptr gp, mp_size_t gn,
97 mp_srcptr up, mp_size_t usize,
113 MPN_NORMALIZE (ap, an);
116 mpn_mul (tp, ap, an, up, size);
118 mpn_mul (tp, up, size, ap, an);
121 size -= tp[size - 1] == 0;
127 /* |v| = -v = (u a - g) / b */
129 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
130 MPN_NORMALIZE (tp, size);
136 /* |v| = v = (c - u a) / b = (c + |u| a) / b */
137 mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
142 /* Now divide t / b. There must be no remainder */
144 MPN_NORMALIZE (bp, bn);
148 ASSERT (vn <= n + 1);
150 mpn_divexact (vp, tp, size, bp, bn);
151 vn -= (vp[vn-1] == 0);
156 /* Temporary storage:
158 Initial division: Quotient of at most an - n + 1 <= an limbs.
160 Storage for u0 and u1: 2(n+1).
162 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
164 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
166 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
168 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
170 For the lehmer call after the loop, Let T denote
171 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
172 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
173 for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
174 sufficient for both operations.
178 /* Optimal choice of p seems difficult. In each iteration the division
179 * of work between hgcd and the updates of u0 and u1 depends on the
180 * current size of the u. It may be desirable to use a different
181 * choice of p in each iteration. Also the input size seems to matter;
182 * choosing p = n / 3 in the first iteration seems to improve
183 * performance slightly for input size just above the threshold, but
184 * degrade performance for larger inputs. */
185 #define CHOOSE_P_1(n) ((n) / 2)
186 #define CHOOSE_P_2(n) ((n) / 3)
189 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
190 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
194 mp_size_t matrix_scratch;
195 mp_size_t ualloc = n + 1;
210 /* FIXME: Check for small sizes first, before setting up temporary
212 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
214 /* For initial division */
215 scratch = an - n + 1;
216 if (scratch > talloc)
219 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
222 mp_size_t hgcd_scratch;
223 mp_size_t update_scratch;
224 mp_size_t p1 = CHOOSE_P_1 (n);
225 mp_size_t p2 = CHOOSE_P_2 (n);
226 mp_size_t min_p = MIN(p1, p2);
227 mp_size_t max_p = MAX(p1, p2);
228 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
229 hgcd_scratch = mpn_hgcd_itch (n - min_p);
230 update_scratch = max_p + n - 1;
232 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
233 if (scratch > talloc)
236 /* Final mpn_gcdext_lehmer_n call. Need space for u and for
237 copies of a and b. */
238 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
239 + 3*GCDEXT_DC_THRESHOLD;
241 if (scratch > talloc)
244 /* Cofactors u0 and u1 */
248 tp = TMP_ALLOC_LIMBS(talloc);
252 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
254 if (mpn_zero_p (ap, n))
256 MPN_COPY (gp, bp, n);
263 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
265 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
271 MPN_ZERO (tp, 2*ualloc);
272 u0 = tp; tp += ualloc;
273 u1 = tp; tp += ualloc;
276 /* For the first hgcd call, there are no u updates, and it makes
277 some sense to use a different choice for p. */
279 /* FIXME: We could trim use of temporary storage, since u0 and u1
280 are not used yet. For the hgcd call, we could swap in the u0
281 and u1 pointers for the relevant matrix elements. */
283 struct hgcd_matrix M;
284 mp_size_t p = CHOOSE_P_1 (n);
287 mpn_hgcd_matrix_init (&M, n - p, tp);
288 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
291 ASSERT (M.n <= (n - p - 1)/2);
292 ASSERT (M.n + p <= (p + n - 1) / 2);
294 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
295 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
297 MPN_COPY (u0, M.p[1][0], M.n);
298 MPN_COPY (u1, M.p[1][1], M.n);
300 while ( (u0[un-1] | u1[un-1] ) == 0)
305 /* mpn_hgcd has failed. Then either one of a or b is very
306 small, or the difference is very small. Perform one
307 subtraction followed by one division. */
309 mp_size_t updated_un = 1;
313 /* Temporary storage 2n + 1 */
314 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
315 u0, u1, &updated_un, tp, tp + n);
323 ASSERT (un < ualloc);
327 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
329 struct hgcd_matrix M;
330 mp_size_t p = CHOOSE_P_2 (n);
333 mpn_hgcd_matrix_init (&M, n - p, tp);
334 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
339 t0 = tp + matrix_scratch;
340 ASSERT (M.n <= (n - p - 1)/2);
341 ASSERT (M.n + p <= (p + n - 1) / 2);
343 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
344 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
346 /* By the same analysis as for mpn_hgcd_matrix_mul */
347 ASSERT (M.n + un <= ualloc);
349 /* FIXME: This copying could be avoided by some swapping of
350 * pointers. May need more temporary storage, though. */
351 MPN_COPY (t0, u0, un);
353 /* Temporary storage ualloc */
354 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
356 ASSERT (un < ualloc);
357 ASSERT ( (u0[un-1] | u1[un-1]) > 0);
361 /* mpn_hgcd has failed. Then either one of a or b is very
362 small, or the difference is very small. Perform one
363 subtraction followed by one division. */
365 mp_size_t updated_un = un;
367 /* Temporary storage 2n + 1 */
368 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
369 u0, u1, &updated_un, tp, tp + n);
377 ASSERT (un < ualloc);
381 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
383 /* Must return the smallest cofactor, +u1 or -u0 */
386 MPN_COPY (gp, ap, n);
388 MPN_CMP (c, u0, u1, un);
392 MPN_NORMALIZE (u0, un);
393 MPN_COPY (up, u0, un);
398 MPN_NORMALIZE_NOT_ZERO (u1, un);
399 MPN_COPY (up, u1, un);
406 else if (mpn_zero_p (u0, un))
412 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
413 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
420 /* We have A = ... a + ... b
428 |u0|, |u1| <= B / min(a, b)
430 Compute g = u a + v b = (u u1 - v u0) A + (...) B
431 Here, u, v are bounded by
447 lehmer_up = tp; tp += n;
449 /* Call mpn_gcdext_lehmer_n with copies of a and b. */
450 MPN_COPY (tp, ap, n);
451 MPN_COPY (tp + n, bp, n);
452 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
455 MPN_NORMALIZE (u0, u0n);
458 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
459 MPN_COPY (up, u0, u0n);
467 /* Compute v = (g - u a) / b */
468 lehmer_vn = compute_v (lehmer_vp,
469 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
475 lehmer_un = -lehmer_un;
480 MPN_NORMALIZE (u1, u1n);
482 /* It's possible that u0 = 1, u1 = 0 */
488 /* u1 == 0 ==> u u1 + v u0 = v */
489 MPN_COPY (up, lehmer_vp, lehmer_vn);
490 *usizep = negate ? lehmer_vn : - lehmer_vn;
496 ASSERT (lehmer_un + u1n <= ualloc);
497 ASSERT (lehmer_vn + u0n <= ualloc);
499 /* Now u0, u1, u are non-zero. We may still have v == 0 */
502 if (lehmer_un <= u1n)
503 /* Should be the common case */
504 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
506 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
508 un = u1n + lehmer_un;
509 un -= (up[un - 1] == 0);
515 /* Overwrites old u1 value */
516 if (lehmer_vn <= u0n)
517 /* Should be the common case */
518 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
520 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
522 u1n = u0n + lehmer_vn;
523 u1n -= (u1[u1n - 1] == 0);
527 cy = mpn_add (up, up, un, u1, u1n);
531 cy = mpn_add (up, u1, u1n, up, un);
537 ASSERT (un < ualloc);
539 *usizep = negate ? -un : un;