1 /* mpn_gcdext -- Extended Greatest Common Divisor.
3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26 the size is returned (if inputs are non-normalized, result may be
27 non-normalized too). Temporary space needed is M->n + n.
30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
35 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
48 mpn_mul (tp, M->p[0][0], M->n, ap, n);
49 mpn_mul (rp, M->p[1][0], M->n, bp, n);
53 mpn_mul (tp, ap, n, M->p[0][0], M->n);
54 mpn_mul (rp, bp, n, M->p[1][0], M->n);
57 ah = mpn_add_n (rp, rp, tp, n + M->n);
61 mpn_mul (tp, M->p[1][1], M->n, bp, n);
62 mpn_mul (bp, M->p[0][1], M->n, ap, n);
66 mpn_mul (tp, bp, n, M->p[1][1], M->n);
67 mpn_mul (bp, ap, n, M->p[0][1], M->n);
69 bh = mpn_add_n (bp, bp, tp, n + M->n);
81 while ( (rp[n-1] | bp[n-1]) == 0)
88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
91 negative, and v is of the opposite sign. a, b are of size n, u and
92 v at most size n, and v must have space for n+1 limbs. */
95 mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96 mp_srcptr gp, mp_size_t gn,
97 mp_srcptr up, mp_size_t usize,
113 MPN_NORMALIZE (ap, an);
116 mpn_mul (tp, ap, an, up, size);
118 mpn_mul (tp, up, size, ap, an);
121 size -= tp[size - 1] == 0;
127 /* |v| = -v = (u a - g) / b */
129 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
130 MPN_NORMALIZE (tp, size);
136 /* |v| = v = (c - u a) / b = (c + |u| a) / b */
137 mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
142 /* Now divide t / b. There must be no remainder */
144 MPN_NORMALIZE (bp, bn);
148 ASSERT (vn <= n + 1);
150 mpn_divexact (vp, tp, size, bp, bn);
151 vn -= (vp[vn-1] == 0);
156 /* Temporary storage:
158 Initial division: Quotient of at most an - n + 1 <= an limbs.
160 Storage for u0 and u1: 2(n+1).
162 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
164 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
166 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
168 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
170 For the lehmer call after the loop, Let T denote
171 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
172 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
173 for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
174 sufficient for both operations.
178 /* Optimal choice of p seems difficult. In each iteration the division
179 * of work between hgcd and the updates of u0 and u1 depends on the
180 * current size of the u. It may be desirable to use a different
181 * choice of p in each iteration. Also the input size seems to matter;
182 * choosing p = n / 3 in the first iteration seems to improve
183 * performance slightly for input size just above the threshold, but
184 * degrade performance for larger inputs. */
185 #define CHOOSE_P_1(n) ((n) / 2)
186 #define CHOOSE_P_2(n) ((n) / 3)
189 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
190 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
194 mp_size_t matrix_scratch;
195 mp_size_t ualloc = n + 1;
210 /* FIXME: Check for small sizes first, before setting up temporary
212 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
214 /* For initial division */
215 scratch = an - n + 1;
216 if (scratch > talloc)
219 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
222 mp_size_t hgcd_scratch;
223 mp_size_t update_scratch;
224 mp_size_t p1 = CHOOSE_P_1 (n);
225 mp_size_t p2 = CHOOSE_P_2 (n);
226 mp_size_t min_p = MIN(p1, p2);
227 mp_size_t max_p = MAX(p1, p2);
228 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
229 hgcd_scratch = mpn_hgcd_itch (n - min_p);
230 update_scratch = max_p + n - 1;
232 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
233 if (scratch > talloc)
236 /* Final mpn_gcdext_lehmer_n call. Need space for u and for
237 copies of a and b. */
238 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
239 + 3*GCDEXT_DC_THRESHOLD;
241 if (scratch > talloc)
244 /* Cofactors u0 and u1 */
248 tp = TMP_ALLOC_LIMBS(talloc);
252 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
254 if (mpn_zero_p (ap, n))
256 MPN_COPY (gp, bp, n);
263 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
265 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
271 MPN_ZERO (tp, 2*ualloc);
272 u0 = tp; tp += ualloc;
273 u1 = tp; tp += ualloc;
276 /* For the first hgcd call, there are no u updates, and it makes
277 some sense to use a different choice for p. */
279 /* FIXME: We could trim use of temporary storage, since u0 and u1
280 are not used yet. For the hgcd call, we could swap in the u0
281 and u1 pointers for the relevant matrix elements. */
283 struct hgcd_matrix M;
284 mp_size_t p = CHOOSE_P_1 (n);
287 mpn_hgcd_matrix_init (&M, n - p, tp);
288 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
291 ASSERT (M.n <= (n - p - 1)/2);
292 ASSERT (M.n + p <= (p + n - 1) / 2);
294 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
295 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
297 MPN_COPY (u0, M.p[1][0], M.n);
298 MPN_COPY (u1, M.p[1][1], M.n);
300 while ( (u0[un-1] | u1[un-1] ) == 0)
305 /* mpn_hgcd has failed. Then either one of a or b is very
306 small, or the difference is very small. Perform one
307 subtraction followed by one division. */
309 mp_size_t updated_un = 1;
313 /* Temporary storage 2n + 1 */
314 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
315 u0, u1, &updated_un, tp, tp + n);
323 ASSERT (un < ualloc);
327 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
329 struct hgcd_matrix M;
330 mp_size_t p = CHOOSE_P_2 (n);
333 mpn_hgcd_matrix_init (&M, n - p, tp);
334 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
339 t0 = tp + matrix_scratch;
340 ASSERT (M.n <= (n - p - 1)/2);
341 ASSERT (M.n + p <= (p + n - 1) / 2);
343 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
344 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
346 /* By the same analysis as for mpn_hgcd_matrix_mul */
347 ASSERT (M.n + un <= ualloc);
349 /* FIXME: This copying could be avoided by some swapping of
350 * pointers. May need more temporary storage, though. */
351 MPN_COPY (t0, u0, un);
353 /* Temporary storage ualloc */
354 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
356 ASSERT (un < ualloc);
357 ASSERT ( (u0[un-1] | u1[un-1]) > 0);
361 /* mpn_hgcd has failed. Then either one of a or b is very
362 small, or the difference is very small. Perform one
363 subtraction followed by one division. */
365 mp_size_t updated_un = un;
367 /* Temporary storage 2n + 1 */
368 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
369 u0, u1, &updated_un, tp, tp + n);
377 ASSERT (un < ualloc);
381 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
383 /* Must return the smallest cofactor, +u1 or -u0 */
386 MPN_COPY (gp, ap, n);
388 MPN_CMP (c, u0, u1, un);
389 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
390 this case we choose the cofactor + 1, corresponding to G = A
391 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
392 ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
395 MPN_NORMALIZE (u0, un);
396 MPN_COPY (up, u0, un);
401 MPN_NORMALIZE_NOT_ZERO (u1, un);
402 MPN_COPY (up, u1, un);
409 else if (mpn_zero_p (u0, un))
415 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
416 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
423 /* We have A = ... a + ... b
431 |u0|, |u1| <= B / min(a, b)
433 Compute g = u a + v b = (u u1 - v u0) A + (...) B
434 Here, u, v are bounded by
450 lehmer_up = tp; tp += n;
452 /* Call mpn_gcdext_lehmer_n with copies of a and b. */
453 MPN_COPY (tp, ap, n);
454 MPN_COPY (tp + n, bp, n);
455 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
458 MPN_NORMALIZE (u0, u0n);
461 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
462 MPN_COPY (up, u0, u0n);
470 /* Compute v = (g - u a) / b */
471 lehmer_vn = compute_v (lehmer_vp,
472 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
478 lehmer_un = -lehmer_un;
483 MPN_NORMALIZE (u1, u1n);
485 /* It's possible that u0 = 1, u1 = 0 */
491 /* u1 == 0 ==> u u1 + v u0 = v */
492 MPN_COPY (up, lehmer_vp, lehmer_vn);
493 *usizep = negate ? lehmer_vn : - lehmer_vn;
499 ASSERT (lehmer_un + u1n <= ualloc);
500 ASSERT (lehmer_vn + u0n <= ualloc);
502 /* Now u0, u1, u are non-zero. We may still have v == 0 */
505 if (lehmer_un <= u1n)
506 /* Should be the common case */
507 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
509 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
511 un = u1n + lehmer_un;
512 un -= (up[un - 1] == 0);
518 /* Overwrites old u1 value */
519 if (lehmer_vn <= u0n)
520 /* Should be the common case */
521 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
523 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
525 u1n = u0n + lehmer_vn;
526 u1n -= (u1[u1n - 1] == 0);
530 cy = mpn_add (up, up, un, u1, u1n);
534 cy = mpn_add (up, u1, u1n, up, un);
540 ASSERT (un < ualloc);
542 *usizep = negate ? -un : un;