3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
7 Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19 License for more details.
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
28 /* For input of size n, matrix elements are of size at most ceil(n/2)
29 - 1, but we need two limbs extra. */
31 mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
33 mp_size_t s = (n+1)/2 + 1;
39 M->p[1][0] = p + 2 * s;
40 M->p[1][1] = p + 3 * s;
42 M->p[0][0][0] = M->p[1][1][0] = 1;
45 /* Updated column COL, adding in column (1-COL). */
47 hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col)
52 c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n);
53 c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n);
55 M->p[0][col][M->n] = c0;
56 M->p[1][col][M->n] = c1;
58 M->n += (c0 | c1) != 0;
59 ASSERT (M->n < M->alloc);
62 /* Updated column COL, adding in column Q * (1-COL). Temporary
63 * storage: qn + n <= M->alloc, where n is the size of the largest
64 * element in column 1 - COL. */
66 hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
67 unsigned col, mp_ptr tp)
76 c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
77 c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
79 M->p[0][col][M->n] = c0;
80 M->p[1][col][M->n] = c1;
82 M->n += (c0 | c1) != 0;
88 /* Carries for the unlikely case that we get both high words
89 from the multiplication and carries from the addition. */
93 /* The matrix will not necessarily grow in size by qn, so we
94 need normalization in order not to overflow M. */
96 for (n = M->n; n + qn > M->n; n--)
99 if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
103 ASSERT (qn + n <= M->alloc);
105 for (row = 0; row < 2; row++)
108 mpn_mul (tp, M->p[row][1-col], n, qp, qn);
110 mpn_mul (tp, qp, qn, M->p[row][1-col], n);
112 ASSERT (n + qn >= M->n);
113 c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
118 M->p[0][col][n-1] = c[0];
119 M->p[1][col][n-1] = c[1];
124 n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
130 ASSERT (M->n < M->alloc);
133 /* Multiply M by M1 from the right. Since the M1 elements fit in
134 GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
135 temporary space M->n */
137 hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
142 /* Could avoid copy by some swapping of pointers. */
143 MPN_COPY (tp, M->p[0][0], M->n);
144 n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n);
145 MPN_COPY (tp, M->p[1][0], M->n);
146 n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n);
148 /* Depends on zero initialization */
150 ASSERT (M->n < M->alloc);
153 /* Perform a few steps, using some of mpn_hgcd2, subtraction and
154 division. Reduces the size by almost one limb or more, but never
155 below the given size s. Return new size for a and b, or 0 if no
156 more steps are possible.
158 If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n
159 limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
160 fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
161 hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
164 If N is the input size to the calling hgcd, then s = floor(N/2) +
165 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
166 < N, so N is sufficient.
170 hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
171 struct hgcd_matrix *M, mp_ptr tp)
173 struct hgcd_matrix1 M1;
175 mp_limb_t ah, al, bh, bl;
176 mp_size_t an, bn, qn;
181 mask = ap[n-1] | bp[n-1];
189 ah = ap[n-1]; al = ap[n-2];
190 bh = bp[n-1]; bl = bp[n-2];
192 else if (mask & GMP_NUMB_HIGHBIT)
194 ah = ap[n-1]; al = ap[n-2];
195 bh = bp[n-1]; bl = bp[n-2];
201 count_leading_zeros (shift, mask);
202 ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
203 al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
204 bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
205 bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
208 /* Try an mpn_hgcd2 step */
209 if (mpn_hgcd2 (ah, al, bh, bl, &M1))
211 /* Multiply M <- M * M1 */
212 hgcd_matrix_mul_1 (M, &M1, tp);
214 /* Can't swap inputs, so we need to copy. */
215 MPN_COPY (tp, ap, n);
216 /* Multiply M1^{-1} (a;b) */
217 return mpn_hgcd_mul_matrix1_inverse_vector (&M1, ap, tp, bp, n);
221 /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and
222 bh was too small, or ah, bh were (almost) equal. Perform one
223 subtraction step (for possible cancellation of high limbs),
224 followed by one division. */
226 /* Since we must ensure that #(a-b) > s, we handle cancellation of
227 high limbs explicitly up front. (FIXME: Or is it better to just
228 subtract, normalize, and use an addition to undo if it turns out
229 the the difference is too small?) */
230 for (an = n; an > s; an--)
231 if (ap[an-1] != bp[an-1])
237 /* Maintain a > b. When needed, swap a and b, and let col keep track
238 of how to update M. */
239 if (ap[an-1] > bp[an-1])
241 /* a is largest. In the subtraction step, we need to update
247 MP_PTR_SWAP (ap, bp);
252 MPN_NORMALIZE (bp, bn);
256 /* We have #a, #b > s. When is it possible that #(a-b) < s? For
257 cancellation to happen, the numbers must be of the form
259 a = x + 1, 0, ..., 0, al
260 b = x , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl
262 where al, bl denotes the least significant k limbs. If al < bl,
263 then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX,
264 then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */
266 if (ap[an-1] == bp[an-1] + 1)
270 for (k = an-1; k > s; k--)
271 if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX)
274 MPN_CMP (c, ap, bp, k);
279 /* The limbs from k and up are cancelled. */
282 cy = mpn_sub_n (ap, ap, bp, k);
288 ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k));
294 ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an));
297 ASSERT (ap[an-1] > 0);
299 ASSERT (bp[bn-1] > 0);
301 hgcd_matrix_update_1 (M, col);
305 MPN_PTR_SWAP (ap, an, bp, bn);
311 MPN_CMP (c, ap, bp, an);
314 MP_PTR_SWAP (ap, bp);
322 /* FIXME: We could use an approximate division, that may return a
323 too small quotient, and only guarantee that the size of r is
324 almost the size of b. FIXME: Let ap and remainder overlap. */
325 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
326 qn -= (tp[qn -1] == 0);
328 /* Normalize remainder */
330 for ( ; an > s; an--)
336 /* Quotient is too large */
339 cy = mpn_add (ap, bp, bn, ap, an);
349 MPN_DECR_U (tp, qn, 1);
350 qn -= (tp[qn-1] == 0);
354 hgcd_matrix_update_q (M, tp, qn, col, tp + qn);
359 /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
360 with elements of size at most (n+1)/2 - 1. Returns new size of a,
361 b, or zero if no reduction is possible. */
363 mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n,
364 struct hgcd_matrix *M, mp_ptr tp)
366 mp_size_t s = n/2 + 1;
370 ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
372 nn = hgcd_step (n, ap, bp, s, M, tp);
380 nn = hgcd_step (n, ap, bp, s, M, tp);
386 /* Multiply M by M1 from the right. Needs 3*(M->n + M1->n) + 5 limbs
387 of temporary storage (see mpn_matrix22_mul_itch). */
389 mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
394 /* About the new size of M:s elements. Since M1's diagonal elements
395 are > 0, no element can decrease. The new elements are of size
396 M->n + M1->n, one limb more or less. The computation of the
397 matrix product produces elements of size M->n + M1->n + 1. But
398 the true size, after normalization, may be three limbs smaller.
400 The reason that the product has normalized size >= M->n + M1->n -
401 2 is subtle. It depends on the fact that M and M1 can be factored
402 as products of (1,1; 0,1) and (1,0; 1,1), and that we can't have
403 M ending with a large power and M1 starting with a large power of
406 /* FIXME: Strassen multiplication gives only a small speedup. In FFT
407 multiplication range, this function could be sped up quite a lot
409 ASSERT (M->n + M1->n < M->alloc);
411 ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
412 | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
414 ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
415 | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
417 mpn_matrix22_mul (M->p[0][0], M->p[0][1],
418 M->p[1][0], M->p[1][1], M->n,
419 M1->p[0][0], M1->p[0][1],
420 M1->p[1][0], M1->p[1][1], M1->n, tp);
422 /* Index of last potentially non-zero limb, size is one greater. */
425 n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
426 n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
427 n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
429 ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0);
434 /* Multiplies the least significant p limbs of (a;b) by M^-1.
435 Temporary space needed: 2 * (p + M->n)*/
437 mpn_hgcd_matrix_adjust (struct hgcd_matrix *M,
438 mp_size_t n, mp_ptr ap, mp_ptr bp,
439 mp_size_t p, mp_ptr tp)
441 /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
442 = (r11 a - r01 b; - r10 a + r00 b */
445 mp_ptr t1 = tp + p + M->n;
449 ASSERT (p + M->n < n);
451 /* First compute the two values depending on a, before overwriting a */
455 mpn_mul (t0, M->p[1][1], M->n, ap, p);
456 mpn_mul (t1, M->p[1][0], M->n, ap, p);
460 mpn_mul (t0, ap, p, M->p[1][1], M->n);
461 mpn_mul (t1, ap, p, M->p[1][0], M->n);
465 MPN_COPY (ap, t0, p);
466 ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
469 mpn_mul (t0, M->p[0][1], M->n, bp, p);
471 mpn_mul (t0, bp, p, M->p[0][1], M->n);
473 cy = mpn_sub (ap, ap, n, t0, p + M->n);
479 mpn_mul (t0, M->p[0][0], M->n, bp, p);
481 mpn_mul (t0, bp, p, M->p[0][0], M->n);
483 MPN_COPY (bp, t0, p);
484 bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
485 cy = mpn_sub (bp, bp, n, t1, p + M->n);
489 if (ah > 0 || bh > 0)
497 /* The subtraction can reduce the size by at most one limb. */
498 if (ap[n-1] == 0 && bp[n-1] == 0)
501 ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
505 /* Size analysis for hgcd:
507 For the recursive calls, we have n1 <= ceil(n / 2). Then the
508 storage need is determined by the storage for the recursive call
509 computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1
510 (after this, the storage needed for M1 can be recycled).
512 Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1)
513 = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2,
514 and for the hgcd_matrix_mul, we may need 3 ceil(n/2) + 8. In total,
515 4 * ceil(n/4) + 3 ceil(n/2) + 12 <= 10 ceil(n/4) + 12.
517 For the recursive call, we need S(n1) = S(ceil(n/2)).
519 S(n) <= 10*ceil(n/4) + 12 + S(ceil(n/2))
520 <= 10*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 12k + S(ceil(n/2^k))
521 <= 10*(2 ceil(n/4) + k) + 12k + S(ceil(n/2^k))
522 <= 20 ceil(n/4) + 22k + S(ceil(n/2^k))
526 mpn_hgcd_itch (mp_size_t n)
532 if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
533 return MPN_HGCD_LEHMER_ITCH (n);
535 /* Get the recursion depth. */
536 nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
537 count_leading_zeros (count, nscaled);
538 k = GMP_LIMB_BITS - count;
540 return 20 * ((n+3) / 4) + 22 * k
541 + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD);
544 /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
545 with elements of size at most (n+1)/2 - 1. Returns new size of a,
546 b, or zero if no reduction is possible. */
549 mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
550 struct hgcd_matrix *M, mp_ptr tp)
552 mp_size_t s = n/2 + 1;
553 mp_size_t n2 = (3*n)/4 + 1;
559 /* Happens when n <= 2, a fairly uninteresting case but exercised
560 by the random inputs of the testsuite. */
563 ASSERT ((ap[n-1] | bp[n-1]) > 0);
565 ASSERT ((n+1)/2 - 1 < M->alloc);
567 if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
568 return mpn_hgcd_lehmer (ap, bp, n, M, tp);
571 nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
574 /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
576 n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
581 /* Needs n + 1 storage */
582 nn = hgcd_step (n, ap, bp, s, M, tp);
584 return success ? n : 0;
591 struct hgcd_matrix M1;
595 scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
597 mpn_hgcd_matrix_init(&M1, n - p, tp);
598 nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
601 /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
602 ASSERT (M->n + 2 >= M1.n);
604 /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
605 then either q or q + 1 is a correct quotient, and M1 will
606 start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
607 rules out the case that the size of M * M1 is much
608 smaller than the expected M->n + M1->n. */
610 ASSERT (M->n + M1.n < M->alloc);
612 /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
613 = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
614 n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
616 /* We need a bound for of M->n + M1.n. Let n be the original
619 ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
623 M.n + M1.n <= ceil(n/2) + 1
625 Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
626 amount of needed scratch space. */
627 mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
632 /* This really is the base case */
636 nn = hgcd_step (n, ap, bp, s, M, tp);
638 return success ? n : 0;