contrib/gmp/mpn/generic/gcdext.c

   1 /* mpn_gcdext -- Extended Greatest Common Divisor.
   2
   3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
   4 Foundation, Inc.
   5
   6 This file is part of the GNU MP Library.
   7
   8 The GNU MP Library is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU Lesser General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at your
  11 option) any later version.
  12
  13 The GNU MP Library is distributed in the hope that it will be useful, but
  14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 License for more details.
  17
  18 You should have received a copy of the GNU Lesser General Public License
  19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  20
  21 #include "gmp.h"
  22 #include "gmp-impl.h"
  23 #include "longlong.h"
  24
  25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
  26    the size is returned (if inputs are non-normalized, result may be
  27    non-normalized too). Temporary space needed is M->n + n.
  28  */
  29 static size_t
  30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
  31                         mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
  32 {
  33   mp_limb_t ah, bh;
  34
  35   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
  36
  37      t  = u00 * a
  38      r  = u10 * b
  39      r += t;
  40
  41      t  = u11 * b
  42      b  = u01 * a
  43      b += t;
  44   */
  45
  46   if (M->n >= n)
  47     {
  48       mpn_mul (tp, M->p[0][0], M->n, ap, n);
  49       mpn_mul (rp, M->p[1][0], M->n, bp, n);
  50     }
  51   else
  52     {
  53       mpn_mul (tp, ap, n, M->p[0][0], M->n);
  54       mpn_mul (rp, bp, n, M->p[1][0], M->n);
  55     }
  56
  57   ah = mpn_add_n (rp, rp, tp, n + M->n);
  58
  59   if (M->n >= n)
  60     {
  61       mpn_mul (tp, M->p[1][1], M->n, bp, n);
  62       mpn_mul (bp, M->p[0][1], M->n, ap, n);
  63     }
  64   else
  65     {
  66       mpn_mul (tp, bp, n, M->p[1][1], M->n);
  67       mpn_mul (bp, ap, n, M->p[0][1], M->n);
  68     }
  69   bh = mpn_add_n (bp, bp, tp, n + M->n);
  70
  71   n += M->n;
  72   if ( (ah | bh) > 0)
  73     {
  74       rp[n] = ah;
  75       bp[n] = bh;
  76       n++;
  77     }
  78   else
  79     {
  80       /* Normalize */
  81       while ( (rp[n-1] | bp[n-1]) == 0)
  82         n--;
  83     }
  84
  85   return n;
  86 }
  87
  88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
  89
  90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
  91    negative, and v is of the opposite sign. a, b are of size n, u and
  92    v at most size n, and v must have space for n+1 limbs. */
  93 static mp_size_t
  94 compute_v (mp_ptr vp,
  95            mp_srcptr ap, mp_srcptr bp, mp_size_t n,
  96            mp_srcptr gp, mp_size_t gn,
  97            mp_srcptr up, mp_size_t usize,
  98            mp_ptr tp)
  99 {
 100   mp_size_t size;
 101   mp_size_t an;
 102   mp_size_t bn;
 103   mp_size_t vn;
 104
 105   ASSERT (n > 0);
 106   ASSERT (gn > 0);
 107   ASSERT (usize != 0);
 108
 109   size = ABS (usize);
 110   ASSERT (size <= n);
 111
 112   an = n;
 113   MPN_NORMALIZE (ap, an);
 114
 115   if (an >= size)
 116     mpn_mul (tp, ap, an, up, size);
 117   else
 118     mpn_mul (tp, up, size, ap, an);
 119
 120   size += an;
 121   size -= tp[size - 1] == 0;
 122
 123   ASSERT (gn <= size);
 124
 125   if (usize > 0)
 126     {
 127       /* |v| = -v = (u a - g) / b */
 128
 129       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
 130       MPN_NORMALIZE (tp, size);
 131       if (size == 0)
 132         return 0;
 133     }
 134   else
 135     { /* usize < 0 */
 136       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
 137       mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
 138       if (cy)
 139         tp[size++] = cy;
 140     }
 141
 142   /* Now divide t / b. There must be no remainder */
 143   bn = n;
 144   MPN_NORMALIZE (bp, bn);
 145   ASSERT (size >= bn);
 146
 147   vn = size + 1 - bn;
 148   ASSERT (vn <= n + 1);
 149
 150   mpn_divexact (vp, tp, size, bp, bn);
 151   vn -= (vp[vn-1] == 0);
 152
 153   return vn;
 154 }
 155
 156 /* Temporary storage:
 157
 158    Initial division: Quotient of at most an - n + 1 <= an limbs.
 159
 160    Storage for u0 and u1: 2(n+1).
 161
 162    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
 163
 164    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
 165
 166    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
 167
 168    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
 169
 170    For the lehmer call after the loop, Let T denote
 171    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
 172    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
 173    for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
 174    sufficient for both operations.
 175
 176 */
 177
 178 /* Optimal choice of p seems difficult. In each iteration the division
 179  * of work between hgcd and the updates of u0 and u1 depends on the
 180  * current size of the u. It may be desirable to use a different
 181  * choice of p in each iteration. Also the input size seems to matter;
 182  * choosing p = n / 3 in the first iteration seems to improve
 183  * performance slightly for input size just above the threshold, but
 184  * degrade performance for larger inputs. */
 185 #define CHOOSE_P_1(n) ((n) / 2)
 186 #define CHOOSE_P_2(n) ((n) / 3)
 187
 188 mp_size_t
 189 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
 190             mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
 191 {
 192   mp_size_t talloc;
 193   mp_size_t scratch;
 194   mp_size_t matrix_scratch;
 195   mp_size_t ualloc = n + 1;
 196
 197   mp_size_t un;
 198   mp_ptr u0;
 199   mp_ptr u1;
 200
 201   mp_ptr tp;
 202
 203   TMP_DECL;
 204
 205   ASSERT (an >= n);
 206   ASSERT (n > 0);
 207
 208   TMP_MARK;
 209
 210   /* FIXME: Check for small sizes first, before setting up temporary
 211      storage etc. */
 212   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
 213
 214   /* For initial division */
 215   scratch = an - n + 1;
 216   if (scratch > talloc)
 217     talloc = scratch;
 218
 219   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
 220     {
 221       /* For hgcd loop. */
 222       mp_size_t hgcd_scratch;
 223       mp_size_t update_scratch;
 224       mp_size_t p1 = CHOOSE_P_1 (n);
 225       mp_size_t p2 = CHOOSE_P_2 (n);
 226       mp_size_t min_p = MIN(p1, p2);
 227       mp_size_t max_p = MAX(p1, p2);
 228       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
 229       hgcd_scratch = mpn_hgcd_itch (n - min_p);
 230       update_scratch = max_p + n - 1;
 231
 232       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
 233       if (scratch > talloc)
 234         talloc = scratch;
 235
 236       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
 237          copies of a and b. */
 238       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
 239         + 3*GCDEXT_DC_THRESHOLD;
 240
 241       if (scratch > talloc)
 242         talloc = scratch;
 243
 244       /* Cofactors u0 and u1 */
 245       talloc += 2*(n+1);
 246     }
 247
 248   tp = TMP_ALLOC_LIMBS(talloc);
 249
 250   if (an > n)
 251     {
 252       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
 253
 254       if (mpn_zero_p (ap, n))
 255         {
 256           MPN_COPY (gp, bp, n);
 257           *usizep = 0;
 258           TMP_FREE;
 259           return n;
 260         }
 261     }
 262
 263   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
 264     {
 265       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
 266
 267       TMP_FREE;
 268       return gn;
 269     }
 270
 271   MPN_ZERO (tp, 2*ualloc);
 272   u0 = tp; tp += ualloc;
 273   u1 = tp; tp += ualloc;
 274
 275   {
 276     /* For the first hgcd call, there are no u updates, and it makes
 277        some sense to use a different choice for p. */
 278
 279     /* FIXME: We could trim use of temporary storage, since u0 and u1
 280        are not used yet. For the hgcd call, we could swap in the u0
 281        and u1 pointers for the relevant matrix elements. */
 282
 283     struct hgcd_matrix M;
 284     mp_size_t p = CHOOSE_P_1 (n);
 285     mp_size_t nn;
 286
 287     mpn_hgcd_matrix_init (&M, n - p, tp);
 288     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
 289     if (nn > 0)
 290       {
 291         ASSERT (M.n <= (n - p - 1)/2);
 292         ASSERT (M.n + p <= (p + n - 1) / 2);
 293
 294         /* Temporary storage 2 (p + M->n) <= p + n - 1 */
 295         n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
 296
 297         MPN_COPY (u0, M.p[1][0], M.n);
 298         MPN_COPY (u1, M.p[1][1], M.n);
 299         un = M.n;
 300         while ( (u0[un-1] | u1[un-1] ) == 0)
 301           un--;
 302       }
 303     else
 304       {
 305         /* mpn_hgcd has failed. Then either one of a or b is very
 306            small, or the difference is very small. Perform one
 307            subtraction followed by one division. */
 308         mp_size_t gn;
 309         mp_size_t updated_un = 1;
 310
 311         u1[0] = 1;
 312
 313         /* Temporary storage 2n + 1 */
 314         n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
 315                                     u0, u1, &updated_un, tp, tp + n);
 316         if (n == 0)
 317           {
 318             TMP_FREE;
 319             return gn;
 320           }
 321
 322         un = updated_un;
 323         ASSERT (un < ualloc);
 324       }
 325   }
 326
 327   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
 328     {
 329       struct hgcd_matrix M;
 330       mp_size_t p = CHOOSE_P_2 (n);
 331       mp_size_t nn;
 332
 333       mpn_hgcd_matrix_init (&M, n - p, tp);
 334       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
 335       if (nn > 0)
 336         {
 337           mp_ptr t0;
 338
 339           t0 = tp + matrix_scratch;
 340           ASSERT (M.n <= (n - p - 1)/2);
 341           ASSERT (M.n + p <= (p + n - 1) / 2);
 342
 343           /* Temporary storage 2 (p + M->n) <= p + n - 1 */
 344           n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
 345
 346           /* By the same analysis as for mpn_hgcd_matrix_mul */
 347           ASSERT (M.n + un <= ualloc);
 348
 349           /* FIXME: This copying could be avoided by some swapping of
 350            * pointers. May need more temporary storage, though. */
 351           MPN_COPY (t0, u0, un);
 352
 353           /* Temporary storage ualloc */
 354           un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
 355
 356           ASSERT (un < ualloc);
 357           ASSERT ( (u0[un-1] | u1[un-1]) > 0);
 358         }
 359       else
 360         {
 361           /* mpn_hgcd has failed. Then either one of a or b is very
 362              small, or the difference is very small. Perform one
 363              subtraction followed by one division. */
 364           mp_size_t gn;
 365           mp_size_t updated_un = un;
 366
 367           /* Temporary storage 2n + 1 */
 368           n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
 369                                       u0, u1, &updated_un, tp, tp + n);
 370           if (n == 0)
 371             {
 372               TMP_FREE;
 373               return gn;
 374             }
 375
 376           un = updated_un;
 377           ASSERT (un < ualloc);
 378         }
 379     }
 380
 381   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
 382     {
 383       /* Must return the smallest cofactor, +u1 or -u0 */
 384       int c;
 385
 386       MPN_COPY (gp, ap, n);
 387
 388       MPN_CMP (c, u0, u1, un);
 389       ASSERT (c != 0);
 390       if (c < 0)
 391         {
 392           MPN_NORMALIZE (u0, un);
 393           MPN_COPY (up, u0, un);
 394           *usizep = -un;
 395         }
 396       else
 397         {
 398           MPN_NORMALIZE_NOT_ZERO (u1, un);
 399           MPN_COPY (up, u1, un);
 400           *usizep = un;
 401         }
 402
 403       TMP_FREE;
 404       return n;
 405     }
 406   else if (mpn_zero_p (u0, un))
 407     {
 408       mp_size_t gn;
 409       ASSERT (un == 1);
 410       ASSERT (u1[0] == 1);
 411
 412       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
 413       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
 414
 415       TMP_FREE;
 416       return gn;
 417     }
 418   else
 419     {
 420       /* We have A = ... a + ... b
 421                  B =  u0 a +  u1 b
 422
 423                  a = u1  A + ... B
 424                  b = -u0 A + ... B
 425
 426          with bounds
 427
 428            |u0|, |u1| <= B / min(a, b)
 429
 430          Compute g = u a + v b = (u u1 - v u0) A + (...) B
 431          Here, u, v are bounded by
 432
 433          |u| <= b,
 434          |v| <= a
 435       */
 436
 437       mp_size_t u0n;
 438       mp_size_t u1n;
 439       mp_size_t lehmer_un;
 440       mp_size_t lehmer_vn;
 441       mp_size_t gn;
 442
 443       mp_ptr lehmer_up;
 444       mp_ptr lehmer_vp;
 445       int negate;
 446
 447       lehmer_up = tp; tp += n;
 448
 449       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
 450       MPN_COPY (tp, ap, n);
 451       MPN_COPY (tp + n, bp, n);
 452       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
 453
 454       u0n = un;
 455       MPN_NORMALIZE (u0, u0n);
 456       if (lehmer_un == 0)
 457         {
 458           /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
 459           MPN_COPY (up, u0, u0n);
 460           *usizep = -u0n;
 461
 462           TMP_FREE;
 463           return gn;
 464         }
 465
 466       lehmer_vp = tp;
 467       /* Compute v = (g - u a) / b */
 468       lehmer_vn = compute_v (lehmer_vp,
 469                              ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
 470
 471       if (lehmer_un > 0)
 472         negate = 0;
 473       else
 474         {
 475           lehmer_un = -lehmer_un;
 476           negate = 1;
 477         }
 478
 479       u1n = un;
 480       MPN_NORMALIZE (u1, u1n);
 481
 482       /* It's possible that u0 = 1, u1 = 0 */
 483       if (u1n == 0)
 484         {
 485           ASSERT (un == 1);
 486           ASSERT (u0[0] == 1);
 487
 488           /* u1 == 0 ==> u u1 + v u0 = v */
 489           MPN_COPY (up, lehmer_vp, lehmer_vn);
 490           *usizep = negate ? lehmer_vn : - lehmer_vn;
 491
 492           TMP_FREE;
 493           return gn;
 494         }
 495
 496       ASSERT (lehmer_un + u1n <= ualloc);
 497       ASSERT (lehmer_vn + u0n <= ualloc);
 498
 499       /* Now u0, u1, u are non-zero. We may still have v == 0 */
 500
 501       /* Compute u u0 */
 502       if (lehmer_un <= u1n)
 503         /* Should be the common case */
 504         mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
 505       else
 506         mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
 507
 508       un = u1n + lehmer_un;
 509       un -= (up[un - 1] == 0);
 510
 511       if (lehmer_vn > 0)
 512         {
 513           mp_limb_t cy;
 514
 515           /* Overwrites old u1 value */
 516           if (lehmer_vn <= u0n)
 517             /* Should be the common case */
 518             mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
 519           else
 520             mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
 521
 522           u1n = u0n + lehmer_vn;
 523           u1n -= (u1[u1n - 1] == 0);
 524
 525           if (u1n <= un)
 526             {
 527               cy = mpn_add (up, up, un, u1, u1n);
 528             }
 529           else
 530             {
 531               cy = mpn_add (up, u1, u1n, up, un);
 532               un = u1n;
 533             }
 534           up[un] = cy;
 535           un += (cy != 0);
 536
 537           ASSERT (un < ualloc);
 538         }
 539       *usizep = negate ? -un : un;
 540
 541       TMP_FREE;
 542       return gn;
 543     }
 544 }