contrib/gmp/mpz/n_pow_ui.c

   1 /* mpz_n_pow_ui -- mpn raised to ulong.
   2
   3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
   4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
   5    FUTURE GNU MP RELEASES.
   6
   7 Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
   8
   9 This file is part of the GNU MP Library.
  10
  11 The GNU MP Library is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU Lesser General Public License as published by
  13 the Free Software Foundation; either version 3 of the License, or (at your
  14 option) any later version.
  15
  16 The GNU MP Library is distributed in the hope that it will be useful, but
  17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  18 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  19 License for more details.
  20
  21 You should have received a copy of the GNU Lesser General Public License
  22 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  23
  24 #include "gmp.h"
  25 #include "gmp-impl.h"
  26 #include "longlong.h"
  27
  28
  29 /* Change this to "#define TRACE(x) x" for some traces. */
  30 #define TRACE(x)
  31
  32
  33 /* Use this to test the mul_2 code on a CPU without a native version of that
  34    routine.  */
  35 #if 0
  36 #define mpn_mul_2  refmpn_mul_2
  37 #define HAVE_NATIVE_mpn_mul_2  1
  38 #endif
  39
  40
  41 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
  42    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
  43    bsize==2 or >2, but separating that isn't easy because there's shared
  44    code both before and after (the size calculations and the powers of 2
  45    handling).
  46
  47    Alternatives:
  48
  49    It would work to just use the mpn_mul powering loop for 1 and 2 limb
  50    bases, but the current separate loop allows mul_1 and mul_2 to be done
  51    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
  52    to allow source==dest when vn==1 or 2 then some pointer twiddling might
  53    let us get the same effect in one loop.
  54
  55    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
  56    form the biggest possible power of b that fits, only the biggest power of
  57    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
  58    using mp_bases[b].big_base for small b, and thereby get better value
  59    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
  60    so would be more complicated than it's worth, and could well end up being
  61    a slowdown for small e.  For big e on the other hand the algorithm is
  62    dominated by mpn_sqr so there wouldn't much of a saving.  The current
  63    code can be viewed as simply doing the first few steps of the powering in
  64    a single or double limb where possible.
  65
  66    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
  67    copy made of b is unnecessary.  We could just use the old alloc'ed block
  68    and free it at the end.  But arranging this seems like a lot more trouble
  69    than it's worth.  */
  70
  71
  72 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
  73    a limb without overflowing.
  74    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
  75
  76 #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
  77
  78
  79 /* The following are for convenience, they update the size and check the
  80    alloc.  */
  81
  82 #define MPN_SQR(dst, alloc, src, size)          \
  83   do {                                          \
  84     ASSERT (2*(size) <= (alloc));               \
  85     mpn_sqr (dst, src, size);                   \
  86     (size) *= 2;                                \
  87     (size) -= ((dst)[(size)-1] == 0);           \
  88   } while (0)
  89
  90 #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
  91   do {                                                  \
  92     mp_limb_t  cy;                                      \
  93     ASSERT ((size) + (size2) <= (alloc));               \
  94     cy = mpn_mul (dst, src, size, src2, size2);         \
  95     (size) += (size2) - (cy == 0);                      \
  96   } while (0)
  97
  98 #define MPN_MUL_2(ptr, size, alloc, mult)       \
  99   do {                                          \
 100     mp_limb_t  cy;                              \
 101     ASSERT ((size)+2 <= (alloc));               \
 102     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
 103     (size)++;                                   \
 104     (ptr)[(size)] = cy;                         \
 105     (size) += (cy != 0);                        \
 106   } while (0)
 107
 108 #define MPN_MUL_1(ptr, size, alloc, limb)       \
 109   do {                                          \
 110     mp_limb_t  cy;                              \
 111     ASSERT ((size)+1 <= (alloc));               \
 112     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
 113     (ptr)[size] = cy;                           \
 114     (size) += (cy != 0);                        \
 115   } while (0)
 116
 117 #define MPN_LSHIFT(ptr, size, alloc, shift)     \
 118   do {                                          \
 119     mp_limb_t  cy;                              \
 120     ASSERT ((size)+1 <= (alloc));               \
 121     cy = mpn_lshift (ptr, ptr, size, shift);    \
 122     (ptr)[size] = cy;                           \
 123     (size) += (cy != 0);                        \
 124   } while (0)
 125
 126 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
 127   do {                                                  \
 128     if ((shift) == 0)                                   \
 129       MPN_COPY (dst, src, size);                        \
 130     else                                                \
 131       {                                                 \
 132         mpn_rshift (dst, src, size, shift);             \
 133         (size) -= ((dst)[(size)-1] == 0);               \
 134       }                                                 \
 135   } while (0)
 136
 137
 138 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
 139    allocations.  Avoid writing values to them in a normal build, to ensure
 140    the compiler lets them go dead.  gcc already figures this out itself
 141    actually.  */
 142
 143 #define SWAP_RP_TP                                      \
 144   do {                                                  \
 145     MP_PTR_SWAP (rp, tp);                               \
 146     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
 147   } while (0)
 148
 149
 150 void
 151 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
 152 {
 153   mp_ptr         rp;
 154   mp_size_t      rtwos_limbs, ralloc, rsize;
 155   int            rneg, i, cnt, btwos, r_bp_overlap;
 156   mp_limb_t      blimb, rl;
 157   mp_bitcnt_t    rtwos_bits;
 158 #if HAVE_NATIVE_mpn_mul_2
 159   mp_limb_t      blimb_low, rl_high;
 160 #else
 161   mp_limb_t      b_twolimbs[2];
 162 #endif
 163   TMP_DECL;
 164
 165   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
 166                  PTR(r), bp, bsize, e, e);
 167          mpn_trace ("b", bp, bsize));
 168
 169   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
 170   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
 171
 172   /* b^0 == 1, including 0^0 == 1 */
 173   if (e == 0)
 174     {
 175       PTR(r)[0] = 1;
 176       SIZ(r) = 1;
 177       return;
 178     }
 179
 180   /* 0^e == 0 apart from 0^0 above */
 181   if (bsize == 0)
 182     {
 183       SIZ(r) = 0;
 184       return;
 185     }
 186
 187   /* Sign of the final result. */
 188   rneg = (bsize < 0 && (e & 1) != 0);
 189   bsize = ABS (bsize);
 190   TRACE (printf ("rneg %d\n", rneg));
 191
 192   r_bp_overlap = (PTR(r) == bp);
 193
 194   /* Strip low zero limbs from b. */
 195   rtwos_limbs = 0;
 196   for (blimb = *bp; blimb == 0; blimb = *++bp)
 197     {
 198       rtwos_limbs += e;
 199       bsize--; ASSERT (bsize >= 1);
 200     }
 201   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
 202
 203   /* Strip low zero bits from b. */
 204   count_trailing_zeros (btwos, blimb);
 205   blimb >>= btwos;
 206   rtwos_bits = e * btwos;
 207   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
 208   rtwos_bits %= GMP_NUMB_BITS;
 209   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
 210                  btwos, rtwos_limbs, rtwos_bits));
 211
 212   TMP_MARK;
 213
 214   rl = 1;
 215 #if HAVE_NATIVE_mpn_mul_2
 216   rl_high = 0;
 217 #endif
 218
 219   if (bsize == 1)
 220     {
 221     bsize_1:
 222       /* Power up as far as possible within blimb.  We start here with e!=0,
 223          but if e is small then we might reach e==0 and the whole b^e in rl.
 224          Notice this code works when blimb==1 too, reaching e==0.  */
 225
 226       while (blimb <= GMP_NUMB_HALFMAX)
 227         {
 228           TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
 229                          e, blimb, rl));
 230           ASSERT (e != 0);
 231           if ((e & 1) != 0)
 232             rl *= blimb;
 233           e >>= 1;
 234           if (e == 0)
 235             goto got_rl;
 236           blimb *= blimb;
 237         }
 238
 239 #if HAVE_NATIVE_mpn_mul_2
 240       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
 241                      e, blimb, rl));
 242
 243       /* Can power b once more into blimb:blimb_low */
 244       bsize = 2;
 245       ASSERT (e != 0);
 246       if ((e & 1) != 0)
 247         {
 248           umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
 249           rl >>= GMP_NAIL_BITS;
 250         }
 251       e >>= 1;
 252       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
 253       blimb_low >>= GMP_NAIL_BITS;
 254
 255     got_rl:
 256       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
 257                      e, blimb, blimb_low, rl_high, rl));
 258
 259       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
 260          final mul_1 or mul_2 rather than a separate lshift.
 261          - rl_high:rl mustn't be 1 (since then there's no final mul)
 262          - rl_high mustn't overflow
 263          - rl_high mustn't change to non-zero, since mul_1+lshift is
 264          probably faster than mul_2 (FIXME: is this true?)  */
 265
 266       if (rtwos_bits != 0
 267           && ! (rl_high == 0 && rl == 1)
 268           && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
 269         {
 270           mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
 271             | (rl >> (GMP_NUMB_BITS-rtwos_bits));
 272           if (! (rl_high == 0 && new_rl_high != 0))
 273             {
 274               rl_high = new_rl_high;
 275               rl <<= rtwos_bits;
 276               rtwos_bits = 0;
 277               TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
 278                              rl_high, rl));
 279             }
 280         }
 281 #else
 282     got_rl:
 283       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
 284                      e, blimb, rl));
 285
 286       /* Combine left-over rtwos_bits into rl to be handled by the final
 287          mul_1 rather than a separate lshift.
 288          - rl mustn't be 1 (since then there's no final mul)
 289          - rl mustn't overflow  */
 290
 291       if (rtwos_bits != 0
 292           && rl != 1
 293           && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
 294         {
 295           rl <<= rtwos_bits;
 296           rtwos_bits = 0;
 297           TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
 298         }
 299 #endif
 300     }
 301   else if (bsize == 2)
 302     {
 303       mp_limb_t  bsecond = bp[1];
 304       if (btwos != 0)
 305         blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
 306       bsecond >>= btwos;
 307       if (bsecond == 0)
 308         {
 309           /* Two limbs became one after rshift. */
 310           bsize = 1;
 311           goto bsize_1;
 312         }
 313
 314       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
 315 #if HAVE_NATIVE_mpn_mul_2
 316       blimb_low = blimb;
 317 #else
 318       bp = b_twolimbs;
 319       b_twolimbs[0] = blimb;
 320       b_twolimbs[1] = bsecond;
 321 #endif
 322       blimb = bsecond;
 323     }
 324   else
 325     {
 326       if (r_bp_overlap || btwos != 0)
 327         {
 328           mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
 329           MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
 330           bp = tp;
 331           TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
 332         }
 333 #if HAVE_NATIVE_mpn_mul_2
 334       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
 335       blimb_low = bp[0];
 336 #endif
 337       blimb = bp[bsize-1];
 338
 339       TRACE (printf ("big bsize=%ld  ", bsize);
 340              mpn_trace ("b", bp, bsize));
 341     }
 342
 343   /* At this point blimb is the most significant limb of the base to use.
 344
 345      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
 346      limb to round up the division; +1 for multiplies all using an extra
 347      limb over the true size; +2 for rl at the end; +1 for lshift at the
 348      end.
 349
 350      The size calculation here is reasonably accurate.  The base is at least
 351      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
 352      when it will power up as just over 16, an overestimate of 17/16 =
 353      6.25%.  For a 64-bit limb it's half that.
 354
 355      If e==0 then blimb won't be anything useful (though it will be
 356      non-zero), but that doesn't matter since we just end up with ralloc==5,
 357      and that's fine for 2 limbs of rl and 1 of lshift.  */
 358
 359   ASSERT (blimb != 0);
 360   count_leading_zeros (cnt, blimb);
 361   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
 362   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
 363                  ralloc, bsize, blimb, cnt));
 364   MPZ_REALLOC (r, ralloc + rtwos_limbs);
 365   rp = PTR(r);
 366
 367   /* Low zero limbs resulting from powers of 2. */
 368   MPN_ZERO (rp, rtwos_limbs);
 369   rp += rtwos_limbs;
 370
 371   if (e == 0)
 372     {
 373       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
 374          start. */
 375       rp[0] = rl;
 376       rsize = 1;
 377 #if HAVE_NATIVE_mpn_mul_2
 378       rp[1] = rl_high;
 379       rsize += (rl_high != 0);
 380 #endif
 381       ASSERT (rp[rsize-1] != 0);
 382     }
 383   else
 384     {
 385       mp_ptr     tp;
 386       mp_size_t  talloc;
 387
 388       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
 389          low bit of e is zero, tp only has to hold the second last power
 390          step, which is half the size of the final result.  There's no need
 391          to round up the divide by 2, since ralloc includes a +2 for rl
 392          which not needed by tp.  In the mpn_mul loop when the low bit of e
 393          is 1, tp must hold nearly the full result, so just size it the same
 394          as rp.  */
 395
 396       talloc = ralloc;
 397 #if HAVE_NATIVE_mpn_mul_2
 398       if (bsize <= 2 || (e & 1) == 0)
 399         talloc /= 2;
 400 #else
 401       if (bsize <= 1 || (e & 1) == 0)
 402         talloc /= 2;
 403 #endif
 404       TRACE (printf ("talloc %ld\n", talloc));
 405       tp = TMP_ALLOC_LIMBS (talloc);
 406
 407       /* Go from high to low over the bits of e, starting with i pointing at
 408          the bit below the highest 1 (which will mean i==-1 if e==1).  */
 409       count_leading_zeros (cnt, e);
 410       i = GMP_LIMB_BITS - cnt - 2;
 411
 412 #if HAVE_NATIVE_mpn_mul_2
 413       if (bsize <= 2)
 414         {
 415           mp_limb_t  mult[2];
 416
 417           /* Any bsize==1 will have been powered above to be two limbs. */
 418           ASSERT (bsize == 2);
 419           ASSERT (blimb != 0);
 420
 421           /* Arrange the final result ends up in r, not in the temp space */
 422           if ((i & 1) == 0)
 423             SWAP_RP_TP;
 424
 425           rp[0] = blimb_low;
 426           rp[1] = blimb;
 427           rsize = 2;
 428
 429           mult[0] = blimb_low;
 430           mult[1] = blimb;
 431
 432           for ( ; i >= 0; i--)
 433             {
 434               TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
 435                              i, e, rsize, ralloc, talloc);
 436                      mpn_trace ("r", rp, rsize));
 437
 438               MPN_SQR (tp, talloc, rp, rsize);
 439               SWAP_RP_TP;
 440               if ((e & (1L << i)) != 0)
 441                 MPN_MUL_2 (rp, rsize, ralloc, mult);
 442             }
 443
 444           TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
 445           if (rl_high != 0)
 446             {
 447               mult[0] = rl;
 448               mult[1] = rl_high;
 449               MPN_MUL_2 (rp, rsize, ralloc, mult);
 450             }
 451           else if (rl != 1)
 452             MPN_MUL_1 (rp, rsize, ralloc, rl);
 453         }
 454 #else
 455       if (bsize == 1)
 456         {
 457           /* Arrange the final result ends up in r, not in the temp space */
 458           if ((i & 1) == 0)
 459             SWAP_RP_TP;
 460
 461           rp[0] = blimb;
 462           rsize = 1;
 463
 464           for ( ; i >= 0; i--)
 465             {
 466               TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
 467                              i, e, rsize, ralloc, talloc);
 468                      mpn_trace ("r", rp, rsize));
 469
 470               MPN_SQR (tp, talloc, rp, rsize);
 471               SWAP_RP_TP;
 472               if ((e & (1L << i)) != 0)
 473                 MPN_MUL_1 (rp, rsize, ralloc, blimb);
 474             }
 475
 476           TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
 477           if (rl != 1)
 478             MPN_MUL_1 (rp, rsize, ralloc, rl);
 479         }
 480 #endif
 481       else
 482         {
 483           int  parity;
 484
 485           /* Arrange the final result ends up in r, not in the temp space */
 486           ULONG_PARITY (parity, e);
 487           if (((parity ^ i) & 1) != 0)
 488             SWAP_RP_TP;
 489
 490           MPN_COPY (rp, bp, bsize);
 491           rsize = bsize;
 492
 493           for ( ; i >= 0; i--)
 494             {
 495               TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
 496                              i, e, rsize, ralloc, talloc);
 497                      mpn_trace ("r", rp, rsize));
 498
 499               MPN_SQR (tp, talloc, rp, rsize);
 500               SWAP_RP_TP;
 501               if ((e & (1L << i)) != 0)
 502                 {
 503                   MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
 504                   SWAP_RP_TP;
 505                 }
 506             }
 507         }
 508     }
 509
 510   ASSERT (rp == PTR(r) + rtwos_limbs);
 511   TRACE (mpn_trace ("end loop r", rp, rsize));
 512   TMP_FREE;
 513
 514   /* Apply any partial limb factors of 2. */
 515   if (rtwos_bits != 0)
 516     {
 517       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
 518       TRACE (mpn_trace ("lshift r", rp, rsize));
 519     }
 520
 521   rsize += rtwos_limbs;
 522   SIZ(r) = (rneg ? -rsize : rsize);
 523 }