1 /* mpz_n_pow_ui -- mpn raised to ulong.
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
7 Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19 License for more details.
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
29 /* Change this to "#define TRACE(x) x" for some traces. */
33 /* Use this to test the mul_2 code on a CPU without a native version of that
36 #define mpn_mul_2 refmpn_mul_2
37 #define HAVE_NATIVE_mpn_mul_2 1
41 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
42 ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
43 bsize==2 or >2, but separating that isn't easy because there's shared
44 code both before and after (the size calculations and the powers of 2
49 It would work to just use the mpn_mul powering loop for 1 and 2 limb
50 bases, but the current separate loop allows mul_1 and mul_2 to be done
51 in-place, which might help cache locality a bit. If mpn_mul was relaxed
52 to allow source==dest when vn==1 or 2 then some pointer twiddling might
53 let us get the same effect in one loop.
55 The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
56 form the biggest possible power of b that fits, only the biggest power of
57 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
58 using mp_bases[b].big_base for small b, and thereby get better value
59 from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
60 so would be more complicated than it's worth, and could well end up being
61 a slowdown for small e. For big e on the other hand the algorithm is
62 dominated by mpn_sqr so there wouldn't much of a saving. The current
63 code can be viewed as simply doing the first few steps of the powering in
64 a single or double limb where possible.
66 If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
67 copy made of b is unnecessary. We could just use the old alloc'ed block
68 and free it at the end. But arranging this seems like a lot more trouble
72 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
73 a limb without overflowing.
74 FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
76 #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
79 /* The following are for convenience, they update the size and check the
82 #define MPN_SQR(dst, alloc, src, size) \
84 ASSERT (2*(size) <= (alloc)); \
85 mpn_sqr (dst, src, size); \
87 (size) -= ((dst)[(size)-1] == 0); \
90 #define MPN_MUL(dst, alloc, src, size, src2, size2) \
93 ASSERT ((size) + (size2) <= (alloc)); \
94 cy = mpn_mul (dst, src, size, src2, size2); \
95 (size) += (size2) - (cy == 0); \
98 #define MPN_MUL_2(ptr, size, alloc, mult) \
101 ASSERT ((size)+2 <= (alloc)); \
102 cy = mpn_mul_2 (ptr, ptr, size, mult); \
104 (ptr)[(size)] = cy; \
105 (size) += (cy != 0); \
108 #define MPN_MUL_1(ptr, size, alloc, limb) \
111 ASSERT ((size)+1 <= (alloc)); \
112 cy = mpn_mul_1 (ptr, ptr, size, limb); \
114 (size) += (cy != 0); \
117 #define MPN_LSHIFT(ptr, size, alloc, shift) \
120 ASSERT ((size)+1 <= (alloc)); \
121 cy = mpn_lshift (ptr, ptr, size, shift); \
123 (size) += (cy != 0); \
126 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
129 MPN_COPY (dst, src, size); \
132 mpn_rshift (dst, src, size, shift); \
133 (size) -= ((dst)[(size)-1] == 0); \
138 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
139 allocations. Avoid writing values to them in a normal build, to ensure
140 the compiler lets them go dead. gcc already figures this out itself
145 MP_PTR_SWAP (rp, tp); \
146 ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
151 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
154 mp_size_t rtwos_limbs, ralloc, rsize;
155 int rneg, i, cnt, btwos, r_bp_overlap;
157 mp_bitcnt_t rtwos_bits;
158 #if HAVE_NATIVE_mpn_mul_2
159 mp_limb_t blimb_low, rl_high;
161 mp_limb_t b_twolimbs[2];
165 TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
166 PTR(r), bp, bsize, e, e);
167 mpn_trace ("b", bp, bsize));
169 ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
170 ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
172 /* b^0 == 1, including 0^0 == 1 */
180 /* 0^e == 0 apart from 0^0 above */
187 /* Sign of the final result. */
188 rneg = (bsize < 0 && (e & 1) != 0);
190 TRACE (printf ("rneg %d\n", rneg));
192 r_bp_overlap = (PTR(r) == bp);
194 /* Strip low zero limbs from b. */
196 for (blimb = *bp; blimb == 0; blimb = *++bp)
199 bsize--; ASSERT (bsize >= 1);
201 TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
203 /* Strip low zero bits from b. */
204 count_trailing_zeros (btwos, blimb);
206 rtwos_bits = e * btwos;
207 rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
208 rtwos_bits %= GMP_NUMB_BITS;
209 TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
210 btwos, rtwos_limbs, rtwos_bits));
215 #if HAVE_NATIVE_mpn_mul_2
222 /* Power up as far as possible within blimb. We start here with e!=0,
223 but if e is small then we might reach e==0 and the whole b^e in rl.
224 Notice this code works when blimb==1 too, reaching e==0. */
226 while (blimb <= GMP_NUMB_HALFMAX)
228 TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
239 #if HAVE_NATIVE_mpn_mul_2
240 TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
243 /* Can power b once more into blimb:blimb_low */
248 umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
249 rl >>= GMP_NAIL_BITS;
252 umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
253 blimb_low >>= GMP_NAIL_BITS;
256 TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
257 e, blimb, blimb_low, rl_high, rl));
259 /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
260 final mul_1 or mul_2 rather than a separate lshift.
261 - rl_high:rl mustn't be 1 (since then there's no final mul)
262 - rl_high mustn't overflow
263 - rl_high mustn't change to non-zero, since mul_1+lshift is
264 probably faster than mul_2 (FIXME: is this true?) */
267 && ! (rl_high == 0 && rl == 1)
268 && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
270 mp_limb_t new_rl_high = (rl_high << rtwos_bits)
271 | (rl >> (GMP_NUMB_BITS-rtwos_bits));
272 if (! (rl_high == 0 && new_rl_high != 0))
274 rl_high = new_rl_high;
277 TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
283 TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
286 /* Combine left-over rtwos_bits into rl to be handled by the final
287 mul_1 rather than a separate lshift.
288 - rl mustn't be 1 (since then there's no final mul)
289 - rl mustn't overflow */
293 && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
297 TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
303 mp_limb_t bsecond = bp[1];
305 blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
309 /* Two limbs became one after rshift. */
314 TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
315 #if HAVE_NATIVE_mpn_mul_2
319 b_twolimbs[0] = blimb;
320 b_twolimbs[1] = bsecond;
326 if (r_bp_overlap || btwos != 0)
328 mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
329 MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
331 TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
333 #if HAVE_NATIVE_mpn_mul_2
334 /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
339 TRACE (printf ("big bsize=%ld ", bsize);
340 mpn_trace ("b", bp, bsize));
343 /* At this point blimb is the most significant limb of the base to use.
345 Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
346 limb to round up the division; +1 for multiplies all using an extra
347 limb over the true size; +2 for rl at the end; +1 for lshift at the
350 The size calculation here is reasonably accurate. The base is at least
351 half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
352 when it will power up as just over 16, an overestimate of 17/16 =
353 6.25%. For a 64-bit limb it's half that.
355 If e==0 then blimb won't be anything useful (though it will be
356 non-zero), but that doesn't matter since we just end up with ralloc==5,
357 and that's fine for 2 limbs of rl and 1 of lshift. */
360 count_leading_zeros (cnt, blimb);
361 ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
362 TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
363 ralloc, bsize, blimb, cnt));
364 MPZ_REALLOC (r, ralloc + rtwos_limbs);
367 /* Low zero limbs resulting from powers of 2. */
368 MPN_ZERO (rp, rtwos_limbs);
373 /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
377 #if HAVE_NATIVE_mpn_mul_2
379 rsize += (rl_high != 0);
381 ASSERT (rp[rsize-1] != 0);
388 /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
389 low bit of e is zero, tp only has to hold the second last power
390 step, which is half the size of the final result. There's no need
391 to round up the divide by 2, since ralloc includes a +2 for rl
392 which not needed by tp. In the mpn_mul loop when the low bit of e
393 is 1, tp must hold nearly the full result, so just size it the same
397 #if HAVE_NATIVE_mpn_mul_2
398 if (bsize <= 2 || (e & 1) == 0)
401 if (bsize <= 1 || (e & 1) == 0)
404 TRACE (printf ("talloc %ld\n", talloc));
405 tp = TMP_ALLOC_LIMBS (talloc);
407 /* Go from high to low over the bits of e, starting with i pointing at
408 the bit below the highest 1 (which will mean i==-1 if e==1). */
409 count_leading_zeros (cnt, e);
410 i = GMP_LIMB_BITS - cnt - 2;
412 #if HAVE_NATIVE_mpn_mul_2
417 /* Any bsize==1 will have been powered above to be two limbs. */
421 /* Arrange the final result ends up in r, not in the temp space */
434 TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
435 i, e, rsize, ralloc, talloc);
436 mpn_trace ("r", rp, rsize));
438 MPN_SQR (tp, talloc, rp, rsize);
440 if ((e & (1L << i)) != 0)
441 MPN_MUL_2 (rp, rsize, ralloc, mult);
444 TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
449 MPN_MUL_2 (rp, rsize, ralloc, mult);
452 MPN_MUL_1 (rp, rsize, ralloc, rl);
457 /* Arrange the final result ends up in r, not in the temp space */
466 TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
467 i, e, rsize, ralloc, talloc);
468 mpn_trace ("r", rp, rsize));
470 MPN_SQR (tp, talloc, rp, rsize);
472 if ((e & (1L << i)) != 0)
473 MPN_MUL_1 (rp, rsize, ralloc, blimb);
476 TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
478 MPN_MUL_1 (rp, rsize, ralloc, rl);
485 /* Arrange the final result ends up in r, not in the temp space */
486 ULONG_PARITY (parity, e);
487 if (((parity ^ i) & 1) != 0)
490 MPN_COPY (rp, bp, bsize);
495 TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
496 i, e, rsize, ralloc, talloc);
497 mpn_trace ("r", rp, rsize));
499 MPN_SQR (tp, talloc, rp, rsize);
501 if ((e & (1L << i)) != 0)
503 MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
510 ASSERT (rp == PTR(r) + rtwos_limbs);
511 TRACE (mpn_trace ("end loop r", rp, rsize));
514 /* Apply any partial limb factors of 2. */
517 MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
518 TRACE (mpn_trace ("lshift r", rp, rsize));
521 rsize += rtwos_limbs;
522 SIZ(r) = (rneg ? -rsize : rsize);