contrib/gmp/mpz/fac_ui.c

   1 /* mpz_fac_ui(result, n) -- Set RESULT to N!.
   2
   3 Copyright 1991, 1993, 1994, 1995, 2000, 2001, 2002, 2003 Free Software
   4 Foundation, Inc.
   5
   6 This file is part of the GNU MP Library.
   7
   8 The GNU MP Library is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU Lesser General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at your
  11 option) any later version.
  12
  13 The GNU MP Library is distributed in the hope that it will be useful, but
  14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 License for more details.
  17
  18 You should have received a copy of the GNU Lesser General Public License
  19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  20
  21 #include "gmp.h"
  22 #include "gmp-impl.h"
  23 #include "longlong.h"
  24
  25 #include "fac_ui.h"
  26
  27
  28 static void odd_product __GMP_PROTO ((unsigned long, unsigned long, mpz_t *));
  29 static void ap_product_small __GMP_PROTO ((mpz_t, mp_limb_t, mp_limb_t, unsigned long, unsigned long));
  30
  31
  32 /* must be >=2  */
  33 #define APCONST 5
  34
  35 /* for single non-zero limb */
  36 #define MPZ_SET_1_NZ(z,n)       \
  37   do {                          \
  38     mpz_ptr  __z = (z);         \
  39     ASSERT ((n) != 0);          \
  40     PTR(__z)[0] = (n);          \
  41     SIZ(__z) = 1;               \
  42   } while (0)
  43
  44 /* for src>0 and n>0 */
  45 #define MPZ_MUL_1_POS(dst,src,n)                        \
  46   do {                                                  \
  47     mpz_ptr    __dst = (dst);                           \
  48     mpz_srcptr __src = (src);                           \
  49     mp_size_t  __size = SIZ(__src);                     \
  50     mp_ptr     __dst_p;                                 \
  51     mp_limb_t  __c;                                     \
  52                                                         \
  53     ASSERT (__size > 0);                                \
  54     ASSERT ((n) != 0);                                  \
  55                                                         \
  56     MPZ_REALLOC (__dst, __size+1);                      \
  57     __dst_p = PTR(__dst);                               \
  58                                                         \
  59     __c = mpn_mul_1 (__dst_p, PTR(__src), __size, n);   \
  60     __dst_p[__size] = __c;                              \
  61     SIZ(__dst) = __size + (__c != 0);                   \
  62   } while (0)
  63
  64
  65 #if BITS_PER_ULONG == GMP_LIMB_BITS
  66 #define BSWAP_ULONG(x,y)        BSWAP_LIMB(x,y)
  67 #endif
  68
  69 /* We used to have a case here for limb==2*long, doing a BSWAP_LIMB followed
  70    by a shift down to get the high part.  But it provoked incorrect code
  71    from "HP aC++/ANSI C B3910B A.05.52 [Sep 05 2003]" in ILP32 mode.  This
  72    case would have been nice for gcc ia64 where BSWAP_LIMB is a mux1, but we
  73    can get that directly muxing a 4-byte ulong if it matters enough.  */
  74
  75 #if ! defined (BSWAP_ULONG)
  76 #define BSWAP_ULONG(dst, src)                                           \
  77   do {                                                                  \
  78     unsigned long  __bswapl_src = (src);                                \
  79     unsigned long  __bswapl_dst = 0;                                    \
  80     int        __i;                                                     \
  81     for (__i = 0; __i < sizeof(unsigned long); __i++)                   \
  82       {                                                                 \
  83         __bswapl_dst = (__bswapl_dst << 8) | (__bswapl_src & 0xFF);     \
  84         __bswapl_src >>= 8;                                             \
  85       }                                                                 \
  86     (dst) = __bswapl_dst;                                               \
  87   } while (0)
  88 #endif
  89
  90 /* x is bit reverse of y */
  91 /* Note the divides below are all exact */
  92 #define BITREV_ULONG(x,y)                                                  \
  93   do {                                                                     \
  94    unsigned long __dst;                                                    \
  95    BSWAP_ULONG(__dst,y);                                                   \
  96    __dst = ((__dst>>4)&(ULONG_MAX/17)) | ((__dst<<4)&((ULONG_MAX/17)*16)); \
  97    __dst = ((__dst>>2)&(ULONG_MAX/5) ) | ((__dst<<2)&((ULONG_MAX/5)*4)  ); \
  98    __dst = ((__dst>>1)&(ULONG_MAX/3) ) | ((__dst<<1)&((ULONG_MAX/3)*2)  ); \
  99    (x) = __dst;                                                            \
 100   } while(0)
 101 /* above could be improved if cpu has a nibble/bit swap/muxing instruction */
 102 /* above code is serialized, possible to write as a big parallel expression */
 103
 104
 105
 106 void
 107 mpz_fac_ui (mpz_ptr x, unsigned long n)
 108 {
 109   unsigned long z, stt;
 110   int i, j;
 111   mpz_t t1, st[8 * sizeof (unsigned long) + 1 - APCONST];
 112   mp_limb_t d[4];
 113
 114   static const mp_limb_t table[] = { ONE_LIMB_FACTORIAL_TABLE };
 115
 116   if (n < numberof (table))
 117     {
 118       MPZ_SET_1_NZ (x, table[n]);
 119       return;
 120     }
 121
 122   /*  NOTE : MUST have n>=3 here */
 123   ASSERT (n >= 3);
 124   /* for estimating the alloc sizes the calculation of these formula's is not
 125      exact and also the formulas are only approximations, also we ignore
 126      the few "side" calculations, correct allocation seems to speed up the
 127      small sizes better, having very little effect on the large sizes */
 128
 129   /* estimate space for stack entries see below
 130      number of bits for n! is
 131      (1+log_2(2*pi)/2)-n*log_2(exp(1))+(n+1/2)*log_2(n)=
 132      2.325748065-n*1.442695041+(n+0.5)*log_2(n)  */
 133   umul_ppmm (d[1], d[0], (mp_limb_t) n, (mp_limb_t) FAC2OVERE);
 134   /* d[1] is 2n/e, d[0] ignored        */
 135   count_leading_zeros (z, d[1]);
 136   z = GMP_LIMB_BITS - z - 1;    /* z=floor(log_2(2n/e))   */
 137   umul_ppmm (d[1], d[0], (mp_limb_t) n, (mp_limb_t) z);
 138   /* d=n*floor(log_2(2n/e))   */
 139   d[0] = (d[0] >> 2) | (d[1] << (GMP_LIMB_BITS - 2));
 140   d[1] >>= 2;
 141   /* d=n*floor(log_2(2n/e))/4   */
 142   z = d[0] + 1;                 /* have to ignore any overflow */
 143   /* so z is the number of bits wanted for st[0]    */
 144
 145
 146   if (n <= ((unsigned long) 1) << (APCONST))
 147     {
 148       mpz_realloc2 (x, 4 * z);
 149       ap_product_small (x, CNST_LIMB(2), CNST_LIMB(1), n - 1, 4L);
 150       return;
 151     }
 152   if (n <= ((unsigned long) 1) << (APCONST + 1))
 153     {                           /*  use n!=odd(1,n)*(n/2)!*2^(n/2)         */
 154       mpz_init2 (t1, 2 * z);
 155       mpz_realloc2 (x, 4 * z);
 156       ap_product_small (x, CNST_LIMB(2), CNST_LIMB(1), n / 2 - 1, 4L);
 157       ap_product_small (t1, CNST_LIMB(3), CNST_LIMB(2), (n - 1) / 2, 4L);
 158       mpz_mul (x, x, t1);
 159       mpz_clear (t1);
 160       mpz_mul_2exp (x, x, n / 2);
 161       return;
 162     }
 163   if (n <= ((unsigned long) 1) << (APCONST + 2))
 164     {
 165       /* use n!=C_2(1,n/2)^2*C_2(n/2,n)*(n/4)!*2^(n/2+n/4) all int divs
 166          so need (BITS_IN_N-APCONST+1)=(APCONST+3-APCONST+1)=4 stack entries */
 167       mpz_init2 (t1, 2 * z);
 168       mpz_realloc2 (x, 4 * z);
 169       for (i = 0; i < 4; i++)
 170         {
 171           mpz_init2 (st[i], z);
 172           z >>= 1;
 173         }
 174       odd_product (1, n / 2, st);
 175       mpz_set (x, st[0]);
 176       odd_product (n / 2, n, st);
 177       mpz_mul (x, x, x);
 178       ASSERT (n / 4 <= FACMUL4 + 6);
 179       ap_product_small (t1, CNST_LIMB(2), CNST_LIMB(1), n / 4 - 1, 4L);
 180       /* must have 2^APCONST odd numbers max */
 181       mpz_mul (t1, t1, st[0]);
 182       for (i = 0; i < 4; i++)
 183         mpz_clear (st[i]);
 184       mpz_mul (x, x, t1);
 185       mpz_clear (t1);
 186       mpz_mul_2exp (x, x, n / 2 + n / 4);
 187       return;
 188     }
 189
 190   count_leading_zeros (stt, (mp_limb_t) n);
 191   stt = GMP_LIMB_BITS - stt + 1 - APCONST;
 192
 193   for (i = 0; i < (signed long) stt; i++)
 194     {
 195       mpz_init2 (st[i], z);
 196       z >>= 1;
 197     }
 198
 199   count_leading_zeros (z, (mp_limb_t) (n / 3));
 200   /* find z st 2^z>n/3 range for z is 1 <= z <= 8 * sizeof(unsigned long)-1 */
 201   z = GMP_LIMB_BITS - z;
 202
 203   /*
 204      n! = 2^e * PRODUCT_{i=0}^{i=z-1} C_2( n/2^{i+1}, n/2^i )^{i+1}
 205      where 2^e || n!   3.2^z>n   C_2(a,b)=PRODUCT of odd z such that a<z<=b
 206    */
 207
 208
 209   mpz_init_set_ui (t1, 1);
 210   for (j = 8 * sizeof (unsigned long) / 2; j != 0; j >>= 1)
 211     {
 212       MPZ_SET_1_NZ (x, 1);
 213       for (i = 8 * sizeof (unsigned long) - j; i >= j; i -= 2 * j)
 214         if ((signed long) z >= i)
 215           {
 216             odd_product (n >> i, n >> (i - 1), st);
 217             /* largest odd product when j=i=1 then we have
 218                odd_product(n/2,n,st) which is approx (2n/e)^(n/4)
 219                so log_base2(largest oddproduct)=n*log_base2(2n/e)/4
 220                number of bits is n*log_base2(2n/e)/4+1  */
 221             if (i != j)
 222               mpz_pow_ui (st[0], st[0], i / j);
 223             mpz_mul (x, x, st[0]);
 224           }
 225       if ((signed long) z >= j && j != 1)
 226         {
 227           mpz_mul (t1, t1, x);
 228           mpz_mul (t1, t1, t1);
 229         }
 230     }
 231   for (i = 0; i < (signed long) stt; i++)
 232     mpz_clear (st[i]);
 233   mpz_mul (x, x, t1);
 234   mpz_clear (t1);
 235   popc_limb (i, (mp_limb_t) n);
 236   mpz_mul_2exp (x, x, n - i);
 237   return;
 238 }
 239
 240 /* start,step are mp_limb_t although they will fit in unsigned long     */
 241 static void
 242 ap_product_small (mpz_t ret, mp_limb_t start, mp_limb_t step,
 243                   unsigned long count, unsigned long nm)
 244 {
 245   unsigned long a;
 246   mp_limb_t b;
 247
 248   ASSERT (count <= (((unsigned long) 1) << APCONST));
 249 /* count can never be zero ? check this and remove test below */
 250   if (count == 0)
 251     {
 252       MPZ_SET_1_NZ (ret, 1);
 253       return;
 254     }
 255   if (count == 1)
 256     {
 257       MPZ_SET_1_NZ (ret, start);
 258       return;
 259     }
 260   switch (nm)
 261     {
 262     case 1:
 263       MPZ_SET_1_NZ (ret, start);
 264       b = start + step;
 265       for (a = 0; a < count - 1; b += step, a++)
 266         MPZ_MUL_1_POS (ret, ret, b);
 267       return;
 268     case 2:
 269       MPZ_SET_1_NZ (ret, start * (start + step));
 270       if (count == 2)
 271         return;
 272       for (b = start + 2 * step, a = count / 2 - 1; a != 0;
 273            a--, b += 2 * step)
 274         MPZ_MUL_1_POS (ret, ret, b * (b + step));
 275       if (count % 2 == 1)
 276         MPZ_MUL_1_POS (ret, ret, b);
 277       return;
 278     case 3:
 279       if (count == 2)
 280         {
 281           MPZ_SET_1_NZ (ret, start * (start + step));
 282           return;
 283         }
 284       MPZ_SET_1_NZ (ret, start * (start + step) * (start + 2 * step));
 285       if (count == 3)
 286         return;
 287       for (b = start + 3 * step, a = count / 3 - 1; a != 0;
 288            a--, b += 3 * step)
 289         MPZ_MUL_1_POS (ret, ret, b * (b + step) * (b + 2 * step));
 290       if (count % 3 == 2)
 291         b = b * (b + step);
 292       if (count % 3 != 0)
 293         MPZ_MUL_1_POS (ret, ret, b);
 294       return;
 295     default:                    /* ie nm=4      */
 296       if (count == 2)
 297         {
 298           MPZ_SET_1_NZ (ret, start * (start + step));
 299           return;
 300         }
 301       if (count == 3)
 302         {
 303           MPZ_SET_1_NZ (ret, start * (start + step) * (start + 2 * step));
 304           return;
 305         }
 306       MPZ_SET_1_NZ (ret,
 307                     start * (start + step) * (start + 2 * step) * (start +
 308                                                                    3 * step));
 309       if (count == 4)
 310         return;
 311       for (b = start + 4 * step, a = count / 4 - 1; a != 0;
 312            a--, b += 4 * step)
 313         MPZ_MUL_1_POS (ret, ret,
 314                        b * (b + step) * (b + 2 * step) * (b + 3 * step));
 315       if (count % 4 == 2)
 316         b = b * (b + step);
 317       if (count % 4 == 3)
 318         b = b * (b + step) * (b + 2 * step);
 319       if (count % 4 != 0)
 320         MPZ_MUL_1_POS (ret, ret, b);
 321       return;
 322     }
 323 }
 324
 325 /* return value in st[0]
 326    odd_product(l,h)=sqrt((h/e)^h/(l/e)^l) using Stirling approx and e=exp(1)
 327    so st[0] needs enough bits for above, st[1] needs half these bits and
 328    st[2] needs 1/4 of these bits etc    */
 329 static void
 330 odd_product (unsigned long low, unsigned long high, mpz_t * st)
 331 {
 332   unsigned long stc = 1, stn = 0, n, y, mask, a, nm = 1;
 333   signed long z;
 334
 335   low++;
 336   if (low % 2 == 0)
 337     low++;
 338   if (high == 0)
 339     high = 1;
 340   if (high % 2 == 0)
 341     high--;
 342 /* must have high>=low ? check this and remove test below */
 343   if (high < low)
 344     {
 345       MPZ_SET_1_NZ (st[0], 1);
 346       return;
 347     }
 348   if (high == low)
 349     {
 350       MPZ_SET_1_NZ (st[0], low);
 351       return;
 352     }
 353   if (high <= FACMUL2 + 2)
 354     {
 355       nm = 2;
 356       if (high <= FACMUL3 + 4)
 357         {
 358           nm = 3;
 359           if (high <= FACMUL4 + 6)
 360             nm = 4;
 361         }
 362     }
 363   high = (high - low) / 2 + 1;  /* high is now count,high<=2^(BITS_PER_ULONG-1) */
 364   if (high <= (((unsigned long) 1) << APCONST))
 365     {
 366       ap_product_small (st[0], (mp_limb_t) low, CNST_LIMB(2), high, nm);
 367       return;
 368     }
 369   count_leading_zeros (n, (mp_limb_t) high);
 370 /* assumes clz above is LIMB based not NUMB based */
 371   n = GMP_LIMB_BITS - n - APCONST;
 372   mask = (((unsigned long) 1) << n);
 373   a = mask << 1;
 374   mask--;
 375 /* have 2^(BITS_IN_N-APCONST) iterations so need
 376    (BITS_IN_N-APCONST+1) stack entries  */
 377   for (z = mask; z >= 0; z--)
 378     {
 379       BITREV_ULONG (y, z);
 380       y >>= (BITS_PER_ULONG - n);
 381       ap_product_small (st[stn],
 382                         (mp_limb_t) (low + 2 * ((~y) & mask)), (mp_limb_t) a,
 383                         (high + y) >> n, nm);
 384       ASSERT (((high + y) >> n) <= (((unsigned long) 1) << APCONST));
 385       stn++;
 386       y = stc++;
 387       while ((y & 1) == 0)
 388         {
 389           mpz_mul (st[stn - 2], st[stn - 2], st[stn - 1]);
 390           stn--;
 391           y >>= 1;
 392         }
 393     }
 394   ASSERT (stn == 1);
 395   return;
 396 }