contrib/gmp/gen-psqr.c

   1 /* Generate perfect square testing data.
   2
   3 Copyright 2002, 2003, 2004 Free Software Foundation, Inc.
   4
   5 This file is part of the GNU MP Library.
   6
   7 The GNU MP Library is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU Lesser General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at your
  10 option) any later version.
  11
  12 The GNU MP Library is distributed in the hope that it will be useful, but
  13 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 License for more details.
  16
  17 You should have received a copy of the GNU Lesser General Public License
  18 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  19
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22
  23 #include "dumbmp.c"
  24
  25
  26 /* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
  27    (plus a PERFSQR_PP modulus), and generate tables indicating quadratic
  28    residues and non-residues modulo small factors of that modulus.
  29
  30    For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used.  That
  31    function exists specifically because 2^24-1 and 2^48-1 have nice sets of
  32    prime factors.  For other limb sizes it's considered, but if it doesn't
  33    have good factors then mpn_mod_1 will be used instead.
  34
  35    When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
  36    selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
  37    with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
  38    GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
  39    calculation within a single limb.
  40
  41    In either case primes can be combined to make divisors.  The table data
  42    then effectively indicates remainders which are quadratic residues mod
  43    all the primes.  This sort of combining reduces the number of steps
  44    needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
  45    Nothing is gained or lost in terms of detections, the same total fraction
  46    of non-residues will be identified.
  47
  48    Nothing particularly sophisticated is attempted for combining factors to
  49    make divisors.  This is probably a kind of knapsack problem so it'd be
  50    too hard to attempt anything completely general.  For the usual 32 and 64
  51    bit limbs we get a good enough result just pairing the biggest and
  52    smallest which fit together, repeatedly.
  53
  54    Another aim is to get powerful combinations, ie. divisors which identify
  55    biggest fraction of non-residues, and have those run first.  Again for
  56    the usual 32 and 64 bits it seems good enough just to pair for big
  57    divisors then sort according to the resulting fraction of non-residues
  58    identified.
  59
  60    Also in this program, a table sq_res_0x100 of residues modulo 256 is
  61    generated.  This simply fills bits into limbs of the appropriate
  62    build-time GMP_LIMB_BITS each.
  63
  64 */
  65
  66
  67 /* Normally we aren't using const in gen*.c programs, so as not to have to
  68    bother figuring out if it works, but using it with f_cmp_divisor and
  69    f_cmp_fraction avoids warnings from the qsort calls. */
  70
  71 /* Same tests as gmp.h. */
  72 #if  defined (__STDC__)                                 \
  73   || defined (__cplusplus)                              \
  74   || defined (_AIX)                                     \
  75   || defined (__DECC)                                   \
  76   || (defined (__mips) && defined (_SYSTYPE_SVR4))      \
  77   || defined (_MSC_VER)                                 \
  78   || defined (_WIN32)
  79 #define HAVE_CONST        1
  80 #endif
  81
  82 #if ! HAVE_CONST
  83 #define const
  84 #endif
  85
  86
  87 mpz_t  *sq_res_0x100;          /* table of limbs */
  88 int    nsq_res_0x100;          /* elements in sq_res_0x100 array */
  89 int    sq_res_0x100_num;       /* squares in sq_res_0x100 */
  90 double sq_res_0x100_fraction;  /* sq_res_0x100_num / 256 */
  91
  92 int     mod34_bits;        /* 3*GMP_NUMB_BITS/4 */
  93 int     mod_bits;          /* bits from PERFSQR_MOD_34 or MOD_PP */
  94 int     max_divisor;       /* all divisors <= max_divisor */
  95 int     max_divisor_bits;  /* ceil(log2(max_divisor)) */
  96 double  total_fraction;    /* of squares */
  97 mpz_t   pp;                /* product of primes, or 0 if mod_34lsub1 used */
  98 mpz_t   pp_norm;           /* pp shifted so NUMB high bit set */
  99 mpz_t   pp_inverted;       /* invert_limb style inverse */
 100 mpz_t   mod_mask;          /* 2^mod_bits-1 */
 101 char    mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
 102
 103 /* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
 104 struct rawfactor_t {
 105   int     divisor;
 106   int     multiplicity;
 107 };
 108 struct rawfactor_t  *rawfactor;
 109 int                 nrawfactor;
 110
 111 /* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
 112 struct factor_t {
 113   int     divisor;
 114   mpz_t   inverse;   /* 1/divisor mod 2^mod_bits */
 115   mpz_t   mask;      /* indicating squares mod divisor */
 116   double  fraction;  /* squares/total */
 117 };
 118 struct factor_t  *factor;
 119 int              nfactor;       /* entries in use in factor array */
 120 int              factor_alloc;  /* entries allocated to factor array */
 121
 122
 123 int
 124 f_cmp_divisor (const void *parg, const void *qarg)
 125 {
 126   const struct factor_t *p, *q;
 127   p = parg;
 128   q = qarg;
 129   if (p->divisor > q->divisor)
 130     return 1;
 131   else if (p->divisor < q->divisor)
 132     return -1;
 133   else
 134     return 0;
 135 }
 136
 137 int
 138 f_cmp_fraction (const void *parg, const void *qarg)
 139 {
 140   const struct factor_t *p, *q;
 141   p = parg;
 142   q = qarg;
 143   if (p->fraction > q->fraction)
 144     return 1;
 145   else if (p->fraction < q->fraction)
 146     return -1;
 147   else
 148     return 0;
 149 }
 150
 151 /* Remove array[idx] by copying the remainder down, and adjust narray
 152    accordingly.  */
 153 #define COLLAPSE_ELEMENT(array, idx, narray)                    \
 154   do {                                                          \
 155     mem_copyi ((char *) &(array)[idx],                          \
 156                (char *) &(array)[idx+1],                        \
 157                ((narray)-((idx)+1)) * sizeof (array[0]));       \
 158     (narray)--;                                                 \
 159   } while (0)
 160
 161
 162 /* return n*2^p mod m */
 163 int
 164 mul_2exp_mod (int n, int p, int m)
 165 {
 166   int  i;
 167   for (i = 0; i < p; i++)
 168     n = (2 * n) % m;
 169   return n;
 170 }
 171
 172 /* return -n mod m */
 173 int
 174 neg_mod (int n, int m)
 175 {
 176   ASSERT (n >= 0 && n < m);
 177   return (n == 0 ? 0 : m-n);
 178 }
 179
 180 /* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
 181    "-(idx<<mod_bits)" can be a square modulo m.  */
 182 void
 183 square_mask (mpz_t mask, int m)
 184 {
 185   int    p, i, r, idx;
 186
 187   p = mul_2exp_mod (1, mod_bits, m);
 188   p = neg_mod (p, m);
 189
 190   mpz_set_ui (mask, 0L);
 191   for (i = 0; i < m; i++)
 192     {
 193       r = (i * i) % m;
 194       idx = (r * p) % m;
 195       mpz_setbit (mask, (unsigned long) idx);
 196     }
 197 }
 198
 199 void
 200 generate_sq_res_0x100 (int limb_bits)
 201 {
 202   int  i, res;
 203
 204   nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
 205   sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
 206
 207   for (i = 0; i < nsq_res_0x100; i++)
 208     mpz_init_set_ui (sq_res_0x100[i], 0L);
 209
 210   for (i = 0; i < 0x100; i++)
 211     {
 212       res = (i * i) % 0x100;
 213       mpz_setbit (sq_res_0x100[res / limb_bits],
 214                   (unsigned long) (res % limb_bits));
 215     }
 216
 217   sq_res_0x100_num = 0;
 218   for (i = 0; i < nsq_res_0x100; i++)
 219     sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
 220   sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
 221 }
 222
 223 void
 224 generate_mod (int limb_bits, int nail_bits)
 225 {
 226   int    numb_bits = limb_bits - nail_bits;
 227   int    i, divisor;
 228
 229   mpz_init_set_ui (pp, 0L);
 230   mpz_init_set_ui (pp_norm, 0L);
 231   mpz_init_set_ui (pp_inverted, 0L);
 232
 233   /* no more than limb_bits many factors in a one limb modulus (and of
 234      course in reality nothing like that many) */
 235   factor_alloc = limb_bits;
 236   factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
 237   rawfactor = (struct rawfactor_t *)
 238     xmalloc (factor_alloc * sizeof (*rawfactor));
 239
 240   if (numb_bits % 4 != 0)
 241     {
 242       strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
 243       goto use_pp;
 244     }
 245
 246   max_divisor = 2*limb_bits;
 247   max_divisor_bits = log2_ceil (max_divisor);
 248
 249   if (numb_bits / 4 < max_divisor_bits)
 250     {
 251       /* Wind back to one limb worth of max_divisor, if that will let us use
 252          mpn_mod_34lsub1.  */
 253       max_divisor = limb_bits;
 254       max_divisor_bits = log2_ceil (max_divisor);
 255
 256       if (numb_bits / 4 < max_divisor_bits)
 257         {
 258           strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
 259           goto use_pp;
 260         }
 261     }
 262
 263   {
 264     /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
 265     mpz_t  m, q, r;
 266     int    multiplicity;
 267
 268     mod34_bits = (numb_bits / 4) * 3;
 269
 270     /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
 271        the mod34_bits mark, adding the two halves for a remainder of at most
 272        mod34_bits+1 many bits */
 273     mod_bits = mod34_bits + 1;
 274
 275     mpz_init_set_ui (m, 1L);
 276     mpz_mul_2exp (m, m, mod34_bits);
 277     mpz_sub_ui (m, m, 1L);
 278
 279     mpz_init (q);
 280     mpz_init (r);
 281
 282     for (i = 3; i <= max_divisor; i++)
 283       {
 284         if (! isprime (i))
 285           continue;
 286
 287         mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
 288         if (mpz_sgn (r) != 0)
 289           continue;
 290
 291         /* if a repeated prime is found it's used as an i^n in one factor */
 292         divisor = 1;
 293         multiplicity = 0;
 294         do
 295           {
 296             if (divisor > max_divisor / i)
 297               break;
 298             multiplicity++;
 299             mpz_set (m, q);
 300             mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
 301           }
 302         while (mpz_sgn (r) == 0);
 303
 304         ASSERT (nrawfactor < factor_alloc);
 305         rawfactor[nrawfactor].divisor = i;
 306         rawfactor[nrawfactor].multiplicity = multiplicity;
 307         nrawfactor++;
 308       }
 309
 310     mpz_clear (m);
 311     mpz_clear (q);
 312     mpz_clear (r);
 313   }
 314
 315   if (nrawfactor <= 2)
 316     {
 317       mpz_t  new_pp;
 318
 319       sprintf (mod34_excuse, "only %d small factor%s",
 320                nrawfactor, nrawfactor == 1 ? "" : "s");
 321
 322     use_pp:
 323       /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
 324          tried with just one */
 325       max_divisor = 2*limb_bits;
 326       max_divisor_bits = log2_ceil (max_divisor);
 327
 328       mpz_init (new_pp);
 329       nrawfactor = 0;
 330       mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
 331
 332       /* one copy of each small prime */
 333       mpz_set_ui (pp, 1L);
 334       for (i = 3; i <= max_divisor; i++)
 335         {
 336           if (! isprime (i))
 337             continue;
 338
 339           mpz_mul_ui (new_pp, pp, (unsigned long) i);
 340           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
 341             break;
 342           mpz_set (pp, new_pp);
 343
 344           ASSERT (nrawfactor < factor_alloc);
 345           rawfactor[nrawfactor].divisor = i;
 346           rawfactor[nrawfactor].multiplicity = 1;
 347           nrawfactor++;
 348         }
 349
 350       /* Plus an extra copy of one or more of the primes selected, if that
 351          still fits in max_divisor and the total in mod_bits.  Usually only
 352          3 or 5 will be candidates */
 353       for (i = nrawfactor-1; i >= 0; i--)
 354         {
 355           if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
 356             continue;
 357           mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
 358           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
 359             continue;
 360           mpz_set (pp, new_pp);
 361
 362           rawfactor[i].multiplicity++;
 363         }
 364
 365       mod_bits = mpz_sizeinbase (pp, 2);
 366
 367       mpz_set (pp_norm, pp);
 368       while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
 369         mpz_add (pp_norm, pp_norm, pp_norm);
 370
 371       mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
 372
 373       mpz_clear (new_pp);
 374     }
 375
 376   /* start the factor array */
 377   for (i = 0; i < nrawfactor; i++)
 378     {
 379       int  j;
 380       ASSERT (nfactor < factor_alloc);
 381       factor[nfactor].divisor = 1;
 382       for (j = 0; j < rawfactor[i].multiplicity; j++)
 383         factor[nfactor].divisor *= rawfactor[i].divisor;
 384       nfactor++;
 385     }
 386
 387  combine:
 388   /* Combine entries in the factor array.  Combine the smallest entry with
 389      the biggest one that will fit with it (ie. under max_divisor), then
 390      repeat that with the new smallest entry. */
 391   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
 392   for (i = nfactor-1; i >= 1; i--)
 393     {
 394       if (factor[i].divisor <= max_divisor / factor[0].divisor)
 395         {
 396           factor[0].divisor *= factor[i].divisor;
 397           COLLAPSE_ELEMENT (factor, i, nfactor);
 398           goto combine;
 399         }
 400     }
 401
 402   total_fraction = 1.0;
 403   for (i = 0; i < nfactor; i++)
 404     {
 405       mpz_init (factor[i].inverse);
 406       mpz_invert_ui_2exp (factor[i].inverse,
 407                           (unsigned long) factor[i].divisor,
 408                           (unsigned long) mod_bits);
 409
 410       mpz_init (factor[i].mask);
 411       square_mask (factor[i].mask, factor[i].divisor);
 412
 413       /* fraction of possible squares */
 414       factor[i].fraction = (double) mpz_popcount (factor[i].mask)
 415         / factor[i].divisor;
 416
 417       /* total fraction of possible squares */
 418       total_fraction *= factor[i].fraction;
 419     }
 420
 421   /* best tests first (ie. smallest fraction) */
 422   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
 423 }
 424
 425 void
 426 print (int limb_bits, int nail_bits)
 427 {
 428   int    i;
 429   mpz_t  mhi, mlo;
 430
 431   printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
 432   printf ("\n");
 433
 434   printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
 435           limb_bits, nail_bits);
 436   printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
 437           limb_bits, nail_bits);
 438   printf ("#endif\n");
 439   printf ("\n");
 440
 441   printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
 442   printf ("   This test identifies %.2f%% as non-squares (%d/256). */\n",
 443           (1.0 - sq_res_0x100_fraction) * 100.0,
 444           0x100 - sq_res_0x100_num);
 445   printf ("static const mp_limb_t\n");
 446   printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
 447   for (i = 0; i < nsq_res_0x100; i++)
 448     {
 449       printf ("  CNST_LIMB(0x");
 450       mpz_out_str (stdout, 16, sq_res_0x100[i]);
 451       printf ("),\n");
 452     }
 453   printf ("};\n");
 454   printf ("\n");
 455
 456   if (mpz_sgn (pp) != 0)
 457     {
 458       printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
 459       printf ("/* PERFSQR_PP = ");
 460     }
 461   else
 462     printf ("/* 2^%d-1 = ", mod34_bits);
 463   for (i = 0; i < nrawfactor; i++)
 464     {
 465       if (i != 0)
 466         printf (" * ");
 467       printf ("%d", rawfactor[i].divisor);
 468       if (rawfactor[i].multiplicity != 1)
 469         printf ("^%d", rawfactor[i].multiplicity);
 470     }
 471   printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
 472
 473   printf ("#define PERFSQR_MOD_BITS  %d\n", mod_bits);
 474   if (mpz_sgn (pp) != 0)
 475     {
 476       printf ("#define PERFSQR_PP            CNST_LIMB(0x");
 477       mpz_out_str (stdout, 16, pp);
 478       printf (")\n");
 479       printf ("#define PERFSQR_PP_NORM       CNST_LIMB(0x");
 480       mpz_out_str (stdout, 16, pp_norm);
 481       printf (")\n");
 482       printf ("#define PERFSQR_PP_INVERTED   CNST_LIMB(0x");
 483       mpz_out_str (stdout, 16, pp_inverted);
 484       printf (")\n");
 485     }
 486   printf ("\n");
 487
 488   mpz_init (mhi);
 489   mpz_init (mlo);
 490
 491   printf ("/* This test identifies %.2f%% as non-squares. */\n",
 492           (1.0 - total_fraction) * 100.0);
 493   printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
 494   printf ("  do {                              \\\n");
 495   printf ("    mp_limb_t  r;                   \\\n");
 496   if (mpz_sgn (pp) != 0)
 497     printf ("    PERFSQR_MOD_PP (r, up, usize);  \\\n");
 498   else
 499     printf ("    PERFSQR_MOD_34 (r, up, usize);  \\\n");
 500
 501   for (i = 0; i < nfactor; i++)
 502     {
 503       printf ("                                    \\\n");
 504       printf ("    /* %5.2f%% */                    \\\n",
 505               (1.0 - factor[i].fraction) * 100.0);
 506
 507       printf ("    PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
 508               factor[i].divisor <= limb_bits ? 1 : 2,
 509               factor[i].divisor);
 510       mpz_out_str (stdout, 16, factor[i].inverse);
 511       printf ("), \\\n");
 512       printf ("                   CNST_LIMB(0x");
 513
 514       if ( factor[i].divisor <= limb_bits)
 515         {
 516           mpz_out_str (stdout, 16, factor[i].mask);
 517         }
 518       else
 519         {
 520           mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
 521           mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
 522           mpz_out_str (stdout, 16, mhi);
 523           printf ("), CNST_LIMB(0x");
 524           mpz_out_str (stdout, 16, mlo);
 525         }
 526       printf (")); \\\n");
 527     }
 528
 529   printf ("  } while (0)\n");
 530   printf ("\n");
 531
 532   printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
 533           (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
 534   printf ("\n");
 535
 536   printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
 537   printf ("#define PERFSQR_DIVISORS  { 256,");
 538   for (i = 0; i < nfactor; i++)
 539       printf (" %d,", factor[i].divisor);
 540   printf (" }\n");
 541
 542
 543   mpz_clear (mhi);
 544   mpz_clear (mlo);
 545 }
 546
 547 int
 548 main (int argc, char *argv[])
 549 {
 550   int  limb_bits, nail_bits;
 551
 552   if (argc != 3)
 553     {
 554       fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
 555       exit (1);
 556     }
 557
 558   limb_bits = atoi (argv[1]);
 559   nail_bits = atoi (argv[2]);
 560
 561   if (limb_bits <= 0
 562       || nail_bits < 0
 563       || nail_bits >= limb_bits)
 564     {
 565       fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
 566                limb_bits, nail_bits);
 567       exit (1);
 568     }
 569
 570   generate_sq_res_0x100 (limb_bits);
 571   generate_mod (limb_bits, nail_bits);
 572
 573   print (limb_bits, nail_bits);
 574
 575   return 0;
 576 }