contrib/gmp/mpn/generic/mu_divappr_q.c

   1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
   2
   3    Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
   4    normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
   5    is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
   6    to let N be unmodified during the operation.
   7
   8    Contributed to the GNU project by Torbjorn Granlund.
   9
  10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
  11    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
  12    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
  13
  14 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
  15
  16 This file is part of the GNU MP Library.
  17
  18 The GNU MP Library is free software; you can redistribute it and/or modify
  19 it under the terms of the GNU Lesser General Public License as published by
  20 the Free Software Foundation; either version 3 of the License, or (at your
  21 option) any later version.
  22
  23 The GNU MP Library is distributed in the hope that it will be useful, but
  24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  26 License for more details.
  27
  28 You should have received a copy of the GNU Lesser General Public License
  29 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  30
  31
  32 /*
  33    The idea of the algorithm used herein is to compute a smaller inverted value
  34    than used in the standard Barrett algorithm, and thus save time in the
  35    Newton iterations, and pay just a small price when using the inverted value
  36    for developing quotient bits.  This algorithm was presented at ICMS 2006.
  37 */
  38
  39 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
  40
  41  Things to work on:
  42
  43   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
  44     demonstrated by the fact that the mpn_invertappr function's scratch needs
  45     mean that we need to keep a large allocation long after it is needed.
  46     Things are worse as mpn_mul_fft does not accept any scratch parameter,
  47     which means we'll have a large memory hole while in mpn_mul_fft.  In
  48     general, a peak scratch need in the beginning of a function isn't
  49     well-handled by the itch/scratch scheme.
  50 */
  51
  52 #ifdef STAT
  53 #undef STAT
  54 #define STAT(x) x
  55 #else
  56 #define STAT(x)
  57 #endif
  58
  59 #include <stdlib.h>             /* for NULL */
  60 #include "gmp.h"
  61 #include "gmp-impl.h"
  62
  63
  64 mp_limb_t
  65 mpn_mu_divappr_q (mp_ptr qp,
  66                   mp_srcptr np,
  67                   mp_size_t nn,
  68                   mp_srcptr dp,
  69                   mp_size_t dn,
  70                   mp_ptr scratch)
  71 {
  72   mp_size_t qn, in;
  73   mp_limb_t cy, qh;
  74   mp_ptr ip, tp;
  75
  76   ASSERT (dn > 1);
  77
  78   qn = nn - dn;
  79
  80   /* If Q is smaller than D, truncate operands. */
  81   if (qn + 1 < dn)
  82     {
  83       np += dn - (qn + 1);
  84       nn -= dn - (qn + 1);
  85       dp += dn - (qn + 1);
  86       dn = qn + 1;
  87     }
  88
  89   /* Compute the inverse size.  */
  90   in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
  91   ASSERT (in <= dn);
  92
  93 #if 1
  94   /* This alternative inverse computation method gets slightly more accurate
  95      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
  96      not adapted (3) mpn_invertappr scratch needs not met.  */
  97   ip = scratch;
  98   tp = scratch + in + 1;
  99
 100   /* compute an approximate inverse on (in+1) limbs */
 101   if (dn == in)
 102     {
 103       MPN_COPY (tp + 1, dp, in);
 104       tp[0] = 1;
 105       mpn_invertappr (ip, tp, in + 1, NULL);
 106       MPN_COPY_INCR (ip, ip + 1, in);
 107     }
 108   else
 109     {
 110       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
 111       if (UNLIKELY (cy != 0))
 112         MPN_ZERO (ip, in);
 113       else
 114         {
 115           mpn_invertappr (ip, tp, in + 1, NULL);
 116           MPN_COPY_INCR (ip, ip + 1, in);
 117         }
 118     }
 119 #else
 120   /* This older inverse computation method gets slightly worse results than the
 121      one above.  */
 122   ip = scratch;
 123   tp = scratch + in;
 124
 125   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
 126      inversion function should do this automatically.  */
 127   if (dn == in)
 128     {
 129       tp[in + 1] = 0;
 130       MPN_COPY (tp + in + 2, dp, in);
 131       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
 132     }
 133   else
 134     {
 135       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
 136     }
 137   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
 138   if (UNLIKELY (cy != 0))
 139     MPN_ZERO (tp + 1, in);
 140   MPN_COPY (ip, tp + 1, in);
 141 #endif
 142
 143   qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
 144
 145   return qh;
 146 }
 147
 148 mp_limb_t
 149 mpn_preinv_mu_divappr_q (mp_ptr qp,
 150                          mp_srcptr np,
 151                          mp_size_t nn,
 152                          mp_srcptr dp,
 153                          mp_size_t dn,
 154                          mp_srcptr ip,
 155                          mp_size_t in,
 156                          mp_ptr scratch)
 157 {
 158   mp_size_t qn;
 159   mp_limb_t cy, cx, qh;
 160   mp_limb_t r;
 161   mp_size_t tn, wn;
 162
 163 #define rp           scratch
 164 #define tp           (scratch + dn)
 165 #define scratch_out  (scratch + dn + tn)
 166
 167   qn = nn - dn;
 168
 169   np += qn;
 170   qp += qn;
 171
 172   qh = mpn_cmp (np, dp, dn) >= 0;
 173   if (qh != 0)
 174     mpn_sub_n (rp, np, dp, dn);
 175   else
 176     MPN_COPY (rp, np, dn);
 177
 178   if (qn == 0)
 179     return qh;                  /* Degenerate use.  Should we allow this? */
 180
 181   while (qn > 0)
 182     {
 183       if (qn < in)
 184         {
 185           ip += in - qn;
 186           in = qn;
 187         }
 188       np -= in;
 189       qp -= in;
 190
 191       /* Compute the next block of quotient limbs by multiplying the inverse I
 192          by the upper part of the partial remainder R.  */
 193       mpn_mul_n (tp, rp + dn - in, ip, in);             /* mulhi  */
 194       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);   /* I's msb implicit */
 195       ASSERT_ALWAYS (cy == 0);
 196
 197       qn -= in;
 198       if (qn == 0)
 199         break;
 200
 201       /* Compute the product of the quotient block and the divisor D, to be
 202          subtracted from the partial remainder combined with new limbs from the
 203          dividend N.  We only really need the low dn limbs.  */
 204
 205       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 206         mpn_mul (tp, dp, dn, qp, in);           /* dn+in limbs, high 'in' cancels */
 207       else
 208         {
 209           tn = mpn_mulmod_bnm1_next_size (dn + 1);
 210           mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
 211           wn = dn + in - tn;                    /* number of wrapped limbs */
 212           if (wn > 0)
 213             {
 214               cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
 215               cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
 216               cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
 217               ASSERT_ALWAYS (cx >= cy);
 218               mpn_incr_u (tp, cx - cy);
 219             }
 220         }
 221
 222       r = rp[dn - in] - tp[dn];
 223
 224       /* Subtract the product from the partial remainder combined with new
 225          limbs from the dividend N, generating a new partial remainder R.  */
 226       if (dn != in)
 227         {
 228           cy = mpn_sub_n (tp, np, tp, in);      /* get next 'in' limbs from N */
 229           cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
 230           MPN_COPY (rp, tp, dn);                /* FIXME: try to avoid this */
 231         }
 232       else
 233         {
 234           cy = mpn_sub_n (rp, np, tp, in);      /* get next 'in' limbs from N */
 235         }
 236
 237       STAT (int i; int err = 0;
 238             static int errarr[5]; static int err_rec; static int tot);
 239
 240       /* Check the remainder R and adjust the quotient as needed.  */
 241       r -= cy;
 242       while (r != 0)
 243         {
 244           /* We loop 0 times with about 69% probability, 1 time with about 31%
 245              probability, 2 times with about 0.6% probability, if inverse is
 246              computed as recommended.  */
 247           mpn_incr_u (qp, 1);
 248           cy = mpn_sub_n (rp, rp, dp, dn);
 249           r -= cy;
 250           STAT (err++);
 251         }
 252       if (mpn_cmp (rp, dp, dn) >= 0)
 253         {
 254           /* This is executed with about 76% probability.  */
 255           mpn_incr_u (qp, 1);
 256           cy = mpn_sub_n (rp, rp, dp, dn);
 257           STAT (err++);
 258         }
 259
 260       STAT (
 261             tot++;
 262             errarr[err]++;
 263             if (err > err_rec)
 264               err_rec = err;
 265             if (tot % 0x10000 == 0)
 266               {
 267                 for (i = 0; i <= err_rec; i++)
 268                   printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
 269                 printf ("\n");
 270               }
 271             );
 272     }
 273
 274   /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
 275      quotient.  For now, just make sure the returned quotient is >= the real
 276      quotient; add 3 with saturating arithmetic.  */
 277   qn = nn - dn;
 278   cy += mpn_add_1 (qp, qp, qn, 3);
 279   if (cy != 0)
 280     {
 281       if (qh != 0)
 282         {
 283           /* Return a quotient of just 1-bits, with qh set.  */
 284           mp_size_t i;
 285           for (i = 0; i < qn; i++)
 286             qp[i] = GMP_NUMB_MAX;
 287         }
 288       else
 289         {
 290           /* Propagate carry into qh.  */
 291           qh = 1;
 292         }
 293     }
 294
 295   return qh;
 296 }
 297
 298 /* In case k=0 (automatic choice), we distinguish 3 cases:
 299    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
 300    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
 301    (c) qn < dn/3:       in = qn
 302    In all cases we have in <= dn.
 303  */
 304 mp_size_t
 305 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
 306 {
 307   mp_size_t in;
 308
 309   if (k == 0)
 310     {
 311       mp_size_t b;
 312       if (qn > dn)
 313         {
 314           /* Compute an inverse size that is a nice partition of the quotient.  */
 315           b = (qn - 1) / dn + 1;        /* ceil(qn/dn), number of blocks */
 316           in = (qn - 1) / b + 1;        /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
 317         }
 318       else if (3 * qn > dn)
 319         {
 320           in = (qn - 1) / 2 + 1;        /* b = 2 */
 321         }
 322       else
 323         {
 324           in = (qn - 1) / 1 + 1;        /* b = 1 */
 325         }
 326     }
 327   else
 328     {
 329       mp_size_t xn;
 330       xn = MIN (dn, qn);
 331       in = (xn - 1) / k + 1;
 332     }
 333
 334   return in;
 335 }
 336
 337 mp_size_t
 338 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
 339 {
 340   mp_size_t qn, in, itch_local, itch_out;
 341
 342   qn = nn - dn;
 343   if (qn + 1 < dn)
 344     {
 345       dn = qn + 1;
 346     }
 347   in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
 348
 349   itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
 350   itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
 351   return in + dn + itch_local + itch_out;
 352 }