contrib/gmp/mpn/generic/mu_div_q.c

   1 /* mpn_mu_div_q.
   2
   3    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
   4
   5    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   6    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   7    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
   8
   9 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
  10
  11 This file is part of the GNU MP Library.
  12
  13 The GNU MP Library is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU Lesser General Public License as published by
  15 the Free Software Foundation; either version 3 of the License, or (at your
  16 option) any later version.
  17
  18 The GNU MP Library is distributed in the hope that it will be useful, but
  19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  21 License for more details.
  22
  23 You should have received a copy of the GNU Lesser General Public License
  24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  25
  26
  27 /*
  28    The idea of the algorithm used herein is to compute a smaller inverted value
  29    than used in the standard Barrett algorithm, and thus save time in the
  30    Newton iterations, and pay just a small price when using the inverted value
  31    for developing quotient bits.  This algorithm was presented at ICMS 2006.
  32 */
  33
  34 /*
  35   Things to work on:
  36
  37   1. This is a rudimentary implementation of mpn_mu_div_q.  The algorithm is
  38      probably close to optimal, except when mpn_mu_divappr_q fails.
  39
  40      An alternative which could be considered for much simpler code for the
  41      complex qn>=dn arm would be to allocate a temporary nn+1 limb buffer, then
  42      simply call mpn_mu_divappr_q.  Such a temporary allocation is
  43      unfortunately very large.
  44
  45   2. We used to fall back to mpn_mu_div_qr when we detect a possible
  46      mpn_mu_divappr_q rounding problem, now we multiply and compare.
  47      Unfortunately, since mpn_mu_divappr_q does not return the partial
  48      remainder, this also doesn't become optimal.  A mpn_mu_divappr_qr could
  49      solve that.
  50
  51   3. The allocations done here should be made from the scratch area, which
  52      then would need to be amended.
  53 */
  54
  55 #include <stdlib.h>             /* for NULL */
  56 #include "gmp.h"
  57 #include "gmp-impl.h"
  58
  59
  60 mp_limb_t
  61 mpn_mu_div_q (mp_ptr qp,
  62               mp_srcptr np, mp_size_t nn,
  63               mp_srcptr dp, mp_size_t dn,
  64               mp_ptr scratch)
  65 {
  66   mp_ptr tp, rp, ip, this_ip;
  67   mp_size_t qn, in, this_in;
  68   mp_limb_t cy, qh;
  69   TMP_DECL;
  70
  71   TMP_MARK;
  72
  73   qn = nn - dn;
  74
  75   tp = TMP_BALLOC_LIMBS (qn + 1);
  76
  77   if (qn >= dn)                 /* nn >= 2*dn + 1 */
  78     {
  79       /* Find max inverse size needed by the two preinv calls.  FIXME: This is
  80          not optimal, it underestimates the invariance.  */
  81       if (dn != qn)
  82         {
  83           mp_size_t in1, in2;
  84
  85           in1 = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
  86           in2 = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
  87           in = MAX (in1, in2);
  88         }
  89       else
  90         {
  91           in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
  92         }
  93
  94       ip = TMP_BALLOC_LIMBS (in + 1);
  95
  96       if (dn == in)
  97         {
  98           MPN_COPY (scratch + 1, dp, in);
  99           scratch[0] = 1;
 100           mpn_invertappr (ip, scratch, in + 1, NULL);
 101           MPN_COPY_INCR (ip, ip + 1, in);
 102         }
 103       else
 104         {
 105           cy = mpn_add_1 (scratch, dp + dn - (in + 1), in + 1, 1);
 106           if (UNLIKELY (cy != 0))
 107             MPN_ZERO (ip, in);
 108           else
 109             {
 110               mpn_invertappr (ip, scratch, in + 1, NULL);
 111               MPN_COPY_INCR (ip, ip + 1, in);
 112             }
 113         }
 114
 115        /* |_______________________|   dividend
 116                          |________|   divisor  */
 117       rp = TMP_BALLOC_LIMBS (2 * dn + 1);
 118
 119       this_in = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
 120       this_ip = ip + in - this_in;
 121       qh = mpn_preinv_mu_div_qr (tp + dn + 1, rp + dn + 1, np + dn, qn, dp, dn,
 122                                  this_ip, this_in, scratch);
 123
 124       MPN_COPY (rp + 1, np, dn);
 125       rp[0] = 0;
 126       this_in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
 127       this_ip = ip + in - this_in;
 128       cy = mpn_preinv_mu_divappr_q (tp, rp, 2 * dn + 1, dp, dn,
 129                                     this_ip, this_in, scratch);
 130
 131       if (UNLIKELY (cy != 0))
 132         {
 133           /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was
 134              canonically reduced, replace the returned value of B^(qn-dn)+eps
 135              by the largest possible value.  */
 136           mp_size_t i;
 137           for (i = 0; i < dn + 1; i++)
 138             tp[i] = GMP_NUMB_MAX;
 139         }
 140
 141       /* The max error of mpn_mu_divappr_q is +4.  If the low quotient limb is
 142          greater than the max error, we cannot trust the quotient.  */
 143       if (tp[0] > 4)
 144         {
 145           MPN_COPY (qp, tp + 1, qn);
 146         }
 147       else
 148         {
 149           mp_limb_t cy;
 150           mp_ptr pp;
 151
 152           /* FIXME: can we use already allocated space? */
 153           pp = TMP_BALLOC_LIMBS (nn);
 154           mpn_mul (pp, tp + 1, qn, dp, dn);
 155
 156           cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0;
 157
 158           if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */
 159             qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
 160           else /* Same as above */
 161             MPN_COPY (qp, tp + 1, qn);
 162         }
 163     }
 164   else
 165     {
 166        /* |_______________________|   dividend
 167                  |________________|   divisor  */
 168
 169       /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed
 170          here becomes 2dn, i.e., more than nn.  This shouldn't hurt, since only
 171          the most significant dn-1 limbs will actually be read, but it is not
 172          pretty.  */
 173
 174       qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2,
 175                              dp + dn - (qn + 1), qn + 1, scratch);
 176
 177       /* The max error of mpn_mu_divappr_q is +4, but we get an additional
 178          error from the divisor truncation.  */
 179       if (tp[0] > 6)
 180         {
 181           MPN_COPY (qp, tp + 1, qn);
 182         }
 183       else
 184         {
 185           mp_limb_t cy;
 186
 187           /* FIXME: a shorter product should be enough; we may use already
 188              allocated space... */
 189           rp = TMP_BALLOC_LIMBS (nn);
 190           mpn_mul (rp, dp, dn, tp + 1, qn);
 191
 192           cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0;
 193
 194           if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */
 195             qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
 196           else /* Same as above */
 197             MPN_COPY (qp, tp + 1, qn);
 198         }
 199     }
 200
 201   TMP_FREE;
 202   return qh;
 203 }
 204
 205 mp_size_t
 206 mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
 207 {
 208   mp_size_t qn, itch1, itch2;
 209
 210   qn = nn - dn;
 211   if (qn >= dn)
 212     {
 213       itch1 = mpn_mu_div_qr_itch (qn, dn, mua_k);
 214       itch2 = mpn_mu_divappr_q_itch (2 * dn + 1, dn, mua_k);
 215       return MAX (itch1, itch2);
 216     }
 217   else
 218     {
 219       itch1 = mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k);
 220       return itch1;
 221     }
 222 }