/* Mulders' MulHigh function (short product)

Copyright 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
Contributed by the Arenaire and Caramel projects, INRIA.

This file is part of the GNU MPFR Library.

The GNU MPFR Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MPFR Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */

/* References:
   [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
       Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
       July 25-27, 2011, pages 7-14.
*/

#define MPFR_NEED_LONGLONG_H
#include "mpfr-impl.h"

#ifndef MUL_FFT_THRESHOLD
#define MUL_FFT_THRESHOLD 8448
#endif

/* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
#ifdef MPFR_MULHIGH_TAB_SIZE
static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
#else
static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
#define MPFR_MULHIGH_TAB_SIZE \
  ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0])))
#endif

/* Put in  rp[n..2n-1] an approximation of the n high limbs
   of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
   approximation is always less or equal to the truncated full product).
   Assume 2n limbs are allocated at rp.

   Implements Algorithm ShortMulNaive from [1].
*/
static void
mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
                         mpfr_limb_srcptr vp, mp_size_t n)
{
  mp_size_t i;

  rp += n - 1;
  umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
                                               which is less than B^n */
  for (i = 1 ; i < n ; i++)
    /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
    rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
  /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
}

/* Put in  rp[0..n] the n+1 low limbs of {up, n} * {vp, n}.
   Assume 2n limbs are allocated at rp. */
static void
mpfr_mullow_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
                        mpfr_limb_srcptr vp, mp_size_t n)
{
  mp_size_t i;

  rp[n] = mpn_mul_1 (rp, up, n, vp[0]);
  for (i = 1 ; i < n ; i++)
    mpn_addmul_1 (rp + i, up, n - i + 1, vp[i]);
}

/* Put in  rp[n..2n-1] an approximation of the n high limbs
   of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
   approximation is always less or equal to the truncated full product).

   Implements Algorithm ShortMul from [1].
*/
void
mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
                mp_size_t n)
{
  mp_size_t k;

  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
  k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
  /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
     into k >= (n+4)/2 in the C language. */
  MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
  if (k < 0)
    mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
  else if (k == 0)
    mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
  else if (n > MUL_FFT_THRESHOLD)
    mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
  else
    {
      mp_size_t l = n - k;
      mp_limb_t cy;

      mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
      mpfr_mulhigh_n (rp, np + k, mp, l);        /* fills rp[l-1..2l-1] */
      cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
      mpfr_mulhigh_n (rp, np, mp + k, l);        /* fills rp[l-1..2l-1] */
      cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
      mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
    }
}

/* Put in  rp[0..n] the n+1 low limbs of {np, n} * {mp, n}.
   Assume 2n limbs are allocated at rp. */
void
mpfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
               mp_size_t n)
{
  mp_size_t k;

  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
  k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
  MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n));
  if (k < 0)
    mpn_mul_basecase (rp, np, n, mp, n);
  else if (k == 0)
    mpfr_mullow_n_basecase (rp, np, mp, n);
  else if (n > MUL_FFT_THRESHOLD)
    mpn_mul_n (rp, np, mp, n);
  else
    {
      mp_size_t l = n - k;

      mpn_mul_n (rp, np, mp, k);                      /* fills rp[0..2k] */
      mpfr_mullow_n (rp + n, np + k, mp, l);          /* fills rp[n..n+2l] */
      mpn_add_n (rp + k, rp + k, rp + n, l + 1);
      mpfr_mullow_n (rp + n, np, mp + k, l);          /* fills rp[n..n+2l] */
      mpn_add_n (rp + k, rp + k, rp + n, l + 1);
    }
}

#ifdef MPFR_SQRHIGH_TAB_SIZE
static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
#else
static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
#define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0]))
#endif

/* Put in  rp[n..2n-1] an approximation of the n high limbs
   of {np, n}^2. The error is less than n ulps of rp[n]. */
void
mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
{
  mp_size_t k;

  MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
  k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
    : (n+4)/2; /* ensures that k >= (n+3)/2 */
  MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
  if (k < 0)
    /* we can't use mpn_sqr_basecase here, since it requires
       n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
       is not exported by GMP */
    mpn_sqr_n (rp, np, n);
  else if (k == 0)
    mpfr_mulhigh_n_basecase (rp, np, np, n);
  else
    {
      mp_size_t l = n - k;
      mp_limb_t cy;

      mpn_sqr_n (rp + 2 * l, np + l, k);          /* fills rp[2l..2n-1] */
      mpfr_mulhigh_n (rp, np, np + k, l);         /* fills rp[l-1..2l-1] */
      /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
      cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
      cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
      mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
    }
}

#ifdef MPFR_DIVHIGH_TAB_SIZE
static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
#else
static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
#define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
#endif

#ifndef __GMPFR_GMP_H__
#define mpfr_pi1_t gmp_pi1_t /* with a GMP build */
#endif

#if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
/* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
   with the most significant limb of the quotient as return value (0 or 1).
   Assumes the most significant bit of D is set. Clobbers N.

   The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
*/
static mp_limb_t
mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
                         mpfr_limb_srcptr dp, mp_size_t n)
{
  mp_limb_t qh, d1, d0, dinv, q2, q1, q0;
  mpfr_pi1_t dinv2;

  np += n;

  if ((qh = (mpn_cmp (np, dp, n) >= 0)))
    mpn_sub_n (np, np, dp, n);

  /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */

  d1 = dp[n - 1];

  if (n == 1)
    {
      invert_limb (dinv, d1);
      umul_ppmm (q1, q0, np[0], dinv);
      qp[0] = np[0] + q1;
      return qh;
    }

  /* now n >= 2 */
  d0 = dp[n - 2];
  invert_pi1 (dinv2, d1, d0);
  /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
  while (n > 1)
    {
      /* Invariant: it remains to reduce n limbs from N (in addition to the
         initial low n limbs).
         Since n >= 2 here, necessarily we had n >= 2 initially, which means
         that in addition to the limb np[n-1] to reduce, we have at least 2
         extra limbs, thus accessing np[n-3] is valid. */

      /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D,
         the largest possible partial quotient is B-1 */
      if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0))
        q2 = ~ (mp_limb_t) 0;
      else
        udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
                      d1, d0, dinv2.inv32);
      /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
         we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
         thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
         and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
         thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
         which proves that at most one correction is needed */
      q0 = mpn_submul_1 (np - 1, dp, n, q2);
      if (MPFR_UNLIKELY(q0 > np[n - 1]))
        {
          mpn_add_n (np - 1, np - 1, dp, n);
          q2 --;
        }
      qp[--n] = q2;
      dp ++;
    }

  /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
     q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
        <= floor((np[0]*B+np[1])/d1)
     thus q1 is not larger than the true quotient.
     q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
     For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
     thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
     (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
     d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
     thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.

     For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
     np[0]*B/d1 - 2.

     In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
     q - 4 <= q1 <= q
  */
  umul_ppmm (q1, q0, np[0], dinv2.inv32);
  qp[0] = np[0] + q1;

  return qh;
}
#endif

/* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
   with the most significant limb of the quotient as return value (0 or 1).
   Assumes the most significant bit of D is set. Clobbers N.

   This implements the ShortDiv algorithm from reference [1].
*/
#if 1
mp_limb_t
mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
                mp_size_t n)
{
  mp_size_t k, l;
  mp_limb_t qh, cy;
  mpfr_limb_ptr tp;
  MPFR_TMP_DECL(marker);

  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
  k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);

  if (k == 0)
#if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
  {
    mpfr_pi1_t dinv2;
    invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
    return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
  }
#else /* use our own code for base-case short division */
    return mpfr_divhigh_n_basecase (qp, np, dp, n);
#endif
  else if (k == n)
    /* for k=n, we use a division with remainder (mpn_divrem),
     which computes the exact quotient */
    return mpn_divrem (qp, 0, np, 2 * n, dp, n);

  MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */
  MPFR_TMP_MARK (marker);
  l = n - k;
  /* first divide the most significant 2k limbs from N by the most significant
     k limbs of D */
  qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */

  /* it remains {np,2l+k} = {np,n+l} as remainder */

  /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
     D0={dp,l} */
  tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
  mpfr_mulhigh_n (tp, qp + k, dp, l);
  /* we are only interested in the upper l limbs from {tp,2l} */
  cy = mpn_sub_n (np + n, np + n, tp + l, l);
  if (qh)
    cy += mpn_sub_n (np + n, np + n, dp, l);
  while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
    {
      qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
      cy -= mpn_add_n (np + l, np + l, dp, n);
    }

  /* now it remains {np,n+l} to divide by D */
  cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
  qh += mpn_add_1 (qp + l, qp + l, k, cy);
  MPFR_TMP_FREE(marker);

  return qh;
}
#else /* below is the FoldDiv(K) algorithm from [1] */
mp_limb_t
mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
                mp_size_t n)
{
  mp_size_t k, r;
  mpfr_limb_ptr ip, tp, up;
  mp_limb_t qh = 0, cy, cc;
  int count;
  MPFR_TMP_DECL(marker);

#define K 3
  if (n < K)
    return mpn_divrem (qp, 0, np, 2 * n, dp, n);

  k = (n - 1) / K + 1; /* ceil(n/K) */

  MPFR_TMP_MARK (marker);
  ip = MPFR_TMP_LIMBS_ALLOC (k + 1);
  tp = MPFR_TMP_LIMBS_ALLOC (n + k);
  up = MPFR_TMP_LIMBS_ALLOC (2 * (k + 1));
  mpn_invert (ip, dp + n - (k + 1), k + 1, NULL); /* takes about 13% for n=1000 */
  /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */
  for (r = n, cc = 0UL; r > 0;)
    {
      /* cc is the carry at np[n+r] */
      MPFR_ASSERTD(cc <= 1);
      /* FIXME: why can we have cc as large as say 8? */
      count = 0;
      while (cc > 0)
        {
          count ++;
          MPFR_ASSERTD(count <= 1);
          /* subtract {dp+n-r,r} from {np+n,r} */
          cc -= mpn_sub_n (np + n, np + n, dp + n - r, r);
          /* add 1 at qp[r] */
          qh += mpn_add_1 (qp + r, qp + r, n - r, 1UL);
        }
      /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */
      if (r < k)
        {
          ip += k - r;
          k = r;
        }
      /* now r >= k */
      /* qp + r - 2 * k -> up */
      mpfr_mulhigh_n (up, np + n + r - (k + 1), ip, k + 1);
      /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */
      cy = mpn_add_n (qp + r - k, up + k + 2, np + n + r - k, k);
      /* since we need only r limbs of tp (below), it suffices to consider
         r high limbs of dp */
      if (r > k)
        {
#if 0
          mpn_mul (tp, dp + n - r, r, qp + r - k, k);
#else  /* use a short product for the low k x k limbs */
          /* we know the upper k limbs of the r-limb product cancel with the
             remainder, thus we only need to compute the low r-k limbs */
          if (r - k >= k)
            mpn_mul (tp + k, dp + n - r + k, r - k, qp + r - k, k);
          else /* r-k < k */
            {
/* #define LOW */
#ifndef LOW
              mpn_mul (tp + k, qp + r - k, k, dp + n - r + k, r - k);
#else
              mpfr_mullow_n_basecase (tp + k, qp + r - k, dp + n - r + k, r - k);
              /* take into account qp[2r-2k] * dp[n - r + k] */
              tp[r] += qp[2*r-2*k] * dp[n - r + k];
#endif
              /* tp[k..r] is filled */
            }
#if 0
          mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k);
#else /* compute one more limb. FIXME: we could add one limb of dp in the
         above, to save one mpn_addmul_1 call */
          mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k - 1); /* {up,2k-2} */
          /* add {qp + r - k, k - 1} * dp[n-r+k-1] */
          up[2*k-2] = mpn_addmul_1 (up + k - 1, qp + r - k, k-1, dp[n-r+k-1]);
          /* add {dp+n-r, k} * qp[r-1] */
          up[2*k-1] = mpn_addmul_1 (up + k - 1, dp + n - r, k, qp[r-1]);
#endif
#ifndef LOW
          cc = mpn_add_n (tp + k, tp + k, up + k, k);
          mpn_add_1 (tp + 2 * k, tp + 2 * k, r - k, cc);
#else
          /* update tp[k..r] */
          if (r - k + 1 <= k)
            mpn_add_n (tp + k, tp + k, up + k, r - k + 1);
          else /* r - k >= k */
            {
              cc = mpn_add_n (tp + k, tp + k, up + k, k);
              mpn_add_1 (tp + 2 * k, tp + 2 * k, r - 2 * k + 1, cc);
            }
#endif
#endif
        }
      else /* last step: since we only want the quotient, no need to update,
              just propagate the carry cy */
        {
          MPFR_ASSERTD(r < n);
          if (cy > 0)
            qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
          break;
        }
      /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to
         update {np+n, n} */
      /* we should have tp[r] = np[n+r-k] up to 1 */
      MPFR_ASSERTD(tp[r] == np[n + r - k] || tp[r] + 1 == np[n + r - k]);
#ifndef LOW
      cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r + 1); /* borrow at np[n+r] */
#else
      cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r - k + 2);
#endif
      /* if cy = 1, subtract {dp, n} from {np+r, n}, thus
         {dp+n-r,r} from {np+n,r} */
      if (cy)
        {
          if (r < n)
            cc += mpn_sub_n (np + n - 1, np + n - 1, dp + n - r - 1, r + 1);
          else
            cc += mpn_sub_n (np + n, np + n, dp + n - r, r);
          /* propagate cy */
          if (r == n)
            qh = cy;
          else
            qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
        }
      /* cc is the borrow at np[n+r] */
      count = 0;
      while (cc > 0) /* quotient was too large */
        {
          count++;
          MPFR_ASSERTD (count <= 1);
          cy = mpn_add_n (np + n, np + n, dp + n - (r - k), r - k);
          cc -= mpn_add_1 (np + n + r - k, np + n + r - k, k, cy);
          qh -= mpn_sub_1 (qp + r - k, qp + r - k, n - (r - k), 1UL);
        }
      r -= k;
      cc = np[n + r];
    }
  MPFR_TMP_FREE(marker);

  return qh;
}
#endif