contrib/gmp/mpn/generic/toom_interpolate_6pts.c

   1 /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
   2
   3    Contributed to the GNU project by Marco Bodrato.
   4
   5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   8
   9 Copyright 2009, 2010 Free Software Foundation, Inc.
  10
  11 This file is part of the GNU MP Library.
  12
  13 The GNU MP Library is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU Lesser General Public License as published by
  15 the Free Software Foundation; either version 3 of the License, or (at your
  16 option) any later version.
  17
  18 The GNU MP Library is distributed in the hope that it will be useful, but
  19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  21 License for more details.
  22
  23 You should have received a copy of the GNU Lesser General Public License
  24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  25
  26 #include "gmp.h"
  27 #include "gmp-impl.h"
  28
  29 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
  30 #ifndef mpn_divexact_by3
  31 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && MODLIMB_INVERSE_3
  32 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,MODLIMB_INVERSE_3,0)
  33 #else
  34 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
  35 #endif
  36 #endif
  37
  38 /* Interpolation for Toom-3.5, using the evaluation points: infinity,
  39    1, -1, 2, -2. More precisely, we want to compute
  40    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
  41    six values
  42
  43      w5 = f(0),
  44      w4 = f(-1),
  45      w3 = f(1)
  46      w2 = f(-2),
  47      w1 = f(2),
  48      w0 = limit at infinity of f(x) / x^5,
  49
  50    The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
  51    {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
  52    {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
  53    significant limbs small). f(-1) and f(-2) may be negative, signs
  54    determined by the flag bits. All intermediate results are positive.
  55    Inputs are destroyed.
  56
  57    Interpolation sequence was taken from the paper: "Integer and
  58    Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
  59    Some slight variations were introduced: adaptation to "gmp
  60    instruction set", and a final saving of an operation by interlacing
  61    interpolation and recomposition phases.
  62 */
  63
  64 void
  65 mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
  66                            mp_ptr w4, mp_ptr w2, mp_ptr w1,
  67                            mp_size_t w0n)
  68 {
  69   mp_limb_t cy;
  70   /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
  71   mp_limb_t cy4, cy6, embankment;
  72
  73   ASSERT( n > 0 );
  74   ASSERT( 2*n >= w0n && w0n > 0 );
  75
  76 #define w5  pp                                  /* 2n   */
  77 #define w3  (pp + 2 * n)                        /* 2n+1 */
  78 #define w0  (pp + 5 * n)                        /* w0n  */
  79
  80   /* Interpolate with sequence:
  81      W2 =(W1 - W2)>>2
  82      W1 =(W1 - W5)>>1
  83      W1 =(W1 - W2)>>1
  84      W4 =(W3 - W4)>>1
  85      W2 =(W2 - W4)/3
  86      W3 = W3 - W4 - W5
  87      W1 =(W1 - W3)/3
  88      // Last steps are mixed with recomposition...
  89      W2 = W2 - W0<<2
  90      W4 = W4 - W2
  91      W3 = W3 - W1
  92      W2 = W2 - W0
  93   */
  94
  95   /* W2 =(W1 - W2)>>2 */
  96   if (flags & toom6_vm2_neg)
  97     mpn_add_n (w2, w1, w2, 2 * n + 1);
  98   else
  99     mpn_sub_n (w2, w1, w2, 2 * n + 1);
 100   mpn_rshift (w2, w2, 2 * n + 1, 2);
 101
 102   /* W1 =(W1 - W5)>>1 */
 103   w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
 104   mpn_rshift (w1, w1, 2 * n + 1, 1);
 105
 106   /* W1 =(W1 - W2)>>1 */
 107 #if HAVE_NATIVE_mpn_rsh1sub_n
 108   mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
 109 #else
 110   mpn_sub_n (w1, w1, w2, 2 * n + 1);
 111   mpn_rshift (w1, w1, 2 * n + 1, 1);
 112 #endif
 113
 114   /* W4 =(W3 - W4)>>1 */
 115   if (flags & toom6_vm1_neg)
 116     {
 117 #if HAVE_NATIVE_mpn_rsh1add_n
 118       mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
 119 #else
 120       mpn_add_n (w4, w3, w4, 2 * n + 1);
 121       mpn_rshift (w4, w4, 2 * n + 1, 1);
 122 #endif
 123     }
 124   else
 125     {
 126 #if HAVE_NATIVE_mpn_rsh1sub_n
 127       mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
 128 #else
 129       mpn_sub_n (w4, w3, w4, 2 * n + 1);
 130       mpn_rshift (w4, w4, 2 * n + 1, 1);
 131 #endif
 132     }
 133
 134   /* W2 =(W2 - W4)/3 */
 135   mpn_sub_n (w2, w2, w4, 2 * n + 1);
 136   mpn_divexact_by3 (w2, w2, 2 * n + 1);
 137
 138   /* W3 = W3 - W4 - W5 */
 139   mpn_sub_n (w3, w3, w4, 2 * n + 1);
 140   w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
 141
 142   /* W1 =(W1 - W3)/3 */
 143   mpn_sub_n (w1, w1, w3, 2 * n + 1);
 144   mpn_divexact_by3 (w1, w1, 2 * n + 1);
 145
 146   /*
 147     [1 0 0 0 0 0;
 148      0 1 0 0 0 0;
 149      1 0 1 0 0 0;
 150      0 1 0 1 0 0;
 151      1 0 1 0 1 0;
 152      0 0 0 0 0 1]
 153
 154     pp[] prior to operations:
 155      |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
 156
 157     summation scheme for remaining operations:
 158      |______________5|n_____4|n_____3|n_____2|n______|n______|pp
 159      |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
 160                                     || H w4  | L w4  |
 161                     || H w2  | L w2  |
 162             || H w1  | L w1  |
 163                             ||-H w1  |-L w1  |
 164                      |-H w0  |-L w0 ||-H w2  |-L w2  |
 165   */
 166   cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
 167   MPN_INCR_U (pp + 3 * n + 1, n, cy);
 168
 169   /* W2 -= W0<<2 */
 170 #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n
 171 #if HAVE_NATIVE_mpn_sublsh2_n
 172   cy = mpn_sublsh2_n(w2, w2, w0, w0n);
 173 #else
 174   cy = mpn_sublsh_n(w2, w2, w0, w0n, 2);
 175 #endif
 176 #else
 177   /* {W4,2*n+1} is now free and can be overwritten. */
 178   cy = mpn_lshift(w4, w0, w0n, 2);
 179   cy+= mpn_sub_n(w2, w2, w4, w0n);
 180 #endif
 181   MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
 182
 183   /* W4L = W4L - W2L */
 184   cy = mpn_sub_n (pp + n, pp + n, w2, n);
 185   MPN_DECR_U (w3, 2 * n + 1, cy);
 186
 187   /* W3H = W3H + W2L */
 188   cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
 189   /* W1L + W2H */
 190   cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
 191   MPN_INCR_U (w1 + n, n + 1, cy);
 192
 193   /* W0 = W0 + W1H */
 194   if (LIKELY (w0n > n))
 195     cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
 196   else
 197     cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
 198
 199   /*
 200     summation scheme for the next operation:
 201      |...____5|n_____4|n_____3|n_____2|n______|n______|pp
 202      |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
 203                      ...-w0___|-w1_w2 |
 204   */
 205   /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
 206   cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
 207
 208   /* embankment is a "dirty trick" to avoid carry/borrow propagation
 209      beyond allocated memory */
 210   embankment = w0[w0n - 1] - 1;
 211   w0[w0n - 1] = 1;
 212   if (LIKELY (w0n > n)) {
 213     if ( LIKELY(cy4 > cy6) )
 214       MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
 215     else
 216       MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
 217     MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
 218     MPN_INCR_U (w0 + n, w0n - n, cy6);
 219   } else {
 220     MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
 221     MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
 222   }
 223   w0[w0n - 1] += embankment;
 224
 225 #undef w5
 226 #undef w3
 227 #undef w0
 228
 229 }