contrib/gmp/mpn/generic/toom_interpolate_16pts.c

   1 /* Interpolaton for the algorithm Toom-Cook 8.5-way.
   2
   3    Contributed to the GNU project by Marco Bodrato.
   4
   5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   8
   9 Copyright 2009, 2010 Free Software Foundation, Inc.
  10
  11 This file is part of the GNU MP Library.
  12
  13 The GNU MP Library is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU Lesser General Public License as published by
  15 the Free Software Foundation; either version 3 of the License, or (at your
  16 option) any later version.
  17
  18 The GNU MP Library is distributed in the hope that it will be useful, but
  19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  21 License for more details.
  22
  23 You should have received a copy of the GNU Lesser General Public License
  24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  25
  26
  27 #include "gmp.h"
  28 #include "gmp-impl.h"
  29
  30 #if GMP_NUMB_BITS < 29
  31 #error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
  32 #endif
  33
  34 #if GMP_NUMB_BITS < 28
  35 #error Not implemented: divexact_by188513325 and _by182712915 will not work.
  36 #endif
  37
  38
  39 #if HAVE_NATIVE_mpn_sublsh_n
  40 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
  41 #else
  42 static mp_limb_t
  43 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
  44 {
  45 #if USE_MUL_1 && 0
  46   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
  47 #else
  48   mp_limb_t __cy;
  49   __cy = mpn_lshift(ws,src,n,s);
  50   return    __cy + mpn_sub_n(dst,dst,ws,n);
  51 #endif
  52 }
  53 #endif
  54
  55 #if HAVE_NATIVE_mpn_addlsh_n
  56 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
  57 #else
  58 static mp_limb_t
  59 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
  60 {
  61 #if USE_MUL_1 && 0
  62   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
  63 #else
  64   mp_limb_t __cy;
  65   __cy = mpn_lshift(ws,src,n,s);
  66   return    __cy + mpn_add_n(dst,dst,ws,n);
  67 #endif
  68 }
  69 #endif
  70
  71 #if HAVE_NATIVE_mpn_subrsh
  72 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
  73 #else
  74 /* FIXME: This is not a correct definition, it assumes no carry */
  75 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)                               \
  76 do {                                                                    \
  77   mp_limb_t __cy;                                                       \
  78   MPN_DECR_U (dst, nd, src[0] >> s);                                    \
  79   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
  80   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);                         \
  81 } while (0)
  82 #endif
  83
  84
  85 /* FIXME: tuneup should decide the best variant */
  86 #ifndef AORSMUL_FASTER_AORS_AORSLSH
  87 #define AORSMUL_FASTER_AORS_AORSLSH 1
  88 #endif
  89 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
  90 #define AORSMUL_FASTER_AORS_2AORSLSH 1
  91 #endif
  92 #ifndef AORSMUL_FASTER_2AORSLSH
  93 #define AORSMUL_FASTER_2AORSLSH 1
  94 #endif
  95 #ifndef AORSMUL_FASTER_3AORSLSH
  96 #define AORSMUL_FASTER_3AORSLSH 1
  97 #endif
  98
  99 #if GMP_NUMB_BITS < 43
 100 #define BIT_CORRECTION 1
 101 #define CORRECTION_BITS GMP_NUMB_BITS
 102 #else
 103 #define BIT_CORRECTION 0
 104 #define CORRECTION_BITS 0
 105 #endif
 106
 107 #define BINVERT_9 \
 108   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
 109
 110 #define BINVERT_255 \
 111   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
 112
 113   /* FIXME: find some more general expressions for inverses */
 114 #if GMP_LIMB_BITS == 32
 115 #define BINVERT_2835  (GMP_NUMB_MASK &          CNST_LIMB(0x53E3771B))
 116 #define BINVERT_42525 (GMP_NUMB_MASK &          CNST_LIMB(0x9F314C35))
 117 #define BINVERT_182712915 (GMP_NUMB_MASK &      CNST_LIMB(0x550659DB))
 118 #define BINVERT_188513325 (GMP_NUMB_MASK &      CNST_LIMB(0xFBC333A5))
 119 #define BINVERT_255x182712915L (GMP_NUMB_MASK & CNST_LIMB(0x6FC4CB25))
 120 #define BINVERT_255x188513325L (GMP_NUMB_MASK & CNST_LIMB(0x6864275B))
 121 #if GMP_NAIL_BITS == 0
 122 #define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
 123 #define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
 124 #else /* GMP_NAIL_BITS != 0 */
 125 #define BINVERT_255x182712915H \
 126   (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
 127 #define BINVERT_255x188513325H \
 128   (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
 129 #endif
 130 #else
 131 #if GMP_LIMB_BITS == 64
 132 #define BINVERT_2835  (GMP_NUMB_MASK &  CNST_LIMB(0x938CC70553E3771B))
 133 #define BINVERT_42525 (GMP_NUMB_MASK &  CNST_LIMB(0xE7B40D449F314C35))
 134 #define BINVERT_255x182712915  (GMP_NUMB_MASK & CNST_LIMB(0x1B649A076FC4CB25))
 135 #define BINVERT_255x188513325  (GMP_NUMB_MASK & CNST_LIMB(0x06DB993A6864275B))
 136 #endif
 137 #endif
 138
 139 #ifndef mpn_divexact_by255
 140 #if GMP_NUMB_BITS % 8 == 0
 141 #define mpn_divexact_by255(dst,src,size) \
 142   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
 143 #else
 144 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
 145 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
 146 #else
 147 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
 148 #endif
 149 #endif
 150 #endif
 151
 152 #ifndef mpn_divexact_by255x4
 153 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
 154 #define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
 155 #else
 156 #define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
 157 #endif
 158 #endif
 159
 160 #ifndef mpn_divexact_by9x16
 161 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
 162 #define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
 163 #else
 164 #define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
 165 #endif
 166 #endif
 167
 168 #ifndef mpn_divexact_by42525x16
 169 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
 170 #define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
 171 #else
 172 #define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
 173 #endif
 174 #endif
 175
 176 #ifndef mpn_divexact_by2835x64
 177 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
 178 #define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
 179 #else
 180 #define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
 181 #endif
 182 #endif
 183
 184 #ifndef  mpn_divexact_by255x182712915
 185 #if GMP_NUMB_BITS < 36
 186 #if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
 187 /* FIXME: use mpn_bdiv_q_2_pi2 */
 188 #endif
 189 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
 190 #define mpn_divexact_by255x182712915(dst,src,size)                              \
 191   do {                                                                          \
 192     mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);    \
 193     mpn_divexact_by255(dst,dst,size);                                           \
 194   } while(0)
 195 #else
 196 #define mpn_divexact_by255x182712915(dst,src,size)      \
 197   do {                                                  \
 198     mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));  \
 199     mpn_divexact_by255(dst,dst,size);                   \
 200   } while(0)
 201 #endif
 202 #else /* GMP_NUMB_BITS > 35 */
 203 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
 204 #define mpn_divexact_by255x182712915(dst,src,size) \
 205   mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
 206 #else
 207 #define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
 208 #endif
 209 #endif /* GMP_NUMB_BITS >?< 36 */
 210 #endif
 211
 212 #ifndef  mpn_divexact_by255x188513325
 213 #if GMP_NUMB_BITS < 36
 214 #if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
 215 /* FIXME: use mpn_bdiv_q_1_pi2 */
 216 #endif
 217 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
 218 #define mpn_divexact_by255x188513325(dst,src,size)                      \
 219   do {                                                                  \
 220     mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);    \
 221     mpn_divexact_by255(dst,dst,size);                                   \
 222   } while(0)
 223 #else
 224 #define mpn_divexact_by255x188513325(dst,src,size)      \
 225   do {                                                  \
 226     mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));  \
 227     mpn_divexact_by255(dst,dst,size);                   \
 228   } while(0)
 229 #endif
 230 #else /* GMP_NUMB_BITS > 35 */
 231 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
 232 #define mpn_divexact_by255x188513325(dst,src,size) \
 233   mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
 234 #else
 235 #define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
 236 #endif
 237 #endif /* GMP_NUMB_BITS >?< 36 */
 238 #endif
 239
 240 /* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
 241    points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
 242    +-1/8, 0. More precisely, we want to compute
 243    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
 244    14), given the 16 (rsp. 15) values:
 245
 246      r0 = limit at infinity of f(x) / x^7,
 247      r1 = f(8),f(-8),
 248      r2 = f(4),f(-4),
 249      r3 = f(2),f(-2),
 250      r4 = f(1),f(-1),
 251      r5 = f(1/4),f(-1/4),
 252      r6 = f(1/2),f(-1/2),
 253      r7 = f(1/8),f(-1/8),
 254      r8 = f(0).
 255
 256    All couples of the form f(n),f(-n) must be already mixed with
 257    toom_couple_handling(f(n),...,f(-n),...)
 258
 259    The result is stored in {pp, spt + 7*n (or 8*n)}.
 260    At entry, r8 is stored at {pp, 2n},
 261    r6 is stored at {pp + 3n, 3n + 1}.
 262    r4 is stored at {pp + 7n, 3n + 1}.
 263    r2 is stored at {pp +11n, 3n + 1}.
 264    r0 is stored at {pp +15n, spt}.
 265
 266    The other values are 3n+1 limbs each (with most significant limbs small).
 267
 268    Negative intermediate results are stored two-complemented.
 269    Inputs are destroyed.
 270 */
 271
 272 void
 273 mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
 274                         mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
 275 {
 276   mp_limb_t cy;
 277   mp_size_t n3;
 278   mp_size_t n3p1;
 279   n3 = 3 * n;
 280   n3p1 = n3 + 1;
 281
 282 #define   r6    (pp + n3)                       /* 3n+1 */
 283 #define   r4    (pp + 7 * n)                    /* 3n+1 */
 284 #define   r2    (pp +11 * n)                    /* 3n+1 */
 285 #define   r0    (pp +15 * n)                    /* s+t <= 2*n */
 286
 287   ASSERT( spt <= 2 * n );
 288   /******************************* interpolation *****************************/
 289   if( half != 0) {
 290     cy = mpn_sub_n (r4, r4, r0, spt);
 291     MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
 292
 293     cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
 294     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
 295     DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
 296
 297     cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
 298     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
 299     DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
 300
 301     cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
 302     MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
 303 #if BIT_CORRECTION
 304     /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
 305     cy = r7[n3p1];
 306     r7[n3p1] = 0x80;
 307 #endif
 308     DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
 309 #if BIT_CORRECTION
 310     /* FIXME: assumes r7[n3p1] is writable. */
 311     ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
 312     r7[n3p1] = cy;
 313 #endif
 314   };
 315
 316   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
 317   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
 318
 319 #if HAVE_NATIVE_mpn_add_n_sub_n
 320   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
 321 #else
 322   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
 323   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
 324   MP_PTR_SWAP(r5, wsi);
 325 #endif
 326
 327   r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
 328   DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
 329
 330 #if HAVE_NATIVE_mpn_add_n_sub_n
 331   mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
 332 #else
 333   ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
 334   mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
 335   MP_PTR_SWAP(r3, wsi);
 336 #endif
 337
 338   cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
 339 #if BIT_CORRECTION
 340   MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
 341   cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
 342   cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
 343   ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
 344 #else
 345   r7[n3] -= cy;
 346   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
 347 #endif
 348
 349 #if HAVE_NATIVE_mpn_add_n_sub_n
 350   mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
 351 #else
 352   mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
 353   mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
 354   MP_PTR_SWAP(r7, wsi);
 355 #endif
 356
 357   r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
 358
 359 #if AORSMUL_FASTER_2AORSLSH
 360   mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
 361 #else
 362   DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
 363   DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
 364 #endif
 365
 366   mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
 367 #if AORSMUL_FASTER_3AORSLSH
 368   mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
 369 #else
 370   DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
 371   DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
 372   DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
 373 #endif
 374   mpn_divexact_by255x188513325(r7, r7, n3p1);
 375
 376   mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
 377   /* A division by 2835x64 followsi. Warning: the operand can be negative! */
 378   mpn_divexact_by2835x64(r5, r5, n3p1);
 379   if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
 380     r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
 381
 382 #if AORSMUL_FASTER_AORS_AORSLSH
 383   mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
 384 #else
 385   mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
 386   DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
 387 #endif
 388 #if AORSMUL_FASTER_2AORSLSH
 389   mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
 390 #else
 391   DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
 392   DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
 393 #endif
 394   /* A division by 255x4 followsi. Warning: the operand can be negative! */
 395   mpn_divexact_by255x4(r6, r6, n3p1);
 396   if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
 397     r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
 398
 399   ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
 400
 401   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
 402   ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
 403
 404   /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
 405   DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
 406   mpn_submul_1 (r1, r2, n3p1, 1428);
 407   mpn_submul_1 (r1, r3, n3p1, 112896);
 408   mpn_divexact_by255x182712915(r1, r1, n3p1);
 409
 410   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
 411   mpn_divexact_by42525x16(r2, r2, n3p1);
 412
 413 #if AORSMUL_FASTER_AORS_2AORSLSH
 414   ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
 415 #else
 416   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
 417   ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
 418   ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
 419 #endif
 420   ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
 421   mpn_divexact_by9x16(r3, r3, n3p1);
 422
 423   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
 424   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
 425   ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
 426
 427   mpn_add_n (r6, r2, r6, n3p1);
 428   ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
 429   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
 430
 431   mpn_sub_n (r5, r3, r5, n3p1);
 432   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
 433   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
 434
 435   mpn_add_n (r7, r1, r7, n3p1);
 436   ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
 437   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
 438
 439   /* last interpolation steps... */
 440   /* ... could be mixed with recomposition
 441         ||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
 442   */
 443
 444   /***************************** recomposition *******************************/
 445   /*
 446     pp[] prior to operations:
 447     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
 448
 449     summation scheme for remaining operations:
 450     |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
 451     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
 452         ||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
 453   */
 454
 455   cy = mpn_add_n (pp + n, pp + n, r7, n);
 456   cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
 457 #if HAVE_NATIVE_mpn_add_nc
 458   cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
 459 #else
 460   MPN_INCR_U (r7 + 2 * n, n + 1, cy);
 461   cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
 462 #endif
 463   MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
 464
 465   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
 466   cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
 467 #if HAVE_NATIVE_mpn_add_nc
 468   cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
 469 #else
 470   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
 471   cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
 472 #endif
 473   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
 474
 475   pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
 476   cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
 477 #if HAVE_NATIVE_mpn_add_nc
 478   cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
 479 #else
 480   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
 481   cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
 482 #endif
 483   MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
 484
 485   pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
 486   if ( half ) {
 487     cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
 488 #if HAVE_NATIVE_mpn_add_nc
 489     if(LIKELY(spt > n)) {
 490       cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
 491       MPN_INCR_U (pp + 16 * n, spt - n, cy);
 492     } else {
 493       ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
 494     }
 495 #else
 496     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
 497     if(LIKELY(spt > n)) {
 498       cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
 499       MPN_INCR_U (pp + 16 * n, spt - n, cy);
 500     } else {
 501       ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
 502     }
 503 #endif
 504   } else {
 505     ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
 506   }
 507
 508 #undef   r0
 509 #undef   r2
 510 #undef   r4
 511 #undef   r6
 512 }