contrib/mpfr/mpfr-longlong.h

   1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
   2
   3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
   4 2004, 2005 Free Software Foundation, Inc.
   5
   6 This file is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU Lesser General Public License as published by
   8 the Free Software Foundation; either version 2.1 of the License, or (at your
   9 option) any later version.
  10
  11 This file is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  14 License for more details.
  15
  16 You should have received a copy of the GNU Lesser General Public License
  17 along with this file; see the file COPYING.LIB.  If not, write to
  18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  19 MA 02110-1301, USA. */
  20
  21 /* You have to define the following before including this file:
  22
  23    UWtype -- An unsigned type, default type for operations (typically a "word")
  24    UHWtype -- An unsigned type, at least half the size of UWtype.
  25    UDWtype -- An unsigned type, at least twice as large a UWtype
  26    W_TYPE_SIZE -- size in bits of UWtype
  27
  28    SItype, USItype -- Signed and unsigned 32 bit types.
  29    DItype, UDItype -- Signed and unsigned 64 bit types.
  30
  31    On a 32 bit machine UWtype should typically be USItype;
  32    on a 64 bit machine, UWtype should typically be UDItype.
  33 */
  34
  35 #define __BITS4 (W_TYPE_SIZE / 4)
  36 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
  37 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
  38 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
  39
  40 /* This is used to make sure no undesirable sharing between different libraries
  41    that use this file takes place.  */
  42 #ifndef __MPN
  43 #define __MPN(x) __##x
  44 #endif
  45
  46 #ifndef _PROTO
  47 #if (__STDC__-0) || defined (__cplusplus)
  48 #define _PROTO(x) x
  49 #else
  50 #define _PROTO(x) ()
  51 #endif
  52 #endif
  53
  54 /* Define auxiliary asm macros.
  55
  56    1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two
  57    UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
  58    word product in HIGH_PROD and LOW_PROD.
  59
  60    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
  61    UDWtype product.  This is just a variant of umul_ppmm.
  62
  63    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  64    denominator) divides a UDWtype, composed by the UWtype integers
  65    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
  66    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
  67    than DENOMINATOR for correct operation.  If, in addition, the most
  68    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
  69    UDIV_NEEDS_NORMALIZATION is defined to 1.
  70
  71    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
  72    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
  73    is rounded towards 0.
  74
  75    5) count_leading_zeros(count, x) counts the number of zero-bits from the
  76    msb to the first non-zero bit in the UWtype X.  This is the number of
  77    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
  78    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
  79
  80    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
  81    from the least significant end.
  82
  83    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
  84    high_addend_2, low_addend_2) adds two UWtype integers, composed by
  85    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
  86    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
  87    (i.e. carry out) is not stored anywhere, and is lost.
  88
  89    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
  90    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
  91    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
  92    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
  93    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
  94    and is lost.
  95
  96    If any of these macros are left undefined for a particular CPU,
  97    C macros are used.
  98
  99
 100    Notes:
 101
 102    For add_ssaaaa the two high and two low addends can both commute, but
 103    unfortunately gcc only supports one "%" commutative in each asm block.
 104    This has always been so but is only documented in recent versions
 105    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
 106    compiler error in certain rare circumstances.
 107
 108    Apparently it was only the last "%" that was ever actually respected, so
 109    the code has been updated to leave just that.  Clearly there's a free
 110    choice whether high or low should get it, if there's a reason to favour
 111    one over the other.  Also obviously when the constraints on the two
 112    operands are identical there's no benefit to the reloader in any "%" at
 113    all.
 114
 115    */
 116
 117 /* The CPUs come in alphabetical order below.
 118
 119    Please add support for more CPUs here, or improve the current support
 120    for the CPUs below!  */
 121
 122
 123 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
 124    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
 125    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
 126    __builtin_ctzll.
 127
 128    These builtins are only used when we check what code comes out, on some
 129    chips they're merely libgcc calls, where we will instead want an inline
 130    in that case (either asm or generic C).
 131
 132    These builtins are better than an asm block of the same insn, since an
 133    asm block doesn't give gcc any information about scheduling or resource
 134    usage.  We keep an asm block for use on prior versions of gcc though.
 135
 136    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
 137    it's not used (for count_leading_zeros) because it generally gives extra
 138    code to ensure the result is 0 when the input is 0, which we don't need
 139    or want.  */
 140
 141 #ifdef _LONG_LONG_LIMB
 142 #define count_leading_zeros_gcc_clz(count,x)    \
 143   do {                                          \
 144     ASSERT ((x) != 0);                          \
 145     (count) = __builtin_clzll (x);              \
 146   } while (0)
 147 #else
 148 #define count_leading_zeros_gcc_clz(count,x)    \
 149   do {                                          \
 150     ASSERT ((x) != 0);                          \
 151     (count) = __builtin_clzl (x);               \
 152   } while (0)
 153 #endif
 154
 155 #ifdef _LONG_LONG_LIMB
 156 #define count_trailing_zeros_gcc_ctz(count,x)   \
 157   do {                                          \
 158     ASSERT ((x) != 0);                          \
 159     (count) = __builtin_ctzll (x);              \
 160   } while (0)
 161 #else
 162 #define count_trailing_zeros_gcc_ctz(count,x)   \
 163   do {                                          \
 164     ASSERT ((x) != 0);                          \
 165     (count) = __builtin_ctzl (x);               \
 166   } while (0)
 167 #endif
 168
 169
 170 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
 171    don't need to be under !NO_ASM */
 172 #if ! defined (NO_ASM)
 173
 174 #if defined (__alpha) && W_TYPE_SIZE == 64
 175 /* Most alpha-based machines, except Cray systems. */
 176 #if defined (__GNUC__)
 177 #define umul_ppmm(ph, pl, m0, m1) \
 178   do {                                                                  \
 179     UDItype __m0 = (m0), __m1 = (m1);                                   \
 180     __asm__ ("umulh %r1,%2,%0"                                          \
 181              : "=r" (ph)                                                \
 182              : "%rJ" (m0), "rI" (m1));                                  \
 183     (pl) = __m0 * __m1;                                                 \
 184   } while (0)
 185 #define UMUL_TIME 18
 186 #else /* ! __GNUC__ */
 187 #include <machine/builtins.h>
 188 #define umul_ppmm(ph, pl, m0, m1) \
 189   do {                                                                  \
 190     UDItype __m0 = (m0), __m1 = (m1);                                   \
 191     (ph) = __UMULH (m0, m1);                                            \
 192     (pl) = __m0 * __m1;                                                 \
 193   } while (0)
 194 #endif
 195 #ifndef LONGLONG_STANDALONE
 196 #define udiv_qrnnd(q, r, n1, n0, d) \
 197   do { UWtype __di;                                                     \
 198     __di = __MPN(invert_limb) (d);                                      \
 199     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 200   } while (0)
 201 #define UDIV_PREINV_ALWAYS  1
 202 #define UDIV_NEEDS_NORMALIZATION 1
 203 #define UDIV_TIME 220
 204 #endif /* LONGLONG_STANDALONE */
 205
 206 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
 207    always goes into libgmp.so, even when not actually used.  */
 208 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 209
 210 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
 211 #define count_leading_zeros(COUNT,X) \
 212   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
 213 #define count_trailing_zeros(COUNT,X) \
 214   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
 215 #endif /* clz/ctz using cix */
 216
 217 #if ! defined (count_leading_zeros)                             \
 218   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
 219 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
 220    "$31" is written explicitly in the asm, since an "r" constraint won't
 221    select reg 31.  There seems no need to worry about "r31" syntax for cray,
 222    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
 223 #define ALPHA_CMPBGE_0(dst, src)                                        \
 224   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
 225 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
 226    them, locating the highest non-zero byte.  A second __clz_tab lookup
 227    counts the leading zero bits in that byte, giving the result.  */
 228 #define count_leading_zeros(count, x)                                   \
 229   do {                                                                  \
 230     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
 231     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
 232     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
 233     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
 234     __clz__x >>= __clz__b;                                              \
 235     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
 236     __clz__b = 65 - __clz__b;                                           \
 237     (count) = __clz__b - __clz__c;                                      \
 238   } while (0)
 239 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 240 #endif /* clz using cmpbge */
 241
 242 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
 243 #if HAVE_ATTRIBUTE_CONST
 244 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
 245 #else
 246 long __MPN(count_leading_zeros) _PROTO ((UDItype));
 247 #endif
 248 #define count_leading_zeros(count, x) \
 249   ((count) = __MPN(count_leading_zeros) (x))
 250 #endif /* clz using mpn */
 251 #endif /* __alpha */
 252
 253 #if defined (_CRAY) && W_TYPE_SIZE == 64
 254 #include <intrinsics.h>
 255 #define UDIV_PREINV_ALWAYS  1
 256 #define UDIV_NEEDS_NORMALIZATION 1
 257 #define UDIV_TIME 220
 258 long __MPN(count_leading_zeros) _PROTO ((UDItype));
 259 #define count_leading_zeros(count, x) \
 260   ((count) = _leadz ((UWtype) (x)))
 261 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
 262 #define umul_ppmm(ph, pl, m0, m1) \
 263   do {                                                                  \
 264     UDItype __m0 = (m0), __m1 = (m1);                                   \
 265     (ph) = _int_mult_upper (m0, m1);                                    \
 266     (pl) = __m0 * __m1;                                                 \
 267   } while (0)
 268 #ifndef LONGLONG_STANDALONE
 269 #define udiv_qrnnd(q, r, n1, n0, d) \
 270   do { UWtype __di;                                                     \
 271     __di = __MPN(invert_limb) (d);                                      \
 272     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 273   } while (0)
 274 #endif /* LONGLONG_STANDALONE */
 275 #endif /* _CRAYIEEE */
 276 #endif /* _CRAY */
 277
 278 #if defined (__ia64) && W_TYPE_SIZE == 64
 279 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
 280    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
 281    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
 282    register, which takes an extra cycle.  */
 283 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
 284   do {                                          \
 285     UWtype __x;                                 \
 286     __x = (al) - (bl);                          \
 287     if ((al) < (bl))                            \
 288       (sh) = (ah) - (bh) - 1;                   \
 289     else                                        \
 290       (sh) = (ah) - (bh);                       \
 291     (sl) = __x;                                 \
 292   } while (0)
 293 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
 294 /* Do both product parts in assembly, since that gives better code with
 295    all gcc versions.  Some callers will just use the upper part, and in
 296    that situation we waste an instruction, but not any cycles.  */
 297 #define umul_ppmm(ph, pl, m0, m1) \
 298     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
 299              : "=&f" (ph), "=f" (pl)                                    \
 300              : "f" (m0), "f" (m1))
 301 #define UMUL_TIME 14
 302 #define count_leading_zeros(count, x) \
 303   do {                                                                  \
 304     UWtype _x = (x), _y, _a, _c;                                        \
 305     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
 306     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
 307     _c = (_a - 1) << 3;                                                 \
 308     _x >>= _c;                                                          \
 309     if (_x >= 1 << 4)                                                   \
 310       _x >>= 4, _c += 4;                                                \
 311     if (_x >= 1 << 2)                                                   \
 312       _x >>= 2, _c += 2;                                                \
 313     _c += _x >> 1;                                                      \
 314     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
 315   } while (0)
 316 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
 317    based, and we don't need a special case for x==0 here */
 318 #define count_trailing_zeros(count, x)                                  \
 319   do {                                                                  \
 320     UWtype __ctz_x = (x);                                               \
 321     __asm__ ("popcnt %0 = %1"                                           \
 322              : "=r" (count)                                             \
 323              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
 324   } while (0)
 325 #endif
 326 #if defined (__INTEL_COMPILER)
 327 #include <ia64intrin.h>
 328 #define umul_ppmm(ph, pl, m0, m1)                                       \
 329   do {                                                                  \
 330     UWtype _m0 = (m0), _m1 = (m1);                                      \
 331     ph = _m64_xmahu (_m0, _m1, 0);                                      \
 332     pl = _m0 * _m1;                                                     \
 333   } while (0)
 334 #endif
 335 #ifndef LONGLONG_STANDALONE
 336 #define udiv_qrnnd(q, r, n1, n0, d) \
 337   do { UWtype __di;                                                     \
 338     __di = __MPN(invert_limb) (d);                                      \
 339     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 340   } while (0)
 341 #define UDIV_PREINV_ALWAYS  1
 342 #define UDIV_NEEDS_NORMALIZATION 1
 343 #endif
 344 #define UDIV_TIME 220
 345 #endif
 346
 347
 348 #if defined (__GNUC__)
 349
 350 /* We sometimes need to clobber "cc" with gcc2, but that would not be
 351    understood by gcc1.  Use cpp to avoid major code duplication.  */
 352 #if __GNUC__ < 2
 353 #define __CLOBBER_CC
 354 #define __AND_CLOBBER_CC
 355 #else /* __GNUC__ >= 2 */
 356 #define __CLOBBER_CC : "cc"
 357 #define __AND_CLOBBER_CC , "cc"
 358 #endif /* __GNUC__ < 2 */
 359
 360 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
 361 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 362   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
 363            : "=r" (sh), "=&r" (sl)                                      \
 364            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
 365 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 366   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
 367            : "=r" (sh), "=&r" (sl)                                      \
 368            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
 369 #define umul_ppmm(xh, xl, m0, m1) \
 370   do {                                                                  \
 371     USItype __m0 = (m0), __m1 = (m1);                                   \
 372     __asm__ ("multiplu %0,%1,%2"                                        \
 373              : "=r" (xl)                                                \
 374              : "r" (__m0), "r" (__m1));                                 \
 375     __asm__ ("multmu %0,%1,%2"                                          \
 376              : "=r" (xh)                                                \
 377              : "r" (__m0), "r" (__m1));                                 \
 378   } while (0)
 379 #define udiv_qrnnd(q, r, n1, n0, d) \
 380   __asm__ ("dividu %0,%3,%4"                                            \
 381            : "=r" (q), "=q" (r)                                         \
 382            : "1" (n1), "r" (n0), "r" (d))
 383 #define count_leading_zeros(count, x) \
 384     __asm__ ("clz %0,%1"                                                \
 385              : "=r" (count)                                             \
 386              : "r" (x))
 387 #define COUNT_LEADING_ZEROS_0 32
 388 #endif /* __a29k__ */
 389
 390 #if defined (__arc__)
 391 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 392   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
 393            : "=r" (sh),                                                 \
 394              "=&r" (sl)                                                 \
 395            : "r"  ((USItype) (ah)),                                     \
 396              "rIJ" ((USItype) (bh)),                                    \
 397              "%r" ((USItype) (al)),                                     \
 398              "rIJ" ((USItype) (bl)))
 399 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 400   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
 401            : "=r" (sh),                                                 \
 402              "=&r" (sl)                                                 \
 403            : "r" ((USItype) (ah)),                                      \
 404              "rIJ" ((USItype) (bh)),                                    \
 405              "r" ((USItype) (al)),                                      \
 406              "rIJ" ((USItype) (bl)))
 407 #endif
 408
 409 #if defined (__arm__) && W_TYPE_SIZE == 32
 410 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 411   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
 412            : "=r" (sh), "=&r" (sl)                                      \
 413            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 414 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 415   do {                                                                  \
 416     if (__builtin_constant_p (al))                                      \
 417       {                                                                 \
 418         if (__builtin_constant_p (ah))                                  \
 419           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 420                    : "=r" (sh), "=&r" (sl)                              \
 421                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 422         else                                                            \
 423           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
 424                    : "=r" (sh), "=&r" (sl)                              \
 425                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 426       }                                                                 \
 427     else if (__builtin_constant_p (ah))                                 \
 428       {                                                                 \
 429         if (__builtin_constant_p (bl))                                  \
 430           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 431                    : "=r" (sh), "=&r" (sl)                              \
 432                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 433         else                                                            \
 434           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
 435                    : "=r" (sh), "=&r" (sl)                              \
 436                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
 437       }                                                                 \
 438     else if (__builtin_constant_p (bl))                                 \
 439       {                                                                 \
 440         if (__builtin_constant_p (bh))                                  \
 441           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
 442                    : "=r" (sh), "=&r" (sl)                              \
 443                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 444         else                                                            \
 445           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
 446                    : "=r" (sh), "=&r" (sl)                              \
 447                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
 448       }                                                                 \
 449     else /* only bh might be a constant */                              \
 450       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
 451                : "=r" (sh), "=&r" (sl)                                  \
 452                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
 453     } while (0)
 454 #if 1 || defined (__arm_m__)    /* `M' series has widening multiply support */
 455 #define umul_ppmm(xh, xl, a, b) \
 456   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 457 #define UMUL_TIME 5
 458 #define smul_ppmm(xh, xl, a, b) \
 459   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
 460 #ifndef LONGLONG_STANDALONE
 461 #define udiv_qrnnd(q, r, n1, n0, d) \
 462   do { UWtype __di;                                                     \
 463     __di = __MPN(invert_limb) (d);                                      \
 464     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
 465   } while (0)
 466 #define UDIV_PREINV_ALWAYS  1
 467 #define UDIV_NEEDS_NORMALIZATION 1
 468 #define UDIV_TIME 70
 469 #endif /* LONGLONG_STANDALONE */
 470 #else
 471 #define umul_ppmm(xh, xl, a, b) \
 472   __asm__ ("%@ Inlined umul_ppmm\n"                                     \
 473 "       mov     %|r0, %2, lsr #16\n"                                    \
 474 "       mov     %|r2, %3, lsr #16\n"                                    \
 475 "       bic     %|r1, %2, %|r0, lsl #16\n"                              \
 476 "       bic     %|r2, %3, %|r2, lsl #16\n"                              \
 477 "       mul     %1, %|r1, %|r2\n"                                       \
 478 "       mul     %|r2, %|r0, %|r2\n"                                     \
 479 "       mul     %|r1, %0, %|r1\n"                                       \
 480 "       mul     %0, %|r0, %0\n"                                         \
 481 "       adds    %|r1, %|r2, %|r1\n"                                     \
 482 "       addcs   %0, %0, #65536\n"                                       \
 483 "       adds    %1, %1, %|r1, lsl #16\n"                                \
 484 "       adc     %0, %0, %|r1, lsr #16"                                  \
 485            : "=&r" (xh), "=r" (xl)                                      \
 486            : "r" (a), "r" (b)                                           \
 487            : "r0", "r1", "r2")
 488 #define UMUL_TIME 20
 489 #ifndef LONGLONG_STANDALONE
 490 #define udiv_qrnnd(q, r, n1, n0, d) \
 491   do { UWtype __r;                                                      \
 492     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
 493     (r) = __r;                                                          \
 494   } while (0)
 495 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
 496 #define UDIV_TIME 200
 497 #endif /* LONGLONG_STANDALONE */
 498 #endif
 499 #endif /* __arm__ */
 500
 501 #if defined (__clipper__) && W_TYPE_SIZE == 32
 502 #define umul_ppmm(w1, w0, u, v) \
 503   ({union {UDItype __ll;                                                \
 504            struct {USItype __l, __h;} __i;                              \
 505           } __x;                                                        \
 506   __asm__ ("mulwux %2,%0"                                               \
 507            : "=r" (__x.__ll)                                            \
 508            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
 509   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 510 #define smul_ppmm(w1, w0, u, v) \
 511   ({union {DItype __ll;                                                 \
 512            struct {SItype __l, __h;} __i;                               \
 513           } __x;                                                        \
 514   __asm__ ("mulwx %2,%0"                                                \
 515            : "=r" (__x.__ll)                                            \
 516            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
 517   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 518 #define __umulsidi3(u, v) \
 519   ({UDItype __w;                                                        \
 520     __asm__ ("mulwux %2,%0"                                             \
 521              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
 522     __w; })
 523 #endif /* __clipper__ */
 524
 525 /* Fujitsu vector computers.  */
 526 #if defined (__uxp__) && W_TYPE_SIZE == 32
 527 #define umul_ppmm(ph, pl, u, v) \
 528   do {                                                                  \
 529     union {UDItype __ll;                                                \
 530            struct {USItype __h, __l;} __i;                              \
 531           } __x;                                                        \
 532     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
 533     (ph) = __x.__i.__h;                                                 \
 534     (pl) = __x.__i.__l;                                                 \
 535   } while (0)
 536 #define smul_ppmm(ph, pl, u, v) \
 537   do {                                                                  \
 538     union {UDItype __ll;                                                \
 539            struct {USItype __h, __l;} __i;                              \
 540           } __x;                                                        \
 541     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
 542     (ph) = __x.__i.__h;                                                 \
 543     (pl) = __x.__i.__l;                                                 \
 544   } while (0)
 545 #endif
 546
 547 #if defined (__gmicro__) && W_TYPE_SIZE == 32
 548 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 549   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
 550            : "=g" (sh), "=&g" (sl)                                      \
 551            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 552              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 553 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 554   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
 555            : "=g" (sh), "=&g" (sl)                                      \
 556            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 557              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 558 #define umul_ppmm(ph, pl, m0, m1) \
 559   __asm__ ("mulx %3,%0,%1"                                              \
 560            : "=g" (ph), "=r" (pl)                                       \
 561            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
 562 #define udiv_qrnnd(q, r, nh, nl, d) \
 563   __asm__ ("divx %4,%0,%1"                                              \
 564            : "=g" (q), "=r" (r)                                         \
 565            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
 566 #define count_leading_zeros(count, x) \
 567   __asm__ ("bsch/1 %1,%0"                                               \
 568            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
 569 #endif
 570
 571 #if defined (__hppa) && W_TYPE_SIZE == 32
 572 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 573   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
 574            : "=r" (sh), "=&r" (sl)                                      \
 575            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 577   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
 578            : "=r" (sh), "=&r" (sl)                                      \
 579            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 580 #if defined (_PA_RISC1_1)
 581 #define umul_ppmm(wh, wl, u, v) \
 582   do {                                                                  \
 583     union {UDItype __ll;                                                \
 584            struct {USItype __h, __l;} __i;                              \
 585           } __x;                                                        \
 586     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
 587     (wh) = __x.__i.__h;                                                 \
 588     (wl) = __x.__i.__l;                                                 \
 589   } while (0)
 590 #define UMUL_TIME 8
 591 #define UDIV_TIME 60
 592 #else
 593 #define UMUL_TIME 40
 594 #define UDIV_TIME 80
 595 #endif
 596 #define count_leading_zeros(count, x) \
 597   do {                                                                  \
 598     USItype __tmp;                                                      \
 599     __asm__ (                                                           \
 600        "ldi             1,%0\n"                                         \
 601 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
 602 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
 603 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
 604 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
 605 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
 606 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
 607 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
 608 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
 609 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
 610 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
 611 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
 612 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
 613 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
 614 "       sub             %0,%1,%0        ; Subtract it.\n"               \
 615         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
 616   } while (0)
 617 #endif /* hppa */
 618
 619 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
 620    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
 621    is just a case of no direct support for 2.0n but treating it like 1.0. */
 622 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
 623 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 624   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
 625            : "=r" (sh), "=&r" (sl)                                      \
 626            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
 627 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 628   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
 629            : "=r" (sh), "=&r" (sl)                                      \
 630            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
 631 #endif /* hppa */
 632
 633 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
 634 #define smul_ppmm(xh, xl, m0, m1) \
 635   do {                                                                  \
 636     union {DItype __ll;                                                 \
 637            struct {USItype __h, __l;} __i;                              \
 638           } __x;                                                        \
 639     __asm__ ("lr %N0,%1\n\tmr %0,%2"                                    \
 640              : "=&r" (__x.__ll)                                         \
 641              : "r" (m0), "r" (m1));                                     \
 642     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
 643   } while (0)
 644 #define sdiv_qrnnd(q, r, n1, n0, d) \
 645   do {                                                                  \
 646     union {DItype __ll;                                                 \
 647            struct {USItype __h, __l;} __i;                              \
 648           } __x;                                                        \
 649     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
 650     __asm__ ("dr %0,%2"                                                 \
 651              : "=r" (__x.__ll)                                          \
 652              : "0" (__x.__ll), "r" (d));                                \
 653     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
 654   } while (0)
 655 #endif
 656
 657 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
 658 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 659   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
 660            : "=r" (sh), "=&r" (sl)                                      \
 661            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
 662              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 663 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 664   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
 665            : "=r" (sh), "=&r" (sl)                                      \
 666            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
 667              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 668 #define umul_ppmm(w1, w0, u, v) \
 669   __asm__ ("mull %3"                                                    \
 670            : "=a" (w0), "=d" (w1)                                       \
 671            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
 672 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 673   __asm__ ("divl %4"                 /* stringification in K&R C */     \
 674            : "=a" (q), "=d" (r)                                         \
 675            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
 676
 677 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
 678 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
 679    significant 1 bit is, hence the use of the following alternatives.  bsfl
 680    is slow too, between 18 and 42 depending where the least significant 1
 681    bit is, so let the generic count_trailing_zeros below make use of the
 682    count_leading_zeros here too.  */
 683
 684 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
 685 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
 686    cache miss reading from __clz_tab.  For P55 it's favoured over the float
 687    below so as to avoid mixing MMX and x87, since the penalty for switching
 688    between the two is about 100 cycles.
 689
 690    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
 691    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
 692    follows, but as of gcc 2.95.2 it results in conditional jumps.
 693
 694        __shift = -(__n < 0x1000000);
 695        __shift -= (__n < 0x10000);
 696        __shift -= (__n < 0x100);
 697
 698    The middle two sbbl and cmpl's pair, and with luck something gcc
 699    generates might pair with the first cmpl and the last sbbl.  The "32+1"
 700    constant could be folded into __clz_tab[], but it doesn't seem worth
 701    making a different table just for that.  */
 702
 703 #define count_leading_zeros(c,n)                                        \
 704   do {                                                                  \
 705     USItype  __n = (n);                                                 \
 706     USItype  __shift;                                                   \
 707     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
 708              "sbbl  %0, %0\n"                                           \
 709              "cmpl  $0x10000, %1\n"                                     \
 710              "sbbl  $0, %0\n"                                           \
 711              "cmpl  $0x100, %1\n"                                       \
 712              "sbbl  $0, %0\n"                                           \
 713              : "=&r" (__shift) : "r"  (__n));                           \
 714     __shift = __shift*8 + 24 + 1;                                       \
 715     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
 716   } while (0)
 717 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 718 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
 719
 720 #else /* ! pentiummmx || LONGLONG_STANDALONE */
 721 /* The following should be a fixed 14 cycles or so.  Some scheduling
 722    opportunities should be available between the float load/store too.  This
 723    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
 724    apparently suggested by the Intel optimizing manual (don't know exactly
 725    where).  gcc 2.95 or up will be best for this, so the "double" is
 726    correctly aligned on the stack.  */
 727 #define count_leading_zeros(c,n)                                        \
 728   do {                                                                  \
 729     union {                                                             \
 730       double    d;                                                      \
 731       unsigned  a[2];                                                   \
 732     } __u;                                                              \
 733     ASSERT ((n) != 0);                                                  \
 734     __u.d = (UWtype) (n);                                               \
 735     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
 736   } while (0)
 737 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
 738 #endif /* pentiummx */
 739
 740 #else /* ! pentium */
 741
 742 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
 743 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
 744 #endif /* gcc clz */
 745
 746 /* On P6, gcc prior to 3.0 generates a partial register stall for
 747    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
 748    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
 749    cost of one extra instruction.  Do this for "i386" too, since that means
 750    generic x86.  */
 751 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
 752   && (HAVE_HOST_CPU_i386                                                \
 753       || HAVE_HOST_CPU_i686                                             \
 754       || HAVE_HOST_CPU_pentiumpro                                       \
 755       || HAVE_HOST_CPU_pentium2                                         \
 756       || HAVE_HOST_CPU_pentium3)
 757 #define count_leading_zeros(count, x)                                   \
 758   do {                                                                  \
 759     USItype __cbtmp;                                                    \
 760     ASSERT ((x) != 0);                                                  \
 761     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 762     (count) = 31 - __cbtmp;                                             \
 763   } while (0)
 764 #endif /* gcc<3 asm bsrl */
 765
 766 #ifndef count_leading_zeros
 767 #define count_leading_zeros(count, x)                                   \
 768   do {                                                                  \
 769     USItype __cbtmp;                                                    \
 770     ASSERT ((x) != 0);                                                  \
 771     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
 772     (count) = __cbtmp ^ 31;                                             \
 773   } while (0)
 774 #endif /* asm bsrl */
 775
 776 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
 777 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
 778 #endif /* gcc ctz */
 779
 780 #ifndef count_trailing_zeros
 781 #define count_trailing_zeros(count, x)                                  \
 782   do {                                                                  \
 783     ASSERT ((x) != 0);                                                  \
 784     __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)));        \
 785   } while (0)
 786 #endif /* asm bsfl */
 787
 788 #endif /* ! pentium */
 789
 790 #ifndef UMUL_TIME
 791 #define UMUL_TIME 10
 792 #endif
 793 #ifndef UDIV_TIME
 794 #define UDIV_TIME 40
 795 #endif
 796 #endif /* 80x86 */
 797
 798 #if defined (__amd64__) && W_TYPE_SIZE == 64
 799 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 800   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
 801            : "=r" (sh), "=&r" (sl)                                      \
 802            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
 803              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
 804 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 805   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
 806            : "=r" (sh), "=&r" (sl)                                      \
 807            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
 808              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
 809 #define umul_ppmm(w1, w0, u, v) \
 810   __asm__ ("mulq %3"                                                    \
 811            : "=a" (w0), "=d" (w1)                                       \
 812            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
 813 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
 814   __asm__ ("divq %4"                 /* stringification in K&R C */     \
 815            : "=a" (q), "=d" (r)                                         \
 816            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
 817 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
 818 #define count_leading_zeros(count, x)                                   \
 819   do {                                                                  \
 820     UDItype __cbtmp;                                                    \
 821     ASSERT ((x) != 0);                                                  \
 822     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
 823     (count) = __cbtmp ^ 63;                                             \
 824   } while (0)
 825 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
 826    count is only an int. */
 827 #define count_trailing_zeros(count, x)                                  \
 828   do {                                                                  \
 829     ASSERT ((x) != 0);                                                  \
 830     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
 831   } while (0)
 832 #endif /* x86_64 */
 833
 834 #if defined (__i860__) && W_TYPE_SIZE == 32
 835 #define rshift_rhlc(r,h,l,c) \
 836   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
 837            "=r" (r) : "r" (h), "r" (l), "rn" (c))
 838 #endif /* i860 */
 839
 840 #if defined (__i960__) && W_TYPE_SIZE == 32
 841 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 842   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
 843            : "=r" (sh), "=&r" (sl)                                      \
 844            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
 845 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 846   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
 847            : "=r" (sh), "=&r" (sl)                                      \
 848            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
 849 #define umul_ppmm(w1, w0, u, v) \
 850   ({union {UDItype __ll;                                                \
 851            struct {USItype __l, __h;} __i;                              \
 852           } __x;                                                        \
 853   __asm__ ("emul %2,%1,%0"                                              \
 854            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
 855   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
 856 #define __umulsidi3(u, v) \
 857   ({UDItype __w;                                                        \
 858     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
 859     __w; })
 860 #define udiv_qrnnd(q, r, nh, nl, d) \
 861   do {                                                                  \
 862     union {UDItype __ll;                                                \
 863            struct {USItype __l, __h;} __i;                              \
 864           } __nn;                                                       \
 865     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
 866     __asm__ ("ediv %d,%n,%0"                                            \
 867            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
 868     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
 869   } while (0)
 870 #define count_leading_zeros(count, x) \
 871   do {                                                                  \
 872     USItype __cbtmp;                                                    \
 873     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
 874     (count) = __cbtmp ^ 31;                                             \
 875   } while (0)
 876 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
 877 #if defined (__i960mx)          /* what is the proper symbol to test??? */
 878 #define rshift_rhlc(r,h,l,c) \
 879   do {                                                                  \
 880     union {UDItype __ll;                                                \
 881            struct {USItype __l, __h;} __i;                              \
 882           } __nn;                                                       \
 883     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
 884     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
 885   }
 886 #endif /* i960mx */
 887 #endif /* i960 */
 888
 889 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
 890      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
 891      || defined (__mc5307__)) && W_TYPE_SIZE == 32
 892 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 893   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
 894            : "=d" (sh), "=&d" (sl)                                      \
 895            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
 896              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
 897 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 898   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
 899            : "=d" (sh), "=&d" (sl)                                      \
 900            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
 901              "1" ((USItype)(al)), "g" ((USItype)(bl)))
 902 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
 903 #if defined (__mc68020__) || defined(mc68020) \
 904      || defined (__mc68030__) || defined (mc68030) \
 905      || defined (__mc68040__) || defined (mc68040) \
 906      || defined (__mcpu32__) || defined (mcpu32) \
 907      || defined (__NeXT__)
 908 #define umul_ppmm(w1, w0, u, v) \
 909   __asm__ ("mulu%.l %3,%1:%0"                                           \
 910            : "=d" (w0), "=d" (w1)                                       \
 911            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
 912 #define UMUL_TIME 45
 913 #define udiv_qrnnd(q, r, n1, n0, d) \
 914   __asm__ ("divu%.l %4,%1:%0"                                           \
 915            : "=d" (q), "=d" (r)                                         \
 916            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
 917 #define UDIV_TIME 90
 918 #define sdiv_qrnnd(q, r, n1, n0, d) \
 919   __asm__ ("divs%.l %4,%1:%0"                                           \
 920            : "=d" (q), "=d" (r)                                         \
 921            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
 922 #else /* for other 68k family members use 16x16->32 multiplication */
 923 #define umul_ppmm(xh, xl, a, b) \
 924   do { USItype __umul_tmp1, __umul_tmp2;                                \
 925         __asm__ ("| Inlined umul_ppmm\n"                                \
 926 "       move%.l %5,%3\n"                                                \
 927 "       move%.l %2,%0\n"                                                \
 928 "       move%.w %3,%1\n"                                                \
 929 "       swap    %3\n"                                                   \
 930 "       swap    %0\n"                                                   \
 931 "       mulu%.w %2,%1\n"                                                \
 932 "       mulu%.w %3,%0\n"                                                \
 933 "       mulu%.w %2,%3\n"                                                \
 934 "       swap    %2\n"                                                   \
 935 "       mulu%.w %5,%2\n"                                                \
 936 "       add%.l  %3,%2\n"                                                \
 937 "       jcc     1f\n"                                                   \
 938 "       add%.l  %#0x10000,%0\n"                                         \
 939 "1:     move%.l %2,%3\n"                                                \
 940 "       clr%.w  %2\n"                                                   \
 941 "       swap    %2\n"                                                   \
 942 "       swap    %3\n"                                                   \
 943 "       clr%.w  %3\n"                                                   \
 944 "       add%.l  %3,%1\n"                                                \
 945 "       addx%.l %2,%0\n"                                                \
 946 "       | End inlined umul_ppmm"                                        \
 947               : "=&d" (xh), "=&d" (xl),                                 \
 948                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
 949               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
 950   } while (0)
 951 #define UMUL_TIME 100
 952 #define UDIV_TIME 400
 953 #endif /* not mc68020 */
 954 /* The '020, '030, '040 and '060 have bitfield insns.
 955    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
 956    exclude bfffo on that chip (bitfield insns not available).  */
 957 #if (defined (__mc68020__) || defined (mc68020)    \
 958      || defined (__mc68030__) || defined (mc68030) \
 959      || defined (__mc68040__) || defined (mc68040) \
 960      || defined (__mc68060__) || defined (mc68060) \
 961      || defined (__NeXT__))                        \
 962   && ! defined (__mcpu32__)
 963 #define count_leading_zeros(count, x) \
 964   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
 965            : "=d" (count)                                               \
 966            : "od" ((USItype) (x)), "n" (0))
 967 #define COUNT_LEADING_ZEROS_0 32
 968 #endif
 969 #endif /* mc68000 */
 970
 971 #if defined (__m88000__) && W_TYPE_SIZE == 32
 972 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 973   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
 974            : "=r" (sh), "=&r" (sl)                                      \
 975            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
 976 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 977   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
 978            : "=r" (sh), "=&r" (sl)                                      \
 979            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
 980 #define count_leading_zeros(count, x) \
 981   do {                                                                  \
 982     USItype __cbtmp;                                                    \
 983     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
 984     (count) = __cbtmp ^ 31;                                             \
 985   } while (0)
 986 #define COUNT_LEADING_ZEROS_0 63 /* sic */
 987 #if defined (__m88110__)
 988 #define umul_ppmm(wh, wl, u, v) \
 989   do {                                                                  \
 990     union {UDItype __ll;                                                \
 991            struct {USItype __h, __l;} __i;                              \
 992           } __x;                                                        \
 993     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
 994     (wh) = __x.__i.__h;                                                 \
 995     (wl) = __x.__i.__l;                                                 \
 996   } while (0)
 997 #define udiv_qrnnd(q, r, n1, n0, d) \
 998   ({union {UDItype __ll;                                                \
 999            struct {USItype __h, __l;} __i;                              \
1000           } __x, __q;                                                   \
1001   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1002   __asm__ ("divu.d %0,%1,%2"                                            \
1003            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1004   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1005 #define UMUL_TIME 5
1006 #define UDIV_TIME 25
1007 #else
1008 #define UMUL_TIME 17
1009 #define UDIV_TIME 150
1010 #endif /* __m88110__ */
1011 #endif /* __m88000__ */
1012
1013 #if defined (__mips) && W_TYPE_SIZE == 32
1014 #if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
1015 #define umul_ppmm(w1, w0, u, v) \
1016   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1017 #else
1018 #define umul_ppmm(w1, w0, u, v) \
1019   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1020            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1021 #endif
1022 #define UMUL_TIME 10
1023 #define UDIV_TIME 100
1024 #endif /* __mips */
1025
1026 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1027 #if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
1028 #define umul_ppmm(w1, w0, u, v) \
1029   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1030 #else
1031 #define umul_ppmm(w1, w0, u, v) \
1032   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1033            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1034 #endif
1035 #define UMUL_TIME 20
1036 #define UDIV_TIME 140
1037 #endif /* __mips */
1038
1039 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1040 #define umul_ppmm(w1, w0, u, v) \
1041   ({union {UDItype __ll;                                                \
1042            struct {USItype __l, __h;} __i;                              \
1043           } __x;                                                        \
1044   __asm__ ("meid %2,%0"                                                 \
1045            : "=g" (__x.__ll)                                            \
1046            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1047   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1048 #define __umulsidi3(u, v) \
1049   ({UDItype __w;                                                        \
1050     __asm__ ("meid %2,%0"                                               \
1051              : "=g" (__w)                                               \
1052              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1053     __w; })
1054 #define udiv_qrnnd(q, r, n1, n0, d) \
1055   ({union {UDItype __ll;                                                \
1056            struct {USItype __l, __h;} __i;                              \
1057           } __x;                                                        \
1058   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1059   __asm__ ("deid %2,%0"                                                 \
1060            : "=g" (__x.__ll)                                            \
1061            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1062   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1063 #define count_trailing_zeros(count,x) \
1064   do {                                                                  \
1065     __asm__ ("ffsd      %2,%0"                                          \
1066              : "=r" (count)                                             \
1067              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1068   } while (0)
1069 #endif /* __ns32000__ */
1070
1071 /* In the past we had a block of various #defines tested
1072        _ARCH_PPC    - AIX
1073        _ARCH_PWR    - AIX
1074        __powerpc__  - gcc
1075        __POWERPC__  - BEOS
1076        __ppc__      - Darwin
1077        PPC          - old gcc, GNU/Linux, SysV
1078    The plain PPC test was not good for vxWorks, since PPC is defined on all
1079    CPUs there (eg. m68k too), as a constant one is expected to compare
1080    CPU_FAMILY against.
1081
1082    At any rate, this was pretty unattractive and a bit fragile.  The use of
1083    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1084    getting the desired effect.
1085
1086    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1087    the system vendor compilers.  (Is that vendor compilers with inline asm,
1088    or what?)  */
1089
1090 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1091   && W_TYPE_SIZE == 32
1092 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1093   do {                                                                  \
1094     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1095       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1096              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1097     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1098       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1099              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1100     else                                                                \
1101       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1102              : "=r" (sh), "=&r" (sl)                                    \
1103              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1104   } while (0)
1105 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1106   do {                                                                  \
1107     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1108       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1109                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1110     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1111       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1112                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1113     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1114       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1115                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1116     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1117       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1118                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1119     else                                                                \
1120       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1121                : "=r" (sh), "=&r" (sl)                                  \
1122                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1123   } while (0)
1124 #define count_leading_zeros(count, x) \
1125   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1126 #define COUNT_LEADING_ZEROS_0 32
1127 #if HAVE_HOST_CPU_FAMILY_powerpc
1128 #define umul_ppmm(ph, pl, m0, m1) \
1129   do {                                                                  \
1130     USItype __m0 = (m0), __m1 = (m1);                                   \
1131     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1132     (pl) = __m0 * __m1;                                                 \
1133   } while (0)
1134 #define UMUL_TIME 15
1135 #define smul_ppmm(ph, pl, m0, m1) \
1136   do {                                                                  \
1137     SItype __m0 = (m0), __m1 = (m1);                                    \
1138     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1139     (pl) = __m0 * __m1;                                                 \
1140   } while (0)
1141 #define SMUL_TIME 14
1142 #define UDIV_TIME 120
1143 #else
1144 #define UMUL_TIME 8
1145 #define smul_ppmm(xh, xl, m0, m1) \
1146   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1147 #define SMUL_TIME 4
1148 #define sdiv_qrnnd(q, r, nh, nl, d) \
1149   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1150 #define UDIV_TIME 100
1151 #endif
1152 #endif /* 32-bit POWER architecture variants.  */
1153
1154 /* We should test _IBMR2 here when we add assembly support for the system
1155    vendor compilers.  */
1156 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1157 #if !defined (_LONG_LONG_LIMB)
1158 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1159    use adde etc only when not _LONG_LONG_LIMB.  */
1160 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1161   do {                                                                  \
1162     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1163       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1164              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1165     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1166       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1167              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1168     else                                                                \
1169       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1170              : "=r" (sh), "=&r" (sl)                                    \
1171              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1172   } while (0)
1173 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1174   do {                                                                  \
1175     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1176       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1177                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1178     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)         \
1179       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1180                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1181     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1182       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1183                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1184     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1185       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1186                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1187     else                                                                \
1188       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1189                : "=r" (sh), "=&r" (sl)                                  \
1190                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1191   } while (0)
1192 #endif /* ! _LONG_LONG_LIMB */
1193 #define count_leading_zeros(count, x) \
1194   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1195 #define COUNT_LEADING_ZEROS_0 64
1196 #define umul_ppmm(ph, pl, m0, m1) \
1197   do {                                                                  \
1198     UDItype __m0 = (m0), __m1 = (m1);                                   \
1199     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1200     (pl) = __m0 * __m1;                                                 \
1201   } while (0)
1202 #define UMUL_TIME 15
1203 #define smul_ppmm(ph, pl, m0, m1) \
1204   do {                                                                  \
1205     DItype __m0 = (m0), __m1 = (m1);                                    \
1206     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1207     (pl) = __m0 * __m1;                                                 \
1208   } while (0)
1209 #define SMUL_TIME 14  /* ??? */
1210 #define UDIV_TIME 120 /* ??? */
1211 #endif /* 64-bit PowerPC.  */
1212
1213 #if defined (__pyr__) && W_TYPE_SIZE == 32
1214 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1215   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1216            : "=r" (sh), "=&r" (sl)                                      \
1217            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1218              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1219 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1220   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1221            : "=r" (sh), "=&r" (sl)                                      \
1222            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1223              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1224 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1225 #define umul_ppmm(w1, w0, u, v) \
1226   ({union {UDItype __ll;                                                \
1227            struct {USItype __h, __l;} __i;                              \
1228           } __x;                                                        \
1229   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1230            : "=&r" (__x.__ll)                                           \
1231            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1232   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1233 #endif /* __pyr__ */
1234
1235 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1236 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1237   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1238            : "=r" (sh), "=&r" (sl)                                      \
1239            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1240              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1241 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1242   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1243            : "=r" (sh), "=&r" (sl)                                      \
1244            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1245              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1246 #define smul_ppmm(ph, pl, m0, m1) \
1247   __asm__ (                                                             \
1248        "s       r2,r2\n"                                                \
1249 "       mts r10,%2\n"                                                   \
1250 "       m       r2,%3\n"                                                \
1251 "       m       r2,%3\n"                                                \
1252 "       m       r2,%3\n"                                                \
1253 "       m       r2,%3\n"                                                \
1254 "       m       r2,%3\n"                                                \
1255 "       m       r2,%3\n"                                                \
1256 "       m       r2,%3\n"                                                \
1257 "       m       r2,%3\n"                                                \
1258 "       m       r2,%3\n"                                                \
1259 "       m       r2,%3\n"                                                \
1260 "       m       r2,%3\n"                                                \
1261 "       m       r2,%3\n"                                                \
1262 "       m       r2,%3\n"                                                \
1263 "       m       r2,%3\n"                                                \
1264 "       m       r2,%3\n"                                                \
1265 "       m       r2,%3\n"                                                \
1266 "       cas     %0,r2,r0\n"                                             \
1267 "       mfs     r10,%1"                                                 \
1268            : "=r" (ph), "=r" (pl)                                       \
1269            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1270            : "r2")
1271 #define UMUL_TIME 20
1272 #define UDIV_TIME 200
1273 #define count_leading_zeros(count, x) \
1274   do {                                                                  \
1275     if ((x) >= 0x10000)                                                 \
1276       __asm__ ("clz     %0,%1"                                          \
1277                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1278     else                                                                \
1279       {                                                                 \
1280         __asm__ ("clz   %0,%1"                                          \
1281                  : "=r" (count) : "r" ((USItype)(x)));                  \
1282         (count) += 16;                                                  \
1283       }                                                                 \
1284   } while (0)
1285 #endif /* RT/ROMP */
1286
1287 #if defined (__sh2__) && W_TYPE_SIZE == 32
1288 #define umul_ppmm(w1, w0, u, v) \
1289   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1290            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1291 #define UMUL_TIME 5
1292 #endif
1293
1294 #if defined (__sparc__) && W_TYPE_SIZE == 32
1295 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1296   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1297            : "=r" (sh), "=&r" (sl)                                      \
1298            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1299            __CLOBBER_CC)
1300 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1301   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1302            : "=r" (sh), "=&r" (sl)                                      \
1303            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1304            __CLOBBER_CC)
1305 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1306    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1307 #if defined (__sparc_v9__) || defined (__sparcv9)
1308 /* Perhaps we should use floating-point operations here?  */
1309 #if 0
1310 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1311    Perhaps we simply need explicitly zero-extend the inputs?  */
1312 #define umul_ppmm(w1, w0, u, v) \
1313   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1314            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1315 #else
1316 /* Use v8 umul until above bug is fixed.  */
1317 #define umul_ppmm(w1, w0, u, v) \
1318   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1319 #endif
1320 /* Use a plain v8 divide for v9.  */
1321 #define udiv_qrnnd(q, r, n1, n0, d) \
1322   do {                                                                  \
1323     USItype __q;                                                        \
1324     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1325              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1326     (r) = (n0) - __q * (d);                                             \
1327     (q) = __q;                                                          \
1328   } while (0)
1329 #else
1330 #if defined (__sparc_v8__)   /* gcc normal */                           \
1331   || defined (__sparcv8)     /* gcc solaris */                          \
1332   || HAVE_HOST_CPU_supersparc
1333 /* Don't match immediate range because, 1) it is not often useful,
1334    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1335    while we want to match a 13 bit interval, sign extended to 32 bits,
1336    but INTERPRETED AS UNSIGNED.  */
1337 #define umul_ppmm(w1, w0, u, v) \
1338   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1339 #define UMUL_TIME 5
1340
1341 #if HAVE_HOST_CPU_supersparc
1342 #define UDIV_TIME 60            /* SuperSPARC timing */
1343 #else
1344 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1345    dividends and will trap to the kernel for the rest. */
1346 #define udiv_qrnnd(q, r, n1, n0, d) \
1347   do {                                                                  \
1348     USItype __q;                                                        \
1349     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1350              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1351     (r) = (n0) - __q * (d);                                             \
1352     (q) = __q;                                                          \
1353   } while (0)
1354 #define UDIV_TIME 25
1355 #endif /* HAVE_HOST_CPU_supersparc */
1356
1357 #else /* ! __sparc_v8__ */
1358 #if defined (__sparclite__)
1359 /* This has hardware multiply but not divide.  It also has two additional
1360    instructions scan (ffs from high bit) and divscc.  */
1361 #define umul_ppmm(w1, w0, u, v) \
1362   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1363 #define UMUL_TIME 5
1364 #define udiv_qrnnd(q, r, n1, n0, d) \
1365   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1366 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1367 "       tst     %%g0\n"                                                 \
1368 "       divscc  %3,%4,%%g1\n"                                           \
1369 "       divscc  %%g1,%4,%%g1\n"                                         \
1370 "       divscc  %%g1,%4,%%g1\n"                                         \
1371 "       divscc  %%g1,%4,%%g1\n"                                         \
1372 "       divscc  %%g1,%4,%%g1\n"                                         \
1373 "       divscc  %%g1,%4,%%g1\n"                                         \
1374 "       divscc  %%g1,%4,%%g1\n"                                         \
1375 "       divscc  %%g1,%4,%%g1\n"                                         \
1376 "       divscc  %%g1,%4,%%g1\n"                                         \
1377 "       divscc  %%g1,%4,%%g1\n"                                         \
1378 "       divscc  %%g1,%4,%%g1\n"                                         \
1379 "       divscc  %%g1,%4,%%g1\n"                                         \
1380 "       divscc  %%g1,%4,%%g1\n"                                         \
1381 "       divscc  %%g1,%4,%%g1\n"                                         \
1382 "       divscc  %%g1,%4,%%g1\n"                                         \
1383 "       divscc  %%g1,%4,%%g1\n"                                         \
1384 "       divscc  %%g1,%4,%%g1\n"                                         \
1385 "       divscc  %%g1,%4,%%g1\n"                                         \
1386 "       divscc  %%g1,%4,%%g1\n"                                         \
1387 "       divscc  %%g1,%4,%%g1\n"                                         \
1388 "       divscc  %%g1,%4,%%g1\n"                                         \
1389 "       divscc  %%g1,%4,%%g1\n"                                         \
1390 "       divscc  %%g1,%4,%%g1\n"                                         \
1391 "       divscc  %%g1,%4,%%g1\n"                                         \
1392 "       divscc  %%g1,%4,%%g1\n"                                         \
1393 "       divscc  %%g1,%4,%%g1\n"                                         \
1394 "       divscc  %%g1,%4,%%g1\n"                                         \
1395 "       divscc  %%g1,%4,%%g1\n"                                         \
1396 "       divscc  %%g1,%4,%%g1\n"                                         \
1397 "       divscc  %%g1,%4,%%g1\n"                                         \
1398 "       divscc  %%g1,%4,%%g1\n"                                         \
1399 "       divscc  %%g1,%4,%0\n"                                           \
1400 "       rd      %%y,%1\n"                                               \
1401 "       bl,a 1f\n"                                                      \
1402 "       add     %1,%4,%1\n"                                             \
1403 "1:     ! End of inline udiv_qrnnd"                                     \
1404            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1405            : "%g1" __AND_CLOBBER_CC)
1406 #define UDIV_TIME 37
1407 #define count_leading_zeros(count, x) \
1408   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1409 /* Early sparclites return 63 for an argument of 0, but they warn that future
1410    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1411    undefined.  */
1412 #endif /* __sparclite__ */
1413 #endif /* __sparc_v8__ */
1414 #endif /* __sparc_v9__ */
1415 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1416 #ifndef umul_ppmm
1417 #define umul_ppmm(w1, w0, u, v) \
1418   __asm__ ("! Inlined umul_ppmm\n"                                      \
1419 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1420 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1421 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1422 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1423 "       mulscc  %%g1,%3,%%g1\n"                                         \
1424 "       mulscc  %%g1,%3,%%g1\n"                                         \
1425 "       mulscc  %%g1,%3,%%g1\n"                                         \
1426 "       mulscc  %%g1,%3,%%g1\n"                                         \
1427 "       mulscc  %%g1,%3,%%g1\n"                                         \
1428 "       mulscc  %%g1,%3,%%g1\n"                                         \
1429 "       mulscc  %%g1,%3,%%g1\n"                                         \
1430 "       mulscc  %%g1,%3,%%g1\n"                                         \
1431 "       mulscc  %%g1,%3,%%g1\n"                                         \
1432 "       mulscc  %%g1,%3,%%g1\n"                                         \
1433 "       mulscc  %%g1,%3,%%g1\n"                                         \
1434 "       mulscc  %%g1,%3,%%g1\n"                                         \
1435 "       mulscc  %%g1,%3,%%g1\n"                                         \
1436 "       mulscc  %%g1,%3,%%g1\n"                                         \
1437 "       mulscc  %%g1,%3,%%g1\n"                                         \
1438 "       mulscc  %%g1,%3,%%g1\n"                                         \
1439 "       mulscc  %%g1,%3,%%g1\n"                                         \
1440 "       mulscc  %%g1,%3,%%g1\n"                                         \
1441 "       mulscc  %%g1,%3,%%g1\n"                                         \
1442 "       mulscc  %%g1,%3,%%g1\n"                                         \
1443 "       mulscc  %%g1,%3,%%g1\n"                                         \
1444 "       mulscc  %%g1,%3,%%g1\n"                                         \
1445 "       mulscc  %%g1,%3,%%g1\n"                                         \
1446 "       mulscc  %%g1,%3,%%g1\n"                                         \
1447 "       mulscc  %%g1,%3,%%g1\n"                                         \
1448 "       mulscc  %%g1,%3,%%g1\n"                                         \
1449 "       mulscc  %%g1,%3,%%g1\n"                                         \
1450 "       mulscc  %%g1,%3,%%g1\n"                                         \
1451 "       mulscc  %%g1,%3,%%g1\n"                                         \
1452 "       mulscc  %%g1,%3,%%g1\n"                                         \
1453 "       mulscc  %%g1,%3,%%g1\n"                                         \
1454 "       mulscc  %%g1,%3,%%g1\n"                                         \
1455 "       mulscc  %%g1,0,%%g1\n"                                          \
1456 "       add     %%g1,%%g2,%0\n"                                         \
1457 "       rd      %%y,%1"                                                 \
1458            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1459            : "%g1", "%g2" __AND_CLOBBER_CC)
1460 #define UMUL_TIME 39            /* 39 instructions */
1461 #endif
1462 #ifndef udiv_qrnnd
1463 #ifndef LONGLONG_STANDALONE
1464 #define udiv_qrnnd(q, r, n1, n0, d) \
1465   do { UWtype __r;                                                      \
1466     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1467     (r) = __r;                                                          \
1468   } while (0)
1469 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1470 #ifndef UDIV_TIME
1471 #define UDIV_TIME 140
1472 #endif
1473 #endif /* LONGLONG_STANDALONE */
1474 #endif /* udiv_qrnnd */
1475 #endif /* __sparc__ */
1476
1477 #if defined (__sparc__) && W_TYPE_SIZE == 64
1478 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1479   __asm__ (                                                             \
1480        "addcc   %r4,%5,%1\n"                                            \
1481       " addccc  %r6,%7,%%g0\n"                                          \
1482       " addc    %r2,%3,%0"                                              \
1483           : "=r" (sh), "=&r" (sl)                                       \
1484           : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
1485             "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
1486            __CLOBBER_CC)
1487 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1488   __asm__ (                                                             \
1489        "subcc   %r4,%5,%1\n"                                            \
1490       " subccc  %r6,%7,%%g0\n"                                          \
1491       " subc    %r2,%3,%0"                                              \
1492           : "=r" (sh), "=&r" (sl)                                       \
1493           : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),         \
1494             "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
1495            __CLOBBER_CC)
1496 #endif
1497
1498 #if defined (__vax__) && W_TYPE_SIZE == 32
1499 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1500   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1501            : "=g" (sh), "=&g" (sl)                                      \
1502            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1503              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1504 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1505   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1506            : "=g" (sh), "=&g" (sl)                                      \
1507            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1508              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1509 #define smul_ppmm(xh, xl, m0, m1) \
1510   do {                                                                  \
1511     union {UDItype __ll;                                                \
1512            struct {USItype __l, __h;} __i;                              \
1513           } __x;                                                        \
1514     USItype __m0 = (m0), __m1 = (m1);                                   \
1515     __asm__ ("emul %1,%2,$0,%0"                                         \
1516              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1517     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1518   } while (0)
1519 #define sdiv_qrnnd(q, r, n1, n0, d) \
1520   do {                                                                  \
1521     union {DItype __ll;                                                 \
1522            struct {SItype __l, __h;} __i;                               \
1523           } __x;                                                        \
1524     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1525     __asm__ ("ediv %3,%2,%0,%1"                                         \
1526              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1527   } while (0)
1528 #if 0
1529 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1530    8800 maybe). */
1531 #define count_trailing_zeros(count,x)                                   \
1532   do {                                                                  \
1533     __asm__ ("ffs 0, 31, %1, %0"                                        \
1534              : "=g" (count)                                             \
1535              : "g" ((USItype) (x)));                                    \
1536   } while (0)
1537 #endif
1538 #endif /* __vax__ */
1539
1540 #if defined (__z8000__) && W_TYPE_SIZE == 16
1541 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1542   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1543            : "=r" (sh), "=&r" (sl)                                      \
1544            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1545              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1546 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1547   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1548            : "=r" (sh), "=&r" (sl)                                      \
1549            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1550              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1551 #define umul_ppmm(xh, xl, m0, m1) \
1552   do {                                                                  \
1553     union {long int __ll;                                               \
1554            struct {unsigned int __h, __l;} __i;                         \
1555           } __x;                                                        \
1556     unsigned int __m0 = (m0), __m1 = (m1);                              \
1557     __asm__ ("mult      %S0,%H3"                                        \
1558              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1559              : "%1" (m0), "rQR" (m1));                                  \
1560     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1561     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1562              + (((signed int) __m1 >> 15) & __m0));                     \
1563   } while (0)
1564 #endif /* __z8000__ */
1565
1566 #endif /* __GNUC__ */
1567
1568 #endif /* NO_ASM */
1569
1570
1571 #if !defined (umul_ppmm) && defined (__umulsidi3)
1572 #define umul_ppmm(ph, pl, m0, m1) \
1573   {                                                                     \
1574     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1575     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1576     pl = (UWtype) __ll;                                                 \
1577   }
1578 #endif
1579
1580 #if !defined (__umulsidi3)
1581 #define __umulsidi3(u, v) \
1582   ({UWtype __hi, __lo;                                                  \
1583     umul_ppmm (__hi, __lo, u, v);                                       \
1584     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1585 #endif
1586
1587
1588 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1589    forms have "reversed" arguments, meaning the pointer is last, which
1590    sometimes allows better parameter passing, in particular on 64-bit
1591    hppa. */
1592
1593 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1594 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1595
1596 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1597   && ! defined (LONGLONG_STANDALONE)
1598 #define umul_ppmm(wh, wl, u, v)                                               \
1599   do {                                                                        \
1600     UWtype __umul_ppmm__p0;                                                   \
1601     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1602     (wl) = __umul_ppmm__p0;                                                   \
1603   } while (0)
1604 #endif
1605
1606 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1607 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1608
1609 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1610   && ! defined (LONGLONG_STANDALONE)
1611 #define umul_ppmm(wh, wl, u, v)                                               \
1612   do {                                                                        \
1613     UWtype __umul_ppmm__p0;                                                   \
1614     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1615     (wl) = __umul_ppmm__p0;                                                   \
1616   } while (0)
1617 #endif
1618
1619 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1620 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1621
1622 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1623   && ! defined (LONGLONG_STANDALONE)
1624 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1625   do {                                                                  \
1626     UWtype __udiv_qrnnd__r;                                             \
1627     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                             \
1628                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1629     (r) = __udiv_qrnnd__r;                                              \
1630   } while (0)
1631 #endif
1632
1633 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1634 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1635
1636 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1637   && ! defined (LONGLONG_STANDALONE)
1638 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1639   do {                                                                  \
1640     UWtype __udiv_qrnnd__r;                                             \
1641     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1642                             &__udiv_qrnnd__r);                          \
1643     (r) = __udiv_qrnnd__r;                                              \
1644   } while (0)
1645 #endif
1646
1647
1648 /* If this machine has no inline assembler, use C macros.  */
1649
1650 #if !defined (add_ssaaaa)
1651 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1652   do {                                                                  \
1653     UWtype __x;                                                         \
1654     __x = (al) + (bl);                                                  \
1655     (sh) = (ah) + (bh) + (__x < (al));                                  \
1656     (sl) = __x;                                                         \
1657   } while (0)
1658 #endif
1659
1660 #if !defined (sub_ddmmss)
1661 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1662   do {                                                                  \
1663     UWtype __x;                                                         \
1664     __x = (al) - (bl);                                                  \
1665     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1666     (sl) = __x;                                                         \
1667   } while (0)
1668 #endif
1669
1670 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1671    smul_ppmm.  */
1672 #if !defined (umul_ppmm) && defined (smul_ppmm)
1673 #define umul_ppmm(w1, w0, u, v)                                         \
1674   do {                                                                  \
1675     UWtype __w1;                                                        \
1676     UWtype __xm0 = (u), __xm1 = (v);                                    \
1677     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1678     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1679                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1680   } while (0)
1681 #endif
1682
1683 /* If we still don't have umul_ppmm, define it using plain C.
1684
1685    For reference, when this code is used for squaring (ie. u and v identical
1686    expressions), gcc recognises __x1 and __x2 are the same and generates 3
1687    multiplies, not 4.  The subsequent additions could be optimized a bit,
1688    but the only place GMP currently uses such a square is mpn_sqr_basecase,
1689    and chips obliged to use this generic C umul will have plenty of worse
1690    performance problems than a couple of extra instructions on the diagonal
1691    of sqr_basecase.  */
1692
1693 #if !defined (umul_ppmm)
1694 #define umul_ppmm(w1, w0, u, v)                                         \
1695   do {                                                                  \
1696     UWtype __x0, __x1, __x2, __x3;                                      \
1697     UHWtype __ul, __vl, __uh, __vh;                                     \
1698     UWtype __u = (u), __v = (v);                                        \
1699                                                                         \
1700     __ul = __ll_lowpart (__u);                                          \
1701     __uh = __ll_highpart (__u);                                         \
1702     __vl = __ll_lowpart (__v);                                          \
1703     __vh = __ll_highpart (__v);                                         \
1704                                                                         \
1705     __x0 = (UWtype) __ul * __vl;                                        \
1706     __x1 = (UWtype) __ul * __vh;                                        \
1707     __x2 = (UWtype) __uh * __vl;                                        \
1708     __x3 = (UWtype) __uh * __vh;                                        \
1709                                                                         \
1710     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
1711     __x1 += __x2;               /* but this indeed can */               \
1712     if (__x1 < __x2)            /* did we get it? */                    \
1713       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
1714                                                                         \
1715     (w1) = __x3 + __ll_highpart (__x1);                                 \
1716     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
1717   } while (0)
1718 #endif
1719
1720 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1721    exist in one form or another.  */
1722 #if !defined (smul_ppmm)
1723 #define smul_ppmm(w1, w0, u, v)                                         \
1724   do {                                                                  \
1725     UWtype __w1;                                                        \
1726     UWtype __xm0 = (u), __xm1 = (v);                                    \
1727     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1728     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1729                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1730   } while (0)
1731 #endif
1732
1733 /* Define this unconditionally, so it can be used for debugging.  */
1734 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1735   do {                                                                  \
1736     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
1737                                                                         \
1738     ASSERT ((d) != 0);                                                  \
1739     ASSERT ((n1) < (d));                                                \
1740                                                                         \
1741     __d1 = __ll_highpart (d);                                           \
1742     __d0 = __ll_lowpart (d);                                            \
1743                                                                         \
1744     __q1 = (n1) / __d1;                                                 \
1745     __r1 = (n1) - __q1 * __d1;                                          \
1746     __m = __q1 * __d0;                                                  \
1747     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
1748     if (__r1 < __m)                                                     \
1749       {                                                                 \
1750         __q1--, __r1 += (d);                                            \
1751         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1752           if (__r1 < __m)                                               \
1753             __q1--, __r1 += (d);                                        \
1754       }                                                                 \
1755     __r1 -= __m;                                                        \
1756                                                                         \
1757     __q0 = __r1 / __d1;                                                 \
1758     __r0 = __r1  - __q0 * __d1;                                         \
1759     __m = __q0 * __d0;                                                  \
1760     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
1761     if (__r0 < __m)                                                     \
1762       {                                                                 \
1763         __q0--, __r0 += (d);                                            \
1764         if (__r0 >= (d))                                                \
1765           if (__r0 < __m)                                               \
1766             __q0--, __r0 += (d);                                        \
1767       }                                                                 \
1768     __r0 -= __m;                                                        \
1769                                                                         \
1770     (q) = __q1 * __ll_B | __q0;                                         \
1771     (r) = __r0;                                                         \
1772   } while (0)
1773
1774 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1775    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
1776 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1777 #define udiv_qrnnd(q, r, nh, nl, d) \
1778   do {                                                                  \
1779     UWtype __r;                                                         \
1780     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
1781     (r) = __r;                                                          \
1782   } while (0)
1783 #endif
1784
1785 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
1786 #if !defined (udiv_qrnnd)
1787 #define UDIV_NEEDS_NORMALIZATION 1
1788 #define udiv_qrnnd __udiv_qrnnd_c
1789 #endif
1790
1791 #if !defined (count_leading_zeros)
1792 #define count_leading_zeros(count, x) \
1793   do {                                                                  \
1794     UWtype __xr = (x);                                                  \
1795     UWtype __a;                                                         \
1796                                                                         \
1797     if (W_TYPE_SIZE == 32)                                              \
1798       {                                                                 \
1799         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
1800           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
1801           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
1802           : 3*__BITS4 + 1);                                             \
1803       }                                                                 \
1804     else                                                                \
1805       {                                                                 \
1806         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
1807           if (((__xr >> __a) & 0xff) != 0)                              \
1808             break;                                                      \
1809         ++__a;                                                          \
1810       }                                                                 \
1811                                                                         \
1812     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
1813   } while (0)
1814 /* This version gives a well-defined value for zero. */
1815 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
1816 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1817 #endif
1818
1819 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
1820 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
1821 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1822 #endif
1823
1824 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1825 # ifdef MPFR_HAVE_GMP_IMPL
1826     extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
1827 # else
1828     extern const unsigned char __clz_tab[128];
1829 # endif
1830 #endif
1831
1832 #if !defined (count_trailing_zeros)
1833 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
1834    defined in asm, but if it is not, the C version above is good enough.  */
1835 #define count_trailing_zeros(count, x) \
1836   do {                                                                  \
1837     UWtype __ctz_x = (x);                                               \
1838     UWtype __ctz_c;                                                     \
1839     ASSERT (__ctz_x != 0);                                              \
1840     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
1841     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
1842   } while (0)
1843 #endif
1844
1845 #ifndef UDIV_NEEDS_NORMALIZATION
1846 #define UDIV_NEEDS_NORMALIZATION 0
1847 #endif
1848
1849 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
1850    that hence the latter should always be used.  */
1851 #ifndef UDIV_PREINV_ALWAYS
1852 #define UDIV_PREINV_ALWAYS 0
1853 #endif
1854
1855 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
1856 #ifndef UMUL_TIME
1857 #define UMUL_TIME 1
1858 #endif
1859
1860 #ifndef UDIV_TIME
1861 #define UDIV_TIME UMUL_TIME
1862 #endif