Merge branch 'vendor/GCC44'
[dragonfly.git] / contrib / mpfr / mpfr-longlong.h
1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
4 2004, 2005 Free Software Foundation, Inc.
5
6 This file is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or (at your
9 option) any later version.
10
11 This file is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14 License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with this file; see the file COPYING.LIB.  If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 MA 02110-1301, USA. */
20
21 /* You have to define the following before including this file:
22
23    UWtype -- An unsigned type, default type for operations (typically a "word")
24    UHWtype -- An unsigned type, at least half the size of UWtype.
25    UDWtype -- An unsigned type, at least twice as large a UWtype
26    W_TYPE_SIZE -- size in bits of UWtype
27
28    SItype, USItype -- Signed and unsigned 32 bit types.
29    DItype, UDItype -- Signed and unsigned 64 bit types.
30
31    On a 32 bit machine UWtype should typically be USItype;
32    on a 64 bit machine, UWtype should typically be UDItype.
33 */
34
35 #define __BITS4 (W_TYPE_SIZE / 4)
36 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
37 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
38 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
39
40 /* This is used to make sure no undesirable sharing between different libraries
41    that use this file takes place.  */
42 #ifndef __MPN
43 #define __MPN(x) __##x
44 #endif
45
46 #ifndef _PROTO
47 #if (__STDC__-0) || defined (__cplusplus)
48 #define _PROTO(x) x
49 #else
50 #define _PROTO(x) ()
51 #endif
52 #endif
53
54 /* Define auxiliary asm macros.
55
56    1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two
57    UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
58    word product in HIGH_PROD and LOW_PROD.
59
60    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
61    UDWtype product.  This is just a variant of umul_ppmm.
62
63    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
64    denominator) divides a UDWtype, composed by the UWtype integers
65    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
66    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
67    than DENOMINATOR for correct operation.  If, in addition, the most
68    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
69    UDIV_NEEDS_NORMALIZATION is defined to 1.
70
71    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
73    is rounded toward 0.
74
75    5) count_leading_zeros(count, x) counts the number of zero-bits from the
76    msb to the first non-zero bit in the UWtype X.  This is the number of
77    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
78    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
79
80    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
81    from the least significant end.
82
83    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
84    high_addend_2, low_addend_2) adds two UWtype integers, composed by
85    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
86    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
87    (i.e. carry out) is not stored anywhere, and is lost.
88
89    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
90    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
91    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
92    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
93    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
94    and is lost.
95
96    If any of these macros are left undefined for a particular CPU,
97    C macros are used.
98
99
100    Notes:
101
102    For add_ssaaaa the two high and two low addends can both commute, but
103    unfortunately gcc only supports one "%" commutative in each asm block.
104    This has always been so but is only documented in recent versions
105    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
106    compiler error in certain rare circumstances.
107
108    Apparently it was only the last "%" that was ever actually respected, so
109    the code has been updated to leave just that.  Clearly there's a free
110    choice whether high or low should get it, if there's a reason to favour
111    one over the other.  Also obviously when the constraints on the two
112    operands are identical there's no benefit to the reloader in any "%" at
113    all.
114
115    */
116
117 /* The CPUs come in alphabetical order below.
118
119    Please add support for more CPUs here, or improve the current support
120    for the CPUs below!  */
121
122
123 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
124    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
125    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
126    __builtin_ctzll.
127
128    These builtins are only used when we check what code comes out, on some
129    chips they're merely libgcc calls, where we will instead want an inline
130    in that case (either asm or generic C).
131
132    These builtins are better than an asm block of the same insn, since an
133    asm block doesn't give gcc any information about scheduling or resource
134    usage.  We keep an asm block for use on prior versions of gcc though.
135
136    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
137    it's not used (for count_leading_zeros) because it generally gives extra
138    code to ensure the result is 0 when the input is 0, which we don't need
139    or want.  */
140
141 #ifdef _LONG_LONG_LIMB
142 #define count_leading_zeros_gcc_clz(count,x)    \
143   do {                                          \
144     ASSERT ((x) != 0);                          \
145     (count) = __builtin_clzll (x);              \
146   } while (0)
147 #else
148 #define count_leading_zeros_gcc_clz(count,x)    \
149   do {                                          \
150     ASSERT ((x) != 0);                          \
151     (count) = __builtin_clzl (x);               \
152   } while (0)
153 #endif
154
155 #ifdef _LONG_LONG_LIMB
156 #define count_trailing_zeros_gcc_ctz(count,x)   \
157   do {                                          \
158     ASSERT ((x) != 0);                          \
159     (count) = __builtin_ctzll (x);              \
160   } while (0)
161 #else
162 #define count_trailing_zeros_gcc_ctz(count,x)   \
163   do {                                          \
164     ASSERT ((x) != 0);                          \
165     (count) = __builtin_ctzl (x);               \
166   } while (0)
167 #endif
168
169
170 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
171    don't need to be under !NO_ASM */
172 #if ! defined (NO_ASM)
173
174 #if defined (__alpha) && W_TYPE_SIZE == 64
175 /* Most alpha-based machines, except Cray systems. */
176 #if defined (__GNUC__)
177 #define umul_ppmm(ph, pl, m0, m1) \
178   do {                                                                  \
179     UDItype __m0 = (m0), __m1 = (m1);                                   \
180     __asm__ ("umulh %r1,%2,%0"                                          \
181              : "=r" (ph)                                                \
182              : "%rJ" (m0), "rI" (m1));                                  \
183     (pl) = __m0 * __m1;                                                 \
184   } while (0)
185 #define UMUL_TIME 18
186 #else /* ! __GNUC__ */
187 #include <machine/builtins.h>
188 #define umul_ppmm(ph, pl, m0, m1) \
189   do {                                                                  \
190     UDItype __m0 = (m0), __m1 = (m1);                                   \
191     (ph) = __UMULH (m0, m1);                                            \
192     (pl) = __m0 * __m1;                                                 \
193   } while (0)
194 #endif
195 #ifndef LONGLONG_STANDALONE
196 #define udiv_qrnnd(q, r, n1, n0, d) \
197   do { UWtype __di;                                                     \
198     __di = __MPN(invert_limb) (d);                                      \
199     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
200   } while (0)
201 #define UDIV_PREINV_ALWAYS  1
202 #define UDIV_NEEDS_NORMALIZATION 1
203 #define UDIV_TIME 220
204 #endif /* LONGLONG_STANDALONE */
205
206 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
207    always goes into libgmp.so, even when not actually used.  */
208 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
209
210 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
211 #define count_leading_zeros(COUNT,X) \
212   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
213 #define count_trailing_zeros(COUNT,X) \
214   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
215 #endif /* clz/ctz using cix */
216
217 #if ! defined (count_leading_zeros)                             \
218   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
219 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
220    "$31" is written explicitly in the asm, since an "r" constraint won't
221    select reg 31.  There seems no need to worry about "r31" syntax for cray,
222    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
223 #define ALPHA_CMPBGE_0(dst, src)                                        \
224   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
225 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
226    them, locating the highest non-zero byte.  A second __clz_tab lookup
227    counts the leading zero bits in that byte, giving the result.  */
228 #define count_leading_zeros(count, x)                                   \
229   do {                                                                  \
230     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
231     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
232     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
233     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
234     __clz__x >>= __clz__b;                                              \
235     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
236     __clz__b = 65 - __clz__b;                                           \
237     (count) = __clz__b - __clz__c;                                      \
238   } while (0)
239 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
240 #endif /* clz using cmpbge */
241
242 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
243 #if HAVE_ATTRIBUTE_CONST
244 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
245 #else
246 long __MPN(count_leading_zeros) _PROTO ((UDItype));
247 #endif
248 #define count_leading_zeros(count, x) \
249   ((count) = __MPN(count_leading_zeros) (x))
250 #endif /* clz using mpn */
251 #endif /* __alpha */
252
253 #if defined (_CRAY) && W_TYPE_SIZE == 64
254 #include <intrinsics.h>
255 #define UDIV_PREINV_ALWAYS  1
256 #define UDIV_NEEDS_NORMALIZATION 1
257 #define UDIV_TIME 220
258 long __MPN(count_leading_zeros) _PROTO ((UDItype));
259 #define count_leading_zeros(count, x) \
260   ((count) = _leadz ((UWtype) (x)))
261 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
262 #define umul_ppmm(ph, pl, m0, m1) \
263   do {                                                                  \
264     UDItype __m0 = (m0), __m1 = (m1);                                   \
265     (ph) = _int_mult_upper (m0, m1);                                    \
266     (pl) = __m0 * __m1;                                                 \
267   } while (0)
268 #ifndef LONGLONG_STANDALONE
269 #define udiv_qrnnd(q, r, n1, n0, d) \
270   do { UWtype __di;                                                     \
271     __di = __MPN(invert_limb) (d);                                      \
272     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
273   } while (0)
274 #endif /* LONGLONG_STANDALONE */
275 #endif /* _CRAYIEEE */
276 #endif /* _CRAY */
277
278 #if defined (__ia64) && W_TYPE_SIZE == 64
279 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
280    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
281    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
282    register, which takes an extra cycle.  */
283 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
284   do {                                          \
285     UWtype __x;                                 \
286     __x = (al) - (bl);                          \
287     if ((al) < (bl))                            \
288       (sh) = (ah) - (bh) - 1;                   \
289     else                                        \
290       (sh) = (ah) - (bh);                       \
291     (sl) = __x;                                 \
292   } while (0)
293 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
294 /* Do both product parts in assembly, since that gives better code with
295    all gcc versions.  Some callers will just use the upper part, and in
296    that situation we waste an instruction, but not any cycles.  */
297 #define umul_ppmm(ph, pl, m0, m1) \
298     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
299              : "=&f" (ph), "=f" (pl)                                    \
300              : "f" (m0), "f" (m1))
301 #define UMUL_TIME 14
302 #define count_leading_zeros(count, x) \
303   do {                                                                  \
304     UWtype _x = (x), _y, _a, _c;                                        \
305     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
306     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
307     _c = (_a - 1) << 3;                                                 \
308     _x >>= _c;                                                          \
309     if (_x >= 1 << 4)                                                   \
310       _x >>= 4, _c += 4;                                                \
311     if (_x >= 1 << 2)                                                   \
312       _x >>= 2, _c += 2;                                                \
313     _c += _x >> 1;                                                      \
314     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
315   } while (0)
316 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
317    based, and we don't need a special case for x==0 here */
318 #define count_trailing_zeros(count, x)                                  \
319   do {                                                                  \
320     UWtype __ctz_x = (x);                                               \
321     __asm__ ("popcnt %0 = %1"                                           \
322              : "=r" (count)                                             \
323              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
324   } while (0)
325 #endif
326 #if defined (__INTEL_COMPILER)
327 #include <ia64intrin.h>
328 #define umul_ppmm(ph, pl, m0, m1)                                       \
329   do {                                                                  \
330     UWtype _m0 = (m0), _m1 = (m1);                                      \
331     ph = _m64_xmahu (_m0, _m1, 0);                                      \
332     pl = _m0 * _m1;                                                     \
333   } while (0)
334 #endif
335 #ifndef LONGLONG_STANDALONE
336 #define udiv_qrnnd(q, r, n1, n0, d) \
337   do { UWtype __di;                                                     \
338     __di = __MPN(invert_limb) (d);                                      \
339     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
340   } while (0)
341 #define UDIV_PREINV_ALWAYS  1
342 #define UDIV_NEEDS_NORMALIZATION 1
343 #endif
344 #define UDIV_TIME 220
345 #endif
346
347
348 #if defined (__GNUC__)
349
350 /* We sometimes need to clobber "cc" with gcc2, but that would not be
351    understood by gcc1.  Use cpp to avoid major code duplication.  */
352 #if __GNUC__ < 2
353 #define __CLOBBER_CC
354 #define __AND_CLOBBER_CC
355 #else /* __GNUC__ >= 2 */
356 #define __CLOBBER_CC : "cc"
357 #define __AND_CLOBBER_CC , "cc"
358 #endif /* __GNUC__ < 2 */
359
360 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
361 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
362   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
363            : "=r" (sh), "=&r" (sl)                                      \
364            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
365 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
366   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
367            : "=r" (sh), "=&r" (sl)                                      \
368            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
369 #define umul_ppmm(xh, xl, m0, m1) \
370   do {                                                                  \
371     USItype __m0 = (m0), __m1 = (m1);                                   \
372     __asm__ ("multiplu %0,%1,%2"                                        \
373              : "=r" (xl)                                                \
374              : "r" (__m0), "r" (__m1));                                 \
375     __asm__ ("multmu %0,%1,%2"                                          \
376              : "=r" (xh)                                                \
377              : "r" (__m0), "r" (__m1));                                 \
378   } while (0)
379 #define udiv_qrnnd(q, r, n1, n0, d) \
380   __asm__ ("dividu %0,%3,%4"                                            \
381            : "=r" (q), "=q" (r)                                         \
382            : "1" (n1), "r" (n0), "r" (d))
383 #define count_leading_zeros(count, x) \
384     __asm__ ("clz %0,%1"                                                \
385              : "=r" (count)                                             \
386              : "r" (x))
387 #define COUNT_LEADING_ZEROS_0 32
388 #endif /* __a29k__ */
389
390 #if defined (__arc__)
391 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
392   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
393            : "=r" (sh),                                                 \
394              "=&r" (sl)                                                 \
395            : "r"  ((USItype) (ah)),                                     \
396              "rIJ" ((USItype) (bh)),                                    \
397              "%r" ((USItype) (al)),                                     \
398              "rIJ" ((USItype) (bl)))
399 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
400   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
401            : "=r" (sh),                                                 \
402              "=&r" (sl)                                                 \
403            : "r" ((USItype) (ah)),                                      \
404              "rIJ" ((USItype) (bh)),                                    \
405              "r" ((USItype) (al)),                                      \
406              "rIJ" ((USItype) (bl)))
407 #endif
408
409 #if defined (__arm__) && W_TYPE_SIZE == 32
410 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
411   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
412            : "=r" (sh), "=&r" (sl)                                      \
413            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
414 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
415   do {                                                                  \
416     if (__builtin_constant_p (al))                                      \
417       {                                                                 \
418         if (__builtin_constant_p (ah))                                  \
419           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
420                    : "=r" (sh), "=&r" (sl)                              \
421                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
422         else                                                            \
423           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
424                    : "=r" (sh), "=&r" (sl)                              \
425                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
426       }                                                                 \
427     else if (__builtin_constant_p (ah))                                 \
428       {                                                                 \
429         if (__builtin_constant_p (bl))                                  \
430           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
431                    : "=r" (sh), "=&r" (sl)                              \
432                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
433         else                                                            \
434           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
435                    : "=r" (sh), "=&r" (sl)                              \
436                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
437       }                                                                 \
438     else if (__builtin_constant_p (bl))                                 \
439       {                                                                 \
440         if (__builtin_constant_p (bh))                                  \
441           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
442                    : "=r" (sh), "=&r" (sl)                              \
443                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
444         else                                                            \
445           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
446                    : "=r" (sh), "=&r" (sl)                              \
447                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
448       }                                                                 \
449     else /* only bh might be a constant */                              \
450       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
451                : "=r" (sh), "=&r" (sl)                                  \
452                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
453     } while (0)
454 #if 1 || defined (__arm_m__)    /* `M' series has widening multiply support */
455 #define umul_ppmm(xh, xl, a, b) \
456   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
457 #define UMUL_TIME 5
458 #define smul_ppmm(xh, xl, a, b) \
459   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
460 #ifndef LONGLONG_STANDALONE
461 #define udiv_qrnnd(q, r, n1, n0, d) \
462   do { UWtype __di;                                                     \
463     __di = __MPN(invert_limb) (d);                                      \
464     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
465   } while (0)
466 #define UDIV_PREINV_ALWAYS  1
467 #define UDIV_NEEDS_NORMALIZATION 1
468 #define UDIV_TIME 70
469 #endif /* LONGLONG_STANDALONE */
470 #else
471 #define umul_ppmm(xh, xl, a, b) \
472   __asm__ ("%@ Inlined umul_ppmm\n"                                     \
473 "       mov     %|r0, %2, lsr #16\n"                                    \
474 "       mov     %|r2, %3, lsr #16\n"                                    \
475 "       bic     %|r1, %2, %|r0, lsl #16\n"                              \
476 "       bic     %|r2, %3, %|r2, lsl #16\n"                              \
477 "       mul     %1, %|r1, %|r2\n"                                       \
478 "       mul     %|r2, %|r0, %|r2\n"                                     \
479 "       mul     %|r1, %0, %|r1\n"                                       \
480 "       mul     %0, %|r0, %0\n"                                         \
481 "       adds    %|r1, %|r2, %|r1\n"                                     \
482 "       addcs   %0, %0, #65536\n"                                       \
483 "       adds    %1, %1, %|r1, lsl #16\n"                                \
484 "       adc     %0, %0, %|r1, lsr #16"                                  \
485            : "=&r" (xh), "=r" (xl)                                      \
486            : "r" (a), "r" (b)                                           \
487            : "r0", "r1", "r2")
488 #define UMUL_TIME 20
489 #ifndef LONGLONG_STANDALONE
490 #define udiv_qrnnd(q, r, n1, n0, d) \
491   do { UWtype __r;                                                      \
492     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
493     (r) = __r;                                                          \
494   } while (0)
495 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
496 #define UDIV_TIME 200
497 #endif /* LONGLONG_STANDALONE */
498 #endif
499 #endif /* __arm__ */
500
501 #if defined (__clipper__) && W_TYPE_SIZE == 32
502 #define umul_ppmm(w1, w0, u, v) \
503   ({union {UDItype __ll;                                                \
504            struct {USItype __l, __h;} __i;                              \
505           } __x;                                                        \
506   __asm__ ("mulwux %2,%0"                                               \
507            : "=r" (__x.__ll)                                            \
508            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
509   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
510 #define smul_ppmm(w1, w0, u, v) \
511   ({union {DItype __ll;                                                 \
512            struct {SItype __l, __h;} __i;                               \
513           } __x;                                                        \
514   __asm__ ("mulwx %2,%0"                                                \
515            : "=r" (__x.__ll)                                            \
516            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
517   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
518 #define __umulsidi3(u, v) \
519   ({UDItype __w;                                                        \
520     __asm__ ("mulwux %2,%0"                                             \
521              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
522     __w; })
523 #endif /* __clipper__ */
524
525 /* Fujitsu vector computers.  */
526 #if defined (__uxp__) && W_TYPE_SIZE == 32
527 #define umul_ppmm(ph, pl, u, v) \
528   do {                                                                  \
529     union {UDItype __ll;                                                \
530            struct {USItype __h, __l;} __i;                              \
531           } __x;                                                        \
532     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
533     (ph) = __x.__i.__h;                                                 \
534     (pl) = __x.__i.__l;                                                 \
535   } while (0)
536 #define smul_ppmm(ph, pl, u, v) \
537   do {                                                                  \
538     union {UDItype __ll;                                                \
539            struct {USItype __h, __l;} __i;                              \
540           } __x;                                                        \
541     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
542     (ph) = __x.__i.__h;                                                 \
543     (pl) = __x.__i.__l;                                                 \
544   } while (0)
545 #endif
546
547 #if defined (__gmicro__) && W_TYPE_SIZE == 32
548 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
549   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
550            : "=g" (sh), "=&g" (sl)                                      \
551            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
552              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
553 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
554   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
555            : "=g" (sh), "=&g" (sl)                                      \
556            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
557              "1" ((USItype)(al)), "g" ((USItype)(bl)))
558 #define umul_ppmm(ph, pl, m0, m1) \
559   __asm__ ("mulx %3,%0,%1"                                              \
560            : "=g" (ph), "=r" (pl)                                       \
561            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
562 #define udiv_qrnnd(q, r, nh, nl, d) \
563   __asm__ ("divx %4,%0,%1"                                              \
564            : "=g" (q), "=r" (r)                                         \
565            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
566 #define count_leading_zeros(count, x) \
567   __asm__ ("bsch/1 %1,%0"                                               \
568            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
569 #endif
570
571 #if defined (__hppa) && W_TYPE_SIZE == 32
572 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
573   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
574            : "=r" (sh), "=&r" (sl)                                      \
575            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
578            : "=r" (sh), "=&r" (sl)                                      \
579            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
580 #if defined (_PA_RISC1_1)
581 #define umul_ppmm(wh, wl, u, v) \
582   do {                                                                  \
583     union {UDItype __ll;                                                \
584            struct {USItype __h, __l;} __i;                              \
585           } __x;                                                        \
586     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
587     (wh) = __x.__i.__h;                                                 \
588     (wl) = __x.__i.__l;                                                 \
589   } while (0)
590 #define UMUL_TIME 8
591 #define UDIV_TIME 60
592 #else
593 #define UMUL_TIME 40
594 #define UDIV_TIME 80
595 #endif
596 #define count_leading_zeros(count, x) \
597   do {                                                                  \
598     USItype __tmp;                                                      \
599     __asm__ (                                                           \
600        "ldi             1,%0\n"                                         \
601 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
602 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
603 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
604 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
605 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
606 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
607 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
608 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
609 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
610 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
611 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
612 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
613 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
614 "       sub             %0,%1,%0        ; Subtract it.\n"               \
615         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
616   } while (0)
617 #endif /* hppa */
618
619 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
620    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
621    is just a case of no direct support for 2.0n but treating it like 1.0. */
622 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
623 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
624   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
625            : "=r" (sh), "=&r" (sl)                                      \
626            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
627 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
628   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
629            : "=r" (sh), "=&r" (sl)                                      \
630            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
631 #endif /* hppa */
632
633 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
634 #define smul_ppmm(xh, xl, m0, m1) \
635   do {                                                                  \
636     union {DItype __ll;                                                 \
637            struct {USItype __h, __l;} __i;                              \
638           } __x;                                                        \
639     __asm__ ("lr %N0,%1\n\tmr %0,%2"                                    \
640              : "=&r" (__x.__ll)                                         \
641              : "r" (m0), "r" (m1));                                     \
642     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
643   } while (0)
644 #define sdiv_qrnnd(q, r, n1, n0, d) \
645   do {                                                                  \
646     union {DItype __ll;                                                 \
647            struct {USItype __h, __l;} __i;                              \
648           } __x;                                                        \
649     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
650     __asm__ ("dr %0,%2"                                                 \
651              : "=r" (__x.__ll)                                          \
652              : "0" (__x.__ll), "r" (d));                                \
653     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
654   } while (0)
655 #endif
656
657 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
658 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
659   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
660            : "=r" (sh), "=&r" (sl)                                      \
661            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
662              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
663 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
664   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
665            : "=r" (sh), "=&r" (sl)                                      \
666            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
667              "1" ((USItype)(al)), "g" ((USItype)(bl)))
668 #define umul_ppmm(w1, w0, u, v) \
669   __asm__ ("mull %3"                                                    \
670            : "=a" (w0), "=d" (w1)                                       \
671            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
672 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
673   __asm__ ("divl %4"                 /* stringification in K&R C */     \
674            : "=a" (q), "=d" (r)                                         \
675            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
676
677 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
678 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
679    significant 1 bit is, hence the use of the following alternatives.  bsfl
680    is slow too, between 18 and 42 depending where the least significant 1
681    bit is, so let the generic count_trailing_zeros below make use of the
682    count_leading_zeros here too.  */
683
684 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
685 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
686    cache miss reading from __clz_tab.  For P55 it's favoured over the float
687    below so as to avoid mixing MMX and x87, since the penalty for switching
688    between the two is about 100 cycles.
689
690    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
691    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
692    follows, but as of gcc 2.95.2 it results in conditional jumps.
693
694        __shift = -(__n < 0x1000000);
695        __shift -= (__n < 0x10000);
696        __shift -= (__n < 0x100);
697
698    The middle two sbbl and cmpl's pair, and with luck something gcc
699    generates might pair with the first cmpl and the last sbbl.  The "32+1"
700    constant could be folded into __clz_tab[], but it doesn't seem worth
701    making a different table just for that.  */
702
703 #define count_leading_zeros(c,n)                                        \
704   do {                                                                  \
705     USItype  __n = (n);                                                 \
706     USItype  __shift;                                                   \
707     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
708              "sbbl  %0, %0\n"                                           \
709              "cmpl  $0x10000, %1\n"                                     \
710              "sbbl  $0, %0\n"                                           \
711              "cmpl  $0x100, %1\n"                                       \
712              "sbbl  $0, %0\n"                                           \
713              : "=&r" (__shift) : "r"  (__n));                           \
714     __shift = __shift*8 + 24 + 1;                                       \
715     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
716   } while (0)
717 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
718 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
719
720 #else /* ! pentiummmx || LONGLONG_STANDALONE */
721 /* The following should be a fixed 14 cycles or so.  Some scheduling
722    opportunities should be available between the float load/store too.  This
723    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
724    apparently suggested by the Intel optimizing manual (don't know exactly
725    where).  gcc 2.95 or up will be best for this, so the "double" is
726    correctly aligned on the stack.  */
727 #define count_leading_zeros(c,n)                                        \
728   do {                                                                  \
729     union {                                                             \
730       double    d;                                                      \
731       unsigned  a[2];                                                   \
732     } __u;                                                              \
733     ASSERT ((n) != 0);                                                  \
734     __u.d = (UWtype) (n);                                               \
735     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
736   } while (0)
737 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
738 #endif /* pentiummx */
739
740 #else /* ! pentium */
741
742 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
743 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
744 #endif /* gcc clz */
745
746 /* On P6, gcc prior to 3.0 generates a partial register stall for
747    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
748    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
749    cost of one extra instruction.  Do this for "i386" too, since that means
750    generic x86.  */
751 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
752   && (HAVE_HOST_CPU_i386                                                \
753       || HAVE_HOST_CPU_i686                                             \
754       || HAVE_HOST_CPU_pentiumpro                                       \
755       || HAVE_HOST_CPU_pentium2                                         \
756       || HAVE_HOST_CPU_pentium3)
757 #define count_leading_zeros(count, x)                                   \
758   do {                                                                  \
759     USItype __cbtmp;                                                    \
760     ASSERT ((x) != 0);                                                  \
761     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
762     (count) = 31 - __cbtmp;                                             \
763   } while (0)
764 #endif /* gcc<3 asm bsrl */
765
766 #ifndef count_leading_zeros
767 #define count_leading_zeros(count, x)                                   \
768   do {                                                                  \
769     USItype __cbtmp;                                                    \
770     ASSERT ((x) != 0);                                                  \
771     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
772     (count) = __cbtmp ^ 31;                                             \
773   } while (0)
774 #endif /* asm bsrl */
775
776 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
777 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
778 #endif /* gcc ctz */
779
780 #ifndef count_trailing_zeros
781 #define count_trailing_zeros(count, x)                                  \
782   do {                                                                  \
783     ASSERT ((x) != 0);                                                  \
784     __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)));        \
785   } while (0)
786 #endif /* asm bsfl */
787
788 #endif /* ! pentium */
789
790 #ifndef UMUL_TIME
791 #define UMUL_TIME 10
792 #endif
793 #ifndef UDIV_TIME
794 #define UDIV_TIME 40
795 #endif
796 #endif /* 80x86 */
797
798 #if defined (__amd64__) && W_TYPE_SIZE == 64
799 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
800   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
801            : "=r" (sh), "=&r" (sl)                                      \
802            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
803              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
804 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
805   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
806            : "=r" (sh), "=&r" (sl)                                      \
807            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
808              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
809 #define umul_ppmm(w1, w0, u, v) \
810   __asm__ ("mulq %3"                                                    \
811            : "=a" (w0), "=d" (w1)                                       \
812            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
813 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
814   __asm__ ("divq %4"                 /* stringification in K&R C */     \
815            : "=a" (q), "=d" (r)                                         \
816            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
817 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
818 #define count_leading_zeros(count, x)                                   \
819   do {                                                                  \
820     UDItype __cbtmp;                                                    \
821     ASSERT ((x) != 0);                                                  \
822     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
823     (count) = __cbtmp ^ 63;                                             \
824   } while (0)
825 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
826    count is only an int. */
827 #define count_trailing_zeros(count, x)                                  \
828   do {                                                                  \
829     ASSERT ((x) != 0);                                                  \
830     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
831   } while (0)
832 #endif /* x86_64 */
833
834 #if defined (__i860__) && W_TYPE_SIZE == 32
835 #define rshift_rhlc(r,h,l,c) \
836   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
837            "=r" (r) : "r" (h), "r" (l), "rn" (c))
838 #endif /* i860 */
839
840 #if defined (__i960__) && W_TYPE_SIZE == 32
841 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
842   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
843            : "=r" (sh), "=&r" (sl)                                      \
844            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
845 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
846   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
847            : "=r" (sh), "=&r" (sl)                                      \
848            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
849 #define umul_ppmm(w1, w0, u, v) \
850   ({union {UDItype __ll;                                                \
851            struct {USItype __l, __h;} __i;                              \
852           } __x;                                                        \
853   __asm__ ("emul %2,%1,%0"                                              \
854            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
855   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
856 #define __umulsidi3(u, v) \
857   ({UDItype __w;                                                        \
858     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
859     __w; })
860 #define udiv_qrnnd(q, r, nh, nl, d) \
861   do {                                                                  \
862     union {UDItype __ll;                                                \
863            struct {USItype __l, __h;} __i;                              \
864           } __nn;                                                       \
865     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
866     __asm__ ("ediv %d,%n,%0"                                            \
867            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
868     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
869   } while (0)
870 #define count_leading_zeros(count, x) \
871   do {                                                                  \
872     USItype __cbtmp;                                                    \
873     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
874     (count) = __cbtmp ^ 31;                                             \
875   } while (0)
876 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
877 #if defined (__i960mx)          /* what is the proper symbol to test??? */
878 #define rshift_rhlc(r,h,l,c) \
879   do {                                                                  \
880     union {UDItype __ll;                                                \
881            struct {USItype __l, __h;} __i;                              \
882           } __nn;                                                       \
883     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
884     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
885   }
886 #endif /* i960mx */
887 #endif /* i960 */
888
889 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
890      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
891      || defined (__mc5307__)) && W_TYPE_SIZE == 32
892 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
893   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
894            : "=d" (sh), "=&d" (sl)                                      \
895            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
896              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
897 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
898   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
899            : "=d" (sh), "=&d" (sl)                                      \
900            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
901              "1" ((USItype)(al)), "g" ((USItype)(bl)))
902 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
903 #if defined (__mc68020__) || defined(mc68020) \
904      || defined (__mc68030__) || defined (mc68030) \
905      || defined (__mc68040__) || defined (mc68040) \
906      || defined (__mcpu32__) || defined (mcpu32) \
907      || defined (__NeXT__)
908 #define umul_ppmm(w1, w0, u, v) \
909   __asm__ ("mulu%.l %3,%1:%0"                                           \
910            : "=d" (w0), "=d" (w1)                                       \
911            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
912 #define UMUL_TIME 45
913 #define udiv_qrnnd(q, r, n1, n0, d) \
914   __asm__ ("divu%.l %4,%1:%0"                                           \
915            : "=d" (q), "=d" (r)                                         \
916            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
917 #define UDIV_TIME 90
918 #define sdiv_qrnnd(q, r, n1, n0, d) \
919   __asm__ ("divs%.l %4,%1:%0"                                           \
920            : "=d" (q), "=d" (r)                                         \
921            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
922 #else /* for other 68k family members use 16x16->32 multiplication */
923 #define umul_ppmm(xh, xl, a, b) \
924   do { USItype __umul_tmp1, __umul_tmp2;                                \
925         __asm__ ("| Inlined umul_ppmm\n"                                \
926 "       move%.l %5,%3\n"                                                \
927 "       move%.l %2,%0\n"                                                \
928 "       move%.w %3,%1\n"                                                \
929 "       swap    %3\n"                                                   \
930 "       swap    %0\n"                                                   \
931 "       mulu%.w %2,%1\n"                                                \
932 "       mulu%.w %3,%0\n"                                                \
933 "       mulu%.w %2,%3\n"                                                \
934 "       swap    %2\n"                                                   \
935 "       mulu%.w %5,%2\n"                                                \
936 "       add%.l  %3,%2\n"                                                \
937 "       jcc     1f\n"                                                   \
938 "       add%.l  %#0x10000,%0\n"                                         \
939 "1:     move%.l %2,%3\n"                                                \
940 "       clr%.w  %2\n"                                                   \
941 "       swap    %2\n"                                                   \
942 "       swap    %3\n"                                                   \
943 "       clr%.w  %3\n"                                                   \
944 "       add%.l  %3,%1\n"                                                \
945 "       addx%.l %2,%0\n"                                                \
946 "       | End inlined umul_ppmm"                                        \
947               : "=&d" (xh), "=&d" (xl),                                 \
948                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
949               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
950   } while (0)
951 #define UMUL_TIME 100
952 #define UDIV_TIME 400
953 #endif /* not mc68020 */
954 /* The '020, '030, '040 and '060 have bitfield insns.
955    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
956    exclude bfffo on that chip (bitfield insns not available).  */
957 #if (defined (__mc68020__) || defined (mc68020)    \
958      || defined (__mc68030__) || defined (mc68030) \
959      || defined (__mc68040__) || defined (mc68040) \
960      || defined (__mc68060__) || defined (mc68060) \
961      || defined (__NeXT__))                        \
962   && ! defined (__mcpu32__)
963 #define count_leading_zeros(count, x) \
964   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
965            : "=d" (count)                                               \
966            : "od" ((USItype) (x)), "n" (0))
967 #define COUNT_LEADING_ZEROS_0 32
968 #endif
969 #endif /* mc68000 */
970
971 #if defined (__m88000__) && W_TYPE_SIZE == 32
972 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
973   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
974            : "=r" (sh), "=&r" (sl)                                      \
975            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
976 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
977   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
978            : "=r" (sh), "=&r" (sl)                                      \
979            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
980 #define count_leading_zeros(count, x) \
981   do {                                                                  \
982     USItype __cbtmp;                                                    \
983     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
984     (count) = __cbtmp ^ 31;                                             \
985   } while (0)
986 #define COUNT_LEADING_ZEROS_0 63 /* sic */
987 #if defined (__m88110__)
988 #define umul_ppmm(wh, wl, u, v) \
989   do {                                                                  \
990     union {UDItype __ll;                                                \
991            struct {USItype __h, __l;} __i;                              \
992           } __x;                                                        \
993     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
994     (wh) = __x.__i.__h;                                                 \
995     (wl) = __x.__i.__l;                                                 \
996   } while (0)
997 #define udiv_qrnnd(q, r, n1, n0, d) \
998   ({union {UDItype __ll;                                                \
999            struct {USItype __h, __l;} __i;                              \
1000           } __x, __q;                                                   \
1001   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1002   __asm__ ("divu.d %0,%1,%2"                                            \
1003            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1004   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1005 #define UMUL_TIME 5
1006 #define UDIV_TIME 25
1007 #else
1008 #define UMUL_TIME 17
1009 #define UDIV_TIME 150
1010 #endif /* __m88110__ */
1011 #endif /* __m88000__ */
1012
1013 #if defined (__mips) && W_TYPE_SIZE == 32
1014 #if (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
1015 #define umul_ppmm(w1, w0, u, v) \
1016   do {                                                                  \
1017     UDItype _r;                                                 \
1018     _r = (UDItype) u * v;                                               \
1019     (w1) = _r >> 32;                                                    \
1020     (w0) = (USItype) _r;                                                \
1021   } while (0)
1022 #elif __GNUC__ > 2 || __GNUC_MINOR__ >= 7
1023 #define umul_ppmm(w1, w0, u, v) \
1024   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1025 #else
1026 #define umul_ppmm(w1, w0, u, v) \
1027   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1028            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1029 #endif
1030 #define UMUL_TIME 10
1031 #define UDIV_TIME 100
1032 #endif /* __mips */
1033
1034 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1035 #if (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
1036 typedef unsigned int UTItype __attribute__ ((mode (TI)));
1037 #define umul_ppmm(w1, w0, u, v) \
1038   do {                                                                  \
1039     UTItype _r;                                                 \
1040     _r = (UTItype) u * v;                                               \
1041     (w1) = _r >> 64;                                                    \
1042     (w0) = (UDItype) _r;                                                \
1043   } while (0)
1044 #elif __GNUC__ > 2 || __GNUC_MINOR__ >= 7
1045 #define umul_ppmm(w1, w0, u, v) \
1046   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1047 #else
1048 #define umul_ppmm(w1, w0, u, v) \
1049   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1050            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1051 #endif
1052 #define UMUL_TIME 20
1053 #define UDIV_TIME 140
1054 #endif /* __mips */
1055
1056 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1057 #define umul_ppmm(w1, w0, u, v) \
1058   ({union {UDItype __ll;                                                \
1059            struct {USItype __l, __h;} __i;                              \
1060           } __x;                                                        \
1061   __asm__ ("meid %2,%0"                                                 \
1062            : "=g" (__x.__ll)                                            \
1063            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1064   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1065 #define __umulsidi3(u, v) \
1066   ({UDItype __w;                                                        \
1067     __asm__ ("meid %2,%0"                                               \
1068              : "=g" (__w)                                               \
1069              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1070     __w; })
1071 #define udiv_qrnnd(q, r, n1, n0, d) \
1072   ({union {UDItype __ll;                                                \
1073            struct {USItype __l, __h;} __i;                              \
1074           } __x;                                                        \
1075   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1076   __asm__ ("deid %2,%0"                                                 \
1077            : "=g" (__x.__ll)                                            \
1078            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1079   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1080 #define count_trailing_zeros(count,x) \
1081   do {                                                                  \
1082     __asm__ ("ffsd      %2,%0"                                          \
1083              : "=r" (count)                                             \
1084              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1085   } while (0)
1086 #endif /* __ns32000__ */
1087
1088 /* In the past we had a block of various #defines tested
1089        _ARCH_PPC    - AIX
1090        _ARCH_PWR    - AIX
1091        __powerpc__  - gcc
1092        __POWERPC__  - BEOS
1093        __ppc__      - Darwin
1094        PPC          - old gcc, GNU/Linux, SysV
1095    The plain PPC test was not good for vxWorks, since PPC is defined on all
1096    CPUs there (eg. m68k too), as a constant one is expected to compare
1097    CPU_FAMILY against.
1098
1099    At any rate, this was pretty unattractive and a bit fragile.  The use of
1100    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1101    getting the desired effect.
1102
1103    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1104    the system vendor compilers.  (Is that vendor compilers with inline asm,
1105    or what?)  */
1106
1107 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1108   && W_TYPE_SIZE == 32
1109 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1110   do {                                                                  \
1111     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1112       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1113              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1114     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1115       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1116              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1117     else                                                                \
1118       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1119              : "=r" (sh), "=&r" (sl)                                    \
1120              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1121   } while (0)
1122 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1123   do {                                                                  \
1124     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1125       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1126                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1127     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1128       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1129                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1130     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1131       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1132                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1133     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1134       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1135                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1136     else                                                                \
1137       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1138                : "=r" (sh), "=&r" (sl)                                  \
1139                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1140   } while (0)
1141 #define count_leading_zeros(count, x) \
1142   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1143 #define COUNT_LEADING_ZEROS_0 32
1144 #if HAVE_HOST_CPU_FAMILY_powerpc
1145 #define umul_ppmm(ph, pl, m0, m1) \
1146   do {                                                                  \
1147     USItype __m0 = (m0), __m1 = (m1);                                   \
1148     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1149     (pl) = __m0 * __m1;                                                 \
1150   } while (0)
1151 #define UMUL_TIME 15
1152 #define smul_ppmm(ph, pl, m0, m1) \
1153   do {                                                                  \
1154     SItype __m0 = (m0), __m1 = (m1);                                    \
1155     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1156     (pl) = __m0 * __m1;                                                 \
1157   } while (0)
1158 #define SMUL_TIME 14
1159 #define UDIV_TIME 120
1160 #else
1161 #define UMUL_TIME 8
1162 #define smul_ppmm(xh, xl, m0, m1) \
1163   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1164 #define SMUL_TIME 4
1165 #define sdiv_qrnnd(q, r, nh, nl, d) \
1166   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1167 #define UDIV_TIME 100
1168 #endif
1169 #endif /* 32-bit POWER architecture variants.  */
1170
1171 /* We should test _IBMR2 here when we add assembly support for the system
1172    vendor compilers.  */
1173 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1174 #if !defined (_LONG_LONG_LIMB)
1175 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1176    use adde etc only when not _LONG_LONG_LIMB.  */
1177 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1178   do {                                                                  \
1179     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1180       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1181              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1182     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1183       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1184              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1185     else                                                                \
1186       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1187              : "=r" (sh), "=&r" (sl)                                    \
1188              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1189   } while (0)
1190 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1191   do {                                                                  \
1192     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1193       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1194                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1195     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)         \
1196       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1197                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1198     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1199       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1200                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1201     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1202       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1203                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1204     else                                                                \
1205       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1206                : "=r" (sh), "=&r" (sl)                                  \
1207                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1208   } while (0)
1209 #endif /* ! _LONG_LONG_LIMB */
1210 #define count_leading_zeros(count, x) \
1211   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1212 #define COUNT_LEADING_ZEROS_0 64
1213 #define umul_ppmm(ph, pl, m0, m1) \
1214   do {                                                                  \
1215     UDItype __m0 = (m0), __m1 = (m1);                                   \
1216     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1217     (pl) = __m0 * __m1;                                                 \
1218   } while (0)
1219 #define UMUL_TIME 15
1220 #define smul_ppmm(ph, pl, m0, m1) \
1221   do {                                                                  \
1222     DItype __m0 = (m0), __m1 = (m1);                                    \
1223     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1224     (pl) = __m0 * __m1;                                                 \
1225   } while (0)
1226 #define SMUL_TIME 14  /* ??? */
1227 #define UDIV_TIME 120 /* ??? */
1228 #endif /* 64-bit PowerPC.  */
1229
1230 #if defined (__pyr__) && W_TYPE_SIZE == 32
1231 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1232   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1233            : "=r" (sh), "=&r" (sl)                                      \
1234            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1235              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1236 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1237   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1238            : "=r" (sh), "=&r" (sl)                                      \
1239            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1240              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1241 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1242 #define umul_ppmm(w1, w0, u, v) \
1243   ({union {UDItype __ll;                                                \
1244            struct {USItype __h, __l;} __i;                              \
1245           } __x;                                                        \
1246   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1247            : "=&r" (__x.__ll)                                           \
1248            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1249   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1250 #endif /* __pyr__ */
1251
1252 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1253 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1254   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1255            : "=r" (sh), "=&r" (sl)                                      \
1256            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1257              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1258 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1259   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1260            : "=r" (sh), "=&r" (sl)                                      \
1261            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1262              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1263 #define smul_ppmm(ph, pl, m0, m1) \
1264   __asm__ (                                                             \
1265        "s       r2,r2\n"                                                \
1266 "       mts r10,%2\n"                                                   \
1267 "       m       r2,%3\n"                                                \
1268 "       m       r2,%3\n"                                                \
1269 "       m       r2,%3\n"                                                \
1270 "       m       r2,%3\n"                                                \
1271 "       m       r2,%3\n"                                                \
1272 "       m       r2,%3\n"                                                \
1273 "       m       r2,%3\n"                                                \
1274 "       m       r2,%3\n"                                                \
1275 "       m       r2,%3\n"                                                \
1276 "       m       r2,%3\n"                                                \
1277 "       m       r2,%3\n"                                                \
1278 "       m       r2,%3\n"                                                \
1279 "       m       r2,%3\n"                                                \
1280 "       m       r2,%3\n"                                                \
1281 "       m       r2,%3\n"                                                \
1282 "       m       r2,%3\n"                                                \
1283 "       cas     %0,r2,r0\n"                                             \
1284 "       mfs     r10,%1"                                                 \
1285            : "=r" (ph), "=r" (pl)                                       \
1286            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1287            : "r2")
1288 #define UMUL_TIME 20
1289 #define UDIV_TIME 200
1290 #define count_leading_zeros(count, x) \
1291   do {                                                                  \
1292     if ((x) >= 0x10000)                                                 \
1293       __asm__ ("clz     %0,%1"                                          \
1294                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1295     else                                                                \
1296       {                                                                 \
1297         __asm__ ("clz   %0,%1"                                          \
1298                  : "=r" (count) : "r" ((USItype)(x)));                  \
1299         (count) += 16;                                                  \
1300       }                                                                 \
1301   } while (0)
1302 #endif /* RT/ROMP */
1303
1304 #if defined (__sh2__) && W_TYPE_SIZE == 32
1305 #define umul_ppmm(w1, w0, u, v) \
1306   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1307            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1308 #define UMUL_TIME 5
1309 #endif
1310
1311 #if defined (__sparc__) && W_TYPE_SIZE == 32
1312 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1313   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1314            : "=r" (sh), "=&r" (sl)                                      \
1315            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1316            __CLOBBER_CC)
1317 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1318   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1319            : "=r" (sh), "=&r" (sl)                                      \
1320            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1321            __CLOBBER_CC)
1322 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1323    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1324 #if defined (__sparc_v9__) || defined (__sparcv9)
1325 /* Perhaps we should use floating-point operations here?  */
1326 #if 0
1327 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1328    Perhaps we simply need explicitly zero-extend the inputs?  */
1329 #define umul_ppmm(w1, w0, u, v) \
1330   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1331            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1332 #else
1333 /* Use v8 umul until above bug is fixed.  */
1334 #define umul_ppmm(w1, w0, u, v) \
1335   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1336 #endif
1337 /* Use a plain v8 divide for v9.  */
1338 #define udiv_qrnnd(q, r, n1, n0, d) \
1339   do {                                                                  \
1340     USItype __q;                                                        \
1341     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1342              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1343     (r) = (n0) - __q * (d);                                             \
1344     (q) = __q;                                                          \
1345   } while (0)
1346 #else
1347 #if defined (__sparc_v8__)   /* gcc normal */                           \
1348   || defined (__sparcv8)     /* gcc solaris */                          \
1349   || HAVE_HOST_CPU_supersparc
1350 /* Don't match immediate range because, 1) it is not often useful,
1351    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1352    while we want to match a 13 bit interval, sign extended to 32 bits,
1353    but INTERPRETED AS UNSIGNED.  */
1354 #define umul_ppmm(w1, w0, u, v) \
1355   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1356 #define UMUL_TIME 5
1357
1358 #if HAVE_HOST_CPU_supersparc
1359 #define UDIV_TIME 60            /* SuperSPARC timing */
1360 #else
1361 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1362    dividends and will trap to the kernel for the rest. */
1363 #define udiv_qrnnd(q, r, n1, n0, d) \
1364   do {                                                                  \
1365     USItype __q;                                                        \
1366     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1367              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1368     (r) = (n0) - __q * (d);                                             \
1369     (q) = __q;                                                          \
1370   } while (0)
1371 #define UDIV_TIME 25
1372 #endif /* HAVE_HOST_CPU_supersparc */
1373
1374 #else /* ! __sparc_v8__ */
1375 #if defined (__sparclite__)
1376 /* This has hardware multiply but not divide.  It also has two additional
1377    instructions scan (ffs from high bit) and divscc.  */
1378 #define umul_ppmm(w1, w0, u, v) \
1379   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1380 #define UMUL_TIME 5
1381 #define udiv_qrnnd(q, r, n1, n0, d) \
1382   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1383 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1384 "       tst     %%g0\n"                                                 \
1385 "       divscc  %3,%4,%%g1\n"                                           \
1386 "       divscc  %%g1,%4,%%g1\n"                                         \
1387 "       divscc  %%g1,%4,%%g1\n"                                         \
1388 "       divscc  %%g1,%4,%%g1\n"                                         \
1389 "       divscc  %%g1,%4,%%g1\n"                                         \
1390 "       divscc  %%g1,%4,%%g1\n"                                         \
1391 "       divscc  %%g1,%4,%%g1\n"                                         \
1392 "       divscc  %%g1,%4,%%g1\n"                                         \
1393 "       divscc  %%g1,%4,%%g1\n"                                         \
1394 "       divscc  %%g1,%4,%%g1\n"                                         \
1395 "       divscc  %%g1,%4,%%g1\n"                                         \
1396 "       divscc  %%g1,%4,%%g1\n"                                         \
1397 "       divscc  %%g1,%4,%%g1\n"                                         \
1398 "       divscc  %%g1,%4,%%g1\n"                                         \
1399 "       divscc  %%g1,%4,%%g1\n"                                         \
1400 "       divscc  %%g1,%4,%%g1\n"                                         \
1401 "       divscc  %%g1,%4,%%g1\n"                                         \
1402 "       divscc  %%g1,%4,%%g1\n"                                         \
1403 "       divscc  %%g1,%4,%%g1\n"                                         \
1404 "       divscc  %%g1,%4,%%g1\n"                                         \
1405 "       divscc  %%g1,%4,%%g1\n"                                         \
1406 "       divscc  %%g1,%4,%%g1\n"                                         \
1407 "       divscc  %%g1,%4,%%g1\n"                                         \
1408 "       divscc  %%g1,%4,%%g1\n"                                         \
1409 "       divscc  %%g1,%4,%%g1\n"                                         \
1410 "       divscc  %%g1,%4,%%g1\n"                                         \
1411 "       divscc  %%g1,%4,%%g1\n"                                         \
1412 "       divscc  %%g1,%4,%%g1\n"                                         \
1413 "       divscc  %%g1,%4,%%g1\n"                                         \
1414 "       divscc  %%g1,%4,%%g1\n"                                         \
1415 "       divscc  %%g1,%4,%%g1\n"                                         \
1416 "       divscc  %%g1,%4,%0\n"                                           \
1417 "       rd      %%y,%1\n"                                               \
1418 "       bl,a 1f\n"                                                      \
1419 "       add     %1,%4,%1\n"                                             \
1420 "1:     ! End of inline udiv_qrnnd"                                     \
1421            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1422            : "%g1" __AND_CLOBBER_CC)
1423 #define UDIV_TIME 37
1424 #define count_leading_zeros(count, x) \
1425   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1426 /* Early sparclites return 63 for an argument of 0, but they warn that future
1427    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1428    undefined.  */
1429 #endif /* __sparclite__ */
1430 #endif /* __sparc_v8__ */
1431 #endif /* __sparc_v9__ */
1432 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1433 #ifndef umul_ppmm
1434 #define umul_ppmm(w1, w0, u, v) \
1435   __asm__ ("! Inlined umul_ppmm\n"                                      \
1436 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1437 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1438 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1439 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1440 "       mulscc  %%g1,%3,%%g1\n"                                         \
1441 "       mulscc  %%g1,%3,%%g1\n"                                         \
1442 "       mulscc  %%g1,%3,%%g1\n"                                         \
1443 "       mulscc  %%g1,%3,%%g1\n"                                         \
1444 "       mulscc  %%g1,%3,%%g1\n"                                         \
1445 "       mulscc  %%g1,%3,%%g1\n"                                         \
1446 "       mulscc  %%g1,%3,%%g1\n"                                         \
1447 "       mulscc  %%g1,%3,%%g1\n"                                         \
1448 "       mulscc  %%g1,%3,%%g1\n"                                         \
1449 "       mulscc  %%g1,%3,%%g1\n"                                         \
1450 "       mulscc  %%g1,%3,%%g1\n"                                         \
1451 "       mulscc  %%g1,%3,%%g1\n"                                         \
1452 "       mulscc  %%g1,%3,%%g1\n"                                         \
1453 "       mulscc  %%g1,%3,%%g1\n"                                         \
1454 "       mulscc  %%g1,%3,%%g1\n"                                         \
1455 "       mulscc  %%g1,%3,%%g1\n"                                         \
1456 "       mulscc  %%g1,%3,%%g1\n"                                         \
1457 "       mulscc  %%g1,%3,%%g1\n"                                         \
1458 "       mulscc  %%g1,%3,%%g1\n"                                         \
1459 "       mulscc  %%g1,%3,%%g1\n"                                         \
1460 "       mulscc  %%g1,%3,%%g1\n"                                         \
1461 "       mulscc  %%g1,%3,%%g1\n"                                         \
1462 "       mulscc  %%g1,%3,%%g1\n"                                         \
1463 "       mulscc  %%g1,%3,%%g1\n"                                         \
1464 "       mulscc  %%g1,%3,%%g1\n"                                         \
1465 "       mulscc  %%g1,%3,%%g1\n"                                         \
1466 "       mulscc  %%g1,%3,%%g1\n"                                         \
1467 "       mulscc  %%g1,%3,%%g1\n"                                         \
1468 "       mulscc  %%g1,%3,%%g1\n"                                         \
1469 "       mulscc  %%g1,%3,%%g1\n"                                         \
1470 "       mulscc  %%g1,%3,%%g1\n"                                         \
1471 "       mulscc  %%g1,%3,%%g1\n"                                         \
1472 "       mulscc  %%g1,0,%%g1\n"                                          \
1473 "       add     %%g1,%%g2,%0\n"                                         \
1474 "       rd      %%y,%1"                                                 \
1475            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1476            : "%g1", "%g2" __AND_CLOBBER_CC)
1477 #define UMUL_TIME 39            /* 39 instructions */
1478 #endif
1479 #ifndef udiv_qrnnd
1480 #ifndef LONGLONG_STANDALONE
1481 #define udiv_qrnnd(q, r, n1, n0, d) \
1482   do { UWtype __r;                                                      \
1483     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1484     (r) = __r;                                                          \
1485   } while (0)
1486 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1487 #ifndef UDIV_TIME
1488 #define UDIV_TIME 140
1489 #endif
1490 #endif /* LONGLONG_STANDALONE */
1491 #endif /* udiv_qrnnd */
1492 #endif /* __sparc__ */
1493
1494 #if defined (__sparc__) && W_TYPE_SIZE == 64
1495 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1496   __asm__ (                                                             \
1497        "addcc   %r4,%5,%1\n"                                            \
1498       " addccc  %r6,%7,%%g0\n"                                          \
1499       " addc    %r2,%3,%0"                                              \
1500           : "=r" (sh), "=&r" (sl)                                       \
1501           : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
1502             "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
1503            __CLOBBER_CC)
1504 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1505   __asm__ (                                                             \
1506        "subcc   %r4,%5,%1\n"                                            \
1507       " subccc  %r6,%7,%%g0\n"                                          \
1508       " subc    %r2,%3,%0"                                              \
1509           : "=r" (sh), "=&r" (sl)                                       \
1510           : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),         \
1511             "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
1512            __CLOBBER_CC)
1513 #endif
1514
1515 #if defined (__vax__) && W_TYPE_SIZE == 32
1516 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1517   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1518            : "=g" (sh), "=&g" (sl)                                      \
1519            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1520              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1521 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1522   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1523            : "=g" (sh), "=&g" (sl)                                      \
1524            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1525              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1526 #define smul_ppmm(xh, xl, m0, m1) \
1527   do {                                                                  \
1528     union {UDItype __ll;                                                \
1529            struct {USItype __l, __h;} __i;                              \
1530           } __x;                                                        \
1531     USItype __m0 = (m0), __m1 = (m1);                                   \
1532     __asm__ ("emul %1,%2,$0,%0"                                         \
1533              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1534     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1535   } while (0)
1536 #define sdiv_qrnnd(q, r, n1, n0, d) \
1537   do {                                                                  \
1538     union {DItype __ll;                                                 \
1539            struct {SItype __l, __h;} __i;                               \
1540           } __x;                                                        \
1541     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1542     __asm__ ("ediv %3,%2,%0,%1"                                         \
1543              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1544   } while (0)
1545 #if 0
1546 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1547    8800 maybe). */
1548 #define count_trailing_zeros(count,x)                                   \
1549   do {                                                                  \
1550     __asm__ ("ffs 0, 31, %1, %0"                                        \
1551              : "=g" (count)                                             \
1552              : "g" ((USItype) (x)));                                    \
1553   } while (0)
1554 #endif
1555 #endif /* __vax__ */
1556
1557 #if defined (__z8000__) && W_TYPE_SIZE == 16
1558 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1559   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1560            : "=r" (sh), "=&r" (sl)                                      \
1561            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1562              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1563 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1564   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1565            : "=r" (sh), "=&r" (sl)                                      \
1566            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1567              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1568 #define umul_ppmm(xh, xl, m0, m1) \
1569   do {                                                                  \
1570     union {long int __ll;                                               \
1571            struct {unsigned int __h, __l;} __i;                         \
1572           } __x;                                                        \
1573     unsigned int __m0 = (m0), __m1 = (m1);                              \
1574     __asm__ ("mult      %S0,%H3"                                        \
1575              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1576              : "%1" (m0), "rQR" (m1));                                  \
1577     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1578     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1579              + (((signed int) __m1 >> 15) & __m0));                     \
1580   } while (0)
1581 #endif /* __z8000__ */
1582
1583 #endif /* __GNUC__ */
1584
1585 #endif /* NO_ASM */
1586
1587
1588 #if !defined (umul_ppmm) && defined (__umulsidi3)
1589 #define umul_ppmm(ph, pl, m0, m1) \
1590   {                                                                     \
1591     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1592     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1593     pl = (UWtype) __ll;                                                 \
1594   }
1595 #endif
1596
1597 #if !defined (__umulsidi3)
1598 #define __umulsidi3(u, v) \
1599   ({UWtype __hi, __lo;                                                  \
1600     umul_ppmm (__hi, __lo, u, v);                                       \
1601     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1602 #endif
1603
1604
1605 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1606    forms have "reversed" arguments, meaning the pointer is last, which
1607    sometimes allows better parameter passing, in particular on 64-bit
1608    hppa. */
1609
1610 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1611 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1612
1613 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1614   && ! defined (LONGLONG_STANDALONE)
1615 #define umul_ppmm(wh, wl, u, v)                                               \
1616   do {                                                                        \
1617     UWtype __umul_ppmm__p0;                                                   \
1618     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1619     (wl) = __umul_ppmm__p0;                                                   \
1620   } while (0)
1621 #endif
1622
1623 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1624 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1625
1626 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1627   && ! defined (LONGLONG_STANDALONE)
1628 #define umul_ppmm(wh, wl, u, v)                                               \
1629   do {                                                                        \
1630     UWtype __umul_ppmm__p0;                                                   \
1631     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1632     (wl) = __umul_ppmm__p0;                                                   \
1633   } while (0)
1634 #endif
1635
1636 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1637 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1638
1639 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1640   && ! defined (LONGLONG_STANDALONE)
1641 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1642   do {                                                                  \
1643     UWtype __udiv_qrnnd__r;                                             \
1644     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                             \
1645                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1646     (r) = __udiv_qrnnd__r;                                              \
1647   } while (0)
1648 #endif
1649
1650 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1651 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1652
1653 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1654   && ! defined (LONGLONG_STANDALONE)
1655 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1656   do {                                                                  \
1657     UWtype __udiv_qrnnd__r;                                             \
1658     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1659                             &__udiv_qrnnd__r);                          \
1660     (r) = __udiv_qrnnd__r;                                              \
1661   } while (0)
1662 #endif
1663
1664
1665 /* If this machine has no inline assembler, use C macros.  */
1666
1667 #if !defined (add_ssaaaa)
1668 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1669   do {                                                                  \
1670     UWtype __x;                                                         \
1671     __x = (al) + (bl);                                                  \
1672     (sh) = (ah) + (bh) + (__x < (al));                                  \
1673     (sl) = __x;                                                         \
1674   } while (0)
1675 #endif
1676
1677 #if !defined (sub_ddmmss)
1678 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1679   do {                                                                  \
1680     UWtype __x;                                                         \
1681     __x = (al) - (bl);                                                  \
1682     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1683     (sl) = __x;                                                         \
1684   } while (0)
1685 #endif
1686
1687 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1688    smul_ppmm.  */
1689 #if !defined (umul_ppmm) && defined (smul_ppmm)
1690 #define umul_ppmm(w1, w0, u, v)                                         \
1691   do {                                                                  \
1692     UWtype __w1;                                                        \
1693     UWtype __xm0 = (u), __xm1 = (v);                                    \
1694     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1695     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1696                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1697   } while (0)
1698 #endif
1699
1700 /* If we still don't have umul_ppmm, define it using plain C.
1701
1702    For reference, when this code is used for squaring (ie. u and v identical
1703    expressions), gcc recognises __x1 and __x2 are the same and generates 3
1704    multiplies, not 4.  The subsequent additions could be optimized a bit,
1705    but the only place GMP currently uses such a square is mpn_sqr_basecase,
1706    and chips obliged to use this generic C umul will have plenty of worse
1707    performance problems than a couple of extra instructions on the diagonal
1708    of sqr_basecase.  */
1709
1710 #if !defined (umul_ppmm)
1711 #define umul_ppmm(w1, w0, u, v)                                         \
1712   do {                                                                  \
1713     UWtype __x0, __x1, __x2, __x3;                                      \
1714     UHWtype __ul, __vl, __uh, __vh;                                     \
1715     UWtype __u = (u), __v = (v);                                        \
1716                                                                         \
1717     __ul = __ll_lowpart (__u);                                          \
1718     __uh = __ll_highpart (__u);                                         \
1719     __vl = __ll_lowpart (__v);                                          \
1720     __vh = __ll_highpart (__v);                                         \
1721                                                                         \
1722     __x0 = (UWtype) __ul * __vl;                                        \
1723     __x1 = (UWtype) __ul * __vh;                                        \
1724     __x2 = (UWtype) __uh * __vl;                                        \
1725     __x3 = (UWtype) __uh * __vh;                                        \
1726                                                                         \
1727     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
1728     __x1 += __x2;               /* but this indeed can */               \
1729     if (__x1 < __x2)            /* did we get it? */                    \
1730       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
1731                                                                         \
1732     (w1) = __x3 + __ll_highpart (__x1);                                 \
1733     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
1734   } while (0)
1735 #endif
1736
1737 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1738    exist in one form or another.  */
1739 #if !defined (smul_ppmm)
1740 #define smul_ppmm(w1, w0, u, v)                                         \
1741   do {                                                                  \
1742     UWtype __w1;                                                        \
1743     UWtype __xm0 = (u), __xm1 = (v);                                    \
1744     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1745     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1746                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1747   } while (0)
1748 #endif
1749
1750 /* Define this unconditionally, so it can be used for debugging.  */
1751 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1752   do {                                                                  \
1753     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
1754                                                                         \
1755     ASSERT ((d) != 0);                                                  \
1756     ASSERT ((n1) < (d));                                                \
1757                                                                         \
1758     __d1 = __ll_highpart (d);                                           \
1759     __d0 = __ll_lowpart (d);                                            \
1760                                                                         \
1761     __q1 = (n1) / __d1;                                                 \
1762     __r1 = (n1) - __q1 * __d1;                                          \
1763     __m = __q1 * __d0;                                                  \
1764     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
1765     if (__r1 < __m)                                                     \
1766       {                                                                 \
1767         __q1--, __r1 += (d);                                            \
1768         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1769           if (__r1 < __m)                                               \
1770             __q1--, __r1 += (d);                                        \
1771       }                                                                 \
1772     __r1 -= __m;                                                        \
1773                                                                         \
1774     __q0 = __r1 / __d1;                                                 \
1775     __r0 = __r1  - __q0 * __d1;                                         \
1776     __m = __q0 * __d0;                                                  \
1777     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
1778     if (__r0 < __m)                                                     \
1779       {                                                                 \
1780         __q0--, __r0 += (d);                                            \
1781         if (__r0 >= (d))                                                \
1782           if (__r0 < __m)                                               \
1783             __q0--, __r0 += (d);                                        \
1784       }                                                                 \
1785     __r0 -= __m;                                                        \
1786                                                                         \
1787     (q) = __q1 * __ll_B | __q0;                                         \
1788     (r) = __r0;                                                         \
1789   } while (0)
1790
1791 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1792    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
1793 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1794 #define udiv_qrnnd(q, r, nh, nl, d) \
1795   do {                                                                  \
1796     UWtype __r;                                                         \
1797     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
1798     (r) = __r;                                                          \
1799   } while (0)
1800 #endif
1801
1802 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
1803 #if !defined (udiv_qrnnd)
1804 #define UDIV_NEEDS_NORMALIZATION 1
1805 #define udiv_qrnnd __udiv_qrnnd_c
1806 #endif
1807
1808 #if !defined (count_leading_zeros)
1809 #define count_leading_zeros(count, x) \
1810   do {                                                                  \
1811     UWtype __xr = (x);                                                  \
1812     UWtype __a;                                                         \
1813                                                                         \
1814     if (W_TYPE_SIZE == 32)                                              \
1815       {                                                                 \
1816         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
1817           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
1818           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
1819           : 3*__BITS4 + 1);                                             \
1820       }                                                                 \
1821     else                                                                \
1822       {                                                                 \
1823         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
1824           if (((__xr >> __a) & 0xff) != 0)                              \
1825             break;                                                      \
1826         ++__a;                                                          \
1827       }                                                                 \
1828                                                                         \
1829     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
1830   } while (0)
1831 /* This version gives a well-defined value for zero. */
1832 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
1833 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1834 #endif
1835
1836 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
1837 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
1838 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1839 #endif
1840
1841 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1842 # ifdef MPFR_HAVE_GMP_IMPL
1843     extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
1844 # else
1845     extern const unsigned char __clz_tab[128];
1846 # endif
1847 #endif
1848
1849 #if !defined (count_trailing_zeros)
1850 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
1851    defined in asm, but if it is not, the C version above is good enough.  */
1852 #define count_trailing_zeros(count, x) \
1853   do {                                                                  \
1854     UWtype __ctz_x = (x);                                               \
1855     UWtype __ctz_c;                                                     \
1856     ASSERT (__ctz_x != 0);                                              \
1857     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
1858     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
1859   } while (0)
1860 #endif
1861
1862 #ifndef UDIV_NEEDS_NORMALIZATION
1863 #define UDIV_NEEDS_NORMALIZATION 0
1864 #endif
1865
1866 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
1867    that hence the latter should always be used.  */
1868 #ifndef UDIV_PREINV_ALWAYS
1869 #define UDIV_PREINV_ALWAYS 0
1870 #endif
1871
1872 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
1873 #ifndef UMUL_TIME
1874 #define UMUL_TIME 1
1875 #endif
1876
1877 #ifndef UDIV_TIME
1878 #define UDIV_TIME UMUL_TIME
1879 #endif