Upgrade GMP from 5.0.2 to 5.0.5 on the vendor branch
[dragonfly.git] / contrib / gmp / longlong.h
1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
4 2004, 2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
5
6 This file is free software; you can redistribute it and/or modify it under the
7 terms of the GNU Lesser General Public License as published by the Free
8 Software Foundation; either version 3 of the License, or (at your option) any
9 later version.
10
11 This file is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
13 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
14 details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with this file.  If not, see http://www.gnu.org/licenses/.  */
18
19 /* You have to define the following before including this file:
20
21    UWtype -- An unsigned type, default type for operations (typically a "word")
22    UHWtype -- An unsigned type, at least half the size of UWtype
23    UDWtype -- An unsigned type, at least twice as large a UWtype
24    W_TYPE_SIZE -- size in bits of UWtype
25
26    SItype, USItype -- Signed and unsigned 32 bit types
27    DItype, UDItype -- Signed and unsigned 64 bit types
28
29    On a 32 bit machine UWtype should typically be USItype;
30    on a 64 bit machine, UWtype should typically be UDItype.
31
32    Optionally, define:
33
34    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
35    NO_ASM -- Disable inline asm
36
37
38    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
39    need to include gmp.h and gmp-impl.h, or certain things might not work as
40    expected.
41 */
42
43 #define __BITS4 (W_TYPE_SIZE / 4)
44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
47
48 /* This is used to make sure no undesirable sharing between different libraries
49    that use this file takes place.  */
50 #ifndef __MPN
51 #define __MPN(x) __##x
52 #endif
53
54 #ifndef _PROTO
55 #if (__STDC__-0) || defined (__cplusplus)
56 #define _PROTO(x) x
57 #else
58 #define _PROTO(x) ()
59 #endif
60 #endif
61
62 /* Define auxiliary asm macros.
63
64    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
65    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
66    word product in HIGH_PROD and LOW_PROD.
67
68    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
69    UDWtype product.  This is just a variant of umul_ppmm.
70
71    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72    denominator) divides a UDWtype, composed by the UWtype integers
73    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
74    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
75    than DENOMINATOR for correct operation.  If, in addition, the most
76    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
77    UDIV_NEEDS_NORMALIZATION is defined to 1.
78
79    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
80    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
81    is rounded towards 0.
82
83    5) count_leading_zeros(count, x) counts the number of zero-bits from the
84    msb to the first non-zero bit in the UWtype X.  This is the number of
85    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
86    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
87
88    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
89    from the least significant end.
90
91    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
92    high_addend_2, low_addend_2) adds two UWtype integers, composed by
93    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
94    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
95    (i.e. carry out) is not stored anywhere, and is lost.
96
97    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
98    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
99    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
100    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
101    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
102    and is lost.
103
104    If any of these macros are left undefined for a particular CPU,
105    C macros are used.
106
107
108    Notes:
109
110    For add_ssaaaa the two high and two low addends can both commute, but
111    unfortunately gcc only supports one "%" commutative in each asm block.
112    This has always been so but is only documented in recent versions
113    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
114    compiler error in certain rare circumstances.
115
116    Apparently it was only the last "%" that was ever actually respected, so
117    the code has been updated to leave just that.  Clearly there's a free
118    choice whether high or low should get it, if there's a reason to favour
119    one over the other.  Also obviously when the constraints on the two
120    operands are identical there's no benefit to the reloader in any "%" at
121    all.
122
123    */
124
125 /* The CPUs come in alphabetical order below.
126
127    Please add support for more CPUs here, or improve the current support
128    for the CPUs below!  */
129
130
131 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
132    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
133    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
134    __builtin_ctzll.
135
136    These builtins are only used when we check what code comes out, on some
137    chips they're merely libgcc calls, where we will instead want an inline
138    in that case (either asm or generic C).
139
140    These builtins are better than an asm block of the same insn, since an
141    asm block doesn't give gcc any information about scheduling or resource
142    usage.  We keep an asm block for use on prior versions of gcc though.
143
144    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
145    it's not used (for count_leading_zeros) because it generally gives extra
146    code to ensure the result is 0 when the input is 0, which we don't need
147    or want.  */
148
149 #ifdef _LONG_LONG_LIMB
150 #define count_leading_zeros_gcc_clz(count,x)    \
151   do {                                          \
152     ASSERT ((x) != 0);                          \
153     (count) = __builtin_clzll (x);              \
154   } while (0)
155 #else
156 #define count_leading_zeros_gcc_clz(count,x)    \
157   do {                                          \
158     ASSERT ((x) != 0);                          \
159     (count) = __builtin_clzl (x);               \
160   } while (0)
161 #endif
162
163 #ifdef _LONG_LONG_LIMB
164 #define count_trailing_zeros_gcc_ctz(count,x)   \
165   do {                                          \
166     ASSERT ((x) != 0);                          \
167     (count) = __builtin_ctzll (x);              \
168   } while (0)
169 #else
170 #define count_trailing_zeros_gcc_ctz(count,x)   \
171   do {                                          \
172     ASSERT ((x) != 0);                          \
173     (count) = __builtin_ctzl (x);               \
174   } while (0)
175 #endif
176
177
178 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
179    don't need to be under !NO_ASM */
180 #if ! defined (NO_ASM)
181
182 #if defined (__alpha) && W_TYPE_SIZE == 64
183 /* Most alpha-based machines, except Cray systems. */
184 #if defined (__GNUC__)
185 #if __GMP_GNUC_PREREQ (3,3)
186 #define umul_ppmm(ph, pl, m0, m1) \
187   do {                                                                  \
188     UDItype __m0 = (m0), __m1 = (m1);                                   \
189     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
190     (pl) = __m0 * __m1;                                                 \
191   } while (0)
192 #else
193 #define umul_ppmm(ph, pl, m0, m1) \
194   do {                                                                  \
195     UDItype __m0 = (m0), __m1 = (m1);                                   \
196     __asm__ ("umulh %r1,%2,%0"                                          \
197              : "=r" (ph)                                                \
198              : "%rJ" (m0), "rI" (m1));                                  \
199     (pl) = __m0 * __m1;                                                 \
200   } while (0)
201 #endif
202 #define UMUL_TIME 18
203 #else /* ! __GNUC__ */
204 #include <machine/builtins.h>
205 #define umul_ppmm(ph, pl, m0, m1) \
206   do {                                                                  \
207     UDItype __m0 = (m0), __m1 = (m1);                                   \
208     (ph) = __UMULH (m0, m1);                                            \
209     (pl) = __m0 * __m1;                                                 \
210   } while (0)
211 #endif
212 #ifndef LONGLONG_STANDALONE
213 #define udiv_qrnnd(q, r, n1, n0, d) \
214   do { UWtype __di;                                                     \
215     __di = __MPN(invert_limb) (d);                                      \
216     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
217   } while (0)
218 #define UDIV_PREINV_ALWAYS  1
219 #define UDIV_NEEDS_NORMALIZATION 1
220 #define UDIV_TIME 220
221 #endif /* LONGLONG_STANDALONE */
222
223 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
224    always goes into libgmp.so, even when not actually used.  */
225 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
226
227 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
228 #define count_leading_zeros(COUNT,X) \
229   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
230 #define count_trailing_zeros(COUNT,X) \
231   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
232 #endif /* clz/ctz using cix */
233
234 #if ! defined (count_leading_zeros)                             \
235   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
236 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
237    "$31" is written explicitly in the asm, since an "r" constraint won't
238    select reg 31.  There seems no need to worry about "r31" syntax for cray,
239    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
240 #define ALPHA_CMPBGE_0(dst, src)                                        \
241   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
242 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
243    them, locating the highest non-zero byte.  A second __clz_tab lookup
244    counts the leading zero bits in that byte, giving the result.  */
245 #define count_leading_zeros(count, x)                                   \
246   do {                                                                  \
247     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
248     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
249     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
250     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
251     __clz__x >>= __clz__b;                                              \
252     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
253     __clz__b = 65 - __clz__b;                                           \
254     (count) = __clz__b - __clz__c;                                      \
255   } while (0)
256 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
257 #endif /* clz using cmpbge */
258
259 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
260 #if HAVE_ATTRIBUTE_CONST
261 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
262 #else
263 long __MPN(count_leading_zeros) _PROTO ((UDItype));
264 #endif
265 #define count_leading_zeros(count, x) \
266   ((count) = __MPN(count_leading_zeros) (x))
267 #endif /* clz using mpn */
268 #endif /* __alpha */
269
270 #if defined (_CRAY) && W_TYPE_SIZE == 64
271 #include <intrinsics.h>
272 #define UDIV_PREINV_ALWAYS  1
273 #define UDIV_NEEDS_NORMALIZATION 1
274 #define UDIV_TIME 220
275 long __MPN(count_leading_zeros) _PROTO ((UDItype));
276 #define count_leading_zeros(count, x) \
277   ((count) = _leadz ((UWtype) (x)))
278 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
279 #define umul_ppmm(ph, pl, m0, m1) \
280   do {                                                                  \
281     UDItype __m0 = (m0), __m1 = (m1);                                   \
282     (ph) = _int_mult_upper (m0, m1);                                    \
283     (pl) = __m0 * __m1;                                                 \
284   } while (0)
285 #ifndef LONGLONG_STANDALONE
286 #define udiv_qrnnd(q, r, n1, n0, d) \
287   do { UWtype __di;                                                     \
288     __di = __MPN(invert_limb) (d);                                      \
289     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
290   } while (0)
291 #endif /* LONGLONG_STANDALONE */
292 #endif /* _CRAYIEEE */
293 #endif /* _CRAY */
294
295 #if defined (__ia64) && W_TYPE_SIZE == 64
296 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
297    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
298    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
299    register, which takes an extra cycle.  */
300 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
301   do {                                          \
302     UWtype __x;                                 \
303     __x = (al) - (bl);                          \
304     if ((al) < (bl))                            \
305       (sh) = (ah) - (bh) - 1;                   \
306     else                                        \
307       (sh) = (ah) - (bh);                       \
308     (sl) = __x;                                 \
309   } while (0)
310 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
311 /* Do both product parts in assembly, since that gives better code with
312    all gcc versions.  Some callers will just use the upper part, and in
313    that situation we waste an instruction, but not any cycles.  */
314 #define umul_ppmm(ph, pl, m0, m1) \
315     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
316              : "=&f" (ph), "=f" (pl)                                    \
317              : "f" (m0), "f" (m1))
318 #define UMUL_TIME 14
319 #define count_leading_zeros(count, x) \
320   do {                                                                  \
321     UWtype _x = (x), _y, _a, _c;                                        \
322     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
323     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
324     _c = (_a - 1) << 3;                                                 \
325     _x >>= _c;                                                          \
326     if (_x >= 1 << 4)                                                   \
327       _x >>= 4, _c += 4;                                                \
328     if (_x >= 1 << 2)                                                   \
329       _x >>= 2, _c += 2;                                                \
330     _c += _x >> 1;                                                      \
331     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
332   } while (0)
333 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
334    based, and we don't need a special case for x==0 here */
335 #define count_trailing_zeros(count, x)                                  \
336   do {                                                                  \
337     UWtype __ctz_x = (x);                                               \
338     __asm__ ("popcnt %0 = %1"                                           \
339              : "=r" (count)                                             \
340              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
341   } while (0)
342 #endif
343 #if defined (__INTEL_COMPILER)
344 #include <ia64intrin.h>
345 #define umul_ppmm(ph, pl, m0, m1)                                       \
346   do {                                                                  \
347     UWtype _m0 = (m0), _m1 = (m1);                                      \
348     ph = _m64_xmahu (_m0, _m1, 0);                                      \
349     pl = _m0 * _m1;                                                     \
350   } while (0)
351 #endif
352 #ifndef LONGLONG_STANDALONE
353 #define udiv_qrnnd(q, r, n1, n0, d) \
354   do { UWtype __di;                                                     \
355     __di = __MPN(invert_limb) (d);                                      \
356     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
357   } while (0)
358 #define UDIV_PREINV_ALWAYS  1
359 #define UDIV_NEEDS_NORMALIZATION 1
360 #endif
361 #define UDIV_TIME 220
362 #endif
363
364
365 #if defined (__GNUC__)
366
367 /* We sometimes need to clobber "cc" with gcc2, but that would not be
368    understood by gcc1.  Use cpp to avoid major code duplication.  */
369 #if __GNUC__ < 2
370 #define __CLOBBER_CC
371 #define __AND_CLOBBER_CC
372 #else /* __GNUC__ >= 2 */
373 #define __CLOBBER_CC : "cc"
374 #define __AND_CLOBBER_CC , "cc"
375 #endif /* __GNUC__ < 2 */
376
377 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
379   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
380            : "=r" (sh), "=&r" (sl)                                      \
381            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
382 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
383   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
384            : "=r" (sh), "=&r" (sl)                                      \
385            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
386 #define umul_ppmm(xh, xl, m0, m1) \
387   do {                                                                  \
388     USItype __m0 = (m0), __m1 = (m1);                                   \
389     __asm__ ("multiplu %0,%1,%2"                                        \
390              : "=r" (xl)                                                \
391              : "r" (__m0), "r" (__m1));                                 \
392     __asm__ ("multmu %0,%1,%2"                                          \
393              : "=r" (xh)                                                \
394              : "r" (__m0), "r" (__m1));                                 \
395   } while (0)
396 #define udiv_qrnnd(q, r, n1, n0, d) \
397   __asm__ ("dividu %0,%3,%4"                                            \
398            : "=r" (q), "=q" (r)                                         \
399            : "1" (n1), "r" (n0), "r" (d))
400 #define count_leading_zeros(count, x) \
401     __asm__ ("clz %0,%1"                                                \
402              : "=r" (count)                                             \
403              : "r" (x))
404 #define COUNT_LEADING_ZEROS_0 32
405 #endif /* __a29k__ */
406
407 #if defined (__arc__)
408 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
409   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
410            : "=r" (sh),                                                 \
411              "=&r" (sl)                                                 \
412            : "r"  ((USItype) (ah)),                                     \
413              "rIJ" ((USItype) (bh)),                                    \
414              "%r" ((USItype) (al)),                                     \
415              "rIJ" ((USItype) (bl)))
416 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
417   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
418            : "=r" (sh),                                                 \
419              "=&r" (sl)                                                 \
420            : "r" ((USItype) (ah)),                                      \
421              "rIJ" ((USItype) (bh)),                                    \
422              "r" ((USItype) (al)),                                      \
423              "rIJ" ((USItype) (bl)))
424 #endif
425
426 #if defined (__arm__) && W_TYPE_SIZE == 32
427 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
428   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
429            : "=r" (sh), "=&r" (sl)                                      \
430            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
431 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
432   do {                                                                  \
433     if (__builtin_constant_p (al))                                      \
434       {                                                                 \
435         if (__builtin_constant_p (ah))                                  \
436           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
437                    : "=r" (sh), "=&r" (sl)                              \
438                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
439         else                                                            \
440           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
441                    : "=r" (sh), "=&r" (sl)                              \
442                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
443       }                                                                 \
444     else if (__builtin_constant_p (ah))                                 \
445       {                                                                 \
446         if (__builtin_constant_p (bl))                                  \
447           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
448                    : "=r" (sh), "=&r" (sl)                              \
449                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
450         else                                                            \
451           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
452                    : "=r" (sh), "=&r" (sl)                              \
453                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
454       }                                                                 \
455     else if (__builtin_constant_p (bl))                                 \
456       {                                                                 \
457         if (__builtin_constant_p (bh))                                  \
458           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
459                    : "=r" (sh), "=&r" (sl)                              \
460                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
461         else                                                            \
462           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
463                    : "=r" (sh), "=&r" (sl)                              \
464                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
465       }                                                                 \
466     else /* only bh might be a constant */                              \
467       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
468                : "=r" (sh), "=&r" (sl)                                  \
469                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
470     } while (0)
471 #if 1 || defined (__arm_m__)    /* `M' series has widening multiply support */
472 #define umul_ppmm(xh, xl, a, b) \
473   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
474 #define UMUL_TIME 5
475 #define smul_ppmm(xh, xl, a, b) \
476   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
477 #ifndef LONGLONG_STANDALONE
478 #define udiv_qrnnd(q, r, n1, n0, d) \
479   do { UWtype __di;                                                     \
480     __di = __MPN(invert_limb) (d);                                      \
481     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
482   } while (0)
483 #define UDIV_PREINV_ALWAYS  1
484 #define UDIV_NEEDS_NORMALIZATION 1
485 #define UDIV_TIME 70
486 #endif /* LONGLONG_STANDALONE */
487 #else
488 #define umul_ppmm(xh, xl, a, b) \
489   __asm__ ("%@ Inlined umul_ppmm\n"                                     \
490 "       mov     %|r0, %2, lsr #16\n"                                    \
491 "       mov     %|r2, %3, lsr #16\n"                                    \
492 "       bic     %|r1, %2, %|r0, lsl #16\n"                              \
493 "       bic     %|r2, %3, %|r2, lsl #16\n"                              \
494 "       mul     %1, %|r1, %|r2\n"                                       \
495 "       mul     %|r2, %|r0, %|r2\n"                                     \
496 "       mul     %|r1, %0, %|r1\n"                                       \
497 "       mul     %0, %|r0, %0\n"                                         \
498 "       adds    %|r1, %|r2, %|r1\n"                                     \
499 "       addcs   %0, %0, #65536\n"                                       \
500 "       adds    %1, %1, %|r1, lsl #16\n"                                \
501 "       adc     %0, %0, %|r1, lsr #16"                                  \
502            : "=&r" (xh), "=r" (xl)                                      \
503            : "r" (a), "r" (b)                                           \
504            : "r0", "r1", "r2")
505 #define UMUL_TIME 20
506 #ifndef LONGLONG_STANDALONE
507 #define udiv_qrnnd(q, r, n1, n0, d) \
508   do { UWtype __r;                                                      \
509     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
510     (r) = __r;                                                          \
511   } while (0)
512 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
513 #define UDIV_TIME 200
514 #endif /* LONGLONG_STANDALONE */
515 #endif
516 #if defined (__ARM_ARCH_5__)
517 /* This actually requires arm 5 */
518 #define count_leading_zeros(count, x) \
519   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
520 #define COUNT_LEADING_ZEROS_0 32
521 #endif
522 #endif /* __arm__ */
523
524 #if defined (__clipper__) && W_TYPE_SIZE == 32
525 #define umul_ppmm(w1, w0, u, v) \
526   ({union {UDItype __ll;                                                \
527            struct {USItype __l, __h;} __i;                              \
528           } __x;                                                        \
529   __asm__ ("mulwux %2,%0"                                               \
530            : "=r" (__x.__ll)                                            \
531            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
532   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
533 #define smul_ppmm(w1, w0, u, v) \
534   ({union {DItype __ll;                                                 \
535            struct {SItype __l, __h;} __i;                               \
536           } __x;                                                        \
537   __asm__ ("mulwx %2,%0"                                                \
538            : "=r" (__x.__ll)                                            \
539            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
540   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
541 #define __umulsidi3(u, v) \
542   ({UDItype __w;                                                        \
543     __asm__ ("mulwux %2,%0"                                             \
544              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
545     __w; })
546 #endif /* __clipper__ */
547
548 /* Fujitsu vector computers.  */
549 #if defined (__uxp__) && W_TYPE_SIZE == 32
550 #define umul_ppmm(ph, pl, u, v) \
551   do {                                                                  \
552     union {UDItype __ll;                                                \
553            struct {USItype __h, __l;} __i;                              \
554           } __x;                                                        \
555     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
556     (ph) = __x.__i.__h;                                                 \
557     (pl) = __x.__i.__l;                                                 \
558   } while (0)
559 #define smul_ppmm(ph, pl, u, v) \
560   do {                                                                  \
561     union {UDItype __ll;                                                \
562            struct {USItype __h, __l;} __i;                              \
563           } __x;                                                        \
564     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
565     (ph) = __x.__i.__h;                                                 \
566     (pl) = __x.__i.__l;                                                 \
567   } while (0)
568 #endif
569
570 #if defined (__gmicro__) && W_TYPE_SIZE == 32
571 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
572   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
573            : "=g" (sh), "=&g" (sl)                                      \
574            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
575              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
578            : "=g" (sh), "=&g" (sl)                                      \
579            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
580              "1" ((USItype)(al)), "g" ((USItype)(bl)))
581 #define umul_ppmm(ph, pl, m0, m1) \
582   __asm__ ("mulx %3,%0,%1"                                              \
583            : "=g" (ph), "=r" (pl)                                       \
584            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
585 #define udiv_qrnnd(q, r, nh, nl, d) \
586   __asm__ ("divx %4,%0,%1"                                              \
587            : "=g" (q), "=r" (r)                                         \
588            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
589 #define count_leading_zeros(count, x) \
590   __asm__ ("bsch/1 %1,%0"                                               \
591            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
592 #endif
593
594 #if defined (__hppa) && W_TYPE_SIZE == 32
595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
596   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
597            : "=r" (sh), "=&r" (sl)                                      \
598            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
599 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
600   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
601            : "=r" (sh), "=&r" (sl)                                      \
602            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
603 #if defined (_PA_RISC1_1)
604 #define umul_ppmm(wh, wl, u, v) \
605   do {                                                                  \
606     union {UDItype __ll;                                                \
607            struct {USItype __h, __l;} __i;                              \
608           } __x;                                                        \
609     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
610     (wh) = __x.__i.__h;                                                 \
611     (wl) = __x.__i.__l;                                                 \
612   } while (0)
613 #define UMUL_TIME 8
614 #define UDIV_TIME 60
615 #else
616 #define UMUL_TIME 40
617 #define UDIV_TIME 80
618 #endif
619 #define count_leading_zeros(count, x) \
620   do {                                                                  \
621     USItype __tmp;                                                      \
622     __asm__ (                                                           \
623        "ldi             1,%0\n"                                         \
624 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
625 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
626 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
627 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
628 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
629 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
630 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
631 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
632 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
633 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
634 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
635 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
636 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
637 "       sub             %0,%1,%0        ; Subtract it.\n"               \
638         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
639   } while (0)
640 #endif /* hppa */
641
642 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
643    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
644    is just a case of no direct support for 2.0n but treating it like 1.0. */
645 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
646 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
647   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
648            : "=r" (sh), "=&r" (sl)                                      \
649            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
650 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
651   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
652            : "=r" (sh), "=&r" (sl)                                      \
653            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
654 #endif /* hppa */
655
656 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
657 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
658 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
659   do {                                                                  \
660 /*  if (__builtin_constant_p (bl))                                      \
661       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"                            \
662                : "=r" (sh), "=&r" (sl)                                  \
663                : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
664     else                                                                \
665 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"                              \
666                : "=r" (sh), "=&r" (sl)                                  \
667                : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
668   } while (0)
669 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
670   do {                                                                  \
671 /*  if (__builtin_constant_p (bl))                                      \
672       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"                            \
673                : "=r" (sh), "=&r" (sl)                                  \
674                : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);  \
675     else                                                                \
676 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"                              \
677                : "=r" (sh), "=&r" (sl)                                  \
678                : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);  \
679   } while (0)
680 #if __GMP_GNUC_PREREQ (4,5)
681 #define umul_ppmm(xh, xl, m0, m1)                                       \
682   do {                                                                  \
683     union {UDItype __ll;                                                \
684            struct {USItype __h, __l;} __i;                              \
685           } __x;                                                        \
686     __x.__ll = (UDItype) (m0) * (UDItype) (m1);                         \
687     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
688   } while (0)
689 #else
690 #if 0
691 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
692    with a new enough processor pretending we have 32-bit registers.  */
693 #define umul_ppmm(xh, xl, m0, m1)                                       \
694   do {                                                                  \
695     union {UDItype __ll;                                                \
696            struct {USItype __h, __l;} __i;                              \
697           } __x;                                                        \
698     __asm__ ("mlr\t%0,%2"                                               \
699              : "=r" (__x.__ll)                                          \
700              : "%0" (m0), "r" (m1));                                    \
701     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
702   } while (0)
703 #else
704 #define umul_ppmm(xh, xl, m0, m1)                                       \
705   do {                                                                  \
706   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
707      DImode for the product, since that would be allocated to a single 64-bit
708      register, whereas mlr uses the low 32-bits of an even-odd register pair.
709   */                                                                    \
710     register USItype __r0 __asm__ ("0");                                \
711     register USItype __r1 __asm__ ("1") = (m0);                         \
712     __asm__ ("mlr\t%0,%3"                                               \
713              : "=r" (__r0), "=r" (__r1)                                 \
714              : "r" (__r1), "r" (m1));                                   \
715     (xh) = __r0; (xl) = __r1;                                           \
716   } while (0)
717 #endif /* if 0 */
718 #endif
719 #if 0
720 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
721    with a new enough processor pretending we have 32-bit registers.  */
722 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
723   do {                                                                  \
724     union {UDItype __ll;                                                \
725            struct {USItype __h, __l;} __i;                              \
726           } __x;                                                        \
727     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
728     __asm__ ("dlr\t%0,%2"                                               \
729              : "=r" (__x.__ll)                                          \
730              : "0" (__x.__ll), "r" (d));                                \
731     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
732   } while (0)
733 #else
734 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
735   do {                                                                  \
736     register USItype __r0 __asm__ ("0") = (n1);                         \
737     register USItype __r1 __asm__ ("1") = (n0);                         \
738     __asm__ ("dlr\t%0,%4"                                               \
739              : "=r" (__r0), "=r" (__r1)                                 \
740              : "r" (__r0), "r" (__r1), "r" (d));                        \
741     (q) = __r1; (r) = __r0;                                             \
742   } while (0)
743 #endif /* if 0 */
744 #else /* if __zarch__ */
745 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
746 #define smul_ppmm(xh, xl, m0, m1)                                       \
747   do {                                                                  \
748     union {DItype __ll;                                                 \
749            struct {USItype __h, __l;} __i;                              \
750           } __x;                                                        \
751     __asm__ ("mr\t%0,%2"                                                \
752              : "=r" (__x.__ll)                                          \
753              : "%0" (m0), "r" (m1));                                    \
754     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
755   } while (0)
756 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
757 #define sdiv_qrnnd(q, r, n1, n0, d)                                     \
758   do {                                                                  \
759     union {DItype __ll;                                                 \
760            struct {USItype __h, __l;} __i;                              \
761           } __x;                                                        \
762     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
763     __asm__ ("dr\t%0,%2"                                                \
764              : "=r" (__x.__ll)                                          \
765              : "0" (__x.__ll), "r" (d));                                \
766     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
767   } while (0)
768 #endif /* if __zarch__ */
769 #endif
770
771 #if defined (__s390x__) && W_TYPE_SIZE == 64
772 /* We need to cast operands with register constraints, otherwise their types
773    will be assumed to be SImode by gcc.  For these machines, such operations
774    will insert a value into the low 32 bits, and leave the high 32 bits with
775    garbage.  */
776 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
777   do {                                                                  \
778     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"                              \
779                : "=r" (sh), "=&r" (sl)                                  \
780                : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),             \
781                  "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
782   } while (0)
783 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
784   do {                                                                  \
785     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"                              \
786              : "=r" (sh), "=&r" (sl)                                    \
787              : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),                \
788                "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);  \
789   } while (0)
790 #define umul_ppmm(xh, xl, m0, m1)                                       \
791   do {                                                                  \
792     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
793            struct {UDItype __h, __l;} __i;                              \
794           } __x;                                                        \
795     __asm__ ("mlgr\t%0,%2"                                              \
796              : "=r" (__x.__ll)                                          \
797              : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));              \
798     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
799   } while (0)
800 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
801   do {                                                                  \
802     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
803            struct {UDItype __h, __l;} __i;                              \
804           } __x;                                                        \
805     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
806     __asm__ ("dlgr\t%0,%2"                                              \
807              : "=r" (__x.__ll)                                          \
808              : "0" (__x.__ll), "r" ((UDItype)(d)));                     \
809     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
810   } while (0)
811 #if 0 /* FIXME: Enable for z10 (?) */
812 #define count_leading_zeros(cnt, x)                                     \
813   do {                                                                  \
814     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
815            struct {UDItype __h, __l;} __i;                              \
816           } __clr_cnt;                                                  \
817     __asm__ ("flogr\t%0,%1"                                             \
818              : "=r" (__clr_cnt.__ll)                                    \
819              : "r" (x) __CLOBBER_CC);                                   \
820     (cnt) = __clr_cnt.__i.__h;                                          \
821   } while (0)
822 #endif
823 #endif
824
825 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
826 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
827   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
828            : "=r" (sh), "=&r" (sl)                                      \
829            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
830              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
831 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
832   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
833            : "=r" (sh), "=&r" (sl)                                      \
834            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
835              "1" ((USItype)(al)), "g" ((USItype)(bl)))
836 #define umul_ppmm(w1, w0, u, v) \
837   __asm__ ("mull %3"                                                    \
838            : "=a" (w0), "=d" (w1)                                       \
839            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
840 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
841   __asm__ ("divl %4"                 /* stringification in K&R C */     \
842            : "=a" (q), "=d" (r)                                         \
843            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
844
845 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
846 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
847    significant 1 bit is, hence the use of the following alternatives.  bsfl
848    is slow too, between 18 and 42 depending where the least significant 1
849    bit is, so let the generic count_trailing_zeros below make use of the
850    count_leading_zeros here too.  */
851
852 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
853 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
854    cache miss reading from __clz_tab.  For P55 it's favoured over the float
855    below so as to avoid mixing MMX and x87, since the penalty for switching
856    between the two is about 100 cycles.
857
858    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
859    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
860    follows, but as of gcc 2.95.2 it results in conditional jumps.
861
862        __shift = -(__n < 0x1000000);
863        __shift -= (__n < 0x10000);
864        __shift -= (__n < 0x100);
865
866    The middle two sbbl and cmpl's pair, and with luck something gcc
867    generates might pair with the first cmpl and the last sbbl.  The "32+1"
868    constant could be folded into __clz_tab[], but it doesn't seem worth
869    making a different table just for that.  */
870
871 #define count_leading_zeros(c,n)                                        \
872   do {                                                                  \
873     USItype  __n = (n);                                                 \
874     USItype  __shift;                                                   \
875     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
876              "sbbl  %0, %0\n"                                           \
877              "cmpl  $0x10000, %1\n"                                     \
878              "sbbl  $0, %0\n"                                           \
879              "cmpl  $0x100, %1\n"                                       \
880              "sbbl  $0, %0\n"                                           \
881              : "=&r" (__shift) : "r"  (__n));                           \
882     __shift = __shift*8 + 24 + 1;                                       \
883     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
884   } while (0)
885 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
886 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
887
888 #else /* ! pentiummmx || LONGLONG_STANDALONE */
889 /* The following should be a fixed 14 cycles or so.  Some scheduling
890    opportunities should be available between the float load/store too.  This
891    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
892    apparently suggested by the Intel optimizing manual (don't know exactly
893    where).  gcc 2.95 or up will be best for this, so the "double" is
894    correctly aligned on the stack.  */
895 #define count_leading_zeros(c,n)                                        \
896   do {                                                                  \
897     union {                                                             \
898       double    d;                                                      \
899       unsigned  a[2];                                                   \
900     } __u;                                                              \
901     ASSERT ((n) != 0);                                                  \
902     __u.d = (UWtype) (n);                                               \
903     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
904   } while (0)
905 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
906 #endif /* pentiummx */
907
908 #else /* ! pentium */
909
910 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
911 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
912 #endif /* gcc clz */
913
914 /* On P6, gcc prior to 3.0 generates a partial register stall for
915    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
916    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
917    cost of one extra instruction.  Do this for "i386" too, since that means
918    generic x86.  */
919 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
920   && (HAVE_HOST_CPU_i386                                                \
921       || HAVE_HOST_CPU_i686                                             \
922       || HAVE_HOST_CPU_pentiumpro                                       \
923       || HAVE_HOST_CPU_pentium2                                         \
924       || HAVE_HOST_CPU_pentium3)
925 #define count_leading_zeros(count, x)                                   \
926   do {                                                                  \
927     USItype __cbtmp;                                                    \
928     ASSERT ((x) != 0);                                                  \
929     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
930     (count) = 31 - __cbtmp;                                             \
931   } while (0)
932 #endif /* gcc<3 asm bsrl */
933
934 #ifndef count_leading_zeros
935 #define count_leading_zeros(count, x)                                   \
936   do {                                                                  \
937     USItype __cbtmp;                                                    \
938     ASSERT ((x) != 0);                                                  \
939     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
940     (count) = __cbtmp ^ 31;                                             \
941   } while (0)
942 #endif /* asm bsrl */
943
944 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
945 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
946 #endif /* gcc ctz */
947
948 #ifndef count_trailing_zeros
949 #define count_trailing_zeros(count, x)                                  \
950   do {                                                                  \
951     ASSERT ((x) != 0);                                                  \
952     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
953   } while (0)
954 #endif /* asm bsfl */
955
956 #endif /* ! pentium */
957
958 #ifndef UMUL_TIME
959 #define UMUL_TIME 10
960 #endif
961 #ifndef UDIV_TIME
962 #define UDIV_TIME 40
963 #endif
964 #endif /* 80x86 */
965
966 #if defined (__amd64__) && W_TYPE_SIZE == 64
967 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
968   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
969            : "=r" (sh), "=&r" (sl)                                      \
970            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
971              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
972 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
973   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
974            : "=r" (sh), "=&r" (sl)                                      \
975            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
976              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
977 #define umul_ppmm(w1, w0, u, v) \
978   __asm__ ("mulq %3"                                                    \
979            : "=a" (w0), "=d" (w1)                                       \
980            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
981 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
982   __asm__ ("divq %4"                 /* stringification in K&R C */     \
983            : "=a" (q), "=d" (r)                                         \
984            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
985 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
986 #define count_leading_zeros(count, x)                                   \
987   do {                                                                  \
988     UDItype __cbtmp;                                                    \
989     ASSERT ((x) != 0);                                                  \
990     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
991     (count) = __cbtmp ^ 63;                                             \
992   } while (0)
993 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
994    count is only an int. */
995 #define count_trailing_zeros(count, x)                                  \
996   do {                                                                  \
997     ASSERT ((x) != 0);                                                  \
998     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
999   } while (0)
1000 #endif /* x86_64 */
1001
1002 #if defined (__i860__) && W_TYPE_SIZE == 32
1003 #define rshift_rhlc(r,h,l,c) \
1004   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
1005            "=r" (r) : "r" (h), "r" (l), "rn" (c))
1006 #endif /* i860 */
1007
1008 #if defined (__i960__) && W_TYPE_SIZE == 32
1009 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1010   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
1011            : "=r" (sh), "=&r" (sl)                                      \
1012            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1013 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1014   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
1015            : "=r" (sh), "=&r" (sl)                                      \
1016            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1017 #define umul_ppmm(w1, w0, u, v) \
1018   ({union {UDItype __ll;                                                \
1019            struct {USItype __l, __h;} __i;                              \
1020           } __x;                                                        \
1021   __asm__ ("emul %2,%1,%0"                                              \
1022            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
1023   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1024 #define __umulsidi3(u, v) \
1025   ({UDItype __w;                                                        \
1026     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
1027     __w; })
1028 #define udiv_qrnnd(q, r, nh, nl, d) \
1029   do {                                                                  \
1030     union {UDItype __ll;                                                \
1031            struct {USItype __l, __h;} __i;                              \
1032           } __nn;                                                       \
1033     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
1034     __asm__ ("ediv %d,%n,%0"                                            \
1035            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
1036     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
1037   } while (0)
1038 #define count_leading_zeros(count, x) \
1039   do {                                                                  \
1040     USItype __cbtmp;                                                    \
1041     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
1042     (count) = __cbtmp ^ 31;                                             \
1043   } while (0)
1044 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1045 #if defined (__i960mx)          /* what is the proper symbol to test??? */
1046 #define rshift_rhlc(r,h,l,c) \
1047   do {                                                                  \
1048     union {UDItype __ll;                                                \
1049            struct {USItype __l, __h;} __i;                              \
1050           } __nn;                                                       \
1051     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
1052     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
1053   }
1054 #endif /* i960mx */
1055 #endif /* i960 */
1056
1057 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1058      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1059      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1060 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1061   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
1062            : "=d" (sh), "=&d" (sl)                                      \
1063            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
1064              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1065 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1066   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
1067            : "=d" (sh), "=&d" (sl)                                      \
1068            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
1069              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1070 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1071 #if defined (__mc68020__) || defined(mc68020) \
1072      || defined (__mc68030__) || defined (mc68030) \
1073      || defined (__mc68040__) || defined (mc68040) \
1074      || defined (__mcpu32__) || defined (mcpu32) \
1075      || defined (__NeXT__)
1076 #define umul_ppmm(w1, w0, u, v) \
1077   __asm__ ("mulu%.l %3,%1:%0"                                           \
1078            : "=d" (w0), "=d" (w1)                                       \
1079            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1080 #define UMUL_TIME 45
1081 #define udiv_qrnnd(q, r, n1, n0, d) \
1082   __asm__ ("divu%.l %4,%1:%0"                                           \
1083            : "=d" (q), "=d" (r)                                         \
1084            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1085 #define UDIV_TIME 90
1086 #define sdiv_qrnnd(q, r, n1, n0, d) \
1087   __asm__ ("divs%.l %4,%1:%0"                                           \
1088            : "=d" (q), "=d" (r)                                         \
1089            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1090 #else /* for other 68k family members use 16x16->32 multiplication */
1091 #define umul_ppmm(xh, xl, a, b) \
1092   do { USItype __umul_tmp1, __umul_tmp2;                                \
1093         __asm__ ("| Inlined umul_ppmm\n"                                \
1094 "       move%.l %5,%3\n"                                                \
1095 "       move%.l %2,%0\n"                                                \
1096 "       move%.w %3,%1\n"                                                \
1097 "       swap    %3\n"                                                   \
1098 "       swap    %0\n"                                                   \
1099 "       mulu%.w %2,%1\n"                                                \
1100 "       mulu%.w %3,%0\n"                                                \
1101 "       mulu%.w %2,%3\n"                                                \
1102 "       swap    %2\n"                                                   \
1103 "       mulu%.w %5,%2\n"                                                \
1104 "       add%.l  %3,%2\n"                                                \
1105 "       jcc     1f\n"                                                   \
1106 "       add%.l  %#0x10000,%0\n"                                         \
1107 "1:     move%.l %2,%3\n"                                                \
1108 "       clr%.w  %2\n"                                                   \
1109 "       swap    %2\n"                                                   \
1110 "       swap    %3\n"                                                   \
1111 "       clr%.w  %3\n"                                                   \
1112 "       add%.l  %3,%1\n"                                                \
1113 "       addx%.l %2,%0\n"                                                \
1114 "       | End inlined umul_ppmm"                                        \
1115               : "=&d" (xh), "=&d" (xl),                                 \
1116                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
1117               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
1118   } while (0)
1119 #define UMUL_TIME 100
1120 #define UDIV_TIME 400
1121 #endif /* not mc68020 */
1122 /* The '020, '030, '040 and '060 have bitfield insns.
1123    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1124    exclude bfffo on that chip (bitfield insns not available).  */
1125 #if (defined (__mc68020__) || defined (mc68020)    \
1126      || defined (__mc68030__) || defined (mc68030) \
1127      || defined (__mc68040__) || defined (mc68040) \
1128      || defined (__mc68060__) || defined (mc68060) \
1129      || defined (__NeXT__))                        \
1130   && ! defined (__mcpu32__)
1131 #define count_leading_zeros(count, x) \
1132   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
1133            : "=d" (count)                                               \
1134            : "od" ((USItype) (x)), "n" (0))
1135 #define COUNT_LEADING_ZEROS_0 32
1136 #endif
1137 #endif /* mc68000 */
1138
1139 #if defined (__m88000__) && W_TYPE_SIZE == 32
1140 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1141   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
1142            : "=r" (sh), "=&r" (sl)                                      \
1143            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1144 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1145   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
1146            : "=r" (sh), "=&r" (sl)                                      \
1147            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1148 #define count_leading_zeros(count, x) \
1149   do {                                                                  \
1150     USItype __cbtmp;                                                    \
1151     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
1152     (count) = __cbtmp ^ 31;                                             \
1153   } while (0)
1154 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1155 #if defined (__m88110__)
1156 #define umul_ppmm(wh, wl, u, v) \
1157   do {                                                                  \
1158     union {UDItype __ll;                                                \
1159            struct {USItype __h, __l;} __i;                              \
1160           } __x;                                                        \
1161     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1162     (wh) = __x.__i.__h;                                                 \
1163     (wl) = __x.__i.__l;                                                 \
1164   } while (0)
1165 #define udiv_qrnnd(q, r, n1, n0, d) \
1166   ({union {UDItype __ll;                                                \
1167            struct {USItype __h, __l;} __i;                              \
1168           } __x, __q;                                                   \
1169   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1170   __asm__ ("divu.d %0,%1,%2"                                            \
1171            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1172   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1173 #define UMUL_TIME 5
1174 #define UDIV_TIME 25
1175 #else
1176 #define UMUL_TIME 17
1177 #define UDIV_TIME 150
1178 #endif /* __m88110__ */
1179 #endif /* __m88000__ */
1180
1181 #if defined (__mips) && W_TYPE_SIZE == 32
1182 #if __GMP_GNUC_PREREQ (4,4)
1183 #define umul_ppmm(w1, w0, u, v) \
1184   do {                                                                  \
1185     UDItype __ll = (UDItype)(u) * (v);                                  \
1186     w1 = __ll >> 32;                                                    \
1187     w0 = __ll;                                                          \
1188   } while (0)
1189 #endif
1190 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1191 #define umul_ppmm(w1, w0, u, v) \
1192   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1193 #endif
1194 #if !defined (umul_ppmm)
1195 #define umul_ppmm(w1, w0, u, v) \
1196   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1197            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1198 #endif
1199 #define UMUL_TIME 10
1200 #define UDIV_TIME 100
1201 #endif /* __mips */
1202
1203 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1204 #if __GMP_GNUC_PREREQ (4,4)
1205 #define umul_ppmm(w1, w0, u, v) \
1206   do {                                                                  \
1207     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1208     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1209     w1 = __ll >> 64;                                                    \
1210     w0 = __ll;                                                          \
1211   } while (0)
1212 #endif
1213 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1214 #define umul_ppmm(w1, w0, u, v) \
1215   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1216 #endif
1217 #if !defined (umul_ppmm)
1218 #define umul_ppmm(w1, w0, u, v) \
1219   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1220            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1221 #endif
1222 #define UMUL_TIME 20
1223 #define UDIV_TIME 140
1224 #endif /* __mips */
1225
1226 #if defined (__mmix__) && W_TYPE_SIZE == 64
1227 #define umul_ppmm(w1, w0, u, v) \
1228   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1229 #endif
1230
1231 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1232 #define umul_ppmm(w1, w0, u, v) \
1233   ({union {UDItype __ll;                                                \
1234            struct {USItype __l, __h;} __i;                              \
1235           } __x;                                                        \
1236   __asm__ ("meid %2,%0"                                                 \
1237            : "=g" (__x.__ll)                                            \
1238            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1239   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1240 #define __umulsidi3(u, v) \
1241   ({UDItype __w;                                                        \
1242     __asm__ ("meid %2,%0"                                               \
1243              : "=g" (__w)                                               \
1244              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1245     __w; })
1246 #define udiv_qrnnd(q, r, n1, n0, d) \
1247   ({union {UDItype __ll;                                                \
1248            struct {USItype __l, __h;} __i;                              \
1249           } __x;                                                        \
1250   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1251   __asm__ ("deid %2,%0"                                                 \
1252            : "=g" (__x.__ll)                                            \
1253            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1254   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1255 #define count_trailing_zeros(count,x) \
1256   do {                                                                  \
1257     __asm__ ("ffsd      %2,%0"                                          \
1258              : "=r" (count)                                             \
1259              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1260   } while (0)
1261 #endif /* __ns32000__ */
1262
1263 /* In the past we had a block of various #defines tested
1264        _ARCH_PPC    - AIX
1265        _ARCH_PWR    - AIX
1266        __powerpc__  - gcc
1267        __POWERPC__  - BEOS
1268        __ppc__      - Darwin
1269        PPC          - old gcc, GNU/Linux, SysV
1270    The plain PPC test was not good for vxWorks, since PPC is defined on all
1271    CPUs there (eg. m68k too), as a constant one is expected to compare
1272    CPU_FAMILY against.
1273
1274    At any rate, this was pretty unattractive and a bit fragile.  The use of
1275    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1276    getting the desired effect.
1277
1278    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1279    the system vendor compilers.  (Is that vendor compilers with inline asm,
1280    or what?)  */
1281
1282 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1283   && W_TYPE_SIZE == 32
1284 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1285   do {                                                                  \
1286     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1287       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1288              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1289     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1290       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1291              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1292     else                                                                \
1293       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1294              : "=r" (sh), "=&r" (sl)                                    \
1295              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1296   } while (0)
1297 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1298   do {                                                                  \
1299     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1300       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1301                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1302     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1303       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1304                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1305     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1306       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1307                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1308     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1309       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1310                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1311     else                                                                \
1312       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1313                : "=r" (sh), "=&r" (sl)                                  \
1314                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1315   } while (0)
1316 #define count_leading_zeros(count, x) \
1317   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1318 #define COUNT_LEADING_ZEROS_0 32
1319 #if HAVE_HOST_CPU_FAMILY_powerpc
1320 #if __GMP_GNUC_PREREQ (4,4)
1321 #define umul_ppmm(w1, w0, u, v) \
1322   do {                                                                  \
1323     UDItype __ll = (UDItype)(u) * (v);                                  \
1324     w1 = __ll >> 32;                                                    \
1325     w0 = __ll;                                                          \
1326   } while (0)
1327 #endif
1328 #if !defined (umul_ppmm)
1329 #define umul_ppmm(ph, pl, m0, m1) \
1330   do {                                                                  \
1331     USItype __m0 = (m0), __m1 = (m1);                                   \
1332     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1333     (pl) = __m0 * __m1;                                                 \
1334   } while (0)
1335 #endif
1336 #define UMUL_TIME 15
1337 #define smul_ppmm(ph, pl, m0, m1) \
1338   do {                                                                  \
1339     SItype __m0 = (m0), __m1 = (m1);                                    \
1340     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1341     (pl) = __m0 * __m1;                                                 \
1342   } while (0)
1343 #define SMUL_TIME 14
1344 #define UDIV_TIME 120
1345 #else
1346 #define UMUL_TIME 8
1347 #define smul_ppmm(xh, xl, m0, m1) \
1348   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1349 #define SMUL_TIME 4
1350 #define sdiv_qrnnd(q, r, nh, nl, d) \
1351   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1352 #define UDIV_TIME 100
1353 #endif
1354 #endif /* 32-bit POWER architecture variants.  */
1355
1356 /* We should test _IBMR2 here when we add assembly support for the system
1357    vendor compilers.  */
1358 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1359 #if !defined (_LONG_LONG_LIMB)
1360 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1361    use adde etc only when not _LONG_LONG_LIMB.  */
1362 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1363   do {                                                                  \
1364     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1365       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1366              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1367     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1368       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1369              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1370     else                                                                \
1371       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1372              : "=r" (sh), "=&r" (sl)                                    \
1373              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1374   } while (0)
1375 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1376    This might seem strange, but gcc folds away the dead code late.  */
1377 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1378   do {                                                                        \
1379     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {          \
1380         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1381           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"               \
1382                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1383         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1384           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"               \
1385                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1386         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1387           __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"                 \
1388                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1389         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1390           __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"                 \
1391                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1392         else                                                                  \
1393           __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"              \
1394                    : "=r" (sh), "=&r" (sl)                                    \
1395                    : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));             \
1396       } else {                                                                \
1397         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1398           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"         \
1399                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1400         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1401           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"         \
1402                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1403         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1404           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"           \
1405                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1406         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1407           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"           \
1408                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1409         else                                                                  \
1410           __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"        \
1411                    : "=r" (sh), "=&r" (sl)                                    \
1412                    : "r" (ah), "r" (bh), "rI" (al), "r" (bl));                \
1413       }                                                                       \
1414   } while (0)
1415 #endif /* ! _LONG_LONG_LIMB */
1416 #define count_leading_zeros(count, x) \
1417   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1418 #define COUNT_LEADING_ZEROS_0 64
1419 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1420 #define umul_ppmm(w1, w0, u, v) \
1421   do {                                                                  \
1422     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1423     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1424     w1 = __ll >> 64;                                                    \
1425     w0 = __ll;                                                          \
1426   } while (0)
1427 #endif
1428 #if !defined (umul_ppmm)
1429 #define umul_ppmm(ph, pl, m0, m1) \
1430   do {                                                                  \
1431     UDItype __m0 = (m0), __m1 = (m1);                                   \
1432     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1433     (pl) = __m0 * __m1;                                                 \
1434   } while (0)
1435 #endif
1436 #define UMUL_TIME 15
1437 #define smul_ppmm(ph, pl, m0, m1) \
1438   do {                                                                  \
1439     DItype __m0 = (m0), __m1 = (m1);                                    \
1440     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1441     (pl) = __m0 * __m1;                                                 \
1442   } while (0)
1443 #define SMUL_TIME 14  /* ??? */
1444 #define UDIV_TIME 120 /* ??? */
1445 #endif /* 64-bit PowerPC.  */
1446
1447 #if defined (__pyr__) && W_TYPE_SIZE == 32
1448 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1449   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1450            : "=r" (sh), "=&r" (sl)                                      \
1451            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1452              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1453 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1454   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1455            : "=r" (sh), "=&r" (sl)                                      \
1456            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1457              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1458 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1459 #define umul_ppmm(w1, w0, u, v) \
1460   ({union {UDItype __ll;                                                \
1461            struct {USItype __h, __l;} __i;                              \
1462           } __x;                                                        \
1463   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1464            : "=&r" (__x.__ll)                                           \
1465            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1466   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1467 #endif /* __pyr__ */
1468
1469 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1470 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1471   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1472            : "=r" (sh), "=&r" (sl)                                      \
1473            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1474              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1475 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1476   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1477            : "=r" (sh), "=&r" (sl)                                      \
1478            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1479              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1480 #define smul_ppmm(ph, pl, m0, m1) \
1481   __asm__ (                                                             \
1482        "s       r2,r2\n"                                                \
1483 "       mts r10,%2\n"                                                   \
1484 "       m       r2,%3\n"                                                \
1485 "       m       r2,%3\n"                                                \
1486 "       m       r2,%3\n"                                                \
1487 "       m       r2,%3\n"                                                \
1488 "       m       r2,%3\n"                                                \
1489 "       m       r2,%3\n"                                                \
1490 "       m       r2,%3\n"                                                \
1491 "       m       r2,%3\n"                                                \
1492 "       m       r2,%3\n"                                                \
1493 "       m       r2,%3\n"                                                \
1494 "       m       r2,%3\n"                                                \
1495 "       m       r2,%3\n"                                                \
1496 "       m       r2,%3\n"                                                \
1497 "       m       r2,%3\n"                                                \
1498 "       m       r2,%3\n"                                                \
1499 "       m       r2,%3\n"                                                \
1500 "       cas     %0,r2,r0\n"                                             \
1501 "       mfs     r10,%1"                                                 \
1502            : "=r" (ph), "=r" (pl)                                       \
1503            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1504            : "r2")
1505 #define UMUL_TIME 20
1506 #define UDIV_TIME 200
1507 #define count_leading_zeros(count, x) \
1508   do {                                                                  \
1509     if ((x) >= 0x10000)                                                 \
1510       __asm__ ("clz     %0,%1"                                          \
1511                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1512     else                                                                \
1513       {                                                                 \
1514         __asm__ ("clz   %0,%1"                                          \
1515                  : "=r" (count) : "r" ((USItype)(x)));                  \
1516         (count) += 16;                                                  \
1517       }                                                                 \
1518   } while (0)
1519 #endif /* RT/ROMP */
1520
1521 #if defined (__sh2__) && W_TYPE_SIZE == 32
1522 #define umul_ppmm(w1, w0, u, v) \
1523   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1524            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1525 #define UMUL_TIME 5
1526 #endif
1527
1528 #if defined (__sparc__) && W_TYPE_SIZE == 32
1529 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1530   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1531            : "=r" (sh), "=&r" (sl)                                      \
1532            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1533            __CLOBBER_CC)
1534 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1535   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1536            : "=r" (sh), "=&r" (sl)                                      \
1537            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1538            __CLOBBER_CC)
1539 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1540    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1541 #if defined (__sparc_v9__) || defined (__sparcv9)
1542 /* Perhaps we should use floating-point operations here?  */
1543 #if 0
1544 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1545    Perhaps we simply need explicitly zero-extend the inputs?  */
1546 #define umul_ppmm(w1, w0, u, v) \
1547   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1548            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1549 #else
1550 /* Use v8 umul until above bug is fixed.  */
1551 #define umul_ppmm(w1, w0, u, v) \
1552   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1553 #endif
1554 /* Use a plain v8 divide for v9.  */
1555 #define udiv_qrnnd(q, r, n1, n0, d) \
1556   do {                                                                  \
1557     USItype __q;                                                        \
1558     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1559              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1560     (r) = (n0) - __q * (d);                                             \
1561     (q) = __q;                                                          \
1562   } while (0)
1563 #else
1564 #if defined (__sparc_v8__)   /* gcc normal */                           \
1565   || defined (__sparcv8)     /* gcc solaris */                          \
1566   || HAVE_HOST_CPU_supersparc
1567 /* Don't match immediate range because, 1) it is not often useful,
1568    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1569    while we want to match a 13 bit interval, sign extended to 32 bits,
1570    but INTERPRETED AS UNSIGNED.  */
1571 #define umul_ppmm(w1, w0, u, v) \
1572   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1573 #define UMUL_TIME 5
1574
1575 #if HAVE_HOST_CPU_supersparc
1576 #define UDIV_TIME 60            /* SuperSPARC timing */
1577 #else
1578 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1579    dividends and will trap to the kernel for the rest. */
1580 #define udiv_qrnnd(q, r, n1, n0, d) \
1581   do {                                                                  \
1582     USItype __q;                                                        \
1583     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1584              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1585     (r) = (n0) - __q * (d);                                             \
1586     (q) = __q;                                                          \
1587   } while (0)
1588 #define UDIV_TIME 25
1589 #endif /* HAVE_HOST_CPU_supersparc */
1590
1591 #else /* ! __sparc_v8__ */
1592 #if defined (__sparclite__)
1593 /* This has hardware multiply but not divide.  It also has two additional
1594    instructions scan (ffs from high bit) and divscc.  */
1595 #define umul_ppmm(w1, w0, u, v) \
1596   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1597 #define UMUL_TIME 5
1598 #define udiv_qrnnd(q, r, n1, n0, d) \
1599   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1600 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1601 "       tst     %%g0\n"                                                 \
1602 "       divscc  %3,%4,%%g1\n"                                           \
1603 "       divscc  %%g1,%4,%%g1\n"                                         \
1604 "       divscc  %%g1,%4,%%g1\n"                                         \
1605 "       divscc  %%g1,%4,%%g1\n"                                         \
1606 "       divscc  %%g1,%4,%%g1\n"                                         \
1607 "       divscc  %%g1,%4,%%g1\n"                                         \
1608 "       divscc  %%g1,%4,%%g1\n"                                         \
1609 "       divscc  %%g1,%4,%%g1\n"                                         \
1610 "       divscc  %%g1,%4,%%g1\n"                                         \
1611 "       divscc  %%g1,%4,%%g1\n"                                         \
1612 "       divscc  %%g1,%4,%%g1\n"                                         \
1613 "       divscc  %%g1,%4,%%g1\n"                                         \
1614 "       divscc  %%g1,%4,%%g1\n"                                         \
1615 "       divscc  %%g1,%4,%%g1\n"                                         \
1616 "       divscc  %%g1,%4,%%g1\n"                                         \
1617 "       divscc  %%g1,%4,%%g1\n"                                         \
1618 "       divscc  %%g1,%4,%%g1\n"                                         \
1619 "       divscc  %%g1,%4,%%g1\n"                                         \
1620 "       divscc  %%g1,%4,%%g1\n"                                         \
1621 "       divscc  %%g1,%4,%%g1\n"                                         \
1622 "       divscc  %%g1,%4,%%g1\n"                                         \
1623 "       divscc  %%g1,%4,%%g1\n"                                         \
1624 "       divscc  %%g1,%4,%%g1\n"                                         \
1625 "       divscc  %%g1,%4,%%g1\n"                                         \
1626 "       divscc  %%g1,%4,%%g1\n"                                         \
1627 "       divscc  %%g1,%4,%%g1\n"                                         \
1628 "       divscc  %%g1,%4,%%g1\n"                                         \
1629 "       divscc  %%g1,%4,%%g1\n"                                         \
1630 "       divscc  %%g1,%4,%%g1\n"                                         \
1631 "       divscc  %%g1,%4,%%g1\n"                                         \
1632 "       divscc  %%g1,%4,%%g1\n"                                         \
1633 "       divscc  %%g1,%4,%0\n"                                           \
1634 "       rd      %%y,%1\n"                                               \
1635 "       bl,a 1f\n"                                                      \
1636 "       add     %1,%4,%1\n"                                             \
1637 "1:     ! End of inline udiv_qrnnd"                                     \
1638            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1639            : "%g1" __AND_CLOBBER_CC)
1640 #define UDIV_TIME 37
1641 #define count_leading_zeros(count, x) \
1642   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1643 /* Early sparclites return 63 for an argument of 0, but they warn that future
1644    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1645    undefined.  */
1646 #endif /* __sparclite__ */
1647 #endif /* __sparc_v8__ */
1648 #endif /* __sparc_v9__ */
1649 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1650 #ifndef umul_ppmm
1651 #define umul_ppmm(w1, w0, u, v) \
1652   __asm__ ("! Inlined umul_ppmm\n"                                      \
1653 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1654 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1655 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1656 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1657 "       mulscc  %%g1,%3,%%g1\n"                                         \
1658 "       mulscc  %%g1,%3,%%g1\n"                                         \
1659 "       mulscc  %%g1,%3,%%g1\n"                                         \
1660 "       mulscc  %%g1,%3,%%g1\n"                                         \
1661 "       mulscc  %%g1,%3,%%g1\n"                                         \
1662 "       mulscc  %%g1,%3,%%g1\n"                                         \
1663 "       mulscc  %%g1,%3,%%g1\n"                                         \
1664 "       mulscc  %%g1,%3,%%g1\n"                                         \
1665 "       mulscc  %%g1,%3,%%g1\n"                                         \
1666 "       mulscc  %%g1,%3,%%g1\n"                                         \
1667 "       mulscc  %%g1,%3,%%g1\n"                                         \
1668 "       mulscc  %%g1,%3,%%g1\n"                                         \
1669 "       mulscc  %%g1,%3,%%g1\n"                                         \
1670 "       mulscc  %%g1,%3,%%g1\n"                                         \
1671 "       mulscc  %%g1,%3,%%g1\n"                                         \
1672 "       mulscc  %%g1,%3,%%g1\n"                                         \
1673 "       mulscc  %%g1,%3,%%g1\n"                                         \
1674 "       mulscc  %%g1,%3,%%g1\n"                                         \
1675 "       mulscc  %%g1,%3,%%g1\n"                                         \
1676 "       mulscc  %%g1,%3,%%g1\n"                                         \
1677 "       mulscc  %%g1,%3,%%g1\n"                                         \
1678 "       mulscc  %%g1,%3,%%g1\n"                                         \
1679 "       mulscc  %%g1,%3,%%g1\n"                                         \
1680 "       mulscc  %%g1,%3,%%g1\n"                                         \
1681 "       mulscc  %%g1,%3,%%g1\n"                                         \
1682 "       mulscc  %%g1,%3,%%g1\n"                                         \
1683 "       mulscc  %%g1,%3,%%g1\n"                                         \
1684 "       mulscc  %%g1,%3,%%g1\n"                                         \
1685 "       mulscc  %%g1,%3,%%g1\n"                                         \
1686 "       mulscc  %%g1,%3,%%g1\n"                                         \
1687 "       mulscc  %%g1,%3,%%g1\n"                                         \
1688 "       mulscc  %%g1,%3,%%g1\n"                                         \
1689 "       mulscc  %%g1,0,%%g1\n"                                          \
1690 "       add     %%g1,%%g2,%0\n"                                         \
1691 "       rd      %%y,%1"                                                 \
1692            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1693            : "%g1", "%g2" __AND_CLOBBER_CC)
1694 #define UMUL_TIME 39            /* 39 instructions */
1695 #endif
1696 #ifndef udiv_qrnnd
1697 #ifndef LONGLONG_STANDALONE
1698 #define udiv_qrnnd(q, r, n1, n0, d) \
1699   do { UWtype __r;                                                      \
1700     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1701     (r) = __r;                                                          \
1702   } while (0)
1703 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1704 #ifndef UDIV_TIME
1705 #define UDIV_TIME 140
1706 #endif
1707 #endif /* LONGLONG_STANDALONE */
1708 #endif /* udiv_qrnnd */
1709 #endif /* __sparc__ */
1710
1711 #if defined (__sparc__) && W_TYPE_SIZE == 64
1712 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1713   __asm__ (                                                             \
1714        "addcc   %r4,%5,%1\n"                                            \
1715       " addccc  %r6,%7,%%g0\n"                                          \
1716       " addc    %r2,%3,%0"                                              \
1717           : "=r" (sh), "=&r" (sl)                                       \
1718           : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
1719             "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
1720            __CLOBBER_CC)
1721 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1722   __asm__ (                                                             \
1723        "subcc   %r4,%5,%1\n"                                            \
1724       " subccc  %r6,%7,%%g0\n"                                          \
1725       " subc    %r2,%3,%0"                                              \
1726           : "=r" (sh), "=&r" (sl)                                       \
1727           : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),         \
1728             "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
1729            __CLOBBER_CC)
1730 #endif
1731
1732 #if defined (__vax__) && W_TYPE_SIZE == 32
1733 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1734   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1735            : "=g" (sh), "=&g" (sl)                                      \
1736            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1737              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1738 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1739   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1740            : "=g" (sh), "=&g" (sl)                                      \
1741            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1742              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1743 #define smul_ppmm(xh, xl, m0, m1) \
1744   do {                                                                  \
1745     union {UDItype __ll;                                                \
1746            struct {USItype __l, __h;} __i;                              \
1747           } __x;                                                        \
1748     USItype __m0 = (m0), __m1 = (m1);                                   \
1749     __asm__ ("emul %1,%2,$0,%0"                                         \
1750              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1751     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1752   } while (0)
1753 #define sdiv_qrnnd(q, r, n1, n0, d) \
1754   do {                                                                  \
1755     union {DItype __ll;                                                 \
1756            struct {SItype __l, __h;} __i;                               \
1757           } __x;                                                        \
1758     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1759     __asm__ ("ediv %3,%2,%0,%1"                                         \
1760              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1761   } while (0)
1762 #if 0
1763 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1764    8800 maybe). */
1765 #define count_trailing_zeros(count,x)                                   \
1766   do {                                                                  \
1767     __asm__ ("ffs 0, 31, %1, %0"                                        \
1768              : "=g" (count)                                             \
1769              : "g" ((USItype) (x)));                                    \
1770   } while (0)
1771 #endif
1772 #endif /* __vax__ */
1773
1774 #if defined (__z8000__) && W_TYPE_SIZE == 16
1775 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1776   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1777            : "=r" (sh), "=&r" (sl)                                      \
1778            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1779              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1780 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1781   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1782            : "=r" (sh), "=&r" (sl)                                      \
1783            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1784              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1785 #define umul_ppmm(xh, xl, m0, m1) \
1786   do {                                                                  \
1787     union {long int __ll;                                               \
1788            struct {unsigned int __h, __l;} __i;                         \
1789           } __x;                                                        \
1790     unsigned int __m0 = (m0), __m1 = (m1);                              \
1791     __asm__ ("mult      %S0,%H3"                                        \
1792              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1793              : "%1" (m0), "rQR" (m1));                                  \
1794     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1795     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1796              + (((signed int) __m1 >> 15) & __m0));                     \
1797   } while (0)
1798 #endif /* __z8000__ */
1799
1800 #endif /* __GNUC__ */
1801
1802 #endif /* NO_ASM */
1803
1804
1805 #if !defined (umul_ppmm) && defined (__umulsidi3)
1806 #define umul_ppmm(ph, pl, m0, m1) \
1807   {                                                                     \
1808     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1809     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1810     pl = (UWtype) __ll;                                                 \
1811   }
1812 #endif
1813
1814 #if !defined (__umulsidi3)
1815 #define __umulsidi3(u, v) \
1816   ({UWtype __hi, __lo;                                                  \
1817     umul_ppmm (__hi, __lo, u, v);                                       \
1818     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1819 #endif
1820
1821
1822 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1823    forms have "reversed" arguments, meaning the pointer is last, which
1824    sometimes allows better parameter passing, in particular on 64-bit
1825    hppa. */
1826
1827 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1828 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1829
1830 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1831   && ! defined (LONGLONG_STANDALONE)
1832 #define umul_ppmm(wh, wl, u, v)                                               \
1833   do {                                                                        \
1834     UWtype __umul_ppmm__p0;                                                   \
1835     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1836     (wl) = __umul_ppmm__p0;                                                   \
1837   } while (0)
1838 #endif
1839
1840 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1841 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1842
1843 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1844   && ! defined (LONGLONG_STANDALONE)
1845 #define umul_ppmm(wh, wl, u, v)                                               \
1846   do {                                                                        \
1847     UWtype __umul_ppmm__p0;                                                   \
1848     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1849     (wl) = __umul_ppmm__p0;                                                   \
1850   } while (0)
1851 #endif
1852
1853 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1854 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1855
1856 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1857   && ! defined (LONGLONG_STANDALONE)
1858 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1859   do {                                                                  \
1860     UWtype __udiv_qrnnd__r;                                             \
1861     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                             \
1862                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1863     (r) = __udiv_qrnnd__r;                                              \
1864   } while (0)
1865 #endif
1866
1867 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1868 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1869
1870 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1871   && ! defined (LONGLONG_STANDALONE)
1872 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1873   do {                                                                  \
1874     UWtype __udiv_qrnnd__r;                                             \
1875     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1876                             &__udiv_qrnnd__r);                          \
1877     (r) = __udiv_qrnnd__r;                                              \
1878   } while (0)
1879 #endif
1880
1881
1882 /* If this machine has no inline assembler, use C macros.  */
1883
1884 #if !defined (add_ssaaaa)
1885 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1886   do {                                                                  \
1887     UWtype __x;                                                         \
1888     __x = (al) + (bl);                                                  \
1889     (sh) = (ah) + (bh) + (__x < (al));                                  \
1890     (sl) = __x;                                                         \
1891   } while (0)
1892 #endif
1893
1894 #if !defined (sub_ddmmss)
1895 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1896   do {                                                                  \
1897     UWtype __x;                                                         \
1898     __x = (al) - (bl);                                                  \
1899     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1900     (sl) = __x;                                                         \
1901   } while (0)
1902 #endif
1903
1904 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1905    smul_ppmm.  */
1906 #if !defined (umul_ppmm) && defined (smul_ppmm)
1907 #define umul_ppmm(w1, w0, u, v)                                         \
1908   do {                                                                  \
1909     UWtype __w1;                                                        \
1910     UWtype __xm0 = (u), __xm1 = (v);                                    \
1911     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1912     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1913                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1914   } while (0)
1915 #endif
1916
1917 /* If we still don't have umul_ppmm, define it using plain C.
1918
1919    For reference, when this code is used for squaring (ie. u and v identical
1920    expressions), gcc recognises __x1 and __x2 are the same and generates 3
1921    multiplies, not 4.  The subsequent additions could be optimized a bit,
1922    but the only place GMP currently uses such a square is mpn_sqr_basecase,
1923    and chips obliged to use this generic C umul will have plenty of worse
1924    performance problems than a couple of extra instructions on the diagonal
1925    of sqr_basecase.  */
1926
1927 #if !defined (umul_ppmm)
1928 #define umul_ppmm(w1, w0, u, v)                                         \
1929   do {                                                                  \
1930     UWtype __x0, __x1, __x2, __x3;                                      \
1931     UHWtype __ul, __vl, __uh, __vh;                                     \
1932     UWtype __u = (u), __v = (v);                                        \
1933                                                                         \
1934     __ul = __ll_lowpart (__u);                                          \
1935     __uh = __ll_highpart (__u);                                         \
1936     __vl = __ll_lowpart (__v);                                          \
1937     __vh = __ll_highpart (__v);                                         \
1938                                                                         \
1939     __x0 = (UWtype) __ul * __vl;                                        \
1940     __x1 = (UWtype) __ul * __vh;                                        \
1941     __x2 = (UWtype) __uh * __vl;                                        \
1942     __x3 = (UWtype) __uh * __vh;                                        \
1943                                                                         \
1944     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
1945     __x1 += __x2;               /* but this indeed can */               \
1946     if (__x1 < __x2)            /* did we get it? */                    \
1947       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
1948                                                                         \
1949     (w1) = __x3 + __ll_highpart (__x1);                                 \
1950     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
1951   } while (0)
1952 #endif
1953
1954 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1955    exist in one form or another.  */
1956 #if !defined (smul_ppmm)
1957 #define smul_ppmm(w1, w0, u, v)                                         \
1958   do {                                                                  \
1959     UWtype __w1;                                                        \
1960     UWtype __xm0 = (u), __xm1 = (v);                                    \
1961     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1962     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1963                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1964   } while (0)
1965 #endif
1966
1967 /* Define this unconditionally, so it can be used for debugging.  */
1968 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1969   do {                                                                  \
1970     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
1971                                                                         \
1972     ASSERT ((d) != 0);                                                  \
1973     ASSERT ((n1) < (d));                                                \
1974                                                                         \
1975     __d1 = __ll_highpart (d);                                           \
1976     __d0 = __ll_lowpart (d);                                            \
1977                                                                         \
1978     __q1 = (n1) / __d1;                                                 \
1979     __r1 = (n1) - __q1 * __d1;                                          \
1980     __m = __q1 * __d0;                                                  \
1981     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
1982     if (__r1 < __m)                                                     \
1983       {                                                                 \
1984         __q1--, __r1 += (d);                                            \
1985         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1986           if (__r1 < __m)                                               \
1987             __q1--, __r1 += (d);                                        \
1988       }                                                                 \
1989     __r1 -= __m;                                                        \
1990                                                                         \
1991     __q0 = __r1 / __d1;                                                 \
1992     __r0 = __r1  - __q0 * __d1;                                         \
1993     __m = __q0 * __d0;                                                  \
1994     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
1995     if (__r0 < __m)                                                     \
1996       {                                                                 \
1997         __q0--, __r0 += (d);                                            \
1998         if (__r0 >= (d))                                                \
1999           if (__r0 < __m)                                               \
2000             __q0--, __r0 += (d);                                        \
2001       }                                                                 \
2002     __r0 -= __m;                                                        \
2003                                                                         \
2004     (q) = __q1 * __ll_B | __q0;                                         \
2005     (r) = __r0;                                                         \
2006   } while (0)
2007
2008 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2009    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2010 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2011 #define udiv_qrnnd(q, r, nh, nl, d) \
2012   do {                                                                  \
2013     UWtype __r;                                                         \
2014     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
2015     (r) = __r;                                                          \
2016   } while (0)
2017 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2018 #endif
2019
2020 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2021 #if !defined (udiv_qrnnd)
2022 #define UDIV_NEEDS_NORMALIZATION 1
2023 #define udiv_qrnnd __udiv_qrnnd_c
2024 #endif
2025
2026 #if !defined (count_leading_zeros)
2027 #define count_leading_zeros(count, x) \
2028   do {                                                                  \
2029     UWtype __xr = (x);                                                  \
2030     UWtype __a;                                                         \
2031                                                                         \
2032     if (W_TYPE_SIZE == 32)                                              \
2033       {                                                                 \
2034         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
2035           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
2036           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
2037           : 3*__BITS4 + 1);                                             \
2038       }                                                                 \
2039     else                                                                \
2040       {                                                                 \
2041         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
2042           if (((__xr >> __a) & 0xff) != 0)                              \
2043             break;                                                      \
2044         ++__a;                                                          \
2045       }                                                                 \
2046                                                                         \
2047     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
2048   } while (0)
2049 /* This version gives a well-defined value for zero. */
2050 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2051 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2052 #endif
2053
2054 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2055 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2056 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2057 #endif
2058
2059 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2060 extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
2061 #endif
2062
2063 #if !defined (count_trailing_zeros)
2064 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
2065    defined in asm, but if it is not, the C version above is good enough.  */
2066 #define count_trailing_zeros(count, x) \
2067   do {                                                                  \
2068     UWtype __ctz_x = (x);                                               \
2069     UWtype __ctz_c;                                                     \
2070     ASSERT (__ctz_x != 0);                                              \
2071     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
2072     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
2073   } while (0)
2074 #endif
2075
2076 #ifndef UDIV_NEEDS_NORMALIZATION
2077 #define UDIV_NEEDS_NORMALIZATION 0
2078 #endif
2079
2080 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2081    that hence the latter should always be used.  */
2082 #ifndef UDIV_PREINV_ALWAYS
2083 #define UDIV_PREINV_ALWAYS 0
2084 #endif
2085
2086 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2087 #ifndef UMUL_TIME
2088 #define UMUL_TIME 1
2089 #endif
2090
2091 #ifndef UDIV_TIME
2092 #define UDIV_TIME UMUL_TIME
2093 #endif