130dc8512995f75929e63b7f4545df4834df72b1
[dragonfly.git] / contrib / gmp / longlong.h
1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
4 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
5
6 This file is free software; you can redistribute it and/or modify it under the
7 terms of the GNU Lesser General Public License as published by the Free
8 Software Foundation; either version 3 of the License, or (at your option) any
9 later version.
10
11 This file is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
13 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
14 details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with this file.  If not, see http://www.gnu.org/licenses/.  */
18
19 /* You have to define the following before including this file:
20
21    UWtype -- An unsigned type, default type for operations (typically a "word")
22    UHWtype -- An unsigned type, at least half the size of UWtype
23    UDWtype -- An unsigned type, at least twice as large a UWtype
24    W_TYPE_SIZE -- size in bits of UWtype
25
26    SItype, USItype -- Signed and unsigned 32 bit types
27    DItype, UDItype -- Signed and unsigned 64 bit types
28
29    On a 32 bit machine UWtype should typically be USItype;
30    on a 64 bit machine, UWtype should typically be UDItype.
31
32    Optionally, define:
33
34    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
35    NO_ASM -- Disable inline asm
36
37
38    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
39    need to include gmp.h and gmp-impl.h, or certain things might not work as
40    expected.
41 */
42
43 #define __BITS4 (W_TYPE_SIZE / 4)
44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
47
48 /* This is used to make sure no undesirable sharing between different libraries
49    that use this file takes place.  */
50 #ifndef __MPN
51 #define __MPN(x) __##x
52 #endif
53
54 #ifndef _PROTO
55 #if (__STDC__-0) || defined (__cplusplus)
56 #define _PROTO(x) x
57 #else
58 #define _PROTO(x) ()
59 #endif
60 #endif
61
62 /* Define auxiliary asm macros.
63
64    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
65    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
66    word product in HIGH_PROD and LOW_PROD.
67
68    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
69    UDWtype product.  This is just a variant of umul_ppmm.
70
71    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72    denominator) divides a UDWtype, composed by the UWtype integers
73    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
74    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
75    than DENOMINATOR for correct operation.  If, in addition, the most
76    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
77    UDIV_NEEDS_NORMALIZATION is defined to 1.
78
79    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
80    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
81    is rounded towards 0.
82
83    5) count_leading_zeros(count, x) counts the number of zero-bits from the
84    msb to the first non-zero bit in the UWtype X.  This is the number of
85    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
86    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
87
88    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
89    from the least significant end.
90
91    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
92    high_addend_2, low_addend_2) adds two UWtype integers, composed by
93    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
94    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
95    (i.e. carry out) is not stored anywhere, and is lost.
96
97    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
98    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
99    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
100    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
101    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
102    and is lost.
103
104    If any of these macros are left undefined for a particular CPU,
105    C macros are used.
106
107
108    Notes:
109
110    For add_ssaaaa the two high and two low addends can both commute, but
111    unfortunately gcc only supports one "%" commutative in each asm block.
112    This has always been so but is only documented in recent versions
113    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
114    compiler error in certain rare circumstances.
115
116    Apparently it was only the last "%" that was ever actually respected, so
117    the code has been updated to leave just that.  Clearly there's a free
118    choice whether high or low should get it, if there's a reason to favour
119    one over the other.  Also obviously when the constraints on the two
120    operands are identical there's no benefit to the reloader in any "%" at
121    all.
122
123    */
124
125 /* The CPUs come in alphabetical order below.
126
127    Please add support for more CPUs here, or improve the current support
128    for the CPUs below!  */
129
130
131 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
132    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
133    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
134    __builtin_ctzll.
135
136    These builtins are only used when we check what code comes out, on some
137    chips they're merely libgcc calls, where we will instead want an inline
138    in that case (either asm or generic C).
139
140    These builtins are better than an asm block of the same insn, since an
141    asm block doesn't give gcc any information about scheduling or resource
142    usage.  We keep an asm block for use on prior versions of gcc though.
143
144    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
145    it's not used (for count_leading_zeros) because it generally gives extra
146    code to ensure the result is 0 when the input is 0, which we don't need
147    or want.  */
148
149 #ifdef _LONG_LONG_LIMB
150 #define count_leading_zeros_gcc_clz(count,x)    \
151   do {                                          \
152     ASSERT ((x) != 0);                          \
153     (count) = __builtin_clzll (x);              \
154   } while (0)
155 #else
156 #define count_leading_zeros_gcc_clz(count,x)    \
157   do {                                          \
158     ASSERT ((x) != 0);                          \
159     (count) = __builtin_clzl (x);               \
160   } while (0)
161 #endif
162
163 #ifdef _LONG_LONG_LIMB
164 #define count_trailing_zeros_gcc_ctz(count,x)   \
165   do {                                          \
166     ASSERT ((x) != 0);                          \
167     (count) = __builtin_ctzll (x);              \
168   } while (0)
169 #else
170 #define count_trailing_zeros_gcc_ctz(count,x)   \
171   do {                                          \
172     ASSERT ((x) != 0);                          \
173     (count) = __builtin_ctzl (x);               \
174   } while (0)
175 #endif
176
177
178 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
179    don't need to be under !NO_ASM */
180 #if ! defined (NO_ASM)
181
182 #if defined (__alpha) && W_TYPE_SIZE == 64
183 /* Most alpha-based machines, except Cray systems. */
184 #if defined (__GNUC__)
185 #if __GMP_GNUC_PREREQ (3,3)
186 #define umul_ppmm(ph, pl, m0, m1) \
187   do {                                                                  \
188     UDItype __m0 = (m0), __m1 = (m1);                                   \
189     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
190     (pl) = __m0 * __m1;                                                 \
191   } while (0)
192 #else
193 #define umul_ppmm(ph, pl, m0, m1) \
194   do {                                                                  \
195     UDItype __m0 = (m0), __m1 = (m1);                                   \
196     __asm__ ("umulh %r1,%2,%0"                                          \
197              : "=r" (ph)                                                \
198              : "%rJ" (m0), "rI" (m1));                                  \
199     (pl) = __m0 * __m1;                                                 \
200   } while (0)
201 #endif
202 #define UMUL_TIME 18
203 #else /* ! __GNUC__ */
204 #include <machine/builtins.h>
205 #define umul_ppmm(ph, pl, m0, m1) \
206   do {                                                                  \
207     UDItype __m0 = (m0), __m1 = (m1);                                   \
208     (ph) = __UMULH (m0, m1);                                            \
209     (pl) = __m0 * __m1;                                                 \
210   } while (0)
211 #endif
212 #ifndef LONGLONG_STANDALONE
213 #define udiv_qrnnd(q, r, n1, n0, d) \
214   do { UWtype __di;                                                     \
215     __di = __MPN(invert_limb) (d);                                      \
216     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
217   } while (0)
218 #define UDIV_PREINV_ALWAYS  1
219 #define UDIV_NEEDS_NORMALIZATION 1
220 #define UDIV_TIME 220
221 #endif /* LONGLONG_STANDALONE */
222
223 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
224    always goes into libgmp.so, even when not actually used.  */
225 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
226
227 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
228 #define count_leading_zeros(COUNT,X) \
229   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
230 #define count_trailing_zeros(COUNT,X) \
231   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
232 #endif /* clz/ctz using cix */
233
234 #if ! defined (count_leading_zeros)                             \
235   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
236 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
237    "$31" is written explicitly in the asm, since an "r" constraint won't
238    select reg 31.  There seems no need to worry about "r31" syntax for cray,
239    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
240 #define ALPHA_CMPBGE_0(dst, src)                                        \
241   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
242 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
243    them, locating the highest non-zero byte.  A second __clz_tab lookup
244    counts the leading zero bits in that byte, giving the result.  */
245 #define count_leading_zeros(count, x)                                   \
246   do {                                                                  \
247     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
248     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
249     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
250     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
251     __clz__x >>= __clz__b;                                              \
252     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
253     __clz__b = 65 - __clz__b;                                           \
254     (count) = __clz__b - __clz__c;                                      \
255   } while (0)
256 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
257 #endif /* clz using cmpbge */
258
259 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
260 #if HAVE_ATTRIBUTE_CONST
261 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
262 #else
263 long __MPN(count_leading_zeros) _PROTO ((UDItype));
264 #endif
265 #define count_leading_zeros(count, x) \
266   ((count) = __MPN(count_leading_zeros) (x))
267 #endif /* clz using mpn */
268 #endif /* __alpha */
269
270 #if defined (_CRAY) && W_TYPE_SIZE == 64
271 #include <intrinsics.h>
272 #define UDIV_PREINV_ALWAYS  1
273 #define UDIV_NEEDS_NORMALIZATION 1
274 #define UDIV_TIME 220
275 long __MPN(count_leading_zeros) _PROTO ((UDItype));
276 #define count_leading_zeros(count, x) \
277   ((count) = _leadz ((UWtype) (x)))
278 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
279 #define umul_ppmm(ph, pl, m0, m1) \
280   do {                                                                  \
281     UDItype __m0 = (m0), __m1 = (m1);                                   \
282     (ph) = _int_mult_upper (m0, m1);                                    \
283     (pl) = __m0 * __m1;                                                 \
284   } while (0)
285 #ifndef LONGLONG_STANDALONE
286 #define udiv_qrnnd(q, r, n1, n0, d) \
287   do { UWtype __di;                                                     \
288     __di = __MPN(invert_limb) (d);                                      \
289     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
290   } while (0)
291 #endif /* LONGLONG_STANDALONE */
292 #endif /* _CRAYIEEE */
293 #endif /* _CRAY */
294
295 #if defined (__ia64) && W_TYPE_SIZE == 64
296 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
297    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
298    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
299    register, which takes an extra cycle.  */
300 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
301   do {                                          \
302     UWtype __x;                                 \
303     __x = (al) - (bl);                          \
304     if ((al) < (bl))                            \
305       (sh) = (ah) - (bh) - 1;                   \
306     else                                        \
307       (sh) = (ah) - (bh);                       \
308     (sl) = __x;                                 \
309   } while (0)
310 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
311 /* Do both product parts in assembly, since that gives better code with
312    all gcc versions.  Some callers will just use the upper part, and in
313    that situation we waste an instruction, but not any cycles.  */
314 #define umul_ppmm(ph, pl, m0, m1) \
315     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
316              : "=&f" (ph), "=f" (pl)                                    \
317              : "f" (m0), "f" (m1))
318 #define UMUL_TIME 14
319 #define count_leading_zeros(count, x) \
320   do {                                                                  \
321     UWtype _x = (x), _y, _a, _c;                                        \
322     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
323     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
324     _c = (_a - 1) << 3;                                                 \
325     _x >>= _c;                                                          \
326     if (_x >= 1 << 4)                                                   \
327       _x >>= 4, _c += 4;                                                \
328     if (_x >= 1 << 2)                                                   \
329       _x >>= 2, _c += 2;                                                \
330     _c += _x >> 1;                                                      \
331     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
332   } while (0)
333 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
334    based, and we don't need a special case for x==0 here */
335 #define count_trailing_zeros(count, x)                                  \
336   do {                                                                  \
337     UWtype __ctz_x = (x);                                               \
338     __asm__ ("popcnt %0 = %1"                                           \
339              : "=r" (count)                                             \
340              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
341   } while (0)
342 #endif
343 #if defined (__INTEL_COMPILER)
344 #include <ia64intrin.h>
345 #define umul_ppmm(ph, pl, m0, m1)                                       \
346   do {                                                                  \
347     UWtype _m0 = (m0), _m1 = (m1);                                      \
348     ph = _m64_xmahu (_m0, _m1, 0);                                      \
349     pl = _m0 * _m1;                                                     \
350   } while (0)
351 #endif
352 #ifndef LONGLONG_STANDALONE
353 #define udiv_qrnnd(q, r, n1, n0, d) \
354   do { UWtype __di;                                                     \
355     __di = __MPN(invert_limb) (d);                                      \
356     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
357   } while (0)
358 #define UDIV_PREINV_ALWAYS  1
359 #define UDIV_NEEDS_NORMALIZATION 1
360 #endif
361 #define UDIV_TIME 220
362 #endif
363
364
365 #if defined (__GNUC__)
366
367 /* We sometimes need to clobber "cc" with gcc2, but that would not be
368    understood by gcc1.  Use cpp to avoid major code duplication.  */
369 #if __GNUC__ < 2
370 #define __CLOBBER_CC
371 #define __AND_CLOBBER_CC
372 #else /* __GNUC__ >= 2 */
373 #define __CLOBBER_CC : "cc"
374 #define __AND_CLOBBER_CC , "cc"
375 #endif /* __GNUC__ < 2 */
376
377 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
379   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
380            : "=r" (sh), "=&r" (sl)                                      \
381            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
382 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
383   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
384            : "=r" (sh), "=&r" (sl)                                      \
385            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
386 #define umul_ppmm(xh, xl, m0, m1) \
387   do {                                                                  \
388     USItype __m0 = (m0), __m1 = (m1);                                   \
389     __asm__ ("multiplu %0,%1,%2"                                        \
390              : "=r" (xl)                                                \
391              : "r" (__m0), "r" (__m1));                                 \
392     __asm__ ("multmu %0,%1,%2"                                          \
393              : "=r" (xh)                                                \
394              : "r" (__m0), "r" (__m1));                                 \
395   } while (0)
396 #define udiv_qrnnd(q, r, n1, n0, d) \
397   __asm__ ("dividu %0,%3,%4"                                            \
398            : "=r" (q), "=q" (r)                                         \
399            : "1" (n1), "r" (n0), "r" (d))
400 #define count_leading_zeros(count, x) \
401     __asm__ ("clz %0,%1"                                                \
402              : "=r" (count)                                             \
403              : "r" (x))
404 #define COUNT_LEADING_ZEROS_0 32
405 #endif /* __a29k__ */
406
407 #if defined (__arc__)
408 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
409   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
410            : "=r" (sh),                                                 \
411              "=&r" (sl)                                                 \
412            : "r"  ((USItype) (ah)),                                     \
413              "rIJ" ((USItype) (bh)),                                    \
414              "%r" ((USItype) (al)),                                     \
415              "rIJ" ((USItype) (bl)))
416 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
417   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
418            : "=r" (sh),                                                 \
419              "=&r" (sl)                                                 \
420            : "r" ((USItype) (ah)),                                      \
421              "rIJ" ((USItype) (bh)),                                    \
422              "r" ((USItype) (al)),                                      \
423              "rIJ" ((USItype) (bl)))
424 #endif
425
426 #if defined (__arm__) && W_TYPE_SIZE == 32
427 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
428   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
429            : "=r" (sh), "=&r" (sl)                                      \
430            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
431 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
432   do {                                                                  \
433     if (__builtin_constant_p (al))                                      \
434       {                                                                 \
435         if (__builtin_constant_p (ah))                                  \
436           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
437                    : "=r" (sh), "=&r" (sl)                              \
438                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
439         else                                                            \
440           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
441                    : "=r" (sh), "=&r" (sl)                              \
442                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
443       }                                                                 \
444     else if (__builtin_constant_p (ah))                                 \
445       {                                                                 \
446         if (__builtin_constant_p (bl))                                  \
447           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
448                    : "=r" (sh), "=&r" (sl)                              \
449                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
450         else                                                            \
451           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
452                    : "=r" (sh), "=&r" (sl)                              \
453                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
454       }                                                                 \
455     else if (__builtin_constant_p (bl))                                 \
456       {                                                                 \
457         if (__builtin_constant_p (bh))                                  \
458           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
459                    : "=r" (sh), "=&r" (sl)                              \
460                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
461         else                                                            \
462           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
463                    : "=r" (sh), "=&r" (sl)                              \
464                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
465       }                                                                 \
466     else /* only bh might be a constant */                              \
467       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
468                : "=r" (sh), "=&r" (sl)                                  \
469                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
470     } while (0)
471 #if 1 || defined (__arm_m__)    /* `M' series has widening multiply support */
472 #define umul_ppmm(xh, xl, a, b) \
473   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
474 #define UMUL_TIME 5
475 #define smul_ppmm(xh, xl, a, b) \
476   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
477 #ifndef LONGLONG_STANDALONE
478 #define udiv_qrnnd(q, r, n1, n0, d) \
479   do { UWtype __di;                                                     \
480     __di = __MPN(invert_limb) (d);                                      \
481     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
482   } while (0)
483 #define UDIV_PREINV_ALWAYS  1
484 #define UDIV_NEEDS_NORMALIZATION 1
485 #define UDIV_TIME 70
486 #endif /* LONGLONG_STANDALONE */
487 #else
488 #define umul_ppmm(xh, xl, a, b) \
489   __asm__ ("%@ Inlined umul_ppmm\n"                                     \
490 "       mov     %|r0, %2, lsr #16\n"                                    \
491 "       mov     %|r2, %3, lsr #16\n"                                    \
492 "       bic     %|r1, %2, %|r0, lsl #16\n"                              \
493 "       bic     %|r2, %3, %|r2, lsl #16\n"                              \
494 "       mul     %1, %|r1, %|r2\n"                                       \
495 "       mul     %|r2, %|r0, %|r2\n"                                     \
496 "       mul     %|r1, %0, %|r1\n"                                       \
497 "       mul     %0, %|r0, %0\n"                                         \
498 "       adds    %|r1, %|r2, %|r1\n"                                     \
499 "       addcs   %0, %0, #65536\n"                                       \
500 "       adds    %1, %1, %|r1, lsl #16\n"                                \
501 "       adc     %0, %0, %|r1, lsr #16"                                  \
502            : "=&r" (xh), "=r" (xl)                                      \
503            : "r" (a), "r" (b)                                           \
504            : "r0", "r1", "r2")
505 #define UMUL_TIME 20
506 #ifndef LONGLONG_STANDALONE
507 #define udiv_qrnnd(q, r, n1, n0, d) \
508   do { UWtype __r;                                                      \
509     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
510     (r) = __r;                                                          \
511   } while (0)
512 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
513 #define UDIV_TIME 200
514 #endif /* LONGLONG_STANDALONE */
515 #endif
516 #if defined (__ARM_ARCH_5__)
517 /* This actually requires arm 5 */
518 #define count_leading_zeros(count, x) \
519   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
520 #define COUNT_LEADING_ZEROS_0 32
521 #endif
522 #endif /* __arm__ */
523
524 #if defined (__clipper__) && W_TYPE_SIZE == 32
525 #define umul_ppmm(w1, w0, u, v) \
526   ({union {UDItype __ll;                                                \
527            struct {USItype __l, __h;} __i;                              \
528           } __x;                                                        \
529   __asm__ ("mulwux %2,%0"                                               \
530            : "=r" (__x.__ll)                                            \
531            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
532   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
533 #define smul_ppmm(w1, w0, u, v) \
534   ({union {DItype __ll;                                                 \
535            struct {SItype __l, __h;} __i;                               \
536           } __x;                                                        \
537   __asm__ ("mulwx %2,%0"                                                \
538            : "=r" (__x.__ll)                                            \
539            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
540   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
541 #define __umulsidi3(u, v) \
542   ({UDItype __w;                                                        \
543     __asm__ ("mulwux %2,%0"                                             \
544              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
545     __w; })
546 #endif /* __clipper__ */
547
548 /* Fujitsu vector computers.  */
549 #if defined (__uxp__) && W_TYPE_SIZE == 32
550 #define umul_ppmm(ph, pl, u, v) \
551   do {                                                                  \
552     union {UDItype __ll;                                                \
553            struct {USItype __h, __l;} __i;                              \
554           } __x;                                                        \
555     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
556     (ph) = __x.__i.__h;                                                 \
557     (pl) = __x.__i.__l;                                                 \
558   } while (0)
559 #define smul_ppmm(ph, pl, u, v) \
560   do {                                                                  \
561     union {UDItype __ll;                                                \
562            struct {USItype __h, __l;} __i;                              \
563           } __x;                                                        \
564     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
565     (ph) = __x.__i.__h;                                                 \
566     (pl) = __x.__i.__l;                                                 \
567   } while (0)
568 #endif
569
570 #if defined (__gmicro__) && W_TYPE_SIZE == 32
571 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
572   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
573            : "=g" (sh), "=&g" (sl)                                      \
574            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
575              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
578            : "=g" (sh), "=&g" (sl)                                      \
579            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
580              "1" ((USItype)(al)), "g" ((USItype)(bl)))
581 #define umul_ppmm(ph, pl, m0, m1) \
582   __asm__ ("mulx %3,%0,%1"                                              \
583            : "=g" (ph), "=r" (pl)                                       \
584            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
585 #define udiv_qrnnd(q, r, nh, nl, d) \
586   __asm__ ("divx %4,%0,%1"                                              \
587            : "=g" (q), "=r" (r)                                         \
588            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
589 #define count_leading_zeros(count, x) \
590   __asm__ ("bsch/1 %1,%0"                                               \
591            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
592 #endif
593
594 #if defined (__hppa) && W_TYPE_SIZE == 32
595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
596   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
597            : "=r" (sh), "=&r" (sl)                                      \
598            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
599 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
600   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
601            : "=r" (sh), "=&r" (sl)                                      \
602            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
603 #if defined (_PA_RISC1_1)
604 #define umul_ppmm(wh, wl, u, v) \
605   do {                                                                  \
606     union {UDItype __ll;                                                \
607            struct {USItype __h, __l;} __i;                              \
608           } __x;                                                        \
609     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
610     (wh) = __x.__i.__h;                                                 \
611     (wl) = __x.__i.__l;                                                 \
612   } while (0)
613 #define UMUL_TIME 8
614 #define UDIV_TIME 60
615 #else
616 #define UMUL_TIME 40
617 #define UDIV_TIME 80
618 #endif
619 #define count_leading_zeros(count, x) \
620   do {                                                                  \
621     USItype __tmp;                                                      \
622     __asm__ (                                                           \
623        "ldi             1,%0\n"                                         \
624 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
625 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
626 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
627 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
628 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
629 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
630 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
631 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
632 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
633 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
634 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
635 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
636 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
637 "       sub             %0,%1,%0        ; Subtract it.\n"               \
638         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
639   } while (0)
640 #endif /* hppa */
641
642 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
643    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
644    is just a case of no direct support for 2.0n but treating it like 1.0. */
645 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
646 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
647   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
648            : "=r" (sh), "=&r" (sl)                                      \
649            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
650 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
651   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
652            : "=r" (sh), "=&r" (sl)                                      \
653            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
654 #endif /* hppa */
655
656 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
657 #define smul_ppmm(xh, xl, m0, m1) \
658   do {                                                                  \
659     union {DItype __ll;                                                 \
660            struct {USItype __h, __l;} __i;                              \
661           } __x;                                                        \
662     __asm__ ("lr %N0,%1\n\tmr %0,%2"                                    \
663              : "=&r" (__x.__ll)                                         \
664              : "r" (m0), "r" (m1));                                     \
665     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
666   } while (0)
667 #define sdiv_qrnnd(q, r, n1, n0, d) \
668   do {                                                                  \
669     union {DItype __ll;                                                 \
670            struct {USItype __h, __l;} __i;                              \
671           } __x;                                                        \
672     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
673     __asm__ ("dr %0,%2"                                                 \
674              : "=r" (__x.__ll)                                          \
675              : "0" (__x.__ll), "r" (d));                                \
676     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
677   } while (0)
678 #endif
679
680 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
681 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
682   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
683            : "=r" (sh), "=&r" (sl)                                      \
684            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
685              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
686 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
687   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
688            : "=r" (sh), "=&r" (sl)                                      \
689            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
690              "1" ((USItype)(al)), "g" ((USItype)(bl)))
691 #define umul_ppmm(w1, w0, u, v) \
692   __asm__ ("mull %3"                                                    \
693            : "=a" (w0), "=d" (w1)                                       \
694            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
695 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
696   __asm__ ("divl %4"                 /* stringification in K&R C */     \
697            : "=a" (q), "=d" (r)                                         \
698            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
699
700 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
701 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
702    significant 1 bit is, hence the use of the following alternatives.  bsfl
703    is slow too, between 18 and 42 depending where the least significant 1
704    bit is, so let the generic count_trailing_zeros below make use of the
705    count_leading_zeros here too.  */
706
707 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
708 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
709    cache miss reading from __clz_tab.  For P55 it's favoured over the float
710    below so as to avoid mixing MMX and x87, since the penalty for switching
711    between the two is about 100 cycles.
712
713    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
714    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
715    follows, but as of gcc 2.95.2 it results in conditional jumps.
716
717        __shift = -(__n < 0x1000000);
718        __shift -= (__n < 0x10000);
719        __shift -= (__n < 0x100);
720
721    The middle two sbbl and cmpl's pair, and with luck something gcc
722    generates might pair with the first cmpl and the last sbbl.  The "32+1"
723    constant could be folded into __clz_tab[], but it doesn't seem worth
724    making a different table just for that.  */
725
726 #define count_leading_zeros(c,n)                                        \
727   do {                                                                  \
728     USItype  __n = (n);                                                 \
729     USItype  __shift;                                                   \
730     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
731              "sbbl  %0, %0\n"                                           \
732              "cmpl  $0x10000, %1\n"                                     \
733              "sbbl  $0, %0\n"                                           \
734              "cmpl  $0x100, %1\n"                                       \
735              "sbbl  $0, %0\n"                                           \
736              : "=&r" (__shift) : "r"  (__n));                           \
737     __shift = __shift*8 + 24 + 1;                                       \
738     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
739   } while (0)
740 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
741 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
742
743 #else /* ! pentiummmx || LONGLONG_STANDALONE */
744 /* The following should be a fixed 14 cycles or so.  Some scheduling
745    opportunities should be available between the float load/store too.  This
746    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
747    apparently suggested by the Intel optimizing manual (don't know exactly
748    where).  gcc 2.95 or up will be best for this, so the "double" is
749    correctly aligned on the stack.  */
750 #define count_leading_zeros(c,n)                                        \
751   do {                                                                  \
752     union {                                                             \
753       double    d;                                                      \
754       unsigned  a[2];                                                   \
755     } __u;                                                              \
756     ASSERT ((n) != 0);                                                  \
757     __u.d = (UWtype) (n);                                               \
758     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
759   } while (0)
760 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
761 #endif /* pentiummx */
762
763 #else /* ! pentium */
764
765 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
766 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
767 #endif /* gcc clz */
768
769 /* On P6, gcc prior to 3.0 generates a partial register stall for
770    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
771    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
772    cost of one extra instruction.  Do this for "i386" too, since that means
773    generic x86.  */
774 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
775   && (HAVE_HOST_CPU_i386                                                \
776       || HAVE_HOST_CPU_i686                                             \
777       || HAVE_HOST_CPU_pentiumpro                                       \
778       || HAVE_HOST_CPU_pentium2                                         \
779       || HAVE_HOST_CPU_pentium3)
780 #define count_leading_zeros(count, x)                                   \
781   do {                                                                  \
782     USItype __cbtmp;                                                    \
783     ASSERT ((x) != 0);                                                  \
784     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
785     (count) = 31 - __cbtmp;                                             \
786   } while (0)
787 #endif /* gcc<3 asm bsrl */
788
789 #ifndef count_leading_zeros
790 #define count_leading_zeros(count, x)                                   \
791   do {                                                                  \
792     USItype __cbtmp;                                                    \
793     ASSERT ((x) != 0);                                                  \
794     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
795     (count) = __cbtmp ^ 31;                                             \
796   } while (0)
797 #endif /* asm bsrl */
798
799 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
800 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
801 #endif /* gcc ctz */
802
803 #ifndef count_trailing_zeros
804 #define count_trailing_zeros(count, x)                                  \
805   do {                                                                  \
806     ASSERT ((x) != 0);                                                  \
807     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
808   } while (0)
809 #endif /* asm bsfl */
810
811 #endif /* ! pentium */
812
813 #ifndef UMUL_TIME
814 #define UMUL_TIME 10
815 #endif
816 #ifndef UDIV_TIME
817 #define UDIV_TIME 40
818 #endif
819 #endif /* 80x86 */
820
821 #if defined (__amd64__) && W_TYPE_SIZE == 64
822 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
823   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
824            : "=r" (sh), "=&r" (sl)                                      \
825            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
826              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
827 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
828   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
829            : "=r" (sh), "=&r" (sl)                                      \
830            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
831              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
832 #define umul_ppmm(w1, w0, u, v) \
833   __asm__ ("mulq %3"                                                    \
834            : "=a" (w0), "=d" (w1)                                       \
835            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
836 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
837   __asm__ ("divq %4"                 /* stringification in K&R C */     \
838            : "=a" (q), "=d" (r)                                         \
839            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
840 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
841 #define count_leading_zeros(count, x)                                   \
842   do {                                                                  \
843     UDItype __cbtmp;                                                    \
844     ASSERT ((x) != 0);                                                  \
845     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
846     (count) = __cbtmp ^ 63;                                             \
847   } while (0)
848 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
849    count is only an int. */
850 #define count_trailing_zeros(count, x)                                  \
851   do {                                                                  \
852     ASSERT ((x) != 0);                                                  \
853     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
854   } while (0)
855 #endif /* x86_64 */
856
857 #if defined (__i860__) && W_TYPE_SIZE == 32
858 #define rshift_rhlc(r,h,l,c) \
859   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
860            "=r" (r) : "r" (h), "r" (l), "rn" (c))
861 #endif /* i860 */
862
863 #if defined (__i960__) && W_TYPE_SIZE == 32
864 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
865   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
866            : "=r" (sh), "=&r" (sl)                                      \
867            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
868 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
869   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
870            : "=r" (sh), "=&r" (sl)                                      \
871            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
872 #define umul_ppmm(w1, w0, u, v) \
873   ({union {UDItype __ll;                                                \
874            struct {USItype __l, __h;} __i;                              \
875           } __x;                                                        \
876   __asm__ ("emul %2,%1,%0"                                              \
877            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
878   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
879 #define __umulsidi3(u, v) \
880   ({UDItype __w;                                                        \
881     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
882     __w; })
883 #define udiv_qrnnd(q, r, nh, nl, d) \
884   do {                                                                  \
885     union {UDItype __ll;                                                \
886            struct {USItype __l, __h;} __i;                              \
887           } __nn;                                                       \
888     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
889     __asm__ ("ediv %d,%n,%0"                                            \
890            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
891     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
892   } while (0)
893 #define count_leading_zeros(count, x) \
894   do {                                                                  \
895     USItype __cbtmp;                                                    \
896     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
897     (count) = __cbtmp ^ 31;                                             \
898   } while (0)
899 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
900 #if defined (__i960mx)          /* what is the proper symbol to test??? */
901 #define rshift_rhlc(r,h,l,c) \
902   do {                                                                  \
903     union {UDItype __ll;                                                \
904            struct {USItype __l, __h;} __i;                              \
905           } __nn;                                                       \
906     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
907     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
908   }
909 #endif /* i960mx */
910 #endif /* i960 */
911
912 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
913      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
914      || defined (__mc5307__)) && W_TYPE_SIZE == 32
915 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
916   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
917            : "=d" (sh), "=&d" (sl)                                      \
918            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
919              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
920 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
921   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
922            : "=d" (sh), "=&d" (sl)                                      \
923            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
924              "1" ((USItype)(al)), "g" ((USItype)(bl)))
925 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
926 #if defined (__mc68020__) || defined(mc68020) \
927      || defined (__mc68030__) || defined (mc68030) \
928      || defined (__mc68040__) || defined (mc68040) \
929      || defined (__mcpu32__) || defined (mcpu32) \
930      || defined (__NeXT__)
931 #define umul_ppmm(w1, w0, u, v) \
932   __asm__ ("mulu%.l %3,%1:%0"                                           \
933            : "=d" (w0), "=d" (w1)                                       \
934            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
935 #define UMUL_TIME 45
936 #define udiv_qrnnd(q, r, n1, n0, d) \
937   __asm__ ("divu%.l %4,%1:%0"                                           \
938            : "=d" (q), "=d" (r)                                         \
939            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
940 #define UDIV_TIME 90
941 #define sdiv_qrnnd(q, r, n1, n0, d) \
942   __asm__ ("divs%.l %4,%1:%0"                                           \
943            : "=d" (q), "=d" (r)                                         \
944            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
945 #else /* for other 68k family members use 16x16->32 multiplication */
946 #define umul_ppmm(xh, xl, a, b) \
947   do { USItype __umul_tmp1, __umul_tmp2;                                \
948         __asm__ ("| Inlined umul_ppmm\n"                                \
949 "       move%.l %5,%3\n"                                                \
950 "       move%.l %2,%0\n"                                                \
951 "       move%.w %3,%1\n"                                                \
952 "       swap    %3\n"                                                   \
953 "       swap    %0\n"                                                   \
954 "       mulu%.w %2,%1\n"                                                \
955 "       mulu%.w %3,%0\n"                                                \
956 "       mulu%.w %2,%3\n"                                                \
957 "       swap    %2\n"                                                   \
958 "       mulu%.w %5,%2\n"                                                \
959 "       add%.l  %3,%2\n"                                                \
960 "       jcc     1f\n"                                                   \
961 "       add%.l  %#0x10000,%0\n"                                         \
962 "1:     move%.l %2,%3\n"                                                \
963 "       clr%.w  %2\n"                                                   \
964 "       swap    %2\n"                                                   \
965 "       swap    %3\n"                                                   \
966 "       clr%.w  %3\n"                                                   \
967 "       add%.l  %3,%1\n"                                                \
968 "       addx%.l %2,%0\n"                                                \
969 "       | End inlined umul_ppmm"                                        \
970               : "=&d" (xh), "=&d" (xl),                                 \
971                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
972               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
973   } while (0)
974 #define UMUL_TIME 100
975 #define UDIV_TIME 400
976 #endif /* not mc68020 */
977 /* The '020, '030, '040 and '060 have bitfield insns.
978    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
979    exclude bfffo on that chip (bitfield insns not available).  */
980 #if (defined (__mc68020__) || defined (mc68020)    \
981      || defined (__mc68030__) || defined (mc68030) \
982      || defined (__mc68040__) || defined (mc68040) \
983      || defined (__mc68060__) || defined (mc68060) \
984      || defined (__NeXT__))                        \
985   && ! defined (__mcpu32__)
986 #define count_leading_zeros(count, x) \
987   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
988            : "=d" (count)                                               \
989            : "od" ((USItype) (x)), "n" (0))
990 #define COUNT_LEADING_ZEROS_0 32
991 #endif
992 #endif /* mc68000 */
993
994 #if defined (__m88000__) && W_TYPE_SIZE == 32
995 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
996   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
997            : "=r" (sh), "=&r" (sl)                                      \
998            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
999 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1000   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
1001            : "=r" (sh), "=&r" (sl)                                      \
1002            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1003 #define count_leading_zeros(count, x) \
1004   do {                                                                  \
1005     USItype __cbtmp;                                                    \
1006     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
1007     (count) = __cbtmp ^ 31;                                             \
1008   } while (0)
1009 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1010 #if defined (__m88110__)
1011 #define umul_ppmm(wh, wl, u, v) \
1012   do {                                                                  \
1013     union {UDItype __ll;                                                \
1014            struct {USItype __h, __l;} __i;                              \
1015           } __x;                                                        \
1016     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1017     (wh) = __x.__i.__h;                                                 \
1018     (wl) = __x.__i.__l;                                                 \
1019   } while (0)
1020 #define udiv_qrnnd(q, r, n1, n0, d) \
1021   ({union {UDItype __ll;                                                \
1022            struct {USItype __h, __l;} __i;                              \
1023           } __x, __q;                                                   \
1024   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1025   __asm__ ("divu.d %0,%1,%2"                                            \
1026            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1027   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1028 #define UMUL_TIME 5
1029 #define UDIV_TIME 25
1030 #else
1031 #define UMUL_TIME 17
1032 #define UDIV_TIME 150
1033 #endif /* __m88110__ */
1034 #endif /* __m88000__ */
1035
1036 #if defined (__mips) && W_TYPE_SIZE == 32
1037 #if __GMP_GNUC_PREREQ (4,4)
1038 #define umul_ppmm(w1, w0, u, v) \
1039   do {                                                                  \
1040     UDItype __ll = (UDItype)(u) * (v);                                  \
1041     w1 = __ll >> 32;                                                    \
1042     w0 = __ll;                                                          \
1043   } while (0)
1044 #endif
1045 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1046 #define umul_ppmm(w1, w0, u, v) \
1047   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1048 #endif
1049 #if !defined (umul_ppmm)
1050 #define umul_ppmm(w1, w0, u, v) \
1051   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1052            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1053 #endif
1054 #define UMUL_TIME 10
1055 #define UDIV_TIME 100
1056 #endif /* __mips */
1057
1058 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1059 #if __GMP_GNUC_PREREQ (4,4)
1060 #define umul_ppmm(w1, w0, u, v) \
1061   do {                                                                  \
1062     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1063     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1064     w1 = __ll >> 64;                                                    \
1065     w0 = __ll;                                                          \
1066   } while (0)
1067 #endif
1068 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1069 #define umul_ppmm(w1, w0, u, v) \
1070   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1071 #endif
1072 #if !defined (umul_ppmm)
1073 #define umul_ppmm(w1, w0, u, v) \
1074   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1075            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1076 #endif
1077 #define UMUL_TIME 20
1078 #define UDIV_TIME 140
1079 #endif /* __mips */
1080
1081 #if defined (__mmix__) && W_TYPE_SIZE == 64
1082 #define umul_ppmm(w1, w0, u, v) \
1083   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1084 #endif
1085
1086 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1087 #define umul_ppmm(w1, w0, u, v) \
1088   ({union {UDItype __ll;                                                \
1089            struct {USItype __l, __h;} __i;                              \
1090           } __x;                                                        \
1091   __asm__ ("meid %2,%0"                                                 \
1092            : "=g" (__x.__ll)                                            \
1093            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1094   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1095 #define __umulsidi3(u, v) \
1096   ({UDItype __w;                                                        \
1097     __asm__ ("meid %2,%0"                                               \
1098              : "=g" (__w)                                               \
1099              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1100     __w; })
1101 #define udiv_qrnnd(q, r, n1, n0, d) \
1102   ({union {UDItype __ll;                                                \
1103            struct {USItype __l, __h;} __i;                              \
1104           } __x;                                                        \
1105   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1106   __asm__ ("deid %2,%0"                                                 \
1107            : "=g" (__x.__ll)                                            \
1108            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1109   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1110 #define count_trailing_zeros(count,x) \
1111   do {                                                                  \
1112     __asm__ ("ffsd      %2,%0"                                          \
1113              : "=r" (count)                                             \
1114              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1115   } while (0)
1116 #endif /* __ns32000__ */
1117
1118 /* In the past we had a block of various #defines tested
1119        _ARCH_PPC    - AIX
1120        _ARCH_PWR    - AIX
1121        __powerpc__  - gcc
1122        __POWERPC__  - BEOS
1123        __ppc__      - Darwin
1124        PPC          - old gcc, GNU/Linux, SysV
1125    The plain PPC test was not good for vxWorks, since PPC is defined on all
1126    CPUs there (eg. m68k too), as a constant one is expected to compare
1127    CPU_FAMILY against.
1128
1129    At any rate, this was pretty unattractive and a bit fragile.  The use of
1130    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1131    getting the desired effect.
1132
1133    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1134    the system vendor compilers.  (Is that vendor compilers with inline asm,
1135    or what?)  */
1136
1137 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1138   && W_TYPE_SIZE == 32
1139 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1140   do {                                                                  \
1141     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1142       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1143              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1144     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1145       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1146              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1147     else                                                                \
1148       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1149              : "=r" (sh), "=&r" (sl)                                    \
1150              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1151   } while (0)
1152 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1153   do {                                                                  \
1154     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1155       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
1156                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1157     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1158       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
1159                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1160     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1161       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
1162                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1163     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1164       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
1165                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1166     else                                                                \
1167       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
1168                : "=r" (sh), "=&r" (sl)                                  \
1169                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1170   } while (0)
1171 #define count_leading_zeros(count, x) \
1172   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1173 #define COUNT_LEADING_ZEROS_0 32
1174 #if HAVE_HOST_CPU_FAMILY_powerpc
1175 #if __GMP_GNUC_PREREQ (4,4)
1176 #define umul_ppmm(w1, w0, u, v) \
1177   do {                                                                  \
1178     UDItype __ll = (UDItype)(u) * (v);                                  \
1179     w1 = __ll >> 32;                                                    \
1180     w0 = __ll;                                                          \
1181   } while (0)
1182 #endif
1183 #if !defined (umul_ppmm)
1184 #define umul_ppmm(ph, pl, m0, m1) \
1185   do {                                                                  \
1186     USItype __m0 = (m0), __m1 = (m1);                                   \
1187     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1188     (pl) = __m0 * __m1;                                                 \
1189   } while (0)
1190 #endif
1191 #define UMUL_TIME 15
1192 #define smul_ppmm(ph, pl, m0, m1) \
1193   do {                                                                  \
1194     SItype __m0 = (m0), __m1 = (m1);                                    \
1195     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1196     (pl) = __m0 * __m1;                                                 \
1197   } while (0)
1198 #define SMUL_TIME 14
1199 #define UDIV_TIME 120
1200 #else
1201 #define UMUL_TIME 8
1202 #define smul_ppmm(xh, xl, m0, m1) \
1203   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1204 #define SMUL_TIME 4
1205 #define sdiv_qrnnd(q, r, nh, nl, d) \
1206   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1207 #define UDIV_TIME 100
1208 #endif
1209 #endif /* 32-bit POWER architecture variants.  */
1210
1211 /* We should test _IBMR2 here when we add assembly support for the system
1212    vendor compilers.  */
1213 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1214 #if !defined (_LONG_LONG_LIMB)
1215 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1216    use adde etc only when not _LONG_LONG_LIMB.  */
1217 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1218   do {                                                                  \
1219     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1220       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
1221              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1222     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1223       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
1224              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1225     else                                                                \
1226       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
1227              : "=r" (sh), "=&r" (sl)                                    \
1228              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1229   } while (0)
1230 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1231    This might seem strange, but gcc folds away the dead code late.  */
1232 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1233   do {                                                                        \
1234     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {          \
1235         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1236           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"               \
1237                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1238         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1239           __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"               \
1240                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1241         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1242           __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"                 \
1243                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1244         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1245           __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"                 \
1246                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1247         else                                                                  \
1248           __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"              \
1249                    : "=r" (sh), "=&r" (sl)                                    \
1250                    : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));             \
1251       } else {                                                                \
1252         if (__builtin_constant_p (ah) && (ah) == 0)                           \
1253           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"         \
1254                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1255         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
1256           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"         \
1257                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1258         else if (__builtin_constant_p (bh) && (bh) == 0)                      \
1259           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"           \
1260                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1261         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
1262           __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"           \
1263                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1264         else                                                                  \
1265           __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"        \
1266                    : "=r" (sh), "=&r" (sl)                                    \
1267                    : "r" (ah), "r" (bh), "rI" (al), "r" (bl));                \
1268       }                                                                       \
1269   } while (0)
1270 #endif /* ! _LONG_LONG_LIMB */
1271 #define count_leading_zeros(count, x) \
1272   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1273 #define COUNT_LEADING_ZEROS_0 64
1274 #if __GMP_GNUC_PREREQ (4,4)
1275 #define umul_ppmm(w1, w0, u, v) \
1276   do {                                                                  \
1277     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1278     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1279     w1 = __ll >> 64;                                                    \
1280     w0 = __ll;                                                          \
1281   } while (0)
1282 #endif
1283 #if !defined (umul_ppmm)
1284 #define umul_ppmm(ph, pl, m0, m1) \
1285   do {                                                                  \
1286     UDItype __m0 = (m0), __m1 = (m1);                                   \
1287     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1288     (pl) = __m0 * __m1;                                                 \
1289   } while (0)
1290 #endif
1291 #define UMUL_TIME 15
1292 #define smul_ppmm(ph, pl, m0, m1) \
1293   do {                                                                  \
1294     DItype __m0 = (m0), __m1 = (m1);                                    \
1295     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1296     (pl) = __m0 * __m1;                                                 \
1297   } while (0)
1298 #define SMUL_TIME 14  /* ??? */
1299 #define UDIV_TIME 120 /* ??? */
1300 #endif /* 64-bit PowerPC.  */
1301
1302 #if defined (__pyr__) && W_TYPE_SIZE == 32
1303 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1304   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1305            : "=r" (sh), "=&r" (sl)                                      \
1306            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1307              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1308 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1309   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1310            : "=r" (sh), "=&r" (sl)                                      \
1311            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1312              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1313 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1314 #define umul_ppmm(w1, w0, u, v) \
1315   ({union {UDItype __ll;                                                \
1316            struct {USItype __h, __l;} __i;                              \
1317           } __x;                                                        \
1318   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1319            : "=&r" (__x.__ll)                                           \
1320            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1321   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1322 #endif /* __pyr__ */
1323
1324 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1325 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1326   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1327            : "=r" (sh), "=&r" (sl)                                      \
1328            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1329              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1330 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1331   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1332            : "=r" (sh), "=&r" (sl)                                      \
1333            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1334              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1335 #define smul_ppmm(ph, pl, m0, m1) \
1336   __asm__ (                                                             \
1337        "s       r2,r2\n"                                                \
1338 "       mts r10,%2\n"                                                   \
1339 "       m       r2,%3\n"                                                \
1340 "       m       r2,%3\n"                                                \
1341 "       m       r2,%3\n"                                                \
1342 "       m       r2,%3\n"                                                \
1343 "       m       r2,%3\n"                                                \
1344 "       m       r2,%3\n"                                                \
1345 "       m       r2,%3\n"                                                \
1346 "       m       r2,%3\n"                                                \
1347 "       m       r2,%3\n"                                                \
1348 "       m       r2,%3\n"                                                \
1349 "       m       r2,%3\n"                                                \
1350 "       m       r2,%3\n"                                                \
1351 "       m       r2,%3\n"                                                \
1352 "       m       r2,%3\n"                                                \
1353 "       m       r2,%3\n"                                                \
1354 "       m       r2,%3\n"                                                \
1355 "       cas     %0,r2,r0\n"                                             \
1356 "       mfs     r10,%1"                                                 \
1357            : "=r" (ph), "=r" (pl)                                       \
1358            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1359            : "r2")
1360 #define UMUL_TIME 20
1361 #define UDIV_TIME 200
1362 #define count_leading_zeros(count, x) \
1363   do {                                                                  \
1364     if ((x) >= 0x10000)                                                 \
1365       __asm__ ("clz     %0,%1"                                          \
1366                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1367     else                                                                \
1368       {                                                                 \
1369         __asm__ ("clz   %0,%1"                                          \
1370                  : "=r" (count) : "r" ((USItype)(x)));                  \
1371         (count) += 16;                                                  \
1372       }                                                                 \
1373   } while (0)
1374 #endif /* RT/ROMP */
1375
1376 #if defined (__sh2__) && W_TYPE_SIZE == 32
1377 #define umul_ppmm(w1, w0, u, v) \
1378   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1379            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1380 #define UMUL_TIME 5
1381 #endif
1382
1383 #if defined (__sparc__) && W_TYPE_SIZE == 32
1384 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1385   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1386            : "=r" (sh), "=&r" (sl)                                      \
1387            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1388            __CLOBBER_CC)
1389 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1390   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1391            : "=r" (sh), "=&r" (sl)                                      \
1392            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1393            __CLOBBER_CC)
1394 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1395    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1396 #if defined (__sparc_v9__) || defined (__sparcv9)
1397 /* Perhaps we should use floating-point operations here?  */
1398 #if 0
1399 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1400    Perhaps we simply need explicitly zero-extend the inputs?  */
1401 #define umul_ppmm(w1, w0, u, v) \
1402   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1403            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1404 #else
1405 /* Use v8 umul until above bug is fixed.  */
1406 #define umul_ppmm(w1, w0, u, v) \
1407   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1408 #endif
1409 /* Use a plain v8 divide for v9.  */
1410 #define udiv_qrnnd(q, r, n1, n0, d) \
1411   do {                                                                  \
1412     USItype __q;                                                        \
1413     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1414              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1415     (r) = (n0) - __q * (d);                                             \
1416     (q) = __q;                                                          \
1417   } while (0)
1418 #else
1419 #if defined (__sparc_v8__)   /* gcc normal */                           \
1420   || defined (__sparcv8)     /* gcc solaris */                          \
1421   || HAVE_HOST_CPU_supersparc
1422 /* Don't match immediate range because, 1) it is not often useful,
1423    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1424    while we want to match a 13 bit interval, sign extended to 32 bits,
1425    but INTERPRETED AS UNSIGNED.  */
1426 #define umul_ppmm(w1, w0, u, v) \
1427   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1428 #define UMUL_TIME 5
1429
1430 #if HAVE_HOST_CPU_supersparc
1431 #define UDIV_TIME 60            /* SuperSPARC timing */
1432 #else
1433 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1434    dividends and will trap to the kernel for the rest. */
1435 #define udiv_qrnnd(q, r, n1, n0, d) \
1436   do {                                                                  \
1437     USItype __q;                                                        \
1438     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1439              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1440     (r) = (n0) - __q * (d);                                             \
1441     (q) = __q;                                                          \
1442   } while (0)
1443 #define UDIV_TIME 25
1444 #endif /* HAVE_HOST_CPU_supersparc */
1445
1446 #else /* ! __sparc_v8__ */
1447 #if defined (__sparclite__)
1448 /* This has hardware multiply but not divide.  It also has two additional
1449    instructions scan (ffs from high bit) and divscc.  */
1450 #define umul_ppmm(w1, w0, u, v) \
1451   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1452 #define UMUL_TIME 5
1453 #define udiv_qrnnd(q, r, n1, n0, d) \
1454   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1455 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1456 "       tst     %%g0\n"                                                 \
1457 "       divscc  %3,%4,%%g1\n"                                           \
1458 "       divscc  %%g1,%4,%%g1\n"                                         \
1459 "       divscc  %%g1,%4,%%g1\n"                                         \
1460 "       divscc  %%g1,%4,%%g1\n"                                         \
1461 "       divscc  %%g1,%4,%%g1\n"                                         \
1462 "       divscc  %%g1,%4,%%g1\n"                                         \
1463 "       divscc  %%g1,%4,%%g1\n"                                         \
1464 "       divscc  %%g1,%4,%%g1\n"                                         \
1465 "       divscc  %%g1,%4,%%g1\n"                                         \
1466 "       divscc  %%g1,%4,%%g1\n"                                         \
1467 "       divscc  %%g1,%4,%%g1\n"                                         \
1468 "       divscc  %%g1,%4,%%g1\n"                                         \
1469 "       divscc  %%g1,%4,%%g1\n"                                         \
1470 "       divscc  %%g1,%4,%%g1\n"                                         \
1471 "       divscc  %%g1,%4,%%g1\n"                                         \
1472 "       divscc  %%g1,%4,%%g1\n"                                         \
1473 "       divscc  %%g1,%4,%%g1\n"                                         \
1474 "       divscc  %%g1,%4,%%g1\n"                                         \
1475 "       divscc  %%g1,%4,%%g1\n"                                         \
1476 "       divscc  %%g1,%4,%%g1\n"                                         \
1477 "       divscc  %%g1,%4,%%g1\n"                                         \
1478 "       divscc  %%g1,%4,%%g1\n"                                         \
1479 "       divscc  %%g1,%4,%%g1\n"                                         \
1480 "       divscc  %%g1,%4,%%g1\n"                                         \
1481 "       divscc  %%g1,%4,%%g1\n"                                         \
1482 "       divscc  %%g1,%4,%%g1\n"                                         \
1483 "       divscc  %%g1,%4,%%g1\n"                                         \
1484 "       divscc  %%g1,%4,%%g1\n"                                         \
1485 "       divscc  %%g1,%4,%%g1\n"                                         \
1486 "       divscc  %%g1,%4,%%g1\n"                                         \
1487 "       divscc  %%g1,%4,%%g1\n"                                         \
1488 "       divscc  %%g1,%4,%0\n"                                           \
1489 "       rd      %%y,%1\n"                                               \
1490 "       bl,a 1f\n"                                                      \
1491 "       add     %1,%4,%1\n"                                             \
1492 "1:     ! End of inline udiv_qrnnd"                                     \
1493            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1494            : "%g1" __AND_CLOBBER_CC)
1495 #define UDIV_TIME 37
1496 #define count_leading_zeros(count, x) \
1497   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1498 /* Early sparclites return 63 for an argument of 0, but they warn that future
1499    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1500    undefined.  */
1501 #endif /* __sparclite__ */
1502 #endif /* __sparc_v8__ */
1503 #endif /* __sparc_v9__ */
1504 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1505 #ifndef umul_ppmm
1506 #define umul_ppmm(w1, w0, u, v) \
1507   __asm__ ("! Inlined umul_ppmm\n"                                      \
1508 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1509 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1510 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1511 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1512 "       mulscc  %%g1,%3,%%g1\n"                                         \
1513 "       mulscc  %%g1,%3,%%g1\n"                                         \
1514 "       mulscc  %%g1,%3,%%g1\n"                                         \
1515 "       mulscc  %%g1,%3,%%g1\n"                                         \
1516 "       mulscc  %%g1,%3,%%g1\n"                                         \
1517 "       mulscc  %%g1,%3,%%g1\n"                                         \
1518 "       mulscc  %%g1,%3,%%g1\n"                                         \
1519 "       mulscc  %%g1,%3,%%g1\n"                                         \
1520 "       mulscc  %%g1,%3,%%g1\n"                                         \
1521 "       mulscc  %%g1,%3,%%g1\n"                                         \
1522 "       mulscc  %%g1,%3,%%g1\n"                                         \
1523 "       mulscc  %%g1,%3,%%g1\n"                                         \
1524 "       mulscc  %%g1,%3,%%g1\n"                                         \
1525 "       mulscc  %%g1,%3,%%g1\n"                                         \
1526 "       mulscc  %%g1,%3,%%g1\n"                                         \
1527 "       mulscc  %%g1,%3,%%g1\n"                                         \
1528 "       mulscc  %%g1,%3,%%g1\n"                                         \
1529 "       mulscc  %%g1,%3,%%g1\n"                                         \
1530 "       mulscc  %%g1,%3,%%g1\n"                                         \
1531 "       mulscc  %%g1,%3,%%g1\n"                                         \
1532 "       mulscc  %%g1,%3,%%g1\n"                                         \
1533 "       mulscc  %%g1,%3,%%g1\n"                                         \
1534 "       mulscc  %%g1,%3,%%g1\n"                                         \
1535 "       mulscc  %%g1,%3,%%g1\n"                                         \
1536 "       mulscc  %%g1,%3,%%g1\n"                                         \
1537 "       mulscc  %%g1,%3,%%g1\n"                                         \
1538 "       mulscc  %%g1,%3,%%g1\n"                                         \
1539 "       mulscc  %%g1,%3,%%g1\n"                                         \
1540 "       mulscc  %%g1,%3,%%g1\n"                                         \
1541 "       mulscc  %%g1,%3,%%g1\n"                                         \
1542 "       mulscc  %%g1,%3,%%g1\n"                                         \
1543 "       mulscc  %%g1,%3,%%g1\n"                                         \
1544 "       mulscc  %%g1,0,%%g1\n"                                          \
1545 "       add     %%g1,%%g2,%0\n"                                         \
1546 "       rd      %%y,%1"                                                 \
1547            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1548            : "%g1", "%g2" __AND_CLOBBER_CC)
1549 #define UMUL_TIME 39            /* 39 instructions */
1550 #endif
1551 #ifndef udiv_qrnnd
1552 #ifndef LONGLONG_STANDALONE
1553 #define udiv_qrnnd(q, r, n1, n0, d) \
1554   do { UWtype __r;                                                      \
1555     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1556     (r) = __r;                                                          \
1557   } while (0)
1558 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1559 #ifndef UDIV_TIME
1560 #define UDIV_TIME 140
1561 #endif
1562 #endif /* LONGLONG_STANDALONE */
1563 #endif /* udiv_qrnnd */
1564 #endif /* __sparc__ */
1565
1566 #if defined (__sparc__) && W_TYPE_SIZE == 64
1567 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1568   __asm__ (                                                             \
1569        "addcc   %r4,%5,%1\n"                                            \
1570       " addccc  %r6,%7,%%g0\n"                                          \
1571       " addc    %r2,%3,%0"                                              \
1572           : "=r" (sh), "=&r" (sl)                                       \
1573           : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
1574             "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
1575            __CLOBBER_CC)
1576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1577   __asm__ (                                                             \
1578        "subcc   %r4,%5,%1\n"                                            \
1579       " subccc  %r6,%7,%%g0\n"                                          \
1580       " subc    %r2,%3,%0"                                              \
1581           : "=r" (sh), "=&r" (sl)                                       \
1582           : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),         \
1583             "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
1584            __CLOBBER_CC)
1585 #endif
1586
1587 #if defined (__vax__) && W_TYPE_SIZE == 32
1588 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1589   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1590            : "=g" (sh), "=&g" (sl)                                      \
1591            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1592              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1593 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1594   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1595            : "=g" (sh), "=&g" (sl)                                      \
1596            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1597              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1598 #define smul_ppmm(xh, xl, m0, m1) \
1599   do {                                                                  \
1600     union {UDItype __ll;                                                \
1601            struct {USItype __l, __h;} __i;                              \
1602           } __x;                                                        \
1603     USItype __m0 = (m0), __m1 = (m1);                                   \
1604     __asm__ ("emul %1,%2,$0,%0"                                         \
1605              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1606     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1607   } while (0)
1608 #define sdiv_qrnnd(q, r, n1, n0, d) \
1609   do {                                                                  \
1610     union {DItype __ll;                                                 \
1611            struct {SItype __l, __h;} __i;                               \
1612           } __x;                                                        \
1613     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1614     __asm__ ("ediv %3,%2,%0,%1"                                         \
1615              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1616   } while (0)
1617 #if 0
1618 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1619    8800 maybe). */
1620 #define count_trailing_zeros(count,x)                                   \
1621   do {                                                                  \
1622     __asm__ ("ffs 0, 31, %1, %0"                                        \
1623              : "=g" (count)                                             \
1624              : "g" ((USItype) (x)));                                    \
1625   } while (0)
1626 #endif
1627 #endif /* __vax__ */
1628
1629 #if defined (__z8000__) && W_TYPE_SIZE == 16
1630 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1631   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1632            : "=r" (sh), "=&r" (sl)                                      \
1633            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1634              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1635 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1636   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1637            : "=r" (sh), "=&r" (sl)                                      \
1638            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1639              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1640 #define umul_ppmm(xh, xl, m0, m1) \
1641   do {                                                                  \
1642     union {long int __ll;                                               \
1643            struct {unsigned int __h, __l;} __i;                         \
1644           } __x;                                                        \
1645     unsigned int __m0 = (m0), __m1 = (m1);                              \
1646     __asm__ ("mult      %S0,%H3"                                        \
1647              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1648              : "%1" (m0), "rQR" (m1));                                  \
1649     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1650     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1651              + (((signed int) __m1 >> 15) & __m0));                     \
1652   } while (0)
1653 #endif /* __z8000__ */
1654
1655 #endif /* __GNUC__ */
1656
1657 #endif /* NO_ASM */
1658
1659
1660 #if !defined (umul_ppmm) && defined (__umulsidi3)
1661 #define umul_ppmm(ph, pl, m0, m1) \
1662   {                                                                     \
1663     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1664     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1665     pl = (UWtype) __ll;                                                 \
1666   }
1667 #endif
1668
1669 #if !defined (__umulsidi3)
1670 #define __umulsidi3(u, v) \
1671   ({UWtype __hi, __lo;                                                  \
1672     umul_ppmm (__hi, __lo, u, v);                                       \
1673     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1674 #endif
1675
1676
1677 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1678    forms have "reversed" arguments, meaning the pointer is last, which
1679    sometimes allows better parameter passing, in particular on 64-bit
1680    hppa. */
1681
1682 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1683 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1684
1685 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1686   && ! defined (LONGLONG_STANDALONE)
1687 #define umul_ppmm(wh, wl, u, v)                                               \
1688   do {                                                                        \
1689     UWtype __umul_ppmm__p0;                                                   \
1690     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1691     (wl) = __umul_ppmm__p0;                                                   \
1692   } while (0)
1693 #endif
1694
1695 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1696 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1697
1698 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1699   && ! defined (LONGLONG_STANDALONE)
1700 #define umul_ppmm(wh, wl, u, v)                                               \
1701   do {                                                                        \
1702     UWtype __umul_ppmm__p0;                                                   \
1703     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1704     (wl) = __umul_ppmm__p0;                                                   \
1705   } while (0)
1706 #endif
1707
1708 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1709 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1710
1711 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1712   && ! defined (LONGLONG_STANDALONE)
1713 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1714   do {                                                                  \
1715     UWtype __udiv_qrnnd__r;                                             \
1716     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                             \
1717                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1718     (r) = __udiv_qrnnd__r;                                              \
1719   } while (0)
1720 #endif
1721
1722 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1723 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1724
1725 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1726   && ! defined (LONGLONG_STANDALONE)
1727 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1728   do {                                                                  \
1729     UWtype __udiv_qrnnd__r;                                             \
1730     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1731                             &__udiv_qrnnd__r);                          \
1732     (r) = __udiv_qrnnd__r;                                              \
1733   } while (0)
1734 #endif
1735
1736
1737 /* If this machine has no inline assembler, use C macros.  */
1738
1739 #if !defined (add_ssaaaa)
1740 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1741   do {                                                                  \
1742     UWtype __x;                                                         \
1743     __x = (al) + (bl);                                                  \
1744     (sh) = (ah) + (bh) + (__x < (al));                                  \
1745     (sl) = __x;                                                         \
1746   } while (0)
1747 #endif
1748
1749 #if !defined (sub_ddmmss)
1750 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1751   do {                                                                  \
1752     UWtype __x;                                                         \
1753     __x = (al) - (bl);                                                  \
1754     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1755     (sl) = __x;                                                         \
1756   } while (0)
1757 #endif
1758
1759 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1760    smul_ppmm.  */
1761 #if !defined (umul_ppmm) && defined (smul_ppmm)
1762 #define umul_ppmm(w1, w0, u, v)                                         \
1763   do {                                                                  \
1764     UWtype __w1;                                                        \
1765     UWtype __xm0 = (u), __xm1 = (v);                                    \
1766     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1767     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1768                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1769   } while (0)
1770 #endif
1771
1772 /* If we still don't have umul_ppmm, define it using plain C.
1773
1774    For reference, when this code is used for squaring (ie. u and v identical
1775    expressions), gcc recognises __x1 and __x2 are the same and generates 3
1776    multiplies, not 4.  The subsequent additions could be optimized a bit,
1777    but the only place GMP currently uses such a square is mpn_sqr_basecase,
1778    and chips obliged to use this generic C umul will have plenty of worse
1779    performance problems than a couple of extra instructions on the diagonal
1780    of sqr_basecase.  */
1781
1782 #if !defined (umul_ppmm)
1783 #define umul_ppmm(w1, w0, u, v)                                         \
1784   do {                                                                  \
1785     UWtype __x0, __x1, __x2, __x3;                                      \
1786     UHWtype __ul, __vl, __uh, __vh;                                     \
1787     UWtype __u = (u), __v = (v);                                        \
1788                                                                         \
1789     __ul = __ll_lowpart (__u);                                          \
1790     __uh = __ll_highpart (__u);                                         \
1791     __vl = __ll_lowpart (__v);                                          \
1792     __vh = __ll_highpart (__v);                                         \
1793                                                                         \
1794     __x0 = (UWtype) __ul * __vl;                                        \
1795     __x1 = (UWtype) __ul * __vh;                                        \
1796     __x2 = (UWtype) __uh * __vl;                                        \
1797     __x3 = (UWtype) __uh * __vh;                                        \
1798                                                                         \
1799     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
1800     __x1 += __x2;               /* but this indeed can */               \
1801     if (__x1 < __x2)            /* did we get it? */                    \
1802       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
1803                                                                         \
1804     (w1) = __x3 + __ll_highpart (__x1);                                 \
1805     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
1806   } while (0)
1807 #endif
1808
1809 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1810    exist in one form or another.  */
1811 #if !defined (smul_ppmm)
1812 #define smul_ppmm(w1, w0, u, v)                                         \
1813   do {                                                                  \
1814     UWtype __w1;                                                        \
1815     UWtype __xm0 = (u), __xm1 = (v);                                    \
1816     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1817     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1818                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1819   } while (0)
1820 #endif
1821
1822 /* Define this unconditionally, so it can be used for debugging.  */
1823 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1824   do {                                                                  \
1825     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
1826                                                                         \
1827     ASSERT ((d) != 0);                                                  \
1828     ASSERT ((n1) < (d));                                                \
1829                                                                         \
1830     __d1 = __ll_highpart (d);                                           \
1831     __d0 = __ll_lowpart (d);                                            \
1832                                                                         \
1833     __q1 = (n1) / __d1;                                                 \
1834     __r1 = (n1) - __q1 * __d1;                                          \
1835     __m = __q1 * __d0;                                                  \
1836     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
1837     if (__r1 < __m)                                                     \
1838       {                                                                 \
1839         __q1--, __r1 += (d);                                            \
1840         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1841           if (__r1 < __m)                                               \
1842             __q1--, __r1 += (d);                                        \
1843       }                                                                 \
1844     __r1 -= __m;                                                        \
1845                                                                         \
1846     __q0 = __r1 / __d1;                                                 \
1847     __r0 = __r1  - __q0 * __d1;                                         \
1848     __m = __q0 * __d0;                                                  \
1849     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
1850     if (__r0 < __m)                                                     \
1851       {                                                                 \
1852         __q0--, __r0 += (d);                                            \
1853         if (__r0 >= (d))                                                \
1854           if (__r0 < __m)                                               \
1855             __q0--, __r0 += (d);                                        \
1856       }                                                                 \
1857     __r0 -= __m;                                                        \
1858                                                                         \
1859     (q) = __q1 * __ll_B | __q0;                                         \
1860     (r) = __r0;                                                         \
1861   } while (0)
1862
1863 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1864    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
1865 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1866 #define udiv_qrnnd(q, r, nh, nl, d) \
1867   do {                                                                  \
1868     UWtype __r;                                                         \
1869     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
1870     (r) = __r;                                                          \
1871   } while (0)
1872 #endif
1873
1874 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
1875 #if !defined (udiv_qrnnd)
1876 #define UDIV_NEEDS_NORMALIZATION 1
1877 #define udiv_qrnnd __udiv_qrnnd_c
1878 #endif
1879
1880 #if !defined (count_leading_zeros)
1881 #define count_leading_zeros(count, x) \
1882   do {                                                                  \
1883     UWtype __xr = (x);                                                  \
1884     UWtype __a;                                                         \
1885                                                                         \
1886     if (W_TYPE_SIZE == 32)                                              \
1887       {                                                                 \
1888         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
1889           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
1890           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
1891           : 3*__BITS4 + 1);                                             \
1892       }                                                                 \
1893     else                                                                \
1894       {                                                                 \
1895         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
1896           if (((__xr >> __a) & 0xff) != 0)                              \
1897             break;                                                      \
1898         ++__a;                                                          \
1899       }                                                                 \
1900                                                                         \
1901     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
1902   } while (0)
1903 /* This version gives a well-defined value for zero. */
1904 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
1905 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1906 #endif
1907
1908 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
1909 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
1910 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1911 #endif
1912
1913 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1914 extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
1915 #endif
1916
1917 #if !defined (count_trailing_zeros)
1918 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
1919    defined in asm, but if it is not, the C version above is good enough.  */
1920 #define count_trailing_zeros(count, x) \
1921   do {                                                                  \
1922     UWtype __ctz_x = (x);                                               \
1923     UWtype __ctz_c;                                                     \
1924     ASSERT (__ctz_x != 0);                                              \
1925     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
1926     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
1927   } while (0)
1928 #endif
1929
1930 #ifndef UDIV_NEEDS_NORMALIZATION
1931 #define UDIV_NEEDS_NORMALIZATION 0
1932 #endif
1933
1934 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
1935    that hence the latter should always be used.  */
1936 #ifndef UDIV_PREINV_ALWAYS
1937 #define UDIV_PREINV_ALWAYS 0
1938 #endif
1939
1940 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
1941 #ifndef UMUL_TIME
1942 #define UMUL_TIME 1
1943 #endif
1944
1945 #ifndef UDIV_TIME
1946 #define UDIV_TIME UMUL_TIME
1947 #endif