1 /* mpn_sqrtrem -- square root and remainder
3 Contributed to the GNU project by Paul Zimmermann (most code) and
4 Torbjorn Granlund (mpn_sqrtrem1).
6 THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
7 MUTABLE INTERFACE. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
8 INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
9 DISAPPEAR IN A FUTURE GMP RELEASE.
11 Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation,
14 This file is part of the GNU MP Library.
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of the GNU Lesser General Public License as published by
18 the Free Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
21 The GNU MP Library is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
23 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
24 License for more details.
26 You should have received a copy of the GNU Lesser General Public License
27 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
30 /* See "Karatsuba Square Root", reference in gmp.texi. */
40 static const unsigned short invsqrttab[384] =
42 0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
43 0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
44 0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
45 0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
46 0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
47 0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
48 0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
49 0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
50 0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
51 0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
52 0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
53 0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
54 0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
55 0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
56 0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
57 0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
58 0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
59 0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
60 0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
61 0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
62 0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
63 0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
64 0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
65 0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
66 0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
67 0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
68 0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
69 0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
70 0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
71 0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
72 0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
73 0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
74 0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
75 0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
76 0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
77 0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
78 0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
79 0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
80 0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
81 0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
82 0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
83 0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
84 0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
85 0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
86 0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
87 0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
88 0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
89 0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100 /* sqrt(1/1f8)..sqrt(1/1ff) */
92 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */
94 #if GMP_NUMB_BITS > 32
95 #define MAGIC 0x10000000000 /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
97 #define MAGIC 0x100000 /* 0xfee6f < MAGIC < 0x29cbc8 */
101 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
103 #if GMP_NUMB_BITS > 32
106 mp_limb_t x0, t2, t, x2;
109 ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
110 ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
111 ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
113 /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
114 since we can do the former without division. As part of the last
115 iteration convert from 1/sqrt(a) to sqrt(a). */
117 abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
118 x0 = invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
120 /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
122 #if GMP_NUMB_BITS > 32
123 a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
124 t = (mp_limb_signed_t) (0x2000000000000l - 0x30000 - a1 * x0 * x0) >> 16;
125 x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
127 /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
129 t2 = x0 * (a0 >> (32-8));
131 t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
132 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
135 t2 = x0 * (a0 >> (16-8));
137 t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
138 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
142 /* x0 is now a full limb approximation of sqrt(a0) */
145 if (x2 + 2*x0 <= a0 - 1)
156 #define Prec (GMP_NUMB_BITS >> 1)
158 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
159 return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
161 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
163 mp_limb_t qhl, q, u, np0, sp0, rp0, q2;
166 ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
169 sp0 = mpn_sqrtrem1 (rp, np[1]);
177 /* now rp0 < sp0 < 2^Prec */
178 rp0 = (rp0 << Prec) + (np0 >> Prec);
182 q += (qhl & 1) << (Prec - 1);
183 qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
184 /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */
185 sp0 = ((sp0 + qhl) << Prec) + q;
187 rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
188 /* subtract q * q or qhl*2^(2*Prec) from rp */
190 cc -= (rp0 < q2) + qhl;
192 /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
212 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
213 and in {np, n} the low n limbs of the remainder, returns the high
214 limb of the remainder (which is 0 or 1).
215 Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
216 where B=2^GMP_NUMB_BITS. */
218 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
220 mp_limb_t q; /* carry out of {sp, n} */
221 int c, b; /* carry out of remainder */
224 ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
227 c = mpn_sqrtrem2 (sp, np, np);
232 q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
234 mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
235 q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
237 mpn_rshift (sp, sp, l, 1);
238 sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
241 c = mpn_add_n (np + l, np + l, sp + l, h);
242 mpn_sqr_n (np + n, sp, l);
243 b = q + mpn_sub_n (np, np, np + n, 2 * l);
244 c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
245 q = mpn_add_1 (sp + l, sp + l, h, q);
249 c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
250 c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
251 q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
260 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
262 mp_limb_t *tp, s0[1], cc, high, rl;
270 /* If OP is zero, both results are zero. */
274 ASSERT (np[nn - 1] != 0);
275 ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
276 ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
277 ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
280 if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
283 sp[0] = mpn_sqrtrem1 (&r, high);
288 count_leading_zeros (c, high);
291 c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
292 tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
295 if (nn % 2 != 0 || c > 0)
297 tp = TMP_ALLOC_LIMBS (2 * tn);
298 tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */
300 mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
302 MPN_COPY (tp + 2 * tn - nn, np, nn);
303 rl = mpn_dc_sqrtrem (sp, tp, tn);
304 /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
305 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
306 c += (nn % 2) * GMP_NUMB_BITS / 2; /* c now represents k */
307 s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1); /* S mod 2^k */
308 rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
309 cc = mpn_submul_1 (tp, s0, 1, s0[0]);
310 rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
311 mpn_rshift (sp, sp, tn, c);
316 if (c < GMP_NUMB_BITS)
324 mpn_rshift (rp, tp, tn, c);
326 MPN_COPY_INCR (rp, tp, tn);
332 rp = TMP_ALLOC_LIMBS (nn);
334 MPN_COPY (rp, np, nn);
335 rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
338 MPN_NORMALIZE (rp, rn);