contrib/gmp/mpn/generic/sqrmod_bnm1.c

   1 /* sqrmod_bnm1.c -- squaring mod B^n-1.
   2
   3    Contributed to the GNU project by Niels Möller, Torbjorn Granlund and
   4    Marco Bodrato.
   5
   6    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   7    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   8    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   9
  10 Copyright 2009, 2010 Free Software Foundation, Inc.
  11
  12 This file is part of the GNU MP Library.
  13
  14 The GNU MP Library is free software; you can redistribute it and/or modify
  15 it under the terms of the GNU Lesser General Public License as published by
  16 the Free Software Foundation; either version 3 of the License, or (at your
  17 option) any later version.
  18
  19 The GNU MP Library is distributed in the hope that it will be useful, but
  20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  21 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  22 License for more details.
  23
  24 You should have received a copy of the GNU Lesser General Public License
  25 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  26
  27
  28 #include "gmp.h"
  29 #include "gmp-impl.h"
  30 #include "longlong.h"
  31
  32 /* Input is {ap,rn}; output is {rp,rn}, computation is
  33    mod B^rn - 1, and values are semi-normalised; zero is represented
  34    as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
  35    tp==rp is allowed. */
  36 static void
  37 mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
  38 {
  39   mp_limb_t cy;
  40
  41   ASSERT (0 < rn);
  42
  43   mpn_sqr (tp, ap, rn);
  44   cy = mpn_add_n (rp, tp, tp + rn, rn);
  45   /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
  46    * be no overflow when adding in the carry. */
  47   MPN_INCR_U (rp, rn, cy);
  48 }
  49
  50
  51 /* Input is {ap,rn+1}; output is {rp,rn+1}, in
  52    semi-normalised representation, computation is mod B^rn + 1. Needs
  53    a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
  54    Output is normalised. */
  55 static void
  56 mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
  57 {
  58   mp_limb_t cy;
  59
  60   ASSERT (0 < rn);
  61
  62   mpn_sqr (tp, ap, rn + 1);
  63   ASSERT (tp[2*rn+1] == 0);
  64   ASSERT (tp[2*rn] < GMP_NUMB_MAX);
  65   cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
  66   rp[rn] = 0;
  67   MPN_INCR_U (rp, rn+1, cy );
  68 }
  69
  70
  71 /* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1)
  72  *
  73  * The result is expected to be ZERO if and only if the operand
  74  * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
  75  * B^rn-1.
  76  * It should not be a problem if sqrmod_bnm1 is used to
  77  * compute the full square with an <= 2*rn, because this condition
  78  * implies (B^an-1)^2 < (B^rn-1) .
  79  *
  80  * Requires rn/4 < an <= rn
  81  * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives
  82  *
  83  * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4
  84  */
  85 void
  86 mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp)
  87 {
  88   ASSERT (0 < an);
  89   ASSERT (an <= rn);
  90
  91   if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD))
  92     {
  93       if (UNLIKELY (an < rn))
  94         {
  95           if (UNLIKELY (2*an <= rn))
  96             {
  97               mpn_sqr (rp, ap, an);
  98             }
  99           else
 100             {
 101               mp_limb_t cy;
 102               mpn_sqr (tp, ap, an);
 103               cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn);
 104               MPN_INCR_U (rp, rn, cy);
 105             }
 106         }
 107       else
 108         mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp);
 109     }
 110   else
 111     {
 112       mp_size_t n;
 113       mp_limb_t cy;
 114       mp_limb_t hi;
 115
 116       n = rn >> 1;
 117
 118       ASSERT (2*an > n);
 119
 120       /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
 121          and crt together as
 122
 123          x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
 124       */
 125
 126 #define a0 ap
 127 #define a1 (ap + n)
 128
 129 #define xp  tp  /* 2n + 2 */
 130       /* am1  maybe in {xp, n} */
 131 #define sp1 (tp + 2*n + 2)
 132       /* ap1  maybe in {sp1, n + 1} */
 133
 134       {
 135         mp_srcptr am1;
 136         mp_size_t anm;
 137         mp_ptr so;
 138
 139         if (LIKELY (an > n))
 140           {
 141             so = xp + n;
 142             am1 = xp;
 143             cy = mpn_add (xp, a0, n, a1, an - n);
 144             MPN_INCR_U (xp, n, cy);
 145             anm = n;
 146           }
 147         else
 148           {
 149             so = xp;
 150             am1 = a0;
 151             anm = an;
 152           }
 153
 154         mpn_sqrmod_bnm1 (rp, n, am1, anm, so);
 155       }
 156
 157       {
 158         int       k;
 159         mp_srcptr ap1;
 160         mp_size_t anp;
 161
 162         if (LIKELY (an > n)) {
 163           ap1 = sp1;
 164           cy = mpn_sub (sp1, a0, n, a1, an - n);
 165           sp1[n] = 0;
 166           MPN_INCR_U (sp1, n + 1, cy);
 167           anp = n + ap1[n];
 168         } else {
 169           ap1 = a0;
 170           anp = an;
 171         }
 172
 173         if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
 174           k=0;
 175         else
 176           {
 177             int mask;
 178             k = mpn_fft_best_k (n, 1);
 179             mask = (1<<k) -1;
 180             while (n & mask) {k--; mask >>=1;};
 181           }
 182         if (k >= FFT_FIRST_K)
 183           xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k);
 184         else if (UNLIKELY (ap1 == a0))
 185           {
 186             ASSERT (anp <= n);
 187             ASSERT (2*anp > n);
 188             mpn_sqr (xp, a0, an);
 189             anp = 2*an - n;
 190             cy = mpn_sub (xp, xp, n, xp + n, anp);
 191             xp[n] = 0;
 192             MPN_INCR_U (xp, n+1, cy);
 193           }
 194         else
 195           mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp);
 196       }
 197
 198       /* Here the CRT recomposition begins.
 199
 200          xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
 201          Division by 2 is a bitwise rotation.
 202
 203          Assumes xp normalised mod (B^n+1).
 204
 205          The residue class [0] is represented by [B^n-1]; except when
 206          both input are ZERO.
 207       */
 208
 209 #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
 210 #if HAVE_NATIVE_mpn_rsh1add_nc
 211       cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
 212       hi = cy << (GMP_NUMB_BITS - 1);
 213       cy = 0;
 214       /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
 215          overflows, i.e. a further increment will not overflow again. */
 216 #else /* ! _nc */
 217       cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
 218       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
 219       cy >>= 1;
 220       /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
 221          the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
 222 #endif
 223 #if GMP_NAIL_BITS == 0
 224       add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
 225 #else
 226       cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
 227       rp[n-1] ^= hi;
 228 #endif
 229 #else /* ! HAVE_NATIVE_mpn_rsh1add_n */
 230 #if HAVE_NATIVE_mpn_add_nc
 231       cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
 232 #else /* ! _nc */
 233       cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
 234 #endif
 235       cy += (rp[0]&1);
 236       mpn_rshift(rp, rp, n, 1);
 237       ASSERT (cy <= 2);
 238       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
 239       cy >>= 1;
 240       /* We can have cy != 0 only if hi = 0... */
 241       ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
 242       rp[n-1] |= hi;
 243       /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
 244 #endif
 245       ASSERT (cy <= 1);
 246       /* Next increment can not overflow, read the previous comments about cy. */
 247       ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
 248       MPN_INCR_U(rp, n, cy);
 249
 250       /* Compute the highest half:
 251          ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
 252        */
 253       if (UNLIKELY (2*an < rn))
 254         {
 255           /* Note that in this case, the only way the result can equal
 256              zero mod B^{rn} - 1 is if the input is zero, and
 257              then the output of both the recursive calls and this CRT
 258              reconstruction is zero, not B^{rn} - 1. */
 259           cy = mpn_sub_n (rp + n, rp, xp, 2*an - n);
 260
 261           /* FIXME: This subtraction of the high parts is not really
 262              necessary, we do it to get the carry out, and for sanity
 263              checking. */
 264           cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n,
 265                                    xp + 2*an - n, rn - 2*an, cy);
 266           ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an));
 267           cy = mpn_sub_1 (rp, rp, 2*an, cy);
 268           ASSERT (cy == (xp + 2*an - n)[0]);
 269         }
 270       else
 271         {
 272           cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
 273           /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
 274              DECR will affect _at most_ the lowest n limbs. */
 275           MPN_DECR_U (rp, 2*n, cy);
 276         }
 277 #undef a0
 278 #undef a1
 279 #undef xp
 280 #undef sp1
 281     }
 282 }
 283
 284 mp_size_t
 285 mpn_sqrmod_bnm1_next_size (mp_size_t n)
 286 {
 287   mp_size_t nh;
 288
 289   if (BELOW_THRESHOLD (n,     SQRMOD_BNM1_THRESHOLD))
 290     return n;
 291   if (BELOW_THRESHOLD (n, 4 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
 292     return (n + (2-1)) & (-2);
 293   if (BELOW_THRESHOLD (n, 8 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
 294     return (n + (4-1)) & (-4);
 295
 296   nh = (n + 1) >> 1;
 297
 298   if (BELOW_THRESHOLD (nh, SQR_FFT_MODF_THRESHOLD))
 299     return (n + (8-1)) & (-8);
 300
 301   return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 1));
 302 }