Add OpenSSL 0.9.7e.
[dragonfly.git] / crypto / openssl-0.9.7e / crypto / bn / asm / x86_64-gcc.c
1 /*
2  * x86_64 BIGNUM accelerator version 0.1, December 2002.
3  *
4  * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5  * project.
6  *
7  * Rights for redistribution and usage in source and binary forms are
8  * granted according to the OpenSSL license. Warranty of any kind is
9  * disclaimed.
10  *
11  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
12  *    versions, like 1.0...
13  * A. Well, that's because this code is basically a quick-n-dirty
14  *    proof-of-concept hack. As you can see it's implemented with
15  *    inline assembler, which means that you're bound to GCC and that
16  *    there must be a room for fine-tuning.
17  *
18  * Q. Why inline assembler?
19  * A. x86_64 features own ABI I'm not familiar with. Which is why
20  *    I decided to let the compiler take care of subroutine
21  *    prologue/epilogue as well as register allocation.
22  *
23  * Q. How much faster does it get?
24  * A. Unfortunately people sitting on x86_64 hardware are prohibited
25  *    to disclose the performance numbers, so they (SuSE labs to be
26  *    specific) wouldn't tell me. However! Very similar coding technique
27  *    (reaching out for 128-bit result from 64x64-bit multiplication)
28  *    results in >3 times performance improvement on MIPS and I see no
29  *    reason why gain on x86_64 would be so much different:-)
30  */
31
32 #define BN_ULONG unsigned long
33
34 /*
35  * "m"(a), "+m"(r)      is the way to favor DirectPath ยต-code;
36  * "g"(0)               let the compiler to decide where does it
37  *                      want to keep the value of zero;
38  */
39 #define mul_add(r,a,word,carry) do {    \
40         register BN_ULONG high,low;     \
41         asm ("mulq %3"                  \
42                 : "=a"(low),"=d"(high)  \
43                 : "a"(word),"m"(a)      \
44                 : "cc");                \
45         asm ("addq %2,%0; adcq %3,%1"   \
46                 : "+r"(carry),"+d"(high)\
47                 : "a"(low),"g"(0)       \
48                 : "cc");                \
49         asm ("addq %2,%0; adcq %3,%1"   \
50                 : "+m"(r),"+d"(high)    \
51                 : "r"(carry),"g"(0)     \
52                 : "cc");                \
53         carry=high;                     \
54         } while (0)
55
56 #define mul(r,a,word,carry) do {        \
57         register BN_ULONG high,low;     \
58         asm ("mulq %3"                  \
59                 : "=a"(low),"=d"(high)  \
60                 : "a"(word),"g"(a)      \
61                 : "cc");                \
62         asm ("addq %2,%0; adcq %3,%1"   \
63                 : "+r"(carry),"+d"(high)\
64                 : "a"(low),"g"(0)       \
65                 : "cc");                \
66         (r)=carry, carry=high;          \
67         } while (0)
68
69 #define sqr(r0,r1,a)                    \
70         asm ("mulq %2"                  \
71                 : "=a"(r0),"=d"(r1)     \
72                 : "a"(a)                \
73                 : "cc");
74
75 BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
76         {
77         BN_ULONG c1=0;
78
79         if (num <= 0) return(c1);
80
81         while (num&~3)
82                 {
83                 mul_add(rp[0],ap[0],w,c1);
84                 mul_add(rp[1],ap[1],w,c1);
85                 mul_add(rp[2],ap[2],w,c1);
86                 mul_add(rp[3],ap[3],w,c1);
87                 ap+=4; rp+=4; num-=4;
88                 }
89         if (num)
90                 {
91                 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
92                 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
93                 mul_add(rp[2],ap[2],w,c1); return c1;
94                 }
95         
96         return(c1);
97         } 
98
99 BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
100         {
101         BN_ULONG c1=0;
102
103         if (num <= 0) return(c1);
104
105         while (num&~3)
106                 {
107                 mul(rp[0],ap[0],w,c1);
108                 mul(rp[1],ap[1],w,c1);
109                 mul(rp[2],ap[2],w,c1);
110                 mul(rp[3],ap[3],w,c1);
111                 ap+=4; rp+=4; num-=4;
112                 }
113         if (num)
114                 {
115                 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
116                 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
117                 mul(rp[2],ap[2],w,c1);
118                 }
119         return(c1);
120         } 
121
122 void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
123         {
124         if (n <= 0) return;
125
126         while (n&~3)
127                 {
128                 sqr(r[0],r[1],a[0]);
129                 sqr(r[2],r[3],a[1]);
130                 sqr(r[4],r[5],a[2]);
131                 sqr(r[6],r[7],a[3]);
132                 a+=4; r+=8; n-=4;
133                 }
134         if (n)
135                 {
136                 sqr(r[0],r[1],a[0]); if (--n == 0) return;
137                 sqr(r[2],r[3],a[1]); if (--n == 0) return;
138                 sqr(r[4],r[5],a[2]);
139                 }
140         }
141
142 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
143 {       BN_ULONG ret,waste;
144
145         asm ("divq      %4"
146                 : "=a"(ret),"=d"(waste)
147                 : "a"(l),"d"(h),"g"(d)
148                 : "cc");
149
150         return ret;
151 }
152
153 BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
154 { BN_ULONG ret,i;
155
156         if (n <= 0) return 0;
157
158         asm (
159         "       subq    %2,%2           \n"
160         ".align 16                      \n"
161         "1:     movq    (%4,%2,8),%0    \n"
162         "       adcq    (%5,%2,8),%0    \n"
163         "       movq    %0,(%3,%2,8)    \n"
164         "       leaq    1(%2),%2        \n"
165         "       loop    1b              \n"
166         "       sbbq    %0,%0           \n"
167                 : "+a"(ret),"+c"(n),"+r"(i)
168                 : "r"(rp),"r"(ap),"r"(bp)
169                 : "cc"
170         );
171
172   return ret&1;
173 }
174
175 #ifndef SIMICS
176 BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
177 { BN_ULONG ret,i;
178
179         if (n <= 0) return 0;
180
181         asm (
182         "       subq    %2,%2           \n"
183         ".align 16                      \n"
184         "1:     movq    (%4,%2,8),%0    \n"
185         "       sbbq    (%5,%2,8),%0    \n"
186         "       movq    %0,(%3,%2,8)    \n"
187         "       leaq    1(%2),%2        \n"
188         "       loop    1b              \n"
189         "       sbbq    %0,%0           \n"
190                 : "+a"(ret),"+c"(n),"+r"(i)
191                 : "r"(rp),"r"(ap),"r"(bp)
192                 : "cc"
193         );
194
195   return ret&1;
196 }
197 #else
198 /* Simics 1.4<7 has buggy sbbq:-( */
199 #define BN_MASK2 0xffffffffffffffffL
200 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
201         {
202         BN_ULONG t1,t2;
203         int c=0;
204
205         if (n <= 0) return((BN_ULONG)0);
206
207         for (;;)
208                 {
209                 t1=a[0]; t2=b[0];
210                 r[0]=(t1-t2-c)&BN_MASK2;
211                 if (t1 != t2) c=(t1 < t2);
212                 if (--n <= 0) break;
213
214                 t1=a[1]; t2=b[1];
215                 r[1]=(t1-t2-c)&BN_MASK2;
216                 if (t1 != t2) c=(t1 < t2);
217                 if (--n <= 0) break;
218
219                 t1=a[2]; t2=b[2];
220                 r[2]=(t1-t2-c)&BN_MASK2;
221                 if (t1 != t2) c=(t1 < t2);
222                 if (--n <= 0) break;
223
224                 t1=a[3]; t2=b[3];
225                 r[3]=(t1-t2-c)&BN_MASK2;
226                 if (t1 != t2) c=(t1 < t2);
227                 if (--n <= 0) break;
228
229                 a+=4;
230                 b+=4;
231                 r+=4;
232                 }
233         return(c);
234         }
235 #endif
236
237 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
238 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
239 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
240 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
241
242 #if 0
243 /* original macros are kept for reference purposes */
244 #define mul_add_c(a,b,c0,c1,c2) {       \
245         BN_ULONG ta=(a),tb=(b);         \
246         t1 = ta * tb;                   \
247         t2 = BN_UMULT_HIGH(ta,tb);      \
248         c0 += t1; t2 += (c0<t1)?1:0;    \
249         c1 += t2; c2 += (c1<t2)?1:0;    \
250         }
251
252 #define mul_add_c2(a,b,c0,c1,c2) {      \
253         BN_ULONG ta=(a),tb=(b),t0;      \
254         t1 = BN_UMULT_HIGH(ta,tb);      \
255         t0 = ta * tb;                   \
256         t2 = t1+t1; c2 += (t2<t1)?1:0;  \
257         t1 = t0+t0; t2 += (t1<t0)?1:0;  \
258         c0 += t1; t2 += (c0<t1)?1:0;    \
259         c1 += t2; c2 += (c1<t2)?1:0;    \
260         }
261 #else
262 #define mul_add_c(a,b,c0,c1,c2) do {    \
263         asm ("mulq %3"                  \
264                 : "=a"(t1),"=d"(t2)     \
265                 : "a"(a),"m"(b)         \
266                 : "cc");                \
267         asm ("addq %2,%0; adcq %3,%1"   \
268                 : "+r"(c0),"+d"(t2)     \
269                 : "a"(t1),"g"(0)        \
270                 : "cc");                \
271         asm ("addq %2,%0; adcq %3,%1"   \
272                 : "+r"(c1),"+r"(c2)     \
273                 : "d"(t2),"g"(0)        \
274                 : "cc");                \
275         } while (0)
276
277 #define sqr_add_c(a,i,c0,c1,c2) do {    \
278         asm ("mulq %2"                  \
279                 : "=a"(t1),"=d"(t2)     \
280                 : "a"(a[i])             \
281                 : "cc");                \
282         asm ("addq %2,%0; adcq %3,%1"   \
283                 : "+r"(c0),"+d"(t2)     \
284                 : "a"(t1),"g"(0)        \
285                 : "cc");                \
286         asm ("addq %2,%0; adcq %3,%1"   \
287                 : "+r"(c1),"+r"(c2)     \
288                 : "d"(t2),"g"(0)        \
289                 : "cc");                \
290         } while (0)
291
292 #define mul_add_c2(a,b,c0,c1,c2) do {   \
293         asm ("mulq %3"                  \
294                 : "=a"(t1),"=d"(t2)     \
295                 : "a"(a),"m"(b)         \
296                 : "cc");                \
297         asm ("addq %0,%0; adcq %2,%1"   \
298                 : "+d"(t2),"+r"(c2)     \
299                 : "g"(0)                \
300                 : "cc");                \
301         asm ("addq %0,%0; adcq %2,%1"   \
302                 : "+a"(t1),"+d"(t2)     \
303                 : "g"(0)                \
304                 : "cc");                \
305         asm ("addq %2,%0; adcq %3,%1"   \
306                 : "+r"(c0),"+d"(t2)     \
307                 : "a"(t1),"g"(0)        \
308                 : "cc");                \
309         asm ("addq %2,%0; adcq %3,%1"   \
310                 : "+r"(c1),"+r"(c2)     \
311                 : "d"(t2),"g"(0)        \
312                 : "cc");                \
313         } while (0)
314 #endif
315
316 #define sqr_add_c2(a,i,j,c0,c1,c2)      \
317         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
318
319 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
320         {
321         BN_ULONG bl,bh;
322         BN_ULONG t1,t2;
323         BN_ULONG c1,c2,c3;
324
325         c1=0;
326         c2=0;
327         c3=0;
328         mul_add_c(a[0],b[0],c1,c2,c3);
329         r[0]=c1;
330         c1=0;
331         mul_add_c(a[0],b[1],c2,c3,c1);
332         mul_add_c(a[1],b[0],c2,c3,c1);
333         r[1]=c2;
334         c2=0;
335         mul_add_c(a[2],b[0],c3,c1,c2);
336         mul_add_c(a[1],b[1],c3,c1,c2);
337         mul_add_c(a[0],b[2],c3,c1,c2);
338         r[2]=c3;
339         c3=0;
340         mul_add_c(a[0],b[3],c1,c2,c3);
341         mul_add_c(a[1],b[2],c1,c2,c3);
342         mul_add_c(a[2],b[1],c1,c2,c3);
343         mul_add_c(a[3],b[0],c1,c2,c3);
344         r[3]=c1;
345         c1=0;
346         mul_add_c(a[4],b[0],c2,c3,c1);
347         mul_add_c(a[3],b[1],c2,c3,c1);
348         mul_add_c(a[2],b[2],c2,c3,c1);
349         mul_add_c(a[1],b[3],c2,c3,c1);
350         mul_add_c(a[0],b[4],c2,c3,c1);
351         r[4]=c2;
352         c2=0;
353         mul_add_c(a[0],b[5],c3,c1,c2);
354         mul_add_c(a[1],b[4],c3,c1,c2);
355         mul_add_c(a[2],b[3],c3,c1,c2);
356         mul_add_c(a[3],b[2],c3,c1,c2);
357         mul_add_c(a[4],b[1],c3,c1,c2);
358         mul_add_c(a[5],b[0],c3,c1,c2);
359         r[5]=c3;
360         c3=0;
361         mul_add_c(a[6],b[0],c1,c2,c3);
362         mul_add_c(a[5],b[1],c1,c2,c3);
363         mul_add_c(a[4],b[2],c1,c2,c3);
364         mul_add_c(a[3],b[3],c1,c2,c3);
365         mul_add_c(a[2],b[4],c1,c2,c3);
366         mul_add_c(a[1],b[5],c1,c2,c3);
367         mul_add_c(a[0],b[6],c1,c2,c3);
368         r[6]=c1;
369         c1=0;
370         mul_add_c(a[0],b[7],c2,c3,c1);
371         mul_add_c(a[1],b[6],c2,c3,c1);
372         mul_add_c(a[2],b[5],c2,c3,c1);
373         mul_add_c(a[3],b[4],c2,c3,c1);
374         mul_add_c(a[4],b[3],c2,c3,c1);
375         mul_add_c(a[5],b[2],c2,c3,c1);
376         mul_add_c(a[6],b[1],c2,c3,c1);
377         mul_add_c(a[7],b[0],c2,c3,c1);
378         r[7]=c2;
379         c2=0;
380         mul_add_c(a[7],b[1],c3,c1,c2);
381         mul_add_c(a[6],b[2],c3,c1,c2);
382         mul_add_c(a[5],b[3],c3,c1,c2);
383         mul_add_c(a[4],b[4],c3,c1,c2);
384         mul_add_c(a[3],b[5],c3,c1,c2);
385         mul_add_c(a[2],b[6],c3,c1,c2);
386         mul_add_c(a[1],b[7],c3,c1,c2);
387         r[8]=c3;
388         c3=0;
389         mul_add_c(a[2],b[7],c1,c2,c3);
390         mul_add_c(a[3],b[6],c1,c2,c3);
391         mul_add_c(a[4],b[5],c1,c2,c3);
392         mul_add_c(a[5],b[4],c1,c2,c3);
393         mul_add_c(a[6],b[3],c1,c2,c3);
394         mul_add_c(a[7],b[2],c1,c2,c3);
395         r[9]=c1;
396         c1=0;
397         mul_add_c(a[7],b[3],c2,c3,c1);
398         mul_add_c(a[6],b[4],c2,c3,c1);
399         mul_add_c(a[5],b[5],c2,c3,c1);
400         mul_add_c(a[4],b[6],c2,c3,c1);
401         mul_add_c(a[3],b[7],c2,c3,c1);
402         r[10]=c2;
403         c2=0;
404         mul_add_c(a[4],b[7],c3,c1,c2);
405         mul_add_c(a[5],b[6],c3,c1,c2);
406         mul_add_c(a[6],b[5],c3,c1,c2);
407         mul_add_c(a[7],b[4],c3,c1,c2);
408         r[11]=c3;
409         c3=0;
410         mul_add_c(a[7],b[5],c1,c2,c3);
411         mul_add_c(a[6],b[6],c1,c2,c3);
412         mul_add_c(a[5],b[7],c1,c2,c3);
413         r[12]=c1;
414         c1=0;
415         mul_add_c(a[6],b[7],c2,c3,c1);
416         mul_add_c(a[7],b[6],c2,c3,c1);
417         r[13]=c2;
418         c2=0;
419         mul_add_c(a[7],b[7],c3,c1,c2);
420         r[14]=c3;
421         r[15]=c1;
422         }
423
424 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
425         {
426         BN_ULONG bl,bh;
427         BN_ULONG t1,t2;
428         BN_ULONG c1,c2,c3;
429
430         c1=0;
431         c2=0;
432         c3=0;
433         mul_add_c(a[0],b[0],c1,c2,c3);
434         r[0]=c1;
435         c1=0;
436         mul_add_c(a[0],b[1],c2,c3,c1);
437         mul_add_c(a[1],b[0],c2,c3,c1);
438         r[1]=c2;
439         c2=0;
440         mul_add_c(a[2],b[0],c3,c1,c2);
441         mul_add_c(a[1],b[1],c3,c1,c2);
442         mul_add_c(a[0],b[2],c3,c1,c2);
443         r[2]=c3;
444         c3=0;
445         mul_add_c(a[0],b[3],c1,c2,c3);
446         mul_add_c(a[1],b[2],c1,c2,c3);
447         mul_add_c(a[2],b[1],c1,c2,c3);
448         mul_add_c(a[3],b[0],c1,c2,c3);
449         r[3]=c1;
450         c1=0;
451         mul_add_c(a[3],b[1],c2,c3,c1);
452         mul_add_c(a[2],b[2],c2,c3,c1);
453         mul_add_c(a[1],b[3],c2,c3,c1);
454         r[4]=c2;
455         c2=0;
456         mul_add_c(a[2],b[3],c3,c1,c2);
457         mul_add_c(a[3],b[2],c3,c1,c2);
458         r[5]=c3;
459         c3=0;
460         mul_add_c(a[3],b[3],c1,c2,c3);
461         r[6]=c1;
462         r[7]=c2;
463         }
464
465 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
466         {
467         BN_ULONG bl,bh;
468         BN_ULONG t1,t2;
469         BN_ULONG c1,c2,c3;
470
471         c1=0;
472         c2=0;
473         c3=0;
474         sqr_add_c(a,0,c1,c2,c3);
475         r[0]=c1;
476         c1=0;
477         sqr_add_c2(a,1,0,c2,c3,c1);
478         r[1]=c2;
479         c2=0;
480         sqr_add_c(a,1,c3,c1,c2);
481         sqr_add_c2(a,2,0,c3,c1,c2);
482         r[2]=c3;
483         c3=0;
484         sqr_add_c2(a,3,0,c1,c2,c3);
485         sqr_add_c2(a,2,1,c1,c2,c3);
486         r[3]=c1;
487         c1=0;
488         sqr_add_c(a,2,c2,c3,c1);
489         sqr_add_c2(a,3,1,c2,c3,c1);
490         sqr_add_c2(a,4,0,c2,c3,c1);
491         r[4]=c2;
492         c2=0;
493         sqr_add_c2(a,5,0,c3,c1,c2);
494         sqr_add_c2(a,4,1,c3,c1,c2);
495         sqr_add_c2(a,3,2,c3,c1,c2);
496         r[5]=c3;
497         c3=0;
498         sqr_add_c(a,3,c1,c2,c3);
499         sqr_add_c2(a,4,2,c1,c2,c3);
500         sqr_add_c2(a,5,1,c1,c2,c3);
501         sqr_add_c2(a,6,0,c1,c2,c3);
502         r[6]=c1;
503         c1=0;
504         sqr_add_c2(a,7,0,c2,c3,c1);
505         sqr_add_c2(a,6,1,c2,c3,c1);
506         sqr_add_c2(a,5,2,c2,c3,c1);
507         sqr_add_c2(a,4,3,c2,c3,c1);
508         r[7]=c2;
509         c2=0;
510         sqr_add_c(a,4,c3,c1,c2);
511         sqr_add_c2(a,5,3,c3,c1,c2);
512         sqr_add_c2(a,6,2,c3,c1,c2);
513         sqr_add_c2(a,7,1,c3,c1,c2);
514         r[8]=c3;
515         c3=0;
516         sqr_add_c2(a,7,2,c1,c2,c3);
517         sqr_add_c2(a,6,3,c1,c2,c3);
518         sqr_add_c2(a,5,4,c1,c2,c3);
519         r[9]=c1;
520         c1=0;
521         sqr_add_c(a,5,c2,c3,c1);
522         sqr_add_c2(a,6,4,c2,c3,c1);
523         sqr_add_c2(a,7,3,c2,c3,c1);
524         r[10]=c2;
525         c2=0;
526         sqr_add_c2(a,7,4,c3,c1,c2);
527         sqr_add_c2(a,6,5,c3,c1,c2);
528         r[11]=c3;
529         c3=0;
530         sqr_add_c(a,6,c1,c2,c3);
531         sqr_add_c2(a,7,5,c1,c2,c3);
532         r[12]=c1;
533         c1=0;
534         sqr_add_c2(a,7,6,c2,c3,c1);
535         r[13]=c2;
536         c2=0;
537         sqr_add_c(a,7,c3,c1,c2);
538         r[14]=c3;
539         r[15]=c1;
540         }
541
542 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
543         {
544         BN_ULONG bl,bh;
545         BN_ULONG t1,t2;
546         BN_ULONG c1,c2,c3;
547
548         c1=0;
549         c2=0;
550         c3=0;
551         sqr_add_c(a,0,c1,c2,c3);
552         r[0]=c1;
553         c1=0;
554         sqr_add_c2(a,1,0,c2,c3,c1);
555         r[1]=c2;
556         c2=0;
557         sqr_add_c(a,1,c3,c1,c2);
558         sqr_add_c2(a,2,0,c3,c1,c2);
559         r[2]=c3;
560         c3=0;
561         sqr_add_c2(a,3,0,c1,c2,c3);
562         sqr_add_c2(a,2,1,c1,c2,c3);
563         r[3]=c1;
564         c1=0;
565         sqr_add_c(a,2,c2,c3,c1);
566         sqr_add_c2(a,3,1,c2,c3,c1);
567         r[4]=c2;
568         c2=0;
569         sqr_add_c2(a,3,2,c3,c1,c2);
570         r[5]=c3;
571         c3=0;
572         sqr_add_c(a,3,c1,c2,c3);
573         r[6]=c1;
574         r[7]=c2;
575         }