Import OpenSSL 1.0.1t.
[dragonfly.git] / crypto / openssl / crypto / bn / asm / x86_64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005.
11 #
12 # Montgomery multiplication routine for x86_64. While it gives modest
13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
15 # respectful 50%. It remains to be seen if loop unrolling and
16 # dedicated squaring routine can provide further improvement...
17
18 # July 2011.
19 #
20 # Add dedicated squaring procedure. Performance improvement varies
21 # from platform to platform, but in average it's ~5%/15%/25%/33%
22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24 # August 2011.
25 #
26 # Unroll and modulo-schedule inner loops in such manner that they
27 # are "fallen through" for input lengths of 8, which is critical for
28 # 1024-bit RSA *sign*. Average performance improvement in comparison
29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32 $flavour = shift;
33 $output  = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
42
43 open OUT,"| \"$^X\" $xlate $flavour $output";
44 *STDOUT=*OUT;
45
46 # int bn_mul_mont(
47 $rp="%rdi";     # BN_ULONG *rp,
48 $ap="%rsi";     # const BN_ULONG *ap,
49 $bp="%rdx";     # const BN_ULONG *bp,
50 $np="%rcx";     # const BN_ULONG *np,
51 $n0="%r8";      # const BN_ULONG *n0,
52 $num="%r9";     # int num);
53 $lo0="%r10";
54 $hi0="%r11";
55 $hi1="%r13";
56 $i="%r14";
57 $j="%r15";
58 $m0="%rbx";
59 $m1="%rbp";
60
61 $code=<<___;
62 .text
63
64 .globl  bn_mul_mont
65 .type   bn_mul_mont,\@function,6
66 .align  16
67 bn_mul_mont:
68         test    \$3,${num}d
69         jnz     .Lmul_enter
70         cmp     \$8,${num}d
71         jb      .Lmul_enter
72         cmp     $ap,$bp
73         jne     .Lmul4x_enter
74         jmp     .Lsqr4x_enter
75
76 .align  16
77 .Lmul_enter:
78         push    %rbx
79         push    %rbp
80         push    %r12
81         push    %r13
82         push    %r14
83         push    %r15
84
85         mov     ${num}d,${num}d
86         lea     2($num),%r10
87         mov     %rsp,%r11
88         neg     %r10
89         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
90         and     \$-1024,%rsp            # minimize TLB usage
91
92         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
93 .Lmul_body:
94         # Some OSes, *cough*-dows, insist on stack being "wired" to
95         # physical memory in strictly sequential manner, i.e. if stack
96         # allocation spans two pages, then reference to farmost one can
97         # be punishable by SEGV. But page walking can do good even on
98         # other OSes, because it guarantees that villain thread hits
99         # the guard page before it can make damage to innocent one...
100         sub     %rsp,%r11
101         and     \$-4096,%r11
102 .Lmul_page_walk:
103         mov     (%rsp,%r11),%r10
104         sub     \$4096,%r11
105         .byte   0x66,0x2e               # predict non-taken
106         jnc     .Lmul_page_walk
107
108         mov     $bp,%r12                # reassign $bp
109 ___
110                 $bp="%r12";
111 $code.=<<___;
112         mov     ($n0),$n0               # pull n0[0] value
113         mov     ($bp),$m0               # m0=bp[0]
114         mov     ($ap),%rax
115
116         xor     $i,$i                   # i=0
117         xor     $j,$j                   # j=0
118
119         mov     $n0,$m1
120         mulq    $m0                     # ap[0]*bp[0]
121         mov     %rax,$lo0
122         mov     ($np),%rax
123
124         imulq   $lo0,$m1                # "tp[0]"*n0
125         mov     %rdx,$hi0
126
127         mulq    $m1                     # np[0]*m1
128         add     %rax,$lo0               # discarded
129         mov     8($ap),%rax
130         adc     \$0,%rdx
131         mov     %rdx,$hi1
132
133         lea     1($j),$j                # j++
134         jmp     .L1st_enter
135
136 .align  16
137 .L1st:
138         add     %rax,$hi1
139         mov     ($ap,$j,8),%rax
140         adc     \$0,%rdx
141         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
142         mov     $lo0,$hi0
143         adc     \$0,%rdx
144         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
145         mov     %rdx,$hi1
146
147 .L1st_enter:
148         mulq    $m0                     # ap[j]*bp[0]
149         add     %rax,$hi0
150         mov     ($np,$j,8),%rax
151         adc     \$0,%rdx
152         lea     1($j),$j                # j++
153         mov     %rdx,$lo0
154
155         mulq    $m1                     # np[j]*m1
156         cmp     $num,$j
157         jne     .L1st
158
159         add     %rax,$hi1
160         mov     ($ap),%rax              # ap[0]
161         adc     \$0,%rdx
162         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
163         adc     \$0,%rdx
164         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
165         mov     %rdx,$hi1
166         mov     $lo0,$hi0
167
168         xor     %rdx,%rdx
169         add     $hi0,$hi1
170         adc     \$0,%rdx
171         mov     $hi1,-8(%rsp,$num,8)
172         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
173
174         lea     1($i),$i                # i++
175         jmp     .Louter
176 .align  16
177 .Louter:
178         mov     ($bp,$i,8),$m0          # m0=bp[i]
179         xor     $j,$j                   # j=0
180         mov     $n0,$m1
181         mov     (%rsp),$lo0
182         mulq    $m0                     # ap[0]*bp[i]
183         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
184         mov     ($np),%rax
185         adc     \$0,%rdx
186
187         imulq   $lo0,$m1                # tp[0]*n0
188         mov     %rdx,$hi0
189
190         mulq    $m1                     # np[0]*m1
191         add     %rax,$lo0               # discarded
192         mov     8($ap),%rax
193         adc     \$0,%rdx
194         mov     8(%rsp),$lo0            # tp[1]
195         mov     %rdx,$hi1
196
197         lea     1($j),$j                # j++
198         jmp     .Linner_enter
199
200 .align  16
201 .Linner:
202         add     %rax,$hi1
203         mov     ($ap,$j,8),%rax
204         adc     \$0,%rdx
205         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
206         mov     (%rsp,$j,8),$lo0
207         adc     \$0,%rdx
208         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
209         mov     %rdx,$hi1
210
211 .Linner_enter:
212         mulq    $m0                     # ap[j]*bp[i]
213         add     %rax,$hi0
214         mov     ($np,$j,8),%rax
215         adc     \$0,%rdx
216         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
217         mov     %rdx,$hi0
218         adc     \$0,$hi0
219         lea     1($j),$j                # j++
220
221         mulq    $m1                     # np[j]*m1
222         cmp     $num,$j
223         jne     .Linner
224
225         add     %rax,$hi1
226         mov     ($ap),%rax              # ap[0]
227         adc     \$0,%rdx
228         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
229         mov     (%rsp,$j,8),$lo0
230         adc     \$0,%rdx
231         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
232         mov     %rdx,$hi1
233
234         xor     %rdx,%rdx
235         add     $hi0,$hi1
236         adc     \$0,%rdx
237         add     $lo0,$hi1               # pull upmost overflow bit
238         adc     \$0,%rdx
239         mov     $hi1,-8(%rsp,$num,8)
240         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
241
242         lea     1($i),$i                # i++
243         cmp     $num,$i
244         jl      .Louter
245
246         xor     $i,$i                   # i=0 and clear CF!
247         mov     (%rsp),%rax             # tp[0]
248         lea     (%rsp),$ap              # borrow ap for tp
249         mov     $num,$j                 # j=num
250         jmp     .Lsub
251 .align  16
252 .Lsub:  sbb     ($np,$i,8),%rax
253         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
254         mov     8($ap,$i,8),%rax        # tp[i+1]
255         lea     1($i),$i                # i++
256         dec     $j                      # doesnn't affect CF!
257         jnz     .Lsub
258
259         sbb     \$0,%rax                # handle upmost overflow bit
260         xor     $i,$i
261         and     %rax,$ap
262         not     %rax
263         mov     $rp,$np
264         and     %rax,$np
265         mov     $num,$j                 # j=num
266         or      $np,$ap                 # ap=borrow?tp:rp
267 .align  16
268 .Lcopy:                                 # copy or in-place refresh
269         mov     ($ap,$i,8),%rax
270         mov     $i,(%rsp,$i,8)          # zap temporary vector
271         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
272         lea     1($i),$i
273         sub     \$1,$j
274         jnz     .Lcopy
275
276         mov     8(%rsp,$num,8),%rsi     # restore %rsp
277         mov     \$1,%rax
278         mov     (%rsi),%r15
279         mov     8(%rsi),%r14
280         mov     16(%rsi),%r13
281         mov     24(%rsi),%r12
282         mov     32(%rsi),%rbp
283         mov     40(%rsi),%rbx
284         lea     48(%rsi),%rsp
285 .Lmul_epilogue:
286         ret
287 .size   bn_mul_mont,.-bn_mul_mont
288 ___
289 {{{
290 my @A=("%r10","%r11");
291 my @N=("%r13","%rdi");
292 $code.=<<___;
293 .type   bn_mul4x_mont,\@function,6
294 .align  16
295 bn_mul4x_mont:
296 .Lmul4x_enter:
297         push    %rbx
298         push    %rbp
299         push    %r12
300         push    %r13
301         push    %r14
302         push    %r15
303
304         mov     ${num}d,${num}d
305         lea     4($num),%r10
306         mov     %rsp,%r11
307         neg     %r10
308         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
309         and     \$-1024,%rsp            # minimize TLB usage
310
311         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
312 .Lmul4x_body:
313         sub     %rsp,%r11
314         and     \$-4096,%r11
315 .Lmul4x_page_walk:
316         mov     (%rsp,%r11),%r10
317         sub     \$4096,%r11
318         .byte   0x2e                    # predict non-taken
319         jnc     .Lmul4x_page_walk
320
321         mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
322         mov     %rdx,%r12               # reassign $bp
323 ___
324                 $bp="%r12";
325 $code.=<<___;
326         mov     ($n0),$n0               # pull n0[0] value
327         mov     ($bp),$m0               # m0=bp[0]
328         mov     ($ap),%rax
329
330         xor     $i,$i                   # i=0
331         xor     $j,$j                   # j=0
332
333         mov     $n0,$m1
334         mulq    $m0                     # ap[0]*bp[0]
335         mov     %rax,$A[0]
336         mov     ($np),%rax
337
338         imulq   $A[0],$m1               # "tp[0]"*n0
339         mov     %rdx,$A[1]
340
341         mulq    $m1                     # np[0]*m1
342         add     %rax,$A[0]              # discarded
343         mov     8($ap),%rax
344         adc     \$0,%rdx
345         mov     %rdx,$N[1]
346
347         mulq    $m0
348         add     %rax,$A[1]
349         mov     8($np),%rax
350         adc     \$0,%rdx
351         mov     %rdx,$A[0]
352
353         mulq    $m1
354         add     %rax,$N[1]
355         mov     16($ap),%rax
356         adc     \$0,%rdx
357         add     $A[1],$N[1]
358         lea     4($j),$j                # j++
359         adc     \$0,%rdx
360         mov     $N[1],(%rsp)
361         mov     %rdx,$N[0]
362         jmp     .L1st4x
363 .align  16
364 .L1st4x:
365         mulq    $m0                     # ap[j]*bp[0]
366         add     %rax,$A[0]
367         mov     -16($np,$j,8),%rax
368         adc     \$0,%rdx
369         mov     %rdx,$A[1]
370
371         mulq    $m1                     # np[j]*m1
372         add     %rax,$N[0]
373         mov     -8($ap,$j,8),%rax
374         adc     \$0,%rdx
375         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
376         adc     \$0,%rdx
377         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
378         mov     %rdx,$N[1]
379
380         mulq    $m0                     # ap[j]*bp[0]
381         add     %rax,$A[1]
382         mov     -8($np,$j,8),%rax
383         adc     \$0,%rdx
384         mov     %rdx,$A[0]
385
386         mulq    $m1                     # np[j]*m1
387         add     %rax,$N[1]
388         mov     ($ap,$j,8),%rax
389         adc     \$0,%rdx
390         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
391         adc     \$0,%rdx
392         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
393         mov     %rdx,$N[0]
394
395         mulq    $m0                     # ap[j]*bp[0]
396         add     %rax,$A[0]
397         mov     ($np,$j,8),%rax
398         adc     \$0,%rdx
399         mov     %rdx,$A[1]
400
401         mulq    $m1                     # np[j]*m1
402         add     %rax,$N[0]
403         mov     8($ap,$j,8),%rax
404         adc     \$0,%rdx
405         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
406         adc     \$0,%rdx
407         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
408         mov     %rdx,$N[1]
409
410         mulq    $m0                     # ap[j]*bp[0]
411         add     %rax,$A[1]
412         mov     8($np,$j,8),%rax
413         adc     \$0,%rdx
414         lea     4($j),$j                # j++
415         mov     %rdx,$A[0]
416
417         mulq    $m1                     # np[j]*m1
418         add     %rax,$N[1]
419         mov     -16($ap,$j,8),%rax
420         adc     \$0,%rdx
421         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
422         adc     \$0,%rdx
423         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
424         mov     %rdx,$N[0]
425         cmp     $num,$j
426         jl      .L1st4x
427
428         mulq    $m0                     # ap[j]*bp[0]
429         add     %rax,$A[0]
430         mov     -16($np,$j,8),%rax
431         adc     \$0,%rdx
432         mov     %rdx,$A[1]
433
434         mulq    $m1                     # np[j]*m1
435         add     %rax,$N[0]
436         mov     -8($ap,$j,8),%rax
437         adc     \$0,%rdx
438         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
439         adc     \$0,%rdx
440         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
441         mov     %rdx,$N[1]
442
443         mulq    $m0                     # ap[j]*bp[0]
444         add     %rax,$A[1]
445         mov     -8($np,$j,8),%rax
446         adc     \$0,%rdx
447         mov     %rdx,$A[0]
448
449         mulq    $m1                     # np[j]*m1
450         add     %rax,$N[1]
451         mov     ($ap),%rax              # ap[0]
452         adc     \$0,%rdx
453         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
454         adc     \$0,%rdx
455         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
456         mov     %rdx,$N[0]
457
458         xor     $N[1],$N[1]
459         add     $A[0],$N[0]
460         adc     \$0,$N[1]
461         mov     $N[0],-8(%rsp,$j,8)
462         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
463
464         lea     1($i),$i                # i++
465 .align  4
466 .Louter4x:
467         mov     ($bp,$i,8),$m0          # m0=bp[i]
468         xor     $j,$j                   # j=0
469         mov     (%rsp),$A[0]
470         mov     $n0,$m1
471         mulq    $m0                     # ap[0]*bp[i]
472         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
473         mov     ($np),%rax
474         adc     \$0,%rdx
475
476         imulq   $A[0],$m1               # tp[0]*n0
477         mov     %rdx,$A[1]
478
479         mulq    $m1                     # np[0]*m1
480         add     %rax,$A[0]              # "$N[0]", discarded
481         mov     8($ap),%rax
482         adc     \$0,%rdx
483         mov     %rdx,$N[1]
484
485         mulq    $m0                     # ap[j]*bp[i]
486         add     %rax,$A[1]
487         mov     8($np),%rax
488         adc     \$0,%rdx
489         add     8(%rsp),$A[1]           # +tp[1]
490         adc     \$0,%rdx
491         mov     %rdx,$A[0]
492
493         mulq    $m1                     # np[j]*m1
494         add     %rax,$N[1]
495         mov     16($ap),%rax
496         adc     \$0,%rdx
497         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
498         lea     4($j),$j                # j+=2
499         adc     \$0,%rdx
500         mov     $N[1],(%rsp)            # tp[j-1]
501         mov     %rdx,$N[0]
502         jmp     .Linner4x
503 .align  16
504 .Linner4x:
505         mulq    $m0                     # ap[j]*bp[i]
506         add     %rax,$A[0]
507         mov     -16($np,$j,8),%rax
508         adc     \$0,%rdx
509         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
510         adc     \$0,%rdx
511         mov     %rdx,$A[1]
512
513         mulq    $m1                     # np[j]*m1
514         add     %rax,$N[0]
515         mov     -8($ap,$j,8),%rax
516         adc     \$0,%rdx
517         add     $A[0],$N[0]
518         adc     \$0,%rdx
519         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
520         mov     %rdx,$N[1]
521
522         mulq    $m0                     # ap[j]*bp[i]
523         add     %rax,$A[1]
524         mov     -8($np,$j,8),%rax
525         adc     \$0,%rdx
526         add     -8(%rsp,$j,8),$A[1]
527         adc     \$0,%rdx
528         mov     %rdx,$A[0]
529
530         mulq    $m1                     # np[j]*m1
531         add     %rax,$N[1]
532         mov     ($ap,$j,8),%rax
533         adc     \$0,%rdx
534         add     $A[1],$N[1]
535         adc     \$0,%rdx
536         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
537         mov     %rdx,$N[0]
538
539         mulq    $m0                     # ap[j]*bp[i]
540         add     %rax,$A[0]
541         mov     ($np,$j,8),%rax
542         adc     \$0,%rdx
543         add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
544         adc     \$0,%rdx
545         mov     %rdx,$A[1]
546
547         mulq    $m1                     # np[j]*m1
548         add     %rax,$N[0]
549         mov     8($ap,$j,8),%rax
550         adc     \$0,%rdx
551         add     $A[0],$N[0]
552         adc     \$0,%rdx
553         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
554         mov     %rdx,$N[1]
555
556         mulq    $m0                     # ap[j]*bp[i]
557         add     %rax,$A[1]
558         mov     8($np,$j,8),%rax
559         adc     \$0,%rdx
560         add     8(%rsp,$j,8),$A[1]
561         adc     \$0,%rdx
562         lea     4($j),$j                # j++
563         mov     %rdx,$A[0]
564
565         mulq    $m1                     # np[j]*m1
566         add     %rax,$N[1]
567         mov     -16($ap,$j,8),%rax
568         adc     \$0,%rdx
569         add     $A[1],$N[1]
570         adc     \$0,%rdx
571         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
572         mov     %rdx,$N[0]
573         cmp     $num,$j
574         jl      .Linner4x
575
576         mulq    $m0                     # ap[j]*bp[i]
577         add     %rax,$A[0]
578         mov     -16($np,$j,8),%rax
579         adc     \$0,%rdx
580         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
581         adc     \$0,%rdx
582         mov     %rdx,$A[1]
583
584         mulq    $m1                     # np[j]*m1
585         add     %rax,$N[0]
586         mov     -8($ap,$j,8),%rax
587         adc     \$0,%rdx
588         add     $A[0],$N[0]
589         adc     \$0,%rdx
590         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
591         mov     %rdx,$N[1]
592
593         mulq    $m0                     # ap[j]*bp[i]
594         add     %rax,$A[1]
595         mov     -8($np,$j,8),%rax
596         adc     \$0,%rdx
597         add     -8(%rsp,$j,8),$A[1]
598         adc     \$0,%rdx
599         lea     1($i),$i                # i++
600         mov     %rdx,$A[0]
601
602         mulq    $m1                     # np[j]*m1
603         add     %rax,$N[1]
604         mov     ($ap),%rax              # ap[0]
605         adc     \$0,%rdx
606         add     $A[1],$N[1]
607         adc     \$0,%rdx
608         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
609         mov     %rdx,$N[0]
610
611         xor     $N[1],$N[1]
612         add     $A[0],$N[0]
613         adc     \$0,$N[1]
614         add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
615         adc     \$0,$N[1]
616         mov     $N[0],-8(%rsp,$j,8)
617         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
618
619         cmp     $num,$i
620         jl      .Louter4x
621 ___
622 {
623 my @ri=("%rax","%rdx",$m0,$m1);
624 $code.=<<___;
625         mov     16(%rsp,$num,8),$rp     # restore $rp
626         mov     0(%rsp),@ri[0]          # tp[0]
627         pxor    %xmm0,%xmm0
628         mov     8(%rsp),@ri[1]          # tp[1]
629         shr     \$2,$num                # num/=4
630         lea     (%rsp),$ap              # borrow ap for tp
631         xor     $i,$i                   # i=0 and clear CF!
632
633         sub     0($np),@ri[0]
634         mov     16($ap),@ri[2]          # tp[2]
635         mov     24($ap),@ri[3]          # tp[3]
636         sbb     8($np),@ri[1]
637         lea     -1($num),$j             # j=num/4-1
638         jmp     .Lsub4x
639 .align  16
640 .Lsub4x:
641         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
642         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
643         sbb     16($np,$i,8),@ri[2]
644         mov     32($ap,$i,8),@ri[0]     # tp[i+1]
645         mov     40($ap,$i,8),@ri[1]
646         sbb     24($np,$i,8),@ri[3]
647         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
648         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
649         sbb     32($np,$i,8),@ri[0]
650         mov     48($ap,$i,8),@ri[2]
651         mov     56($ap,$i,8),@ri[3]
652         sbb     40($np,$i,8),@ri[1]
653         lea     4($i),$i                # i++
654         dec     $j                      # doesnn't affect CF!
655         jnz     .Lsub4x
656
657         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
658         mov     32($ap,$i,8),@ri[0]     # load overflow bit
659         sbb     16($np,$i,8),@ri[2]
660         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
661         sbb     24($np,$i,8),@ri[3]
662         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
663
664         sbb     \$0,@ri[0]              # handle upmost overflow bit
665         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
666         xor     $i,$i                   # i=0
667         and     @ri[0],$ap
668         not     @ri[0]
669         mov     $rp,$np
670         and     @ri[0],$np
671         lea     -1($num),$j
672         or      $np,$ap                 # ap=borrow?tp:rp
673
674         movdqu  ($ap),%xmm1
675         movdqa  %xmm0,(%rsp)
676         movdqu  %xmm1,($rp)
677         jmp     .Lcopy4x
678 .align  16
679 .Lcopy4x:                                       # copy or in-place refresh
680         movdqu  16($ap,$i),%xmm2
681         movdqu  32($ap,$i),%xmm1
682         movdqa  %xmm0,16(%rsp,$i)
683         movdqu  %xmm2,16($rp,$i)
684         movdqa  %xmm0,32(%rsp,$i)
685         movdqu  %xmm1,32($rp,$i)
686         lea     32($i),$i
687         dec     $j
688         jnz     .Lcopy4x
689
690         shl     \$2,$num
691         movdqu  16($ap,$i),%xmm2
692         movdqa  %xmm0,16(%rsp,$i)
693         movdqu  %xmm2,16($rp,$i)
694 ___
695 }
696 $code.=<<___;
697         mov     8(%rsp,$num,8),%rsi     # restore %rsp
698         mov     \$1,%rax
699         mov     (%rsi),%r15
700         mov     8(%rsi),%r14
701         mov     16(%rsi),%r13
702         mov     24(%rsi),%r12
703         mov     32(%rsi),%rbp
704         mov     40(%rsi),%rbx
705         lea     48(%rsi),%rsp
706 .Lmul4x_epilogue:
707         ret
708 .size   bn_mul4x_mont,.-bn_mul4x_mont
709 ___
710 }}}
711 \f{{{
712 ######################################################################
713 # void bn_sqr4x_mont(
714 my $rptr="%rdi";        # const BN_ULONG *rptr,
715 my $aptr="%rsi";        # const BN_ULONG *aptr,
716 my $bptr="%rdx";        # not used
717 my $nptr="%rcx";        # const BN_ULONG *nptr,
718 my $n0  ="%r8";         # const BN_ULONG *n0);
719 my $num ="%r9";         # int num, has to be divisible by 4 and
720                         # not less than 8
721
722 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
723 my @A0=("%r10","%r11");
724 my @A1=("%r12","%r13");
725 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
726
727 $code.=<<___;
728 .type   bn_sqr4x_mont,\@function,6
729 .align  16
730 bn_sqr4x_mont:
731 .Lsqr4x_enter:
732         mov     %rsp,%rax
733         push    %rbx
734         push    %rbp
735         push    %r12
736         push    %r13
737         push    %r14
738         push    %r15
739
740         shl     \$3,${num}d             # convert $num to bytes
741         mov     %rsp,%r11               # put aside %rsp
742         neg     $num                    # -$num
743         mov     ($n0),$n0               # *n0
744         lea     -72(%rsp,$num,2),%rsp   # alloca(frame+2*$num)
745         and     \$-1024,%rsp            # minimize TLB usage
746
747         sub     %rsp,%r11
748         and     \$-4096,%r11
749 .Lsqr4x_page_walk:
750         mov     (%rsp,%r11),%r10
751         sub     \$4096,%r11
752         .byte   0x2e                    # predict non-taken
753         jnc     .Lsqr4x_page_walk
754
755         mov     $num,%r10
756         neg     $num                    # restore $num
757         lea     -48(%rax),%r11          # restore saved %rsp
758         ##############################################################
759         # Stack layout
760         #
761         # +0    saved $num, used in reduction section
762         # +8    &t[2*$num], used in reduction section
763         # +32   saved $rptr
764         # +40   saved $nptr
765         # +48   saved *n0
766         # +56   saved %rsp
767         # +64   t[2*$num]
768         #
769         mov     $rptr,32(%rsp)          # save $rptr
770         mov     $nptr,40(%rsp)
771         mov     $n0,  48(%rsp)
772         mov     %r11, 56(%rsp)          # save original %rsp
773 .Lsqr4x_body:
774         ##############################################################
775         # Squaring part:
776         #
777         # a) multiply-n-add everything but a[i]*a[i];
778         # b) shift result of a) by 1 to the left and accumulate
779         #    a[i]*a[i] products;
780         #
781         lea     32(%r10),$i             # $i=-($num-32)
782         lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
783
784         mov     $num,$j                 # $j=$num
785
786                                         # comments apply to $num==8 case
787         mov     -32($aptr,$i),$a0       # a[0]
788         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
789         mov     -24($aptr,$i),%rax      # a[1]
790         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
791         mov     -16($aptr,$i),$ai       # a[2]
792         mov     %rax,$a1
793
794         mul     $a0                     # a[1]*a[0]
795         mov     %rax,$A0[0]             # a[1]*a[0]
796          mov    $ai,%rax                # a[2]
797         mov     %rdx,$A0[1]
798         mov     $A0[0],-24($tptr,$i)    # t[1]
799
800         xor     $A0[0],$A0[0]
801         mul     $a0                     # a[2]*a[0]
802         add     %rax,$A0[1]
803          mov    $ai,%rax
804         adc     %rdx,$A0[0]
805         mov     $A0[1],-16($tptr,$i)    # t[2]
806
807         lea     -16($i),$j              # j=-16
808
809
810          mov    8($aptr,$j),$ai         # a[3]
811         mul     $a1                     # a[2]*a[1]
812         mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
813          mov    $ai,%rax
814         mov     %rdx,$A1[1]
815
816         xor     $A0[1],$A0[1]
817         add     $A1[0],$A0[0]
818          lea    16($j),$j
819         adc     \$0,$A0[1]
820         mul     $a0                     # a[3]*a[0]
821         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
822          mov    $ai,%rax
823         adc     %rdx,$A0[1]
824         mov     $A0[0],-8($tptr,$j)     # t[3]
825         jmp     .Lsqr4x_1st
826
827 .align  16
828 .Lsqr4x_1st:
829          mov    ($aptr,$j),$ai          # a[4]
830         xor     $A1[0],$A1[0]
831         mul     $a1                     # a[3]*a[1]
832         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
833          mov    $ai,%rax
834         adc     %rdx,$A1[0]
835
836         xor     $A0[0],$A0[0]
837         add     $A1[1],$A0[1]
838         adc     \$0,$A0[0]
839         mul     $a0                     # a[4]*a[0]
840         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
841          mov    $ai,%rax                # a[3]
842         adc     %rdx,$A0[0]
843         mov     $A0[1],($tptr,$j)       # t[4]
844
845
846          mov    8($aptr,$j),$ai         # a[5]
847         xor     $A1[1],$A1[1]
848         mul     $a1                     # a[4]*a[3]
849         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
850          mov    $ai,%rax
851         adc     %rdx,$A1[1]
852
853         xor     $A0[1],$A0[1]
854         add     $A1[0],$A0[0]
855         adc     \$0,$A0[1]
856         mul     $a0                     # a[5]*a[2]
857         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
858          mov    $ai,%rax
859         adc     %rdx,$A0[1]
860         mov     $A0[0],8($tptr,$j)      # t[5]
861
862          mov    16($aptr,$j),$ai        # a[6]
863         xor     $A1[0],$A1[0]
864         mul     $a1                     # a[5]*a[3]
865         add     %rax,$A1[1]             # a[5]*a[3]+t[6]
866          mov    $ai,%rax
867         adc     %rdx,$A1[0]
868
869         xor     $A0[0],$A0[0]
870         add     $A1[1],$A0[1]
871         adc     \$0,$A0[0]
872         mul     $a0                     # a[6]*a[2]
873         add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
874          mov    $ai,%rax                # a[3]
875         adc     %rdx,$A0[0]
876         mov     $A0[1],16($tptr,$j)     # t[6]
877
878
879          mov    24($aptr,$j),$ai        # a[7]
880         xor     $A1[1],$A1[1]
881         mul     $a1                     # a[6]*a[5]
882         add     %rax,$A1[0]             # a[6]*a[5]+t[7]
883          mov    $ai,%rax
884         adc     %rdx,$A1[1]
885
886         xor     $A0[1],$A0[1]
887         add     $A1[0],$A0[0]
888          lea    32($j),$j
889         adc     \$0,$A0[1]
890         mul     $a0                     # a[7]*a[4]
891         add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
892          mov    $ai,%rax
893         adc     %rdx,$A0[1]
894         mov     $A0[0],-8($tptr,$j)     # t[7]
895
896         cmp     \$0,$j
897         jne     .Lsqr4x_1st
898
899         xor     $A1[0],$A1[0]
900         add     $A0[1],$A1[1]
901         adc     \$0,$A1[0]
902         mul     $a1                     # a[7]*a[5]
903         add     %rax,$A1[1]
904         adc     %rdx,$A1[0]
905
906         mov     $A1[1],($tptr)          # t[8]
907         lea     16($i),$i
908         mov     $A1[0],8($tptr)         # t[9]
909         jmp     .Lsqr4x_outer
910
911 .align  16
912 .Lsqr4x_outer:                          # comments apply to $num==6 case
913         mov     -32($aptr,$i),$a0       # a[0]
914         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
915         mov     -24($aptr,$i),%rax      # a[1]
916         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
917         mov     -16($aptr,$i),$ai       # a[2]
918         mov     %rax,$a1
919
920         mov     -24($tptr,$i),$A0[0]    # t[1]
921         xor     $A0[1],$A0[1]
922         mul     $a0                     # a[1]*a[0]
923         add     %rax,$A0[0]             # a[1]*a[0]+t[1]
924          mov    $ai,%rax                # a[2]
925         adc     %rdx,$A0[1]
926         mov     $A0[0],-24($tptr,$i)    # t[1]
927
928         xor     $A0[0],$A0[0]
929         add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
930         adc     \$0,$A0[0]
931         mul     $a0                     # a[2]*a[0]
932         add     %rax,$A0[1]
933          mov    $ai,%rax
934         adc     %rdx,$A0[0]
935         mov     $A0[1],-16($tptr,$i)    # t[2]
936
937         lea     -16($i),$j              # j=-16
938         xor     $A1[0],$A1[0]
939
940
941          mov    8($aptr,$j),$ai         # a[3]
942         xor     $A1[1],$A1[1]
943         add     8($tptr,$j),$A1[0]
944         adc     \$0,$A1[1]
945         mul     $a1                     # a[2]*a[1]
946         add     %rax,$A1[0]             # a[2]*a[1]+t[3]
947          mov    $ai,%rax
948         adc     %rdx,$A1[1]
949
950         xor     $A0[1],$A0[1]
951         add     $A1[0],$A0[0]
952         adc     \$0,$A0[1]
953         mul     $a0                     # a[3]*a[0]
954         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
955          mov    $ai,%rax
956         adc     %rdx,$A0[1]
957         mov     $A0[0],8($tptr,$j)      # t[3]
958
959         lea     16($j),$j
960         jmp     .Lsqr4x_inner
961
962 .align  16
963 .Lsqr4x_inner:
964          mov    ($aptr,$j),$ai          # a[4]
965         xor     $A1[0],$A1[0]
966         add     ($tptr,$j),$A1[1]
967         adc     \$0,$A1[0]
968         mul     $a1                     # a[3]*a[1]
969         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
970          mov    $ai,%rax
971         adc     %rdx,$A1[0]
972
973         xor     $A0[0],$A0[0]
974         add     $A1[1],$A0[1]
975         adc     \$0,$A0[0]
976         mul     $a0                     # a[4]*a[0]
977         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
978          mov    $ai,%rax                # a[3]
979         adc     %rdx,$A0[0]
980         mov     $A0[1],($tptr,$j)       # t[4]
981
982          mov    8($aptr,$j),$ai         # a[5]
983         xor     $A1[1],$A1[1]
984         add     8($tptr,$j),$A1[0]
985         adc     \$0,$A1[1]
986         mul     $a1                     # a[4]*a[3]
987         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
988          mov    $ai,%rax
989         adc     %rdx,$A1[1]
990
991         xor     $A0[1],$A0[1]
992         add     $A1[0],$A0[0]
993         lea     16($j),$j               # j++
994         adc     \$0,$A0[1]
995         mul     $a0                     # a[5]*a[2]
996         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
997          mov    $ai,%rax
998         adc     %rdx,$A0[1]
999         mov     $A0[0],-8($tptr,$j)     # t[5], "preloaded t[1]" below
1000
1001         cmp     \$0,$j
1002         jne     .Lsqr4x_inner
1003
1004         xor     $A1[0],$A1[0]
1005         add     $A0[1],$A1[1]
1006         adc     \$0,$A1[0]
1007         mul     $a1                     # a[5]*a[3]
1008         add     %rax,$A1[1]
1009         adc     %rdx,$A1[0]
1010
1011         mov     $A1[1],($tptr)          # t[6], "preloaded t[2]" below
1012         mov     $A1[0],8($tptr)         # t[7], "preloaded t[3]" below
1013
1014         add     \$16,$i
1015         jnz     .Lsqr4x_outer
1016
1017                                         # comments apply to $num==4 case
1018         mov     -32($aptr),$a0          # a[0]
1019         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
1020         mov     -24($aptr),%rax         # a[1]
1021         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
1022         mov     -16($aptr),$ai          # a[2]
1023         mov     %rax,$a1
1024
1025         xor     $A0[1],$A0[1]
1026         mul     $a0                     # a[1]*a[0]
1027         add     %rax,$A0[0]             # a[1]*a[0]+t[1], preloaded t[1]
1028          mov    $ai,%rax                # a[2]
1029         adc     %rdx,$A0[1]
1030         mov     $A0[0],-24($tptr)       # t[1]
1031
1032         xor     $A0[0],$A0[0]
1033         add     $A1[1],$A0[1]           # a[2]*a[0]+t[2], preloaded t[2]
1034         adc     \$0,$A0[0]
1035         mul     $a0                     # a[2]*a[0]
1036         add     %rax,$A0[1]
1037          mov    $ai,%rax
1038         adc     %rdx,$A0[0]
1039         mov     $A0[1],-16($tptr)       # t[2]
1040
1041          mov    -8($aptr),$ai           # a[3]
1042         mul     $a1                     # a[2]*a[1]
1043         add     %rax,$A1[0]             # a[2]*a[1]+t[3], preloaded t[3]
1044          mov    $ai,%rax
1045         adc     \$0,%rdx
1046
1047         xor     $A0[1],$A0[1]
1048         add     $A1[0],$A0[0]
1049          mov    %rdx,$A1[1]
1050         adc     \$0,$A0[1]
1051         mul     $a0                     # a[3]*a[0]
1052         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1053          mov    $ai,%rax
1054         adc     %rdx,$A0[1]
1055         mov     $A0[0],-8($tptr)        # t[3]
1056
1057         xor     $A1[0],$A1[0]
1058         add     $A0[1],$A1[1]
1059         adc     \$0,$A1[0]
1060         mul     $a1                     # a[3]*a[1]
1061         add     %rax,$A1[1]
1062          mov    -16($aptr),%rax         # a[2]
1063         adc     %rdx,$A1[0]
1064
1065         mov     $A1[1],($tptr)          # t[4]
1066         mov     $A1[0],8($tptr)         # t[5]
1067
1068         mul     $ai                     # a[2]*a[3]
1069 ___
1070 {
1071 my ($shift,$carry)=($a0,$a1);
1072 my @S=(@A1,$ai,$n0);
1073 $code.=<<___;
1074          add    \$16,$i
1075          xor    $shift,$shift
1076          sub    $num,$i                 # $i=16-$num
1077          xor    $carry,$carry
1078
1079         add     $A1[0],%rax             # t[5]
1080         adc     \$0,%rdx
1081         mov     %rax,8($tptr)           # t[5]
1082         mov     %rdx,16($tptr)          # t[6]
1083         mov     $carry,24($tptr)        # t[7]
1084
1085          mov    -16($aptr,$i),%rax      # a[0]
1086         lea     64(%rsp,$num,2),$tptr
1087          xor    $A0[0],$A0[0]           # t[0]
1088          mov    -24($tptr,$i,2),$A0[1]  # t[1]
1089
1090         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1091         shr     \$63,$A0[0]
1092         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1093         shr     \$63,$A0[1]
1094         or      $A0[0],$S[1]            # | t[2*i]>>63
1095          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1096         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1097         mul     %rax                    # a[i]*a[i]
1098         neg     $carry                  # mov $carry,cf
1099          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1100         adc     %rax,$S[0]
1101          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1102         mov     $S[0],-32($tptr,$i,2)
1103         adc     %rdx,$S[1]
1104
1105         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1106          mov    $S[1],-24($tptr,$i,2)
1107          sbb    $carry,$carry           # mov cf,$carry
1108         shr     \$63,$A0[0]
1109         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1110         shr     \$63,$A0[1]
1111         or      $A0[0],$S[3]            # | t[2*i]>>63
1112          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1113         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1114         mul     %rax                    # a[i]*a[i]
1115         neg     $carry                  # mov $carry,cf
1116          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1117         adc     %rax,$S[2]
1118          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1119         mov     $S[2],-16($tptr,$i,2)
1120         adc     %rdx,$S[3]
1121         lea     16($i),$i
1122         mov     $S[3],-40($tptr,$i,2)
1123         sbb     $carry,$carry           # mov cf,$carry
1124         jmp     .Lsqr4x_shift_n_add
1125
1126 .align  16
1127 .Lsqr4x_shift_n_add:
1128         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1129         shr     \$63,$A0[0]
1130         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1131         shr     \$63,$A0[1]
1132         or      $A0[0],$S[1]            # | t[2*i]>>63
1133          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1134         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1135         mul     %rax                    # a[i]*a[i]
1136         neg     $carry                  # mov $carry,cf
1137          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1138         adc     %rax,$S[0]
1139          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1140         mov     $S[0],-32($tptr,$i,2)
1141         adc     %rdx,$S[1]
1142
1143         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1144          mov    $S[1],-24($tptr,$i,2)
1145          sbb    $carry,$carry           # mov cf,$carry
1146         shr     \$63,$A0[0]
1147         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1148         shr     \$63,$A0[1]
1149         or      $A0[0],$S[3]            # | t[2*i]>>63
1150          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1151         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1152         mul     %rax                    # a[i]*a[i]
1153         neg     $carry                  # mov $carry,cf
1154          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1155         adc     %rax,$S[2]
1156          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1157         mov     $S[2],-16($tptr,$i,2)
1158         adc     %rdx,$S[3]
1159
1160         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1161          mov    $S[3],-8($tptr,$i,2)
1162          sbb    $carry,$carry           # mov cf,$carry
1163         shr     \$63,$A0[0]
1164         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1165         shr     \$63,$A0[1]
1166         or      $A0[0],$S[1]            # | t[2*i]>>63
1167          mov    16($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1168         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1169         mul     %rax                    # a[i]*a[i]
1170         neg     $carry                  # mov $carry,cf
1171          mov    24($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1172         adc     %rax,$S[0]
1173          mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
1174         mov     $S[0],0($tptr,$i,2)
1175         adc     %rdx,$S[1]
1176
1177         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1178          mov    $S[1],8($tptr,$i,2)
1179          sbb    $carry,$carry           # mov cf,$carry
1180         shr     \$63,$A0[0]
1181         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1182         shr     \$63,$A0[1]
1183         or      $A0[0],$S[3]            # | t[2*i]>>63
1184          mov    32($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1185         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1186         mul     %rax                    # a[i]*a[i]
1187         neg     $carry                  # mov $carry,cf
1188          mov    40($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1189         adc     %rax,$S[2]
1190          mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
1191         mov     $S[2],16($tptr,$i,2)
1192         adc     %rdx,$S[3]
1193         mov     $S[3],24($tptr,$i,2)
1194         sbb     $carry,$carry           # mov cf,$carry
1195         add     \$32,$i
1196         jnz     .Lsqr4x_shift_n_add
1197
1198         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1199         shr     \$63,$A0[0]
1200         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1201         shr     \$63,$A0[1]
1202         or      $A0[0],$S[1]            # | t[2*i]>>63
1203          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1204         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1205         mul     %rax                    # a[i]*a[i]
1206         neg     $carry                  # mov $carry,cf
1207          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1208         adc     %rax,$S[0]
1209          mov    -8($aptr),%rax          # a[i+1]        # prefetch
1210         mov     $S[0],-32($tptr)
1211         adc     %rdx,$S[1]
1212
1213         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1214          mov    $S[1],-24($tptr)
1215          sbb    $carry,$carry           # mov cf,$carry
1216         shr     \$63,$A0[0]
1217         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1218         shr     \$63,$A0[1]
1219         or      $A0[0],$S[3]            # | t[2*i]>>63
1220         mul     %rax                    # a[i]*a[i]
1221         neg     $carry                  # mov $carry,cf
1222         adc     %rax,$S[2]
1223         adc     %rdx,$S[3]
1224         mov     $S[2],-16($tptr)
1225         mov     $S[3],-8($tptr)
1226 ___
1227 }\f
1228 ##############################################################
1229 # Montgomery reduction part, "word-by-word" algorithm.
1230 #
1231 {
1232 my ($topbit,$nptr)=("%rbp",$aptr);
1233 my ($m0,$m1)=($a0,$a1);
1234 my @Ni=("%rbx","%r9");
1235 $code.=<<___;
1236         mov     40(%rsp),$nptr          # restore $nptr
1237         mov     48(%rsp),$n0            # restore *n0
1238         xor     $j,$j
1239         mov     $num,0(%rsp)            # save $num
1240         sub     $num,$j                 # $j=-$num
1241          mov    64(%rsp),$A0[0]         # t[0]          # modsched #
1242          mov    $n0,$m0                 #               # modsched #
1243         lea     64(%rsp,$num,2),%rax    # end of t[] buffer
1244         lea     64(%rsp,$num),$tptr     # end of t[] window
1245         mov     %rax,8(%rsp)            # save end of t[] buffer
1246         lea     ($nptr,$num),$nptr      # end of n[] buffer
1247         xor     $topbit,$topbit         # $topbit=0
1248
1249         mov     0($nptr,$j),%rax        # n[0]          # modsched #
1250         mov     8($nptr,$j),$Ni[1]      # n[1]          # modsched #
1251          imulq  $A0[0],$m0              # m0=t[0]*n0    # modsched #
1252          mov    %rax,$Ni[0]             #               # modsched #
1253         jmp     .Lsqr4x_mont_outer
1254
1255 .align  16
1256 .Lsqr4x_mont_outer:
1257         xor     $A0[1],$A0[1]
1258         mul     $m0                     # n[0]*m0
1259         add     %rax,$A0[0]             # n[0]*m0+t[0]
1260          mov    $Ni[1],%rax
1261         adc     %rdx,$A0[1]
1262         mov     $n0,$m1
1263
1264         xor     $A0[0],$A0[0]
1265         add     8($tptr,$j),$A0[1]
1266         adc     \$0,$A0[0]
1267         mul     $m0                     # n[1]*m0
1268         add     %rax,$A0[1]             # n[1]*m0+t[1]
1269          mov    $Ni[0],%rax
1270         adc     %rdx,$A0[0]
1271
1272         imulq   $A0[1],$m1
1273
1274         mov     16($nptr,$j),$Ni[0]     # n[2]
1275         xor     $A1[1],$A1[1]
1276         add     $A0[1],$A1[0]
1277         adc     \$0,$A1[1]
1278         mul     $m1                     # n[0]*m1
1279         add     %rax,$A1[0]             # n[0]*m1+"t[1]"
1280          mov    $Ni[0],%rax
1281         adc     %rdx,$A1[1]
1282         mov     $A1[0],8($tptr,$j)      # "t[1]"
1283
1284         xor     $A0[1],$A0[1]
1285         add     16($tptr,$j),$A0[0]
1286         adc     \$0,$A0[1]
1287         mul     $m0                     # n[2]*m0
1288         add     %rax,$A0[0]             # n[2]*m0+t[2]
1289          mov    $Ni[1],%rax
1290         adc     %rdx,$A0[1]
1291
1292         mov     24($nptr,$j),$Ni[1]     # n[3]
1293         xor     $A1[0],$A1[0]
1294         add     $A0[0],$A1[1]
1295         adc     \$0,$A1[0]
1296         mul     $m1                     # n[1]*m1
1297         add     %rax,$A1[1]             # n[1]*m1+"t[2]"
1298          mov    $Ni[1],%rax
1299         adc     %rdx,$A1[0]
1300         mov     $A1[1],16($tptr,$j)     # "t[2]"
1301
1302         xor     $A0[0],$A0[0]
1303         add     24($tptr,$j),$A0[1]
1304         lea     32($j),$j
1305         adc     \$0,$A0[0]
1306         mul     $m0                     # n[3]*m0
1307         add     %rax,$A0[1]             # n[3]*m0+t[3]
1308          mov    $Ni[0],%rax
1309         adc     %rdx,$A0[0]
1310         jmp     .Lsqr4x_mont_inner
1311
1312 .align  16
1313 .Lsqr4x_mont_inner:
1314         mov     ($nptr,$j),$Ni[0]       # n[4]
1315         xor     $A1[1],$A1[1]
1316         add     $A0[1],$A1[0]
1317         adc     \$0,$A1[1]
1318         mul     $m1                     # n[2]*m1
1319         add     %rax,$A1[0]             # n[2]*m1+"t[3]"
1320          mov    $Ni[0],%rax
1321         adc     %rdx,$A1[1]
1322         mov     $A1[0],-8($tptr,$j)     # "t[3]"
1323
1324         xor     $A0[1],$A0[1]
1325         add     ($tptr,$j),$A0[0]
1326         adc     \$0,$A0[1]
1327         mul     $m0                     # n[4]*m0
1328         add     %rax,$A0[0]             # n[4]*m0+t[4]
1329          mov    $Ni[1],%rax
1330         adc     %rdx,$A0[1]
1331
1332         mov     8($nptr,$j),$Ni[1]      # n[5]
1333         xor     $A1[0],$A1[0]
1334         add     $A0[0],$A1[1]
1335         adc     \$0,$A1[0]
1336         mul     $m1                     # n[3]*m1
1337         add     %rax,$A1[1]             # n[3]*m1+"t[4]"
1338          mov    $Ni[1],%rax
1339         adc     %rdx,$A1[0]
1340         mov     $A1[1],($tptr,$j)       # "t[4]"
1341
1342         xor     $A0[0],$A0[0]
1343         add     8($tptr,$j),$A0[1]
1344         adc     \$0,$A0[0]
1345         mul     $m0                     # n[5]*m0
1346         add     %rax,$A0[1]             # n[5]*m0+t[5]
1347          mov    $Ni[0],%rax
1348         adc     %rdx,$A0[0]
1349
1350
1351         mov     16($nptr,$j),$Ni[0]     # n[6]
1352         xor     $A1[1],$A1[1]
1353         add     $A0[1],$A1[0]
1354         adc     \$0,$A1[1]
1355         mul     $m1                     # n[4]*m1
1356         add     %rax,$A1[0]             # n[4]*m1+"t[5]"
1357          mov    $Ni[0],%rax
1358         adc     %rdx,$A1[1]
1359         mov     $A1[0],8($tptr,$j)      # "t[5]"
1360
1361         xor     $A0[1],$A0[1]
1362         add     16($tptr,$j),$A0[0]
1363         adc     \$0,$A0[1]
1364         mul     $m0                     # n[6]*m0
1365         add     %rax,$A0[0]             # n[6]*m0+t[6]
1366          mov    $Ni[1],%rax
1367         adc     %rdx,$A0[1]
1368
1369         mov     24($nptr,$j),$Ni[1]     # n[7]
1370         xor     $A1[0],$A1[0]
1371         add     $A0[0],$A1[1]
1372         adc     \$0,$A1[0]
1373         mul     $m1                     # n[5]*m1
1374         add     %rax,$A1[1]             # n[5]*m1+"t[6]"
1375          mov    $Ni[1],%rax
1376         adc     %rdx,$A1[0]
1377         mov     $A1[1],16($tptr,$j)     # "t[6]"
1378
1379         xor     $A0[0],$A0[0]
1380         add     24($tptr,$j),$A0[1]
1381         lea     32($j),$j
1382         adc     \$0,$A0[0]
1383         mul     $m0                     # n[7]*m0
1384         add     %rax,$A0[1]             # n[7]*m0+t[7]
1385          mov    $Ni[0],%rax
1386         adc     %rdx,$A0[0]
1387         cmp     \$0,$j
1388         jne     .Lsqr4x_mont_inner
1389
1390          sub    0(%rsp),$j              # $j=-$num      # modsched #
1391          mov    $n0,$m0                 #               # modsched #
1392
1393         xor     $A1[1],$A1[1]
1394         add     $A0[1],$A1[0]
1395         adc     \$0,$A1[1]
1396         mul     $m1                     # n[6]*m1
1397         add     %rax,$A1[0]             # n[6]*m1+"t[7]"
1398         mov     $Ni[1],%rax
1399         adc     %rdx,$A1[1]
1400         mov     $A1[0],-8($tptr)        # "t[7]"
1401
1402         xor     $A0[1],$A0[1]
1403         add     ($tptr),$A0[0]          # +t[8]
1404         adc     \$0,$A0[1]
1405          mov    0($nptr,$j),$Ni[0]      # n[0]          # modsched #
1406         add     $topbit,$A0[0]
1407         adc     \$0,$A0[1]
1408
1409          imulq  16($tptr,$j),$m0        # m0=t[0]*n0    # modsched #
1410         xor     $A1[0],$A1[0]
1411          mov    8($nptr,$j),$Ni[1]      # n[1]          # modsched #
1412         add     $A0[0],$A1[1]
1413          mov    16($tptr,$j),$A0[0]     # t[0]          # modsched #
1414         adc     \$0,$A1[0]
1415         mul     $m1                     # n[7]*m1
1416         add     %rax,$A1[1]             # n[7]*m1+"t[8]"
1417          mov    $Ni[0],%rax             #               # modsched #
1418         adc     %rdx,$A1[0]
1419         mov     $A1[1],($tptr)          # "t[8]"
1420
1421         xor     $topbit,$topbit
1422         add     8($tptr),$A1[0]         # +t[9]
1423         adc     $topbit,$topbit
1424         add     $A0[1],$A1[0]
1425         lea     16($tptr),$tptr         # "t[$num]>>128"
1426         adc     \$0,$topbit
1427         mov     $A1[0],-8($tptr)        # "t[9]"
1428         cmp     8(%rsp),$tptr           # are we done?
1429         jb      .Lsqr4x_mont_outer
1430
1431         mov     0(%rsp),$num            # restore $num
1432         mov     $topbit,($tptr)         # save $topbit
1433 ___
1434 }\f
1435 ##############################################################
1436 # Post-condition, 4x unrolled copy from bn_mul_mont
1437 #
1438 {
1439 my ($tptr,$nptr)=("%rbx",$aptr);
1440 my @ri=("%rax","%rdx","%r10","%r11");
1441 $code.=<<___;
1442         mov     64(%rsp,$num),@ri[0]    # tp[0]
1443         lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
1444         mov     40(%rsp),$nptr          # restore $nptr
1445         shr     \$5,$num                # num/4
1446         mov     8($tptr),@ri[1]         # t[1]
1447         xor     $i,$i                   # i=0 and clear CF!
1448
1449         mov     32(%rsp),$rptr          # restore $rptr
1450         sub     0($nptr),@ri[0]
1451         mov     16($tptr),@ri[2]        # t[2]
1452         mov     24($tptr),@ri[3]        # t[3]
1453         sbb     8($nptr),@ri[1]
1454         lea     -1($num),$j             # j=num/4-1
1455         jmp     .Lsqr4x_sub
1456 .align  16
1457 .Lsqr4x_sub:
1458         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1459         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1460         sbb     16($nptr,$i,8),@ri[2]
1461         mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
1462         mov     40($tptr,$i,8),@ri[1]
1463         sbb     24($nptr,$i,8),@ri[3]
1464         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1465         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1466         sbb     32($nptr,$i,8),@ri[0]
1467         mov     48($tptr,$i,8),@ri[2]
1468         mov     56($tptr,$i,8),@ri[3]
1469         sbb     40($nptr,$i,8),@ri[1]
1470         lea     4($i),$i                # i++
1471         dec     $j                      # doesn't affect CF!
1472         jnz     .Lsqr4x_sub
1473
1474         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1475         mov     32($tptr,$i,8),@ri[0]   # load overflow bit
1476         sbb     16($nptr,$i,8),@ri[2]
1477         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1478         sbb     24($nptr,$i,8),@ri[3]
1479         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1480
1481         sbb     \$0,@ri[0]              # handle upmost overflow bit
1482         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1483         xor     $i,$i                   # i=0
1484         and     @ri[0],$tptr
1485         not     @ri[0]
1486         mov     $rptr,$nptr
1487         and     @ri[0],$nptr
1488         lea     -1($num),$j
1489         or      $nptr,$tptr             # tp=borrow?tp:rp
1490
1491         pxor    %xmm0,%xmm0
1492         lea     64(%rsp,$num,8),$nptr
1493         movdqu  ($tptr),%xmm1
1494         lea     ($nptr,$num,8),$nptr
1495         movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
1496         movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
1497         movdqu  %xmm1,($rptr)
1498         jmp     .Lsqr4x_copy
1499 .align  16
1500 .Lsqr4x_copy:                           # copy or in-place refresh
1501         movdqu  16($tptr,$i),%xmm2
1502         movdqu  32($tptr,$i),%xmm1
1503         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1504         movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
1505         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1506         movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
1507         movdqu  %xmm2,16($rptr,$i)
1508         movdqu  %xmm1,32($rptr,$i)
1509         lea     32($i),$i
1510         dec     $j
1511         jnz     .Lsqr4x_copy
1512
1513         movdqu  16($tptr,$i),%xmm2
1514         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1515         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1516         movdqu  %xmm2,16($rptr,$i)
1517 ___
1518 }
1519 $code.=<<___;
1520         mov     56(%rsp),%rsi           # restore %rsp
1521         mov     \$1,%rax
1522         mov     0(%rsi),%r15
1523         mov     8(%rsi),%r14
1524         mov     16(%rsi),%r13
1525         mov     24(%rsi),%r12
1526         mov     32(%rsi),%rbp
1527         mov     40(%rsi),%rbx
1528         lea     48(%rsi),%rsp
1529 .Lsqr4x_epilogue:
1530         ret
1531 .size   bn_sqr4x_mont,.-bn_sqr4x_mont
1532 ___
1533 }}}
1534 $code.=<<___;
1535 .asciz  "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1536 .align  16
1537 ___
1538
1539 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1540 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1541 if ($win64) {
1542 $rec="%rcx";
1543 $frame="%rdx";
1544 $context="%r8";
1545 $disp="%r9";
1546
1547 $code.=<<___;
1548 .extern __imp_RtlVirtualUnwind
1549 .type   mul_handler,\@abi-omnipotent
1550 .align  16
1551 mul_handler:
1552         push    %rsi
1553         push    %rdi
1554         push    %rbx
1555         push    %rbp
1556         push    %r12
1557         push    %r13
1558         push    %r14
1559         push    %r15
1560         pushfq
1561         sub     \$64,%rsp
1562
1563         mov     120($context),%rax      # pull context->Rax
1564         mov     248($context),%rbx      # pull context->Rip
1565
1566         mov     8($disp),%rsi           # disp->ImageBase
1567         mov     56($disp),%r11          # disp->HandlerData
1568
1569         mov     0(%r11),%r10d           # HandlerData[0]
1570         lea     (%rsi,%r10),%r10        # end of prologue label
1571         cmp     %r10,%rbx               # context->Rip<end of prologue label
1572         jb      .Lcommon_seh_tail
1573
1574         mov     152($context),%rax      # pull context->Rsp
1575
1576         mov     4(%r11),%r10d           # HandlerData[1]
1577         lea     (%rsi,%r10),%r10        # epilogue label
1578         cmp     %r10,%rbx               # context->Rip>=epilogue label
1579         jae     .Lcommon_seh_tail
1580
1581         mov     192($context),%r10      # pull $num
1582         mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
1583         lea     48(%rax),%rax
1584
1585         mov     -8(%rax),%rbx
1586         mov     -16(%rax),%rbp
1587         mov     -24(%rax),%r12
1588         mov     -32(%rax),%r13
1589         mov     -40(%rax),%r14
1590         mov     -48(%rax),%r15
1591         mov     %rbx,144($context)      # restore context->Rbx
1592         mov     %rbp,160($context)      # restore context->Rbp
1593         mov     %r12,216($context)      # restore context->R12
1594         mov     %r13,224($context)      # restore context->R13
1595         mov     %r14,232($context)      # restore context->R14
1596         mov     %r15,240($context)      # restore context->R15
1597
1598         jmp     .Lcommon_seh_tail
1599 .size   mul_handler,.-mul_handler
1600
1601 .type   sqr_handler,\@abi-omnipotent
1602 .align  16
1603 sqr_handler:
1604         push    %rsi
1605         push    %rdi
1606         push    %rbx
1607         push    %rbp
1608         push    %r12
1609         push    %r13
1610         push    %r14
1611         push    %r15
1612         pushfq
1613         sub     \$64,%rsp
1614
1615         mov     120($context),%rax      # pull context->Rax
1616         mov     248($context),%rbx      # pull context->Rip
1617
1618         lea     .Lsqr4x_body(%rip),%r10
1619         cmp     %r10,%rbx               # context->Rip<.Lsqr_body
1620         jb      .Lcommon_seh_tail
1621
1622         mov     152($context),%rax      # pull context->Rsp
1623
1624         lea     .Lsqr4x_epilogue(%rip),%r10
1625         cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
1626         jae     .Lcommon_seh_tail
1627
1628         mov     56(%rax),%rax           # pull saved stack pointer
1629         lea     48(%rax),%rax
1630
1631         mov     -8(%rax),%rbx
1632         mov     -16(%rax),%rbp
1633         mov     -24(%rax),%r12
1634         mov     -32(%rax),%r13
1635         mov     -40(%rax),%r14
1636         mov     -48(%rax),%r15
1637         mov     %rbx,144($context)      # restore context->Rbx
1638         mov     %rbp,160($context)      # restore context->Rbp
1639         mov     %r12,216($context)      # restore context->R12
1640         mov     %r13,224($context)      # restore context->R13
1641         mov     %r14,232($context)      # restore context->R14
1642         mov     %r15,240($context)      # restore context->R15
1643
1644 .Lcommon_seh_tail:
1645         mov     8(%rax),%rdi
1646         mov     16(%rax),%rsi
1647         mov     %rax,152($context)      # restore context->Rsp
1648         mov     %rsi,168($context)      # restore context->Rsi
1649         mov     %rdi,176($context)      # restore context->Rdi
1650
1651         mov     40($disp),%rdi          # disp->ContextRecord
1652         mov     $context,%rsi           # context
1653         mov     \$154,%ecx              # sizeof(CONTEXT)
1654         .long   0xa548f3fc              # cld; rep movsq
1655
1656         mov     $disp,%rsi
1657         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1658         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1659         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1660         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1661         mov     40(%rsi),%r10           # disp->ContextRecord
1662         lea     56(%rsi),%r11           # &disp->HandlerData
1663         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1664         mov     %r10,32(%rsp)           # arg5
1665         mov     %r11,40(%rsp)           # arg6
1666         mov     %r12,48(%rsp)           # arg7
1667         mov     %rcx,56(%rsp)           # arg8, (NULL)
1668         call    *__imp_RtlVirtualUnwind(%rip)
1669
1670         mov     \$1,%eax                # ExceptionContinueSearch
1671         add     \$64,%rsp
1672         popfq
1673         pop     %r15
1674         pop     %r14
1675         pop     %r13
1676         pop     %r12
1677         pop     %rbp
1678         pop     %rbx
1679         pop     %rdi
1680         pop     %rsi
1681         ret
1682 .size   sqr_handler,.-sqr_handler
1683
1684 .section        .pdata
1685 .align  4
1686         .rva    .LSEH_begin_bn_mul_mont
1687         .rva    .LSEH_end_bn_mul_mont
1688         .rva    .LSEH_info_bn_mul_mont
1689
1690         .rva    .LSEH_begin_bn_mul4x_mont
1691         .rva    .LSEH_end_bn_mul4x_mont
1692         .rva    .LSEH_info_bn_mul4x_mont
1693
1694         .rva    .LSEH_begin_bn_sqr4x_mont
1695         .rva    .LSEH_end_bn_sqr4x_mont
1696         .rva    .LSEH_info_bn_sqr4x_mont
1697
1698 .section        .xdata
1699 .align  8
1700 .LSEH_info_bn_mul_mont:
1701         .byte   9,0,0,0
1702         .rva    mul_handler
1703         .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
1704 .LSEH_info_bn_mul4x_mont:
1705         .byte   9,0,0,0
1706         .rva    mul_handler
1707         .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
1708 .LSEH_info_bn_sqr4x_mont:
1709         .byte   9,0,0,0
1710         .rva    sqr_handler
1711 ___
1712 }
1713
1714 print $code;
1715 close STDOUT;