Update LibreSSL from version 2.4.3 => 2.4.4
[dragonfly.git] / crypto / libressl / crypto / bn / mont5-macosx-x86_64.s
1 .text   
2
3 .globl  _bn_mul_mont_gather5
4
5 .p2align        6
6 _bn_mul_mont_gather5:
7         testl   $3,%r9d
8         jnz     L$mul_enter
9         cmpl    $8,%r9d
10         jb      L$mul_enter
11         jmp     L$mul4x_enter
12
13 .p2align        4
14 L$mul_enter:
15         movl    %r9d,%r9d
16         movl    8(%rsp),%r10d
17         pushq   %rbx
18         pushq   %rbp
19         pushq   %r12
20         pushq   %r13
21         pushq   %r14
22         pushq   %r15
23         movq    %rsp,%rax
24         leaq    2(%r9),%r11
25         negq    %r11
26         leaq    (%rsp,%r11,8),%rsp
27         andq    $-1024,%rsp
28
29         movq    %rax,8(%rsp,%r9,8)
30 L$mul_body:
31         movq    %rdx,%r12
32         movq    %r10,%r11
33         shrq    $3,%r10
34         andq    $7,%r11
35         notq    %r10
36         leaq    L$magic_masks(%rip),%rax
37         andq    $3,%r10
38         leaq    96(%r12,%r11,8),%r12
39         movq    0(%rax,%r10,8),%xmm4
40         movq    8(%rax,%r10,8),%xmm5
41         movq    16(%rax,%r10,8),%xmm6
42         movq    24(%rax,%r10,8),%xmm7
43
44         movq    -96(%r12),%xmm0
45         movq    -32(%r12),%xmm1
46         pand    %xmm4,%xmm0
47         movq    32(%r12),%xmm2
48         pand    %xmm5,%xmm1
49         movq    96(%r12),%xmm3
50         pand    %xmm6,%xmm2
51         por     %xmm1,%xmm0
52         pand    %xmm7,%xmm3
53         por     %xmm2,%xmm0
54         leaq    256(%r12),%r12
55         por     %xmm3,%xmm0
56
57         movd    %xmm0,%rbx
58
59         movq    (%r8),%r8
60         movq    (%rsi),%rax
61
62         xorq    %r14,%r14
63         xorq    %r15,%r15
64
65         movq    -96(%r12),%xmm0
66         movq    -32(%r12),%xmm1
67         pand    %xmm4,%xmm0
68         movq    32(%r12),%xmm2
69         pand    %xmm5,%xmm1
70
71         movq    %r8,%rbp
72         mulq    %rbx
73         movq    %rax,%r10
74         movq    (%rcx),%rax
75
76         movq    96(%r12),%xmm3
77         pand    %xmm6,%xmm2
78         por     %xmm1,%xmm0
79         pand    %xmm7,%xmm3
80
81         imulq   %r10,%rbp
82         movq    %rdx,%r11
83
84         por     %xmm2,%xmm0
85         leaq    256(%r12),%r12
86         por     %xmm3,%xmm0
87
88         mulq    %rbp
89         addq    %rax,%r10
90         movq    8(%rsi),%rax
91         adcq    $0,%rdx
92         movq    %rdx,%r13
93
94         leaq    1(%r15),%r15
95         jmp     L$1st_enter
96
97 .p2align        4
98 L$1st:
99         addq    %rax,%r13
100         movq    (%rsi,%r15,8),%rax
101         adcq    $0,%rdx
102         addq    %r11,%r13
103         movq    %r10,%r11
104         adcq    $0,%rdx
105         movq    %r13,-16(%rsp,%r15,8)
106         movq    %rdx,%r13
107
108 L$1st_enter:
109         mulq    %rbx
110         addq    %rax,%r11
111         movq    (%rcx,%r15,8),%rax
112         adcq    $0,%rdx
113         leaq    1(%r15),%r15
114         movq    %rdx,%r10
115
116         mulq    %rbp
117         cmpq    %r9,%r15
118         jl      L$1st
119
120         movd    %xmm0,%rbx
121
122         addq    %rax,%r13
123         movq    (%rsi),%rax
124         adcq    $0,%rdx
125         addq    %r11,%r13
126         adcq    $0,%rdx
127         movq    %r13,-16(%rsp,%r15,8)
128         movq    %rdx,%r13
129         movq    %r10,%r11
130
131         xorq    %rdx,%rdx
132         addq    %r11,%r13
133         adcq    $0,%rdx
134         movq    %r13,-8(%rsp,%r9,8)
135         movq    %rdx,(%rsp,%r9,8)
136
137         leaq    1(%r14),%r14
138         jmp     L$outer
139 .p2align        4
140 L$outer:
141         xorq    %r15,%r15
142         movq    %r8,%rbp
143         movq    (%rsp),%r10
144
145         movq    -96(%r12),%xmm0
146         movq    -32(%r12),%xmm1
147         pand    %xmm4,%xmm0
148         movq    32(%r12),%xmm2
149         pand    %xmm5,%xmm1
150
151         mulq    %rbx
152         addq    %rax,%r10
153         movq    (%rcx),%rax
154         adcq    $0,%rdx
155
156         movq    96(%r12),%xmm3
157         pand    %xmm6,%xmm2
158         por     %xmm1,%xmm0
159         pand    %xmm7,%xmm3
160
161         imulq   %r10,%rbp
162         movq    %rdx,%r11
163
164         por     %xmm2,%xmm0
165         leaq    256(%r12),%r12
166         por     %xmm3,%xmm0
167
168         mulq    %rbp
169         addq    %rax,%r10
170         movq    8(%rsi),%rax
171         adcq    $0,%rdx
172         movq    8(%rsp),%r10
173         movq    %rdx,%r13
174
175         leaq    1(%r15),%r15
176         jmp     L$inner_enter
177
178 .p2align        4
179 L$inner:
180         addq    %rax,%r13
181         movq    (%rsi,%r15,8),%rax
182         adcq    $0,%rdx
183         addq    %r10,%r13
184         movq    (%rsp,%r15,8),%r10
185         adcq    $0,%rdx
186         movq    %r13,-16(%rsp,%r15,8)
187         movq    %rdx,%r13
188
189 L$inner_enter:
190         mulq    %rbx
191         addq    %rax,%r11
192         movq    (%rcx,%r15,8),%rax
193         adcq    $0,%rdx
194         addq    %r11,%r10
195         movq    %rdx,%r11
196         adcq    $0,%r11
197         leaq    1(%r15),%r15
198
199         mulq    %rbp
200         cmpq    %r9,%r15
201         jl      L$inner
202
203         movd    %xmm0,%rbx
204
205         addq    %rax,%r13
206         movq    (%rsi),%rax
207         adcq    $0,%rdx
208         addq    %r10,%r13
209         movq    (%rsp,%r15,8),%r10
210         adcq    $0,%rdx
211         movq    %r13,-16(%rsp,%r15,8)
212         movq    %rdx,%r13
213
214         xorq    %rdx,%rdx
215         addq    %r11,%r13
216         adcq    $0,%rdx
217         addq    %r10,%r13
218         adcq    $0,%rdx
219         movq    %r13,-8(%rsp,%r9,8)
220         movq    %rdx,(%rsp,%r9,8)
221
222         leaq    1(%r14),%r14
223         cmpq    %r9,%r14
224         jl      L$outer
225
226         xorq    %r14,%r14
227         movq    (%rsp),%rax
228         leaq    (%rsp),%rsi
229         movq    %r9,%r15
230         jmp     L$sub
231 .p2align        4
232 L$sub:  sbbq    (%rcx,%r14,8),%rax
233         movq    %rax,(%rdi,%r14,8)
234         movq    8(%rsi,%r14,8),%rax
235         leaq    1(%r14),%r14
236         decq    %r15
237         jnz     L$sub
238
239         sbbq    $0,%rax
240         xorq    %r14,%r14
241         andq    %rax,%rsi
242         notq    %rax
243         movq    %rdi,%rcx
244         andq    %rax,%rcx
245         movq    %r9,%r15
246         orq     %rcx,%rsi
247 .p2align        4
248 L$copy:
249         movq    (%rsi,%r14,8),%rax
250         movq    %r14,(%rsp,%r14,8)
251         movq    %rax,(%rdi,%r14,8)
252         leaq    1(%r14),%r14
253         subq    $1,%r15
254         jnz     L$copy
255
256         movq    8(%rsp,%r9,8),%rsi
257         movq    $1,%rax
258         movq    (%rsi),%r15
259         movq    8(%rsi),%r14
260         movq    16(%rsi),%r13
261         movq    24(%rsi),%r12
262         movq    32(%rsi),%rbp
263         movq    40(%rsi),%rbx
264         leaq    48(%rsi),%rsp
265 L$mul_epilogue:
266         .byte   0xf3,0xc3
267
268
269 .p2align        4
270 bn_mul4x_mont_gather5:
271 L$mul4x_enter:
272         movl    %r9d,%r9d
273         movl    8(%rsp),%r10d
274         pushq   %rbx
275         pushq   %rbp
276         pushq   %r12
277         pushq   %r13
278         pushq   %r14
279         pushq   %r15
280         movq    %rsp,%rax
281         leaq    4(%r9),%r11
282         negq    %r11
283         leaq    (%rsp,%r11,8),%rsp
284         andq    $-1024,%rsp
285
286         movq    %rax,8(%rsp,%r9,8)
287 L$mul4x_body:
288         movq    %rdi,16(%rsp,%r9,8)
289         movq    %rdx,%r12
290         movq    %r10,%r11
291         shrq    $3,%r10
292         andq    $7,%r11
293         notq    %r10
294         leaq    L$magic_masks(%rip),%rax
295         andq    $3,%r10
296         leaq    96(%r12,%r11,8),%r12
297         movq    0(%rax,%r10,8),%xmm4
298         movq    8(%rax,%r10,8),%xmm5
299         movq    16(%rax,%r10,8),%xmm6
300         movq    24(%rax,%r10,8),%xmm7
301
302         movq    -96(%r12),%xmm0
303         movq    -32(%r12),%xmm1
304         pand    %xmm4,%xmm0
305         movq    32(%r12),%xmm2
306         pand    %xmm5,%xmm1
307         movq    96(%r12),%xmm3
308         pand    %xmm6,%xmm2
309         por     %xmm1,%xmm0
310         pand    %xmm7,%xmm3
311         por     %xmm2,%xmm0
312         leaq    256(%r12),%r12
313         por     %xmm3,%xmm0
314
315         movd    %xmm0,%rbx
316         movq    (%r8),%r8
317         movq    (%rsi),%rax
318
319         xorq    %r14,%r14
320         xorq    %r15,%r15
321
322         movq    -96(%r12),%xmm0
323         movq    -32(%r12),%xmm1
324         pand    %xmm4,%xmm0
325         movq    32(%r12),%xmm2
326         pand    %xmm5,%xmm1
327
328         movq    %r8,%rbp
329         mulq    %rbx
330         movq    %rax,%r10
331         movq    (%rcx),%rax
332
333         movq    96(%r12),%xmm3
334         pand    %xmm6,%xmm2
335         por     %xmm1,%xmm0
336         pand    %xmm7,%xmm3
337
338         imulq   %r10,%rbp
339         movq    %rdx,%r11
340
341         por     %xmm2,%xmm0
342         leaq    256(%r12),%r12
343         por     %xmm3,%xmm0
344
345         mulq    %rbp
346         addq    %rax,%r10
347         movq    8(%rsi),%rax
348         adcq    $0,%rdx
349         movq    %rdx,%rdi
350
351         mulq    %rbx
352         addq    %rax,%r11
353         movq    8(%rcx),%rax
354         adcq    $0,%rdx
355         movq    %rdx,%r10
356
357         mulq    %rbp
358         addq    %rax,%rdi
359         movq    16(%rsi),%rax
360         adcq    $0,%rdx
361         addq    %r11,%rdi
362         leaq    4(%r15),%r15
363         adcq    $0,%rdx
364         movq    %rdi,(%rsp)
365         movq    %rdx,%r13
366         jmp     L$1st4x
367 .p2align        4
368 L$1st4x:
369         mulq    %rbx
370         addq    %rax,%r10
371         movq    -16(%rcx,%r15,8),%rax
372         adcq    $0,%rdx
373         movq    %rdx,%r11
374
375         mulq    %rbp
376         addq    %rax,%r13
377         movq    -8(%rsi,%r15,8),%rax
378         adcq    $0,%rdx
379         addq    %r10,%r13
380         adcq    $0,%rdx
381         movq    %r13,-24(%rsp,%r15,8)
382         movq    %rdx,%rdi
383
384         mulq    %rbx
385         addq    %rax,%r11
386         movq    -8(%rcx,%r15,8),%rax
387         adcq    $0,%rdx
388         movq    %rdx,%r10
389
390         mulq    %rbp
391         addq    %rax,%rdi
392         movq    (%rsi,%r15,8),%rax
393         adcq    $0,%rdx
394         addq    %r11,%rdi
395         adcq    $0,%rdx
396         movq    %rdi,-16(%rsp,%r15,8)
397         movq    %rdx,%r13
398
399         mulq    %rbx
400         addq    %rax,%r10
401         movq    (%rcx,%r15,8),%rax
402         adcq    $0,%rdx
403         movq    %rdx,%r11
404
405         mulq    %rbp
406         addq    %rax,%r13
407         movq    8(%rsi,%r15,8),%rax
408         adcq    $0,%rdx
409         addq    %r10,%r13
410         adcq    $0,%rdx
411         movq    %r13,-8(%rsp,%r15,8)
412         movq    %rdx,%rdi
413
414         mulq    %rbx
415         addq    %rax,%r11
416         movq    8(%rcx,%r15,8),%rax
417         adcq    $0,%rdx
418         leaq    4(%r15),%r15
419         movq    %rdx,%r10
420
421         mulq    %rbp
422         addq    %rax,%rdi
423         movq    -16(%rsi,%r15,8),%rax
424         adcq    $0,%rdx
425         addq    %r11,%rdi
426         adcq    $0,%rdx
427         movq    %rdi,-32(%rsp,%r15,8)
428         movq    %rdx,%r13
429         cmpq    %r9,%r15
430         jl      L$1st4x
431
432         mulq    %rbx
433         addq    %rax,%r10
434         movq    -16(%rcx,%r15,8),%rax
435         adcq    $0,%rdx
436         movq    %rdx,%r11
437
438         mulq    %rbp
439         addq    %rax,%r13
440         movq    -8(%rsi,%r15,8),%rax
441         adcq    $0,%rdx
442         addq    %r10,%r13
443         adcq    $0,%rdx
444         movq    %r13,-24(%rsp,%r15,8)
445         movq    %rdx,%rdi
446
447         mulq    %rbx
448         addq    %rax,%r11
449         movq    -8(%rcx,%r15,8),%rax
450         adcq    $0,%rdx
451         movq    %rdx,%r10
452
453         mulq    %rbp
454         addq    %rax,%rdi
455         movq    (%rsi),%rax
456         adcq    $0,%rdx
457         addq    %r11,%rdi
458         adcq    $0,%rdx
459         movq    %rdi,-16(%rsp,%r15,8)
460         movq    %rdx,%r13
461
462         movd    %xmm0,%rbx
463
464         xorq    %rdi,%rdi
465         addq    %r10,%r13
466         adcq    $0,%rdi
467         movq    %r13,-8(%rsp,%r15,8)
468         movq    %rdi,(%rsp,%r15,8)
469
470         leaq    1(%r14),%r14
471 .p2align        2
472 L$outer4x:
473         xorq    %r15,%r15
474         movq    -96(%r12),%xmm0
475         movq    -32(%r12),%xmm1
476         pand    %xmm4,%xmm0
477         movq    32(%r12),%xmm2
478         pand    %xmm5,%xmm1
479
480         movq    (%rsp),%r10
481         movq    %r8,%rbp
482         mulq    %rbx
483         addq    %rax,%r10
484         movq    (%rcx),%rax
485         adcq    $0,%rdx
486
487         movq    96(%r12),%xmm3
488         pand    %xmm6,%xmm2
489         por     %xmm1,%xmm0
490         pand    %xmm7,%xmm3
491
492         imulq   %r10,%rbp
493         movq    %rdx,%r11
494
495         por     %xmm2,%xmm0
496         leaq    256(%r12),%r12
497         por     %xmm3,%xmm0
498
499         mulq    %rbp
500         addq    %rax,%r10
501         movq    8(%rsi),%rax
502         adcq    $0,%rdx
503         movq    %rdx,%rdi
504
505         mulq    %rbx
506         addq    %rax,%r11
507         movq    8(%rcx),%rax
508         adcq    $0,%rdx
509         addq    8(%rsp),%r11
510         adcq    $0,%rdx
511         movq    %rdx,%r10
512
513         mulq    %rbp
514         addq    %rax,%rdi
515         movq    16(%rsi),%rax
516         adcq    $0,%rdx
517         addq    %r11,%rdi
518         leaq    4(%r15),%r15
519         adcq    $0,%rdx
520         movq    %rdx,%r13
521         jmp     L$inner4x
522 .p2align        4
523 L$inner4x:
524         mulq    %rbx
525         addq    %rax,%r10
526         movq    -16(%rcx,%r15,8),%rax
527         adcq    $0,%rdx
528         addq    -16(%rsp,%r15,8),%r10
529         adcq    $0,%rdx
530         movq    %rdx,%r11
531
532         mulq    %rbp
533         addq    %rax,%r13
534         movq    -8(%rsi,%r15,8),%rax
535         adcq    $0,%rdx
536         addq    %r10,%r13
537         adcq    $0,%rdx
538         movq    %rdi,-32(%rsp,%r15,8)
539         movq    %rdx,%rdi
540
541         mulq    %rbx
542         addq    %rax,%r11
543         movq    -8(%rcx,%r15,8),%rax
544         adcq    $0,%rdx
545         addq    -8(%rsp,%r15,8),%r11
546         adcq    $0,%rdx
547         movq    %rdx,%r10
548
549         mulq    %rbp
550         addq    %rax,%rdi
551         movq    (%rsi,%r15,8),%rax
552         adcq    $0,%rdx
553         addq    %r11,%rdi
554         adcq    $0,%rdx
555         movq    %r13,-24(%rsp,%r15,8)
556         movq    %rdx,%r13
557
558         mulq    %rbx
559         addq    %rax,%r10
560         movq    (%rcx,%r15,8),%rax
561         adcq    $0,%rdx
562         addq    (%rsp,%r15,8),%r10
563         adcq    $0,%rdx
564         movq    %rdx,%r11
565
566         mulq    %rbp
567         addq    %rax,%r13
568         movq    8(%rsi,%r15,8),%rax
569         adcq    $0,%rdx
570         addq    %r10,%r13
571         adcq    $0,%rdx
572         movq    %rdi,-16(%rsp,%r15,8)
573         movq    %rdx,%rdi
574
575         mulq    %rbx
576         addq    %rax,%r11
577         movq    8(%rcx,%r15,8),%rax
578         adcq    $0,%rdx
579         addq    8(%rsp,%r15,8),%r11
580         adcq    $0,%rdx
581         leaq    4(%r15),%r15
582         movq    %rdx,%r10
583
584         mulq    %rbp
585         addq    %rax,%rdi
586         movq    -16(%rsi,%r15,8),%rax
587         adcq    $0,%rdx
588         addq    %r11,%rdi
589         adcq    $0,%rdx
590         movq    %r13,-40(%rsp,%r15,8)
591         movq    %rdx,%r13
592         cmpq    %r9,%r15
593         jl      L$inner4x
594
595         mulq    %rbx
596         addq    %rax,%r10
597         movq    -16(%rcx,%r15,8),%rax
598         adcq    $0,%rdx
599         addq    -16(%rsp,%r15,8),%r10
600         adcq    $0,%rdx
601         movq    %rdx,%r11
602
603         mulq    %rbp
604         addq    %rax,%r13
605         movq    -8(%rsi,%r15,8),%rax
606         adcq    $0,%rdx
607         addq    %r10,%r13
608         adcq    $0,%rdx
609         movq    %rdi,-32(%rsp,%r15,8)
610         movq    %rdx,%rdi
611
612         mulq    %rbx
613         addq    %rax,%r11
614         movq    -8(%rcx,%r15,8),%rax
615         adcq    $0,%rdx
616         addq    -8(%rsp,%r15,8),%r11
617         adcq    $0,%rdx
618         leaq    1(%r14),%r14
619         movq    %rdx,%r10
620
621         mulq    %rbp
622         addq    %rax,%rdi
623         movq    (%rsi),%rax
624         adcq    $0,%rdx
625         addq    %r11,%rdi
626         adcq    $0,%rdx
627         movq    %r13,-24(%rsp,%r15,8)
628         movq    %rdx,%r13
629
630         movd    %xmm0,%rbx
631         movq    %rdi,-16(%rsp,%r15,8)
632
633         xorq    %rdi,%rdi
634         addq    %r10,%r13
635         adcq    $0,%rdi
636         addq    (%rsp,%r9,8),%r13
637         adcq    $0,%rdi
638         movq    %r13,-8(%rsp,%r15,8)
639         movq    %rdi,(%rsp,%r15,8)
640
641         cmpq    %r9,%r14
642         jl      L$outer4x
643         movq    16(%rsp,%r9,8),%rdi
644         movq    0(%rsp),%rax
645         pxor    %xmm0,%xmm0
646         movq    8(%rsp),%rdx
647         shrq    $2,%r9
648         leaq    (%rsp),%rsi
649         xorq    %r14,%r14
650
651         subq    0(%rcx),%rax
652         movq    16(%rsi),%rbx
653         movq    24(%rsi),%rbp
654         sbbq    8(%rcx),%rdx
655         leaq    -1(%r9),%r15
656         jmp     L$sub4x
657 .p2align        4
658 L$sub4x:
659         movq    %rax,0(%rdi,%r14,8)
660         movq    %rdx,8(%rdi,%r14,8)
661         sbbq    16(%rcx,%r14,8),%rbx
662         movq    32(%rsi,%r14,8),%rax
663         movq    40(%rsi,%r14,8),%rdx
664         sbbq    24(%rcx,%r14,8),%rbp
665         movq    %rbx,16(%rdi,%r14,8)
666         movq    %rbp,24(%rdi,%r14,8)
667         sbbq    32(%rcx,%r14,8),%rax
668         movq    48(%rsi,%r14,8),%rbx
669         movq    56(%rsi,%r14,8),%rbp
670         sbbq    40(%rcx,%r14,8),%rdx
671         leaq    4(%r14),%r14
672         decq    %r15
673         jnz     L$sub4x
674
675         movq    %rax,0(%rdi,%r14,8)
676         movq    32(%rsi,%r14,8),%rax
677         sbbq    16(%rcx,%r14,8),%rbx
678         movq    %rdx,8(%rdi,%r14,8)
679         sbbq    24(%rcx,%r14,8),%rbp
680         movq    %rbx,16(%rdi,%r14,8)
681
682         sbbq    $0,%rax
683         movq    %rbp,24(%rdi,%r14,8)
684         xorq    %r14,%r14
685         andq    %rax,%rsi
686         notq    %rax
687         movq    %rdi,%rcx
688         andq    %rax,%rcx
689         leaq    -1(%r9),%r15
690         orq     %rcx,%rsi
691
692         movdqu  (%rsi),%xmm1
693         movdqa  %xmm0,(%rsp)
694         movdqu  %xmm1,(%rdi)
695         jmp     L$copy4x
696 .p2align        4
697 L$copy4x:
698         movdqu  16(%rsi,%r14,1),%xmm2
699         movdqu  32(%rsi,%r14,1),%xmm1
700         movdqa  %xmm0,16(%rsp,%r14,1)
701         movdqu  %xmm2,16(%rdi,%r14,1)
702         movdqa  %xmm0,32(%rsp,%r14,1)
703         movdqu  %xmm1,32(%rdi,%r14,1)
704         leaq    32(%r14),%r14
705         decq    %r15
706         jnz     L$copy4x
707
708         shlq    $2,%r9
709         movdqu  16(%rsi,%r14,1),%xmm2
710         movdqa  %xmm0,16(%rsp,%r14,1)
711         movdqu  %xmm2,16(%rdi,%r14,1)
712         movq    8(%rsp,%r9,8),%rsi
713         movq    $1,%rax
714         movq    (%rsi),%r15
715         movq    8(%rsi),%r14
716         movq    16(%rsi),%r13
717         movq    24(%rsi),%r12
718         movq    32(%rsi),%rbp
719         movq    40(%rsi),%rbx
720         leaq    48(%rsi),%rsp
721 L$mul4x_epilogue:
722         .byte   0xf3,0xc3
723
724 .globl  _bn_scatter5
725
726 .p2align        4
727 _bn_scatter5:
728         cmpq    $0,%rsi
729         jz      L$scatter_epilogue
730         leaq    (%rdx,%rcx,8),%rdx
731 L$scatter:
732         movq    (%rdi),%rax
733         leaq    8(%rdi),%rdi
734         movq    %rax,(%rdx)
735         leaq    256(%rdx),%rdx
736         subq    $1,%rsi
737         jnz     L$scatter
738 L$scatter_epilogue:
739         .byte   0xf3,0xc3
740
741
742 .globl  _bn_gather5
743
744 .p2align        4
745 _bn_gather5:
746         movq    %rcx,%r11
747         shrq    $3,%rcx
748         andq    $7,%r11
749         notq    %rcx
750         leaq    L$magic_masks(%rip),%rax
751         andq    $3,%rcx
752         leaq    96(%rdx,%r11,8),%rdx
753         movq    0(%rax,%rcx,8),%xmm4
754         movq    8(%rax,%rcx,8),%xmm5
755         movq    16(%rax,%rcx,8),%xmm6
756         movq    24(%rax,%rcx,8),%xmm7
757         jmp     L$gather
758 .p2align        4
759 L$gather:
760         movq    -96(%rdx),%xmm0
761         movq    -32(%rdx),%xmm1
762         pand    %xmm4,%xmm0
763         movq    32(%rdx),%xmm2
764         pand    %xmm5,%xmm1
765         movq    96(%rdx),%xmm3
766         pand    %xmm6,%xmm2
767         por     %xmm1,%xmm0
768         pand    %xmm7,%xmm3
769         por     %xmm2,%xmm0
770         leaq    256(%rdx),%rdx
771         por     %xmm3,%xmm0
772
773         movq    %xmm0,(%rdi)
774         leaq    8(%rdi),%rdi
775         subq    $1,%rsi
776         jnz     L$gather
777         .byte   0xf3,0xc3
778 L$SEH_end_bn_gather5:
779
780 .p2align        6
781 L$magic_masks:
782 .long   0,0, 0,0, 0,0, -1,-1
783 .long   0,0, 0,0, 0,0,  0,0
784 .byte   77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0