libcrypto(3): Set CC variable for Perl scripts.
authorPeter Avalos <pavalos@dragonflybsd.org>
Wed, 3 Aug 2016 20:22:16 +0000 (13:22 -0700)
committerPeter Avalos <pavalos@dragonflybsd.org>
Wed, 3 Aug 2016 20:29:04 +0000 (13:29 -0700)
This detects assembler/compiler capabilities.

Obtained-from:   FreeBSD

16 files changed:
secure/lib/libcrypto/asm/Makefile
secure/lib/libcrypto/asm/aesni-gcm-x86_64.s
secure/lib/libcrypto/asm/aesni-mb-x86_64.s
secure/lib/libcrypto/asm/aesni-sha1-x86_64.s
secure/lib/libcrypto/asm/aesni-sha256-x86_64.s
secure/lib/libcrypto/asm/ecp_nistz256-x86_64.s
secure/lib/libcrypto/asm/ghash-x86_64.s
secure/lib/libcrypto/asm/rsaz-avx2.s
secure/lib/libcrypto/asm/rsaz-x86_64.s
secure/lib/libcrypto/asm/sha1-mb-x86_64.s
secure/lib/libcrypto/asm/sha1-x86_64.s
secure/lib/libcrypto/asm/sha256-mb-x86_64.s
secure/lib/libcrypto/asm/sha256-x86_64.s
secure/lib/libcrypto/asm/sha512-x86_64.s
secure/lib/libcrypto/asm/x86_64-mont.s
secure/lib/libcrypto/asm/x86_64-mont5.s

index be5cfc1..a2cf185 100644 (file)
@@ -60,11 +60,11 @@ CLEANFILES+=        ${SRCS:S/.pl$/.s/}
 .SUFFIXES:     .pl
 
 sha{256,512}-x86_64.s: ${OPENSSL_SRC}/crypto/sha/asm/sha512-x86_64.pl
-       perl ${.ALLSRC} elf ${.TARGET}
+       env CC=cc perl ${.ALLSRC} elf ${.TARGET}
        echo ".section .note.GNU-stack,\"\",%progbits" >>${.TARGET}
 
 .pl.s:
-       perl ${.IMPSRC} elf ${PERLFLAGS} > ${.TARGET}
+       env CC=cc perl ${.IMPSRC} elf > ${.TARGET}
        echo ".section .note.GNU-stack,\"\",%progbits" >>${.TARGET}
 
 .include <bsd.prog.mk>
index 7eaaaa0..ce1254d 100644 (file)
 .text  
 
-.globl aesni_gcm_encrypt
-.type  aesni_gcm_encrypt,@function
-aesni_gcm_encrypt:
-       xorl    %eax,%eax
-       .byte   0xf3,0xc3
-.size  aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.type  _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
+       vmovdqu 32(%r11),%xmm2
+       subq    $6,%rdx
+       vpxor   %xmm4,%xmm4,%xmm4
+       vmovdqu 0-128(%rcx),%xmm15
+       vpaddb  %xmm2,%xmm1,%xmm10
+       vpaddb  %xmm2,%xmm10,%xmm11
+       vpaddb  %xmm2,%xmm11,%xmm12
+       vpaddb  %xmm2,%xmm12,%xmm13
+       vpaddb  %xmm2,%xmm13,%xmm14
+       vpxor   %xmm15,%xmm1,%xmm9
+       vmovdqu %xmm4,16+8(%rsp)
+       jmp     .Loop6x
+
+.align 32
+.Loop6x:
+       addl    $100663296,%ebx
+       jc      .Lhandle_ctr32
+       vmovdqu 0-32(%r9),%xmm3
+       vpaddb  %xmm2,%xmm14,%xmm1
+       vpxor   %xmm15,%xmm10,%xmm10
+       vpxor   %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+       vmovdqu %xmm1,(%r8)
+       vpclmulqdq      $0x10,%xmm3,%xmm7,%xmm5
+       vpxor   %xmm15,%xmm12,%xmm12
+       vmovups 16-128(%rcx),%xmm2
+       vpclmulqdq      $0x01,%xmm3,%xmm7,%xmm6
+       xorq    %r12,%r12
+       cmpq    %r14,%r15
+
+       vaesenc %xmm2,%xmm9,%xmm9
+       vmovdqu 48+8(%rsp),%xmm0
+       vpxor   %xmm15,%xmm13,%xmm13
+       vpclmulqdq      $0x00,%xmm3,%xmm7,%xmm1
+       vaesenc %xmm2,%xmm10,%xmm10
+       vpxor   %xmm15,%xmm14,%xmm14
+       setnc   %r12b
+       vpclmulqdq      $0x11,%xmm3,%xmm7,%xmm7
+       vaesenc %xmm2,%xmm11,%xmm11
+       vmovdqu 16-32(%r9),%xmm3
+       negq    %r12
+       vaesenc %xmm2,%xmm12,%xmm12
+       vpxor   %xmm5,%xmm6,%xmm6
+       vpclmulqdq      $0x00,%xmm3,%xmm0,%xmm5
+       vpxor   %xmm4,%xmm8,%xmm8
+       vaesenc %xmm2,%xmm13,%xmm13
+       vpxor   %xmm5,%xmm1,%xmm4
+       andq    $0x60,%r12
+       vmovups 32-128(%rcx),%xmm15
+       vpclmulqdq      $0x10,%xmm3,%xmm0,%xmm1
+       vaesenc %xmm2,%xmm14,%xmm14
+
+       vpclmulqdq      $0x01,%xmm3,%xmm0,%xmm2
+       leaq    (%r14,%r12,1),%r14
+       vaesenc %xmm15,%xmm9,%xmm9
+       vpxor   16+8(%rsp),%xmm8,%xmm8
+       vpclmulqdq      $0x11,%xmm3,%xmm0,%xmm3
+       vmovdqu 64+8(%rsp),%xmm0
+       vaesenc %xmm15,%xmm10,%xmm10
+       movbeq  88(%r14),%r13
+       vaesenc %xmm15,%xmm11,%xmm11
+       movbeq  80(%r14),%r12
+       vaesenc %xmm15,%xmm12,%xmm12
+       movq    %r13,32+8(%rsp)
+       vaesenc %xmm15,%xmm13,%xmm13
+       movq    %r12,40+8(%rsp)
+       vmovdqu 48-32(%r9),%xmm5
+       vaesenc %xmm15,%xmm14,%xmm14
+
+       vmovups 48-128(%rcx),%xmm15
+       vpxor   %xmm1,%xmm6,%xmm6
+       vpclmulqdq      $0x00,%xmm5,%xmm0,%xmm1
+       vaesenc %xmm15,%xmm9,%xmm9
+       vpxor   %xmm2,%xmm6,%xmm6
+       vpclmulqdq      $0x10,%xmm5,%xmm0,%xmm2
+       vaesenc %xmm15,%xmm10,%xmm10
+       vpxor   %xmm3,%xmm7,%xmm7
+       vpclmulqdq      $0x01,%xmm5,%xmm0,%xmm3
+       vaesenc %xmm15,%xmm11,%xmm11
+       vpclmulqdq      $0x11,%xmm5,%xmm0,%xmm5
+       vmovdqu 80+8(%rsp),%xmm0
+       vaesenc %xmm15,%xmm12,%xmm12
+       vaesenc %xmm15,%xmm13,%xmm13
+       vpxor   %xmm1,%xmm4,%xmm4
+       vmovdqu 64-32(%r9),%xmm1
+       vaesenc %xmm15,%xmm14,%xmm14
+
+       vmovups 64-128(%rcx),%xmm15
+       vpxor   %xmm2,%xmm6,%xmm6
+       vpclmulqdq      $0x00,%xmm1,%xmm0,%xmm2
+       vaesenc %xmm15,%xmm9,%xmm9
+       vpxor   %xmm3,%xmm6,%xmm6
+       vpclmulqdq      $0x10,%xmm1,%xmm0,%xmm3
+       vaesenc %xmm15,%xmm10,%xmm10
+       movbeq  72(%r14),%r13
+       vpxor   %xmm5,%xmm7,%xmm7
+       vpclmulqdq      $0x01,%xmm1,%xmm0,%xmm5
+       vaesenc %xmm15,%xmm11,%xmm11
+       movbeq  64(%r14),%r12
+       vpclmulqdq      $0x11,%xmm1,%xmm0,%xmm1
+       vmovdqu 96+8(%rsp),%xmm0
+       vaesenc %xmm15,%xmm12,%xmm12
+       movq    %r13,48+8(%rsp)
+       vaesenc %xmm15,%xmm13,%xmm13
+       movq    %r12,56+8(%rsp)
+       vpxor   %xmm2,%xmm4,%xmm4
+       vmovdqu 96-32(%r9),%xmm2
+       vaesenc %xmm15,%xmm14,%xmm14
+
+       vmovups 80-128(%rcx),%xmm15
+       vpxor   %xmm3,%xmm6,%xmm6
+       vpclmulqdq      $0x00,%xmm2,%xmm0,%xmm3
+       vaesenc %xmm15,%xmm9,%xmm9
+       vpxor   %xmm5,%xmm6,%xmm6
+       vpclmulqdq      $0x10,%xmm2,%xmm0,%xmm5
+       vaesenc %xmm15,%xmm10,%xmm10
+       movbeq  56(%r14),%r13
+       vpxor   %xmm1,%xmm7,%xmm7
+       vpclmulqdq      $0x01,%xmm2,%xmm0,%xmm1
+       vpxor   112+8(%rsp),%xmm8,%xmm8
+       vaesenc %xmm15,%xmm11,%xmm11
+       movbeq  48(%r14),%r12
+       vpclmulqdq      $0x11,%xmm2,%xmm0,%xmm2
+       vaesenc %xmm15,%xmm12,%xmm12
+       movq    %r13,64+8(%rsp)
+       vaesenc %xmm15,%xmm13,%xmm13
+       movq    %r12,72+8(%rsp)
+       vpxor   %xmm3,%xmm4,%xmm4
+       vmovdqu 112-32(%r9),%xmm3
+       vaesenc %xmm15,%xmm14,%xmm14
+
+       vmovups 96-128(%rcx),%xmm15
+       vpxor   %xmm5,%xmm6,%xmm6
+       vpclmulqdq      $0x10,%xmm3,%xmm8,%xmm5
+       vaesenc %xmm15,%xmm9,%xmm9
+       vpxor   %xmm1,%xmm6,%xmm6
+       vpclmulqdq      $0x01,%xmm3,%xmm8,%xmm1
+       vaesenc %xmm15,%xmm10,%xmm10
+       movbeq  40(%r14),%r13
+       vpxor   %xmm2,%xmm7,%xmm7
+       vpclmulqdq      $0x00,%xmm3,%xmm8,%xmm2
+       vaesenc %xmm15,%xmm11,%xmm11
+       movbeq  32(%r14),%r12
+       vpclmulqdq      $0x11,%xmm3,%xmm8,%xmm8
+       vaesenc %xmm15,%xmm12,%xmm12
+       movq    %r13,80+8(%rsp)
+       vaesenc %xmm15,%xmm13,%xmm13
+       movq    %r12,88+8(%rsp)
+       vpxor   %xmm5,%xmm6,%xmm6
+       vaesenc %xmm15,%xmm14,%xmm14
+       vpxor   %xmm1,%xmm6,%xmm6
+
+       vmovups 112-128(%rcx),%xmm15
+       vpslldq $8,%xmm6,%xmm5
+       vpxor   %xmm2,%xmm4,%xmm4
+       vmovdqu 16(%r11),%xmm3
+
+       vaesenc %xmm15,%xmm9,%xmm9
+       vpxor   %xmm8,%xmm7,%xmm7
+       vaesenc %xmm15,%xmm10,%xmm10
+       vpxor   %xmm5,%xmm4,%xmm4
+       movbeq  24(%r14),%r13
+       vaesenc %xmm15,%xmm11,%xmm11
+       movbeq  16(%r14),%r12
+       vpalignr        $8,%xmm4,%xmm4,%xmm0
+       vpclmulqdq      $0x10,%xmm3,%xmm4,%xmm4
+       movq    %r13,96+8(%rsp)
+       vaesenc %xmm15,%xmm12,%xmm12
+       movq    %r12,104+8(%rsp)
+       vaesenc %xmm15,%xmm13,%xmm13
+       vmovups 128-128(%rcx),%xmm1
+       vaesenc %xmm15,%xmm14,%xmm14
+
+       vaesenc %xmm1,%xmm9,%xmm9
+       vmovups 144-128(%rcx),%xmm15
+       vaesenc %xmm1,%xmm10,%xmm10
+       vpsrldq $8,%xmm6,%xmm6
+       vaesenc %xmm1,%xmm11,%xmm11
+       vpxor   %xmm6,%xmm7,%xmm7
+       vaesenc %xmm1,%xmm12,%xmm12
+       vpxor   %xmm0,%xmm4,%xmm4
+       movbeq  8(%r14),%r13
+       vaesenc %xmm1,%xmm13,%xmm13
+       movbeq  0(%r14),%r12
+       vaesenc %xmm1,%xmm14,%xmm14
+       vmovups 160-128(%rcx),%xmm1
+       cmpl    $11,%ebp
+       jb      .Lenc_tail
+
+       vaesenc %xmm15,%xmm9,%xmm9
+       vaesenc %xmm15,%xmm10,%xmm10
+       vaesenc %xmm15,%xmm11,%xmm11
+       vaesenc %xmm15,%xmm12,%xmm12
+       vaesenc %xmm15,%xmm13,%xmm13
+       vaesenc %xmm15,%xmm14,%xmm14
+
+       vaesenc %xmm1,%xmm9,%xmm9
+       vaesenc %xmm1,%xmm10,%xmm10
+       vaesenc %xmm1,%xmm11,%xmm11
+       vaesenc %xmm1,%xmm12,%xmm12
+       vaesenc %xmm1,%xmm13,%xmm13
+       vmovups 176-128(%rcx),%xmm15
+       vaesenc %xmm1,%xmm14,%xmm14
+       vmovups 192-128(%rcx),%xmm1
+       je      .Lenc_tail
 
+       vaesenc %xmm15,%xmm9,%xmm9
+       vaesenc %xmm15,%xmm10,%xmm10
+       vaesenc %xmm15,%xmm11,%xmm11
+       vaesenc %xmm15,%xmm12,%xmm12
+       vaesenc %xmm15,%xmm13,%xmm13
+       vaesenc %xmm15,%xmm14,%xmm14
+
+       vaesenc %xmm1,%xmm9,%xmm9
+       vaesenc %xmm1,%xmm10,%xmm10
+       vaesenc %xmm1,%xmm11,%xmm11
+       vaesenc %xmm1,%xmm12,%xmm12
+       vaesenc %xmm1,%xmm13,%xmm13
+       vmovups 208-128(%rcx),%xmm15
+       vaesenc %xmm1,%xmm14,%xmm14
+       vmovups 224-128(%rcx),%xmm1
+       jmp     .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+       vmovdqu (%r11),%xmm0
+       vpshufb %xmm0,%xmm1,%xmm6
+       vmovdqu 48(%r11),%xmm5
+       vpaddd  64(%r11),%xmm6,%xmm10
+       vpaddd  %xmm5,%xmm6,%xmm11
+       vmovdqu 0-32(%r9),%xmm3
+       vpaddd  %xmm5,%xmm10,%xmm12
+       vpshufb %xmm0,%xmm10,%xmm10
+       vpaddd  %xmm5,%xmm11,%xmm13
+       vpshufb %xmm0,%xmm11,%xmm11
+       vpxor   %xmm15,%xmm10,%xmm10
+       vpaddd  %xmm5,%xmm12,%xmm14
+       vpshufb %xmm0,%xmm12,%xmm12
+       vpxor   %xmm15,%xmm11,%xmm11
+       vpaddd  %xmm5,%xmm13,%xmm1
+       vpshufb %xmm0,%xmm13,%xmm13
+       vpshufb %xmm0,%xmm14,%xmm14
+       vpshufb %xmm0,%xmm1,%xmm1
+       jmp     .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+       vaesenc %xmm15,%xmm9,%xmm9
+       vmovdqu %xmm7,16+8(%rsp)
+       vpalignr        $8,%xmm4,%xmm4,%xmm8
+       vaesenc %xmm15,%xmm10,%xmm10
+       vpclmulqdq      $0x10,%xmm3,%xmm4,%xmm4
+       vpxor   0(%rdi),%xmm1,%xmm2
+       vaesenc %xmm15,%xmm11,%xmm11
+       vpxor   16(%rdi),%xmm1,%xmm0
+       vaesenc %xmm15,%xmm12,%xmm12
+       vpxor   32(%rdi),%xmm1,%xmm5
+       vaesenc %xmm15,%xmm13,%xmm13
+       vpxor   48(%rdi),%xmm1,%xmm6
+       vaesenc %xmm15,%xmm14,%xmm14
+       vpxor   64(%rdi),%xmm1,%xmm7
+       vpxor   80(%rdi),%xmm1,%xmm3
+       vmovdqu (%r8),%xmm1
+
+       vaesenclast     %xmm2,%xmm9,%xmm9
+       vmovdqu 32(%r11),%xmm2
+       vaesenclast     %xmm0,%xmm10,%xmm10
+       vpaddb  %xmm2,%xmm1,%xmm0
+       movq    %r13,112+8(%rsp)
+       leaq    96(%rdi),%rdi
+       vaesenclast     %xmm5,%xmm11,%xmm11
+       vpaddb  %xmm2,%xmm0,%xmm5
+       movq    %r12,120+8(%rsp)
+       leaq    96(%rsi),%rsi
+       vmovdqu 0-128(%rcx),%xmm15
+       vaesenclast     %xmm6,%xmm12,%xmm12
+       vpaddb  %xmm2,%xmm5,%xmm6
+       vaesenclast     %xmm7,%xmm13,%xmm13
+       vpaddb  %xmm2,%xmm6,%xmm7
+       vaesenclast     %xmm3,%xmm14,%xmm14
+       vpaddb  %xmm2,%xmm7,%xmm3
+
+       addq    $0x60,%r10
+       subq    $0x6,%rdx
+       jc      .L6x_done
+
+       vmovups %xmm9,-96(%rsi)
+       vpxor   %xmm15,%xmm1,%xmm9
+       vmovups %xmm10,-80(%rsi)
+       vmovdqa %xmm0,%xmm10
+       vmovups %xmm11,-64(%rsi)
+       vmovdqa %xmm5,%xmm11
+       vmovups %xmm12,-48(%rsi)
+       vmovdqa %xmm6,%xmm12
+       vmovups %xmm13,-32(%rsi)
+       vmovdqa %xmm7,%xmm13
+       vmovups %xmm14,-16(%rsi)
+       vmovdqa %xmm3,%xmm14
+       vmovdqu 32+8(%rsp),%xmm7
+       jmp     .Loop6x
+
+.L6x_done:
+       vpxor   16+8(%rsp),%xmm8,%xmm8
+       vpxor   %xmm4,%xmm8,%xmm8
+
+       .byte   0xf3,0xc3
+.size  _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
 .globl aesni_gcm_decrypt
 .type  aesni_gcm_decrypt,@function
+.align 32
 aesni_gcm_decrypt:
-       xorl    %eax,%eax
+       xorq    %r10,%r10
+       cmpq    $0x60,%rdx
+       jb      .Lgcm_dec_abort
+
+       leaq    (%rsp),%rax
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       vzeroupper
+
+       vmovdqu (%r8),%xmm1
+       addq    $-128,%rsp
+       movl    12(%r8),%ebx
+       leaq    .Lbswap_mask(%rip),%r11
+       leaq    -128(%rcx),%r14
+       movq    $0xf80,%r15
+       vmovdqu (%r9),%xmm8
+       andq    $-128,%rsp
+       vmovdqu (%r11),%xmm0
+       leaq    128(%rcx),%rcx
+       leaq    32+32(%r9),%r9
+       movl    240-128(%rcx),%ebp
+       vpshufb %xmm0,%xmm8,%xmm8
+
+       andq    %r15,%r14
+       andq    %rsp,%r15
+       subq    %r14,%r15
+       jc      .Ldec_no_key_aliasing
+       cmpq    $768,%r15
+       jnc     .Ldec_no_key_aliasing
+       subq    %r15,%rsp
+.Ldec_no_key_aliasing:
+
+       vmovdqu 80(%rdi),%xmm7
+       leaq    (%rdi),%r14
+       vmovdqu 64(%rdi),%xmm4
+       leaq    -192(%rdi,%rdx,1),%r15
+       vmovdqu 48(%rdi),%xmm5
+       shrq    $4,%rdx
+       xorq    %r10,%r10
+       vmovdqu 32(%rdi),%xmm6
+       vpshufb %xmm0,%xmm7,%xmm7
+       vmovdqu 16(%rdi),%xmm2
+       vpshufb %xmm0,%xmm4,%xmm4
+       vmovdqu (%rdi),%xmm3
+       vpshufb %xmm0,%xmm5,%xmm5
+       vmovdqu %xmm4,48(%rsp)
+       vpshufb %xmm0,%xmm6,%xmm6
+       vmovdqu %xmm5,64(%rsp)
+       vpshufb %xmm0,%xmm2,%xmm2
+       vmovdqu %xmm6,80(%rsp)
+       vpshufb %xmm0,%xmm3,%xmm3
+       vmovdqu %xmm2,96(%rsp)
+       vmovdqu %xmm3,112(%rsp)
+
+       call    _aesni_ctr32_ghash_6x
+
+       vmovups %xmm9,-96(%rsi)
+       vmovups %xmm10,-80(%rsi)
+       vmovups %xmm11,-64(%rsi)
+       vmovups %xmm12,-48(%rsi)
+       vmovups %xmm13,-32(%rsi)
+       vmovups %xmm14,-16(%rsi)
+
+       vpshufb (%r11),%xmm8,%xmm8
+       vmovdqu %xmm8,-64(%r9)
+
+       vzeroupper
+       movq    -48(%rax),%r15
+       movq    -40(%rax),%r14
+       movq    -32(%rax),%r13
+       movq    -24(%rax),%r12
+       movq    -16(%rax),%rbp
+       movq    -8(%rax),%rbx
+       leaq    (%rax),%rsp
+.Lgcm_dec_abort:
+       movq    %r10,%rax
        .byte   0xf3,0xc3
 .size  aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type  _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+       vmovdqu 0-128(%rcx),%xmm4
+       vmovdqu 32(%r11),%xmm2
+       leaq    -1(%rbp),%r13
+       vmovups 16-128(%rcx),%xmm15
+       leaq    32-128(%rcx),%r12
+       vpxor   %xmm4,%xmm1,%xmm9
+       addl    $100663296,%ebx
+       jc      .Lhandle_ctr32_2
+       vpaddb  %xmm2,%xmm1,%xmm10
+       vpaddb  %xmm2,%xmm10,%xmm11
+       vpxor   %xmm4,%xmm10,%xmm10
+       vpaddb  %xmm2,%xmm11,%xmm12
+       vpxor   %xmm4,%xmm11,%xmm11
+       vpaddb  %xmm2,%xmm12,%xmm13
+       vpxor   %xmm4,%xmm12,%xmm12
+       vpaddb  %xmm2,%xmm13,%xmm14
+       vpxor   %xmm4,%xmm13,%xmm13
+       vpaddb  %xmm2,%xmm14,%xmm1
+       vpxor   %xmm4,%xmm14,%xmm14
+       jmp     .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+       vaesenc %xmm15,%xmm9,%xmm9
+       vaesenc %xmm15,%xmm10,%xmm10
+       vaesenc %xmm15,%xmm11,%xmm11
+       vaesenc %xmm15,%xmm12,%xmm12
+       vaesenc %xmm15,%xmm13,%xmm13
+       vaesenc %xmm15,%xmm14,%xmm14
+       vmovups (%r12),%xmm15
+       leaq    16(%r12),%r12
+       decl    %r13d
+       jnz     .Loop_ctr32
+
+       vmovdqu (%r12),%xmm3
+       vaesenc %xmm15,%xmm9,%xmm9
+       vpxor   0(%rdi),%xmm3,%xmm4
+       vaesenc %xmm15,%xmm10,%xmm10
+       vpxor   16(%rdi),%xmm3,%xmm5
+       vaesenc %xmm15,%xmm11,%xmm11
+       vpxor   32(%rdi),%xmm3,%xmm6
+       vaesenc %xmm15,%xmm12,%xmm12
+       vpxor   48(%rdi),%xmm3,%xmm8
+       vaesenc %xmm15,%xmm13,%xmm13
+       vpxor   64(%rdi),%xmm3,%xmm2
+       vaesenc %xmm15,%xmm14,%xmm14
+       vpxor   80(%rdi),%xmm3,%xmm3
+       leaq    96(%rdi),%rdi
+
+       vaesenclast     %xmm4,%xmm9,%xmm9
+       vaesenclast     %xmm5,%xmm10,%xmm10
+       vaesenclast     %xmm6,%xmm11,%xmm11
+       vaesenclast     %xmm8,%xmm12,%xmm12
+       vaesenclast     %xmm2,%xmm13,%xmm13
+       vaesenclast     %xmm3,%xmm14,%xmm14
+       vmovups %xmm9,0(%rsi)
+       vmovups %xmm10,16(%rsi)
+       vmovups %xmm11,32(%rsi)
+       vmovups %xmm12,48(%rsi)
+       vmovups %xmm13,64(%rsi)
+       vmovups %xmm14,80(%rsi)
+       leaq    96(%rsi),%rsi
+
+       .byte   0xf3,0xc3
+.align 32
+.Lhandle_ctr32_2:
+       vpshufb %xmm0,%xmm1,%xmm6
+       vmovdqu 48(%r11),%xmm5
+       vpaddd  64(%r11),%xmm6,%xmm10
+       vpaddd  %xmm5,%xmm6,%xmm11
+       vpaddd  %xmm5,%xmm10,%xmm12
+       vpshufb %xmm0,%xmm10,%xmm10
+       vpaddd  %xmm5,%xmm11,%xmm13
+       vpshufb %xmm0,%xmm11,%xmm11
+       vpxor   %xmm4,%xmm10,%xmm10
+       vpaddd  %xmm5,%xmm12,%xmm14
+       vpshufb %xmm0,%xmm12,%xmm12
+       vpxor   %xmm4,%xmm11,%xmm11
+       vpaddd  %xmm5,%xmm13,%xmm1
+       vpshufb %xmm0,%xmm13,%xmm13
+       vpxor   %xmm4,%xmm12,%xmm12
+       vpshufb %xmm0,%xmm14,%xmm14
+       vpxor   %xmm4,%xmm13,%xmm13
+       vpshufb %xmm0,%xmm1,%xmm1
+       vpxor   %xmm4,%xmm14,%xmm14
+       jmp     .Loop_ctr32
+.size  _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type  aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+       xorq    %r10,%r10
+       cmpq    $288,%rdx
+       jb      .Lgcm_enc_abort
+
+       leaq    (%rsp),%rax
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       vzeroupper
+
+       vmovdqu (%r8),%xmm1
+       addq    $-128,%rsp
+       movl    12(%r8),%ebx
+       leaq    .Lbswap_mask(%rip),%r11
+       leaq    -128(%rcx),%r14
+       movq    $0xf80,%r15
+       leaq    128(%rcx),%rcx
+       vmovdqu (%r11),%xmm0
+       andq    $-128,%rsp
+       movl    240-128(%rcx),%ebp
+
+       andq    %r15,%r14
+       andq    %rsp,%r15
+       subq    %r14,%r15
+       jc      .Lenc_no_key_aliasing
+       cmpq    $768,%r15
+       jnc     .Lenc_no_key_aliasing
+       subq    %r15,%rsp
+.Lenc_no_key_aliasing:
+
+       leaq    (%rsi),%r14
+       leaq    -192(%rsi,%rdx,1),%r15
+       shrq    $4,%rdx
+
+       call    _aesni_ctr32_6x
+       vpshufb %xmm0,%xmm9,%xmm8
+       vpshufb %xmm0,%xmm10,%xmm2
+       vmovdqu %xmm8,112(%rsp)
+       vpshufb %xmm0,%xmm11,%xmm4
+       vmovdqu %xmm2,96(%rsp)
+       vpshufb %xmm0,%xmm12,%xmm5
+       vmovdqu %xmm4,80(%rsp)
+       vpshufb %xmm0,%xmm13,%xmm6
+       vmovdqu %xmm5,64(%rsp)
+       vpshufb %xmm0,%xmm14,%xmm7
+       vmovdqu %xmm6,48(%rsp)
+
+       call    _aesni_ctr32_6x
+
+       vmovdqu (%r9),%xmm8
+       leaq    32+32(%r9),%r9
+       subq    $12,%rdx
+       movq    $192,%r10
+       vpshufb %xmm0,%xmm8,%xmm8
+
+       call    _aesni_ctr32_ghash_6x
+       vmovdqu 32(%rsp),%xmm7
+       vmovdqu (%r11),%xmm0
+       vmovdqu 0-32(%r9),%xmm3
+       vpunpckhqdq     %xmm7,%xmm7,%xmm1
+       vmovdqu 32-32(%r9),%xmm15
+       vmovups %xmm9,-96(%rsi)
+       vpshufb %xmm0,%xmm9,%xmm9
+       vpxor   %xmm7,%xmm1,%xmm1
+       vmovups %xmm10,-80(%rsi)
+       vpshufb %xmm0,%xmm10,%xmm10
+       vmovups %xmm11,-64(%rsi)
+       vpshufb %xmm0,%xmm11,%xmm11
+       vmovups %xmm12,-48(%rsi)
+       vpshufb %xmm0,%xmm12,%xmm12
+       vmovups %xmm13,-32(%rsi)
+       vpshufb %xmm0,%xmm13,%xmm13
+       vmovups %xmm14,-16(%rsi)
+       vpshufb %xmm0,%xmm14,%xmm14
+       vmovdqu %xmm9,16(%rsp)
+       vmovdqu 48(%rsp),%xmm6
+       vmovdqu 16-32(%r9),%xmm0
+       vpunpckhqdq     %xmm6,%xmm6,%xmm2
+       vpclmulqdq      $0x00,%xmm3,%xmm7,%xmm5
+       vpxor   %xmm6,%xmm2,%xmm2
+       vpclmulqdq      $0x11,%xmm3,%xmm7,%xmm7
+       vpclmulqdq      $0x00,%xmm15,%xmm1,%xmm1
+
+       vmovdqu 64(%rsp),%xmm9
+       vpclmulqdq      $0x00,%xmm0,%xmm6,%xmm4
+       vmovdqu 48-32(%r9),%xmm3
+       vpxor   %xmm5,%xmm4,%xmm4
+       vpunpckhqdq     %xmm9,%xmm9,%xmm5
+       vpclmulqdq      $0x11,%xmm0,%xmm6,%xmm6
+       vpxor   %xmm9,%xmm5,%xmm5
+       vpxor   %xmm7,%xmm6,%xmm6
+       vpclmulqdq      $0x10,%xmm15,%xmm2,%xmm2
+       vmovdqu 80-32(%r9),%xmm15
+       vpxor   %xmm1,%xmm2,%xmm2
+
+       vmovdqu 80(%rsp),%xmm1
+       vpclmulqdq      $0x00,%xmm3,%xmm9,%xmm7
+       vmovdqu 64-32(%r9),%xmm0
+       vpxor   %xmm4,%xmm7,%xmm7
+       vpunpckhqdq     %xmm1,%xmm1,%xmm4
+       vpclmulqdq      $0x11,%xmm3,%xmm9,%xmm9
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpxor   %xmm6,%xmm9,%xmm9
+       vpclmulqdq      $0x00,%xmm15,%xmm5,%xmm5
+       vpxor   %xmm2,%xmm5,%xmm5
+
+       vmovdqu 96(%rsp),%xmm2
+       vpclmulqdq      $0x00,%xmm0,%xmm1,%xmm6
+       vmovdqu 96-32(%r9),%xmm3
+       vpxor   %xmm7,%xmm6,%xmm6
+       vpunpckhqdq     %xmm2,%xmm2,%xmm7
+       vpclmulqdq      $0x11,%xmm0,%xmm1,%xmm1
+       vpxor   %xmm2,%xmm7,%xmm7
+       vpxor   %xmm9,%xmm1,%xmm1
+       vpclmulqdq      $0x10,%xmm15,%xmm4,%xmm4
+       vmovdqu 128-32(%r9),%xmm15
+       vpxor   %xmm5,%xmm4,%xmm4
+
+       vpxor   112(%rsp),%xmm8,%xmm8
+       vpclmulqdq      $0x00,%xmm3,%xmm2,%xmm5
+       vmovdqu 112-32(%r9),%xmm0
+       vpunpckhqdq     %xmm8,%xmm8,%xmm9
+       vpxor   %xmm6,%xmm5,%xmm5
+       vpclmulqdq      $0x11,%xmm3,%xmm2,%xmm2
+       vpxor   %xmm8,%xmm9,%xmm9
+       vpxor   %xmm1,%xmm2,%xmm2
+       vpclmulqdq      $0x00,%xmm15,%xmm7,%xmm7
+       vpxor   %xmm4,%xmm7,%xmm4
+
+       vpclmulqdq      $0x00,%xmm0,%xmm8,%xmm6
+       vmovdqu 0-32(%r9),%xmm3
+       vpunpckhqdq     %xmm14,%xmm14,%xmm1
+       vpclmulqdq      $0x11,%xmm0,%xmm8,%xmm8
+       vpxor   %xmm14,%xmm1,%xmm1
+       vpxor   %xmm5,%xmm6,%xmm5
+       vpclmulqdq      $0x10,%xmm15,%xmm9,%xmm9
+       vmovdqu 32-32(%r9),%xmm15
+       vpxor   %xmm2,%xmm8,%xmm7
+       vpxor   %xmm4,%xmm9,%xmm6
+
+       vmovdqu 16-32(%r9),%xmm0
+       vpxor   %xmm5,%xmm7,%xmm9
+       vpclmulqdq      $0x00,%xmm3,%xmm14,%xmm4
+       vpxor   %xmm9,%xmm6,%xmm6
+       vpunpckhqdq     %xmm13,%xmm13,%xmm2
+       vpclmulqdq      $0x11,%xmm3,%xmm14,%xmm14
+       vpxor   %xmm13,%xmm2,%xmm2
+       vpslldq $8,%xmm6,%xmm9
+       vpclmulqdq      $0x00,%xmm15,%xmm1,%xmm1
+       vpxor   %xmm9,%xmm5,%xmm8
+       vpsrldq $8,%xmm6,%xmm6
+       vpxor   %xmm6,%xmm7,%xmm7
+
+       vpclmulqdq      $0x00,%xmm0,%xmm13,%xmm5
+       vmovdqu 48-32(%r9),%xmm3
+       vpxor   %xmm4,%xmm5,%xmm5
+       vpunpckhqdq     %xmm12,%xmm12,%xmm9
+       vpclmulqdq      $0x11,%xmm0,%xmm13,%xmm13
+       vpxor   %xmm12,%xmm9,%xmm9
+       vpxor   %xmm14,%xmm13,%xmm13
+       vpalignr        $8,%xmm8,%xmm8,%xmm14
+       vpclmulqdq      $0x10,%xmm15,%xmm2,%xmm2
+       vmovdqu 80-32(%r9),%xmm15
+       vpxor   %xmm1,%xmm2,%xmm2
+
+       vpclmulqdq      $0x00,%xmm3,%xmm12,%xmm4
+       vmovdqu 64-32(%r9),%xmm0
+       vpxor   %xmm5,%xmm4,%xmm4
+       vpunpckhqdq     %xmm11,%xmm11,%xmm1
+       vpclmulqdq      $0x11,%xmm3,%xmm12,%xmm12
+       vpxor   %xmm11,%xmm1,%xmm1
+       vpxor   %xmm13,%xmm12,%xmm12
+       vxorps  16(%rsp),%xmm7,%xmm7
+       vpclmulqdq      $0x00,%xmm15,%xmm9,%xmm9
+       vpxor   %xmm2,%xmm9,%xmm9
+
+       vpclmulqdq      $0x10,16(%r11),%xmm8,%xmm8
+       vxorps  %xmm14,%xmm8,%xmm8
+
+       vpclmulqdq      $0x00,%xmm0,%xmm11,%xmm5
+       vmovdqu 96-32(%r9),%xmm3
+       vpxor   %xmm4,%xmm5,%xmm5
+       vpunpckhqdq     %xmm10,%xmm10,%xmm2
+       vpclmulqdq      $0x11,%xmm0,%xmm11,%xmm11
+       vpxor   %xmm10,%xmm2,%xmm2
+       vpalignr        $8,%xmm8,%xmm8,%xmm14
+       vpxor   %xmm12,%xmm11,%xmm11
+       vpclmulqdq      $0x10,%xmm15,%xmm1,%xmm1
+       vmovdqu 128-32(%r9),%xmm15
+       vpxor   %xmm9,%xmm1,%xmm1
+
+       vxorps  %xmm7,%xmm14,%xmm14
+       vpclmulqdq      $0x10,16(%r11),%xmm8,%xmm8
+       vxorps  %xmm14,%xmm8,%xmm8
+
+       vpclmulqdq      $0x00,%xmm3,%xmm10,%xmm4
+       vmovdqu 112-32(%r9),%xmm0
+       vpxor   %xmm5,%xmm4,%xmm4
+       vpunpckhqdq     %xmm8,%xmm8,%xmm9
+       vpclmulqdq      $0x11,%xmm3,%xmm10,%xmm10
+       vpxor   %xmm8,%xmm9,%xmm9
+       vpxor   %xmm11,%xmm10,%xmm10
+       vpclmulqdq      $0x00,%xmm15,%xmm2,%xmm2
+       vpxor   %xmm1,%xmm2,%xmm2
+
+       vpclmulqdq      $0x00,%xmm0,%xmm8,%xmm5
+       vpclmulqdq      $0x11,%xmm0,%xmm8,%xmm7
+       vpxor   %xmm4,%xmm5,%xmm5
+       vpclmulqdq      $0x10,%xmm15,%xmm9,%xmm6
+       vpxor   %xmm10,%xmm7,%xmm7
+       vpxor   %xmm2,%xmm6,%xmm6
+
+       vpxor   %xmm5,%xmm7,%xmm4
+       vpxor   %xmm4,%xmm6,%xmm6
+       vpslldq $8,%xmm6,%xmm1
+       vmovdqu 16(%r11),%xmm3
+       vpsrldq $8,%xmm6,%xmm6
+       vpxor   %xmm1,%xmm5,%xmm8
+       vpxor   %xmm6,%xmm7,%xmm7
+
+       vpalignr        $8,%xmm8,%xmm8,%xmm2
+       vpclmulqdq      $0x10,%xmm3,%xmm8,%xmm8
+       vpxor   %xmm2,%xmm8,%xmm8
+
+       vpalignr        $8,%xmm8,%xmm8,%xmm2
+       vpclmulqdq      $0x10,%xmm3,%xmm8,%xmm8
+       vpxor   %xmm7,%xmm2,%xmm2
+       vpxor   %xmm2,%xmm8,%xmm8
+       vpshufb (%r11),%xmm8,%xmm8
+       vmovdqu %xmm8,-64(%r9)
+
+       vzeroupper
+       movq    -48(%rax),%r15
+       movq    -40(%rax),%r14
+       movq    -32(%rax),%r13
+       movq    -24(%rax),%r12
+       movq    -16(%rax),%rbp
+       movq    -8(%rax),%rbx
+       leaq    (%rax),%rsp
+.Lgcm_enc_abort:
+       movq    %r10,%rax
+       .byte   0xf3,0xc3
+.size  aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align 64
+.Lbswap_mask:
+.byte  15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte  2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte  1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte  65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
 .section .note.GNU-stack,"",%progbits
index 7c8d2e6..d5373dc 100644 (file)
@@ -6,6 +6,14 @@
 .type  aesni_multi_cbc_encrypt,@function
 .align 32
 aesni_multi_cbc_encrypt:
+       cmpl    $2,%edx
+       jb      .Lenc_non_avx
+       movl    OPENSSL_ia32cap_P+4(%rip),%ecx
+       testl   $268435456,%ecx
+       jnz     _avx_cbc_enc_shortcut
+       jmp     .Lenc_non_avx
+.align 16
+.Lenc_non_avx:
        movq    %rsp,%rax
        pushq   %rbx
        pushq   %rbp
@@ -262,6 +270,14 @@ aesni_multi_cbc_encrypt:
 .type  aesni_multi_cbc_decrypt,@function
 .align 32
 aesni_multi_cbc_decrypt:
+       cmpl    $2,%edx
+       jb      .Ldec_non_avx
+       movl    OPENSSL_ia32cap_P+4(%rip),%ecx
+       testl   $268435456,%ecx
+       jnz     _avx_cbc_dec_shortcut
+       jmp     .Ldec_non_avx
+.align 16
+.Ldec_non_avx:
        movq    %rsp,%rax
        pushq   %rbx
        pushq   %rbp
@@ -504,4 +520,917 @@ aesni_multi_cbc_decrypt:
 .Ldec4x_epilogue:
        .byte   0xf3,0xc3
 .size  aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+.type  aesni_multi_cbc_encrypt_avx,@function
+.align 32
+aesni_multi_cbc_encrypt_avx:
+_avx_cbc_enc_shortcut:
+       movq    %rsp,%rax
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+
+
+
+
+
+
+
+
+       subq    $192,%rsp
+       andq    $-128,%rsp
+       movq    %rax,16(%rsp)
+
+.Lenc8x_body:
+       vzeroupper
+       vmovdqu (%rsi),%xmm15
+       leaq    120(%rsi),%rsi
+       leaq    160(%rdi),%rdi
+       shrl    $1,%edx
+
+.Lenc8x_loop_grande:
+
+       xorl    %edx,%edx
+       movl    -144(%rdi),%ecx
+       movq    -160(%rdi),%r8
+       cmpl    %edx,%ecx
+       movq    -152(%rdi),%rbx
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu -136(%rdi),%xmm2
+       movl    %ecx,32(%rsp)
+       cmovleq %rsp,%r8
+       subq    %r8,%rbx
+       movq    %rbx,64(%rsp)
+       movl    -104(%rdi),%ecx
+       movq    -120(%rdi),%r9
+       cmpl    %edx,%ecx
+       movq    -112(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu -96(%rdi),%xmm3
+       movl    %ecx,36(%rsp)
+       cmovleq %rsp,%r9
+       subq    %r9,%rbp
+       movq    %rbp,72(%rsp)
+       movl    -64(%rdi),%ecx
+       movq    -80(%rdi),%r10
+       cmpl    %edx,%ecx
+       movq    -72(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu -56(%rdi),%xmm4
+       movl    %ecx,40(%rsp)
+       cmovleq %rsp,%r10
+       subq    %r10,%rbp
+       movq    %rbp,80(%rsp)
+       movl    -24(%rdi),%ecx
+       movq    -40(%rdi),%r11
+       cmpl    %edx,%ecx
+       movq    -32(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu -16(%rdi),%xmm5
+       movl    %ecx,44(%rsp)
+       cmovleq %rsp,%r11
+       subq    %r11,%rbp
+       movq    %rbp,88(%rsp)
+       movl    16(%rdi),%ecx
+       movq    0(%rdi),%r12
+       cmpl    %edx,%ecx
+       movq    8(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu 24(%rdi),%xmm6
+       movl    %ecx,48(%rsp)
+       cmovleq %rsp,%r12
+       subq    %r12,%rbp
+       movq    %rbp,96(%rsp)
+       movl    56(%rdi),%ecx
+       movq    40(%rdi),%r13
+       cmpl    %edx,%ecx
+       movq    48(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu 64(%rdi),%xmm7
+       movl    %ecx,52(%rsp)
+       cmovleq %rsp,%r13
+       subq    %r13,%rbp
+       movq    %rbp,104(%rsp)
+       movl    96(%rdi),%ecx
+       movq    80(%rdi),%r14
+       cmpl    %edx,%ecx
+       movq    88(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu 104(%rdi),%xmm8
+       movl    %ecx,56(%rsp)
+       cmovleq %rsp,%r14
+       subq    %r14,%rbp
+       movq    %rbp,112(%rsp)
+       movl    136(%rdi),%ecx
+       movq    120(%rdi),%r15
+       cmpl    %edx,%ecx
+       movq    128(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu 144(%rdi),%xmm9
+       movl    %ecx,60(%rsp)
+       cmovleq %rsp,%r15
+       subq    %r15,%rbp
+       movq    %rbp,120(%rsp)
+       testl   %edx,%edx
+       jz      .Lenc8x_done
+
+       vmovups 16-120(%rsi),%xmm1
+       vmovups 32-120(%rsi),%xmm0
+       movl    240-120(%rsi),%eax
+
+       vpxor   (%r8),%xmm15,%xmm10
+       leaq    128(%rsp),%rbp
+       vpxor   (%r9),%xmm15,%xmm11
+       vpxor   (%r10),%xmm15,%xmm12
+       vpxor   (%r11),%xmm15,%xmm13
+       vpxor   %xmm10,%xmm2,%xmm2
+       vpxor   (%r12),%xmm15,%xmm10
+       vpxor   %xmm11,%xmm3,%xmm3
+       vpxor   (%r13),%xmm15,%xmm11
+       vpxor   %xmm12,%xmm4,%xmm4
+       vpxor   (%r14),%xmm15,%xmm12
+       vpxor   %xmm13,%xmm5,%xmm5
+       vpxor   (%r15),%xmm15,%xmm13
+       vpxor   %xmm10,%xmm6,%xmm6
+       movl    $1,%ecx
+       vpxor   %xmm11,%xmm7,%xmm7
+       vpxor   %xmm12,%xmm8,%xmm8
+       vpxor   %xmm13,%xmm9,%xmm9
+       jmp     .Loop_enc8x
+
+.align 32
+.Loop_enc8x:
+       vaesenc %xmm1,%xmm2,%xmm2
+       cmpl    32+0(%rsp),%ecx
+       vaesenc %xmm1,%xmm3,%xmm3
+       prefetcht0      31(%r8)
+       vaesenc %xmm1,%xmm4,%xmm4
+       vaesenc %xmm1,%xmm5,%xmm5
+       leaq    (%r8,%rbx,1),%rbx
+       cmovgeq %rsp,%r8
+       vaesenc %xmm1,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesenc %xmm1,%xmm7,%xmm7
+       subq    %r8,%rbx
+       vaesenc %xmm1,%xmm8,%xmm8
+       vpxor   16(%r8),%xmm15,%xmm10
+       movq    %rbx,64+0(%rsp)
+       vaesenc %xmm1,%xmm9,%xmm9
+       vmovups -72(%rsi),%xmm1
+       leaq    16(%r8,%rbx,1),%r8
+       vmovdqu %xmm10,0(%rbp)
+       vaesenc %xmm0,%xmm2,%xmm2
+       cmpl    32+4(%rsp),%ecx
+       movq    64+8(%rsp),%rbx
+       vaesenc %xmm0,%xmm3,%xmm3
+       prefetcht0      31(%r9)
+       vaesenc %xmm0,%xmm4,%xmm4
+       vaesenc %xmm0,%xmm5,%xmm5
+       leaq    (%r9,%rbx,1),%rbx
+       cmovgeq %rsp,%r9
+       vaesenc %xmm0,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesenc %xmm0,%xmm7,%xmm7
+       subq    %r9,%rbx
+       vaesenc %xmm0,%xmm8,%xmm8
+       vpxor   16(%r9),%xmm15,%xmm11
+       movq    %rbx,64+8(%rsp)
+       vaesenc %xmm0,%xmm9,%xmm9
+       vmovups -56(%rsi),%xmm0
+       leaq    16(%r9,%rbx,1),%r9
+       vmovdqu %xmm11,16(%rbp)
+       vaesenc %xmm1,%xmm2,%xmm2
+       cmpl    32+8(%rsp),%ecx
+       movq    64+16(%rsp),%rbx
+       vaesenc %xmm1,%xmm3,%xmm3
+       prefetcht0      31(%r10)
+       vaesenc %xmm1,%xmm4,%xmm4
+       prefetcht0      15(%r8)
+       vaesenc %xmm1,%xmm5,%xmm5
+       leaq    (%r10,%rbx,1),%rbx
+       cmovgeq %rsp,%r10
+       vaesenc %xmm1,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesenc %xmm1,%xmm7,%xmm7
+       subq    %r10,%rbx
+       vaesenc %xmm1,%xmm8,%xmm8
+       vpxor   16(%r10),%xmm15,%xmm12
+       movq    %rbx,64+16(%rsp)
+       vaesenc %xmm1,%xmm9,%xmm9
+       vmovups -40(%rsi),%xmm1
+       leaq    16(%r10,%rbx,1),%r10
+       vmovdqu %xmm12,32(%rbp)
+       vaesenc %xmm0,%xmm2,%xmm2
+       cmpl    32+12(%rsp),%ecx
+       movq    64+24(%rsp),%rbx
+       vaesenc %xmm0,%xmm3,%xmm3
+       prefetcht0      31(%r11)
+       vaesenc %xmm0,%xmm4,%xmm4
+       prefetcht0      15(%r9)
+       vaesenc %xmm0,%xmm5,%xmm5
+       leaq    (%r11,%rbx,1),%rbx
+       cmovgeq %rsp,%r11
+       vaesenc %xmm0,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesenc %xmm0,%xmm7,%xmm7
+       subq    %r11,%rbx
+       vaesenc %xmm0,%xmm8,%xmm8
+       vpxor   16(%r11),%xmm15,%xmm13
+       movq    %rbx,64+24(%rsp)
+       vaesenc %xmm0,%xmm9,%xmm9
+       vmovups -24(%rsi),%xmm0
+       leaq    16(%r11,%rbx,1),%r11
+       vmovdqu %xmm13,48(%rbp)
+       vaesenc %xmm1,%xmm2,%xmm2
+       cmpl    32+16(%rsp),%ecx
+       movq    64+32(%rsp),%rbx
+       vaesenc %xmm1,%xmm3,%xmm3
+       prefetcht0      31(%r12)
+       vaesenc %xmm1,%xmm4,%xmm4
+       prefetcht0      15(%r10)
+       vaesenc %xmm1,%xmm5,%xmm5
+       leaq    (%r12,%rbx,1),%rbx
+       cmovgeq %rsp,%r12
+       vaesenc %xmm1,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesenc %xmm1,%xmm7,%xmm7
+       subq    %r12,%rbx
+       vaesenc %xmm1,%xmm8,%xmm8
+       vpxor   16(%r12),%xmm15,%xmm10
+       movq    %rbx,64+32(%rsp)
+       vaesenc %xmm1,%xmm9,%xmm9
+       vmovups -8(%rsi),%xmm1
+       leaq    16(%r12,%rbx,1),%r12
+       vaesenc %xmm0,%xmm2,%xmm2
+       cmpl    32+20(%rsp),%ecx
+       movq    64+40(%rsp),%rbx
+       vaesenc %xmm0,%xmm3,%xmm3
+       prefetcht0      31(%r13)
+       vaesenc %xmm0,%xmm4,%xmm4
+       prefetcht0      15(%r11)
+       vaesenc %xmm0,%xmm5,%xmm5
+       leaq    (%rbx,%r13,1),%rbx
+       cmovgeq %rsp,%r13
+       vaesenc %xmm0,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesenc %xmm0,%xmm7,%xmm7
+       subq    %r13,%rbx
+       vaesenc %xmm0,%xmm8,%xmm8
+       vpxor   16(%r13),%xmm15,%xmm11
+       movq    %rbx,64+40(%rsp)
+       vaesenc %xmm0,%xmm9,%xmm9
+       vmovups 8(%rsi),%xmm0
+       leaq    16(%r13,%rbx,1),%r13
+       vaesenc %xmm1,%xmm2,%xmm2
+       cmpl    32+24(%rsp),%ecx
+       movq    64+48(%rsp),%rbx
+       vaesenc %xmm1,%xmm3,%xmm3
+       prefetcht0      31(%r14)
+       vaesenc %xmm1,%xmm4,%xmm4
+       prefetcht0      15(%r12)
+       vaesenc %xmm1,%xmm5,%xmm5
+       leaq    (%r14,%rbx,1),%rbx
+       cmovgeq %rsp,%r14
+       vaesenc %xmm1,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesenc %xmm1,%xmm7,%xmm7
+       subq    %r14,%rbx
+       vaesenc %xmm1,%xmm8,%xmm8
+       vpxor   16(%r14),%xmm15,%xmm12
+       movq    %rbx,64+48(%rsp)
+       vaesenc %xmm1,%xmm9,%xmm9
+       vmovups 24(%rsi),%xmm1
+       leaq    16(%r14,%rbx,1),%r14
+       vaesenc %xmm0,%xmm2,%xmm2
+       cmpl    32+28(%rsp),%ecx
+       movq    64+56(%rsp),%rbx
+       vaesenc %xmm0,%xmm3,%xmm3
+       prefetcht0      31(%r15)
+       vaesenc %xmm0,%xmm4,%xmm4
+       prefetcht0      15(%r13)
+       vaesenc %xmm0,%xmm5,%xmm5
+       leaq    (%r15,%rbx,1),%rbx
+       cmovgeq %rsp,%r15
+       vaesenc %xmm0,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesenc %xmm0,%xmm7,%xmm7
+       subq    %r15,%rbx
+       vaesenc %xmm0,%xmm8,%xmm8
+       vpxor   16(%r15),%xmm15,%xmm13
+       movq    %rbx,64+56(%rsp)
+       vaesenc %xmm0,%xmm9,%xmm9
+       vmovups 40(%rsi),%xmm0
+       leaq    16(%r15,%rbx,1),%r15
+       vmovdqu 32(%rsp),%xmm14
+       prefetcht0      15(%r14)
+       prefetcht0      15(%r15)
+       cmpl    $11,%eax
+       jb      .Lenc8x_tail
+
+       vaesenc %xmm1,%xmm2,%xmm2
+       vaesenc %xmm1,%xmm3,%xmm3
+       vaesenc %xmm1,%xmm4,%xmm4
+       vaesenc %xmm1,%xmm5,%xmm5
+       vaesenc %xmm1,%xmm6,%xmm6
+       vaesenc %xmm1,%xmm7,%xmm7
+       vaesenc %xmm1,%xmm8,%xmm8
+       vaesenc %xmm1,%xmm9,%xmm9
+       vmovups 176-120(%rsi),%xmm1
+
+       vaesenc %xmm0,%xmm2,%xmm2
+       vaesenc %xmm0,%xmm3,%xmm3
+       vaesenc %xmm0,%xmm4,%xmm4
+       vaesenc %xmm0,%xmm5,%xmm5
+       vaesenc %xmm0,%xmm6,%xmm6
+       vaesenc %xmm0,%xmm7,%xmm7
+       vaesenc %xmm0,%xmm8,%xmm8
+       vaesenc %xmm0,%xmm9,%xmm9
+       vmovups 192-120(%rsi),%xmm0
+       je      .Lenc8x_tail
+
+       vaesenc %xmm1,%xmm2,%xmm2
+       vaesenc %xmm1,%xmm3,%xmm3
+       vaesenc %xmm1,%xmm4,%xmm4
+       vaesenc %xmm1,%xmm5,%xmm5
+       vaesenc %xmm1,%xmm6,%xmm6
+       vaesenc %xmm1,%xmm7,%xmm7
+       vaesenc %xmm1,%xmm8,%xmm8
+       vaesenc %xmm1,%xmm9,%xmm9
+       vmovups 208-120(%rsi),%xmm1
+
+       vaesenc %xmm0,%xmm2,%xmm2
+       vaesenc %xmm0,%xmm3,%xmm3
+       vaesenc %xmm0,%xmm4,%xmm4
+       vaesenc %xmm0,%xmm5,%xmm5
+       vaesenc %xmm0,%xmm6,%xmm6
+       vaesenc %xmm0,%xmm7,%xmm7
+       vaesenc %xmm0,%xmm8,%xmm8
+       vaesenc %xmm0,%xmm9,%xmm9
+       vmovups 224-120(%rsi),%xmm0
+
+.Lenc8x_tail:
+       vaesenc %xmm1,%xmm2,%xmm2
+       vpxor   %xmm15,%xmm15,%xmm15
+       vaesenc %xmm1,%xmm3,%xmm3
+       vaesenc %xmm1,%xmm4,%xmm4
+       vpcmpgtd        %xmm15,%xmm14,%xmm15
+       vaesenc %xmm1,%xmm5,%xmm5
+       vaesenc %xmm1,%xmm6,%xmm6
+       vpaddd  %xmm14,%xmm15,%xmm15
+       vmovdqu 48(%rsp),%xmm14
+       vaesenc %xmm1,%xmm7,%xmm7
+       movq    64(%rsp),%rbx
+       vaesenc %xmm1,%xmm8,%xmm8
+       vaesenc %xmm1,%xmm9,%xmm9
+       vmovups 16-120(%rsi),%xmm1
+
+       vaesenclast     %xmm0,%xmm2,%xmm2
+       vmovdqa %xmm15,32(%rsp)
+       vpxor   %xmm15,%xmm15,%xmm15
+       vaesenclast     %xmm0,%xmm3,%xmm3
+       vaesenclast     %xmm0,%xmm4,%xmm4
+       vpcmpgtd        %xmm15,%xmm14,%xmm15
+       vaesenclast     %xmm0,%xmm5,%xmm5
+       vaesenclast     %xmm0,%xmm6,%xmm6
+       vpaddd  %xmm15,%xmm14,%xmm14
+       vmovdqu -120(%rsi),%xmm15
+       vaesenclast     %xmm0,%xmm7,%xmm7
+       vaesenclast     %xmm0,%xmm8,%xmm8
+       vmovdqa %xmm14,48(%rsp)
+       vaesenclast     %xmm0,%xmm9,%xmm9
+       vmovups 32-120(%rsi),%xmm0
+
+       vmovups %xmm2,-16(%r8)
+       subq    %rbx,%r8
+       vpxor   0(%rbp),%xmm2,%xmm2
+       vmovups %xmm3,-16(%r9)
+       subq    72(%rsp),%r9
+       vpxor   16(%rbp),%xmm3,%xmm3
+       vmovups %xmm4,-16(%r10)
+       subq    80(%rsp),%r10
+       vpxor   32(%rbp),%xmm4,%xmm4
+       vmovups %xmm5,-16(%r11)
+       subq    88(%rsp),%r11
+       vpxor   48(%rbp),%xmm5,%xmm5
+       vmovups %xmm6,-16(%r12)
+       subq    96(%rsp),%r12
+       vpxor   %xmm10,%xmm6,%xmm6
+       vmovups %xmm7,-16(%r13)
+       subq    104(%rsp),%r13
+       vpxor   %xmm11,%xmm7,%xmm7
+       vmovups %xmm8,-16(%r14)
+       subq    112(%rsp),%r14
+       vpxor   %xmm12,%xmm8,%xmm8
+       vmovups %xmm9,-16(%r15)
+       subq    120(%rsp),%r15
+       vpxor   %xmm13,%xmm9,%xmm9
+
+       decl    %edx
+       jnz     .Loop_enc8x
+
+       movq    16(%rsp),%rax
+
+
+
+
+
+.Lenc8x_done:
+       vzeroupper
+       movq    -48(%rax),%r15
+       movq    -40(%rax),%r14
+       movq    -32(%rax),%r13
+       movq    -24(%rax),%r12
+       movq    -16(%rax),%rbp
+       movq    -8(%rax),%rbx
+       leaq    (%rax),%rsp
+.Lenc8x_epilogue:
+       .byte   0xf3,0xc3
+.size  aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
+
+.type  aesni_multi_cbc_decrypt_avx,@function
+.align 32
+aesni_multi_cbc_decrypt_avx:
+_avx_cbc_dec_shortcut:
+       movq    %rsp,%rax
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+
+
+
+
+
+
+
+
+
+       subq    $256,%rsp
+       andq    $-256,%rsp
+       subq    $192,%rsp
+       movq    %rax,16(%rsp)
+
+.Ldec8x_body:
+       vzeroupper
+       vmovdqu (%rsi),%xmm15
+       leaq    120(%rsi),%rsi
+       leaq    160(%rdi),%rdi
+       shrl    $1,%edx
+
+.Ldec8x_loop_grande:
+
+       xorl    %edx,%edx
+       movl    -144(%rdi),%ecx
+       movq    -160(%rdi),%r8
+       cmpl    %edx,%ecx
+       movq    -152(%rdi),%rbx
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu -136(%rdi),%xmm2
+       movl    %ecx,32(%rsp)
+       cmovleq %rsp,%r8
+       subq    %r8,%rbx
+       movq    %rbx,64(%rsp)
+       vmovdqu %xmm2,192(%rsp)
+       movl    -104(%rdi),%ecx
+       movq    -120(%rdi),%r9
+       cmpl    %edx,%ecx
+       movq    -112(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu -96(%rdi),%xmm3
+       movl    %ecx,36(%rsp)
+       cmovleq %rsp,%r9
+       subq    %r9,%rbp
+       movq    %rbp,72(%rsp)
+       vmovdqu %xmm3,208(%rsp)
+       movl    -64(%rdi),%ecx
+       movq    -80(%rdi),%r10
+       cmpl    %edx,%ecx
+       movq    -72(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu -56(%rdi),%xmm4
+       movl    %ecx,40(%rsp)
+       cmovleq %rsp,%r10
+       subq    %r10,%rbp
+       movq    %rbp,80(%rsp)
+       vmovdqu %xmm4,224(%rsp)
+       movl    -24(%rdi),%ecx
+       movq    -40(%rdi),%r11
+       cmpl    %edx,%ecx
+       movq    -32(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu -16(%rdi),%xmm5
+       movl    %ecx,44(%rsp)
+       cmovleq %rsp,%r11
+       subq    %r11,%rbp
+       movq    %rbp,88(%rsp)
+       vmovdqu %xmm5,240(%rsp)
+       movl    16(%rdi),%ecx
+       movq    0(%rdi),%r12
+       cmpl    %edx,%ecx
+       movq    8(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu 24(%rdi),%xmm6
+       movl    %ecx,48(%rsp)
+       cmovleq %rsp,%r12
+       subq    %r12,%rbp
+       movq    %rbp,96(%rsp)
+       vmovdqu %xmm6,256(%rsp)
+       movl    56(%rdi),%ecx
+       movq    40(%rdi),%r13
+       cmpl    %edx,%ecx
+       movq    48(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu 64(%rdi),%xmm7
+       movl    %ecx,52(%rsp)
+       cmovleq %rsp,%r13
+       subq    %r13,%rbp
+       movq    %rbp,104(%rsp)
+       vmovdqu %xmm7,272(%rsp)
+       movl    96(%rdi),%ecx
+       movq    80(%rdi),%r14
+       cmpl    %edx,%ecx
+       movq    88(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu 104(%rdi),%xmm8
+       movl    %ecx,56(%rsp)
+       cmovleq %rsp,%r14
+       subq    %r14,%rbp
+       movq    %rbp,112(%rsp)
+       vmovdqu %xmm8,288(%rsp)
+       movl    136(%rdi),%ecx
+       movq    120(%rdi),%r15
+       cmpl    %edx,%ecx
+       movq    128(%rdi),%rbp
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       vmovdqu 144(%rdi),%xmm9
+       movl    %ecx,60(%rsp)
+       cmovleq %rsp,%r15
+       subq    %r15,%rbp
+       movq    %rbp,120(%rsp)
+       vmovdqu %xmm9,304(%rsp)
+       testl   %edx,%edx
+       jz      .Ldec8x_done
+
+       vmovups 16-120(%rsi),%xmm1
+       vmovups 32-120(%rsi),%xmm0
+       movl    240-120(%rsi),%eax
+       leaq    192+128(%rsp),%rbp
+
+       vmovdqu (%r8),%xmm2
+       vmovdqu (%r9),%xmm3
+       vmovdqu (%r10),%xmm4
+       vmovdqu (%r11),%xmm5
+       vmovdqu (%r12),%xmm6
+       vmovdqu (%r13),%xmm7
+       vmovdqu (%r14),%xmm8
+       vmovdqu (%r15),%xmm9
+       vmovdqu %xmm2,0(%rbp)
+       vpxor   %xmm15,%xmm2,%xmm2
+       vmovdqu %xmm3,16(%rbp)
+       vpxor   %xmm15,%xmm3,%xmm3
+       vmovdqu %xmm4,32(%rbp)
+       vpxor   %xmm15,%xmm4,%xmm4
+       vmovdqu %xmm5,48(%rbp)
+       vpxor   %xmm15,%xmm5,%xmm5
+       vmovdqu %xmm6,64(%rbp)
+       vpxor   %xmm15,%xmm6,%xmm6
+       vmovdqu %xmm7,80(%rbp)
+       vpxor   %xmm15,%xmm7,%xmm7
+       vmovdqu %xmm8,96(%rbp)
+       vpxor   %xmm15,%xmm8,%xmm8
+       vmovdqu %xmm9,112(%rbp)
+       vpxor   %xmm15,%xmm9,%xmm9
+       xorq    $0x80,%rbp
+       movl    $1,%ecx
+       jmp     .Loop_dec8x
+
+.align 32
+.Loop_dec8x:
+       vaesdec %xmm1,%xmm2,%xmm2
+       cmpl    32+0(%rsp),%ecx
+       vaesdec %xmm1,%xmm3,%xmm3
+       prefetcht0      31(%r8)
+       vaesdec %xmm1,%xmm4,%xmm4
+       vaesdec %xmm1,%xmm5,%xmm5
+       leaq    (%r8,%rbx,1),%rbx
+       cmovgeq %rsp,%r8
+       vaesdec %xmm1,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesdec %xmm1,%xmm7,%xmm7
+       subq    %r8,%rbx
+       vaesdec %xmm1,%xmm8,%xmm8
+       vmovdqu 16(%r8),%xmm10
+       movq    %rbx,64+0(%rsp)
+       vaesdec %xmm1,%xmm9,%xmm9
+       vmovups -72(%rsi),%xmm1
+       leaq    16(%r8,%rbx,1),%r8
+       vmovdqu %xmm10,128(%rsp)
+       vaesdec %xmm0,%xmm2,%xmm2
+       cmpl    32+4(%rsp),%ecx
+       movq    64+8(%rsp),%rbx
+       vaesdec %xmm0,%xmm3,%xmm3
+       prefetcht0      31(%r9)
+       vaesdec %xmm0,%xmm4,%xmm4
+       vaesdec %xmm0,%xmm5,%xmm5
+       leaq    (%r9,%rbx,1),%rbx
+       cmovgeq %rsp,%r9
+       vaesdec %xmm0,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesdec %xmm0,%xmm7,%xmm7
+       subq    %r9,%rbx
+       vaesdec %xmm0,%xmm8,%xmm8
+       vmovdqu 16(%r9),%xmm11
+       movq    %rbx,64+8(%rsp)
+       vaesdec %xmm0,%xmm9,%xmm9
+       vmovups -56(%rsi),%xmm0
+       leaq    16(%r9,%rbx,1),%r9
+       vmovdqu %xmm11,144(%rsp)
+       vaesdec %xmm1,%xmm2,%xmm2
+       cmpl    32+8(%rsp),%ecx
+       movq    64+16(%rsp),%rbx
+       vaesdec %xmm1,%xmm3,%xmm3
+       prefetcht0      31(%r10)
+       vaesdec %xmm1,%xmm4,%xmm4
+       prefetcht0      15(%r8)
+       vaesdec %xmm1,%xmm5,%xmm5
+       leaq    (%r10,%rbx,1),%rbx
+       cmovgeq %rsp,%r10
+       vaesdec %xmm1,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesdec %xmm1,%xmm7,%xmm7
+       subq    %r10,%rbx
+       vaesdec %xmm1,%xmm8,%xmm8
+       vmovdqu 16(%r10),%xmm12
+       movq    %rbx,64+16(%rsp)
+       vaesdec %xmm1,%xmm9,%xmm9
+       vmovups -40(%rsi),%xmm1
+       leaq    16(%r10,%rbx,1),%r10
+       vmovdqu %xmm12,160(%rsp)
+       vaesdec %xmm0,%xmm2,%xmm2
+       cmpl    32+12(%rsp),%ecx
+       movq    64+24(%rsp),%rbx
+       vaesdec %xmm0,%xmm3,%xmm3
+       prefetcht0      31(%r11)
+       vaesdec %xmm0,%xmm4,%xmm4
+       prefetcht0      15(%r9)
+       vaesdec %xmm0,%xmm5,%xmm5
+       leaq    (%r11,%rbx,1),%rbx
+       cmovgeq %rsp,%r11
+       vaesdec %xmm0,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesdec %xmm0,%xmm7,%xmm7
+       subq    %r11,%rbx
+       vaesdec %xmm0,%xmm8,%xmm8
+       vmovdqu 16(%r11),%xmm13
+       movq    %rbx,64+24(%rsp)
+       vaesdec %xmm0,%xmm9,%xmm9
+       vmovups -24(%rsi),%xmm0
+       leaq    16(%r11,%rbx,1),%r11
+       vmovdqu %xmm13,176(%rsp)
+       vaesdec %xmm1,%xmm2,%xmm2
+       cmpl    32+16(%rsp),%ecx
+       movq    64+32(%rsp),%rbx
+       vaesdec %xmm1,%xmm3,%xmm3
+       prefetcht0      31(%r12)
+       vaesdec %xmm1,%xmm4,%xmm4
+       prefetcht0      15(%r10)
+       vaesdec %xmm1,%xmm5,%xmm5
+       leaq    (%r12,%rbx,1),%rbx
+       cmovgeq %rsp,%r12
+       vaesdec %xmm1,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesdec %xmm1,%xmm7,%xmm7
+       subq    %r12,%rbx
+       vaesdec %xmm1,%xmm8,%xmm8
+       vmovdqu 16(%r12),%xmm10
+       movq    %rbx,64+32(%rsp)
+       vaesdec %xmm1,%xmm9,%xmm9
+       vmovups -8(%rsi),%xmm1
+       leaq    16(%r12,%rbx,1),%r12
+       vaesdec %xmm0,%xmm2,%xmm2
+       cmpl    32+20(%rsp),%ecx
+       movq    64+40(%rsp),%rbx
+       vaesdec %xmm0,%xmm3,%xmm3
+       prefetcht0      31(%r13)
+       vaesdec %xmm0,%xmm4,%xmm4
+       prefetcht0      15(%r11)
+       vaesdec %xmm0,%xmm5,%xmm5
+       leaq    (%rbx,%r13,1),%rbx
+       cmovgeq %rsp,%r13
+       vaesdec %xmm0,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesdec %xmm0,%xmm7,%xmm7
+       subq    %r13,%rbx
+       vaesdec %xmm0,%xmm8,%xmm8
+       vmovdqu 16(%r13),%xmm11
+       movq    %rbx,64+40(%rsp)
+       vaesdec %xmm0,%xmm9,%xmm9
+       vmovups 8(%rsi),%xmm0
+       leaq    16(%r13,%rbx,1),%r13
+       vaesdec %xmm1,%xmm2,%xmm2
+       cmpl    32+24(%rsp),%ecx
+       movq    64+48(%rsp),%rbx
+       vaesdec %xmm1,%xmm3,%xmm3
+       prefetcht0      31(%r14)
+       vaesdec %xmm1,%xmm4,%xmm4
+       prefetcht0      15(%r12)
+       vaesdec %xmm1,%xmm5,%xmm5
+       leaq    (%r14,%rbx,1),%rbx
+       cmovgeq %rsp,%r14
+       vaesdec %xmm1,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesdec %xmm1,%xmm7,%xmm7
+       subq    %r14,%rbx
+       vaesdec %xmm1,%xmm8,%xmm8
+       vmovdqu 16(%r14),%xmm12
+       movq    %rbx,64+48(%rsp)
+       vaesdec %xmm1,%xmm9,%xmm9
+       vmovups 24(%rsi),%xmm1
+       leaq    16(%r14,%rbx,1),%r14
+       vaesdec %xmm0,%xmm2,%xmm2
+       cmpl    32+28(%rsp),%ecx
+       movq    64+56(%rsp),%rbx
+       vaesdec %xmm0,%xmm3,%xmm3
+       prefetcht0      31(%r15)
+       vaesdec %xmm0,%xmm4,%xmm4
+       prefetcht0      15(%r13)
+       vaesdec %xmm0,%xmm5,%xmm5
+       leaq    (%r15,%rbx,1),%rbx
+       cmovgeq %rsp,%r15
+       vaesdec %xmm0,%xmm6,%xmm6
+       cmovgq  %rsp,%rbx
+       vaesdec %xmm0,%xmm7,%xmm7
+       subq    %r15,%rbx
+       vaesdec %xmm0,%xmm8,%xmm8
+       vmovdqu 16(%r15),%xmm13
+       movq    %rbx,64+56(%rsp)
+       vaesdec %xmm0,%xmm9,%xmm9
+       vmovups 40(%rsi),%xmm0
+       leaq    16(%r15,%rbx,1),%r15
+       vmovdqu 32(%rsp),%xmm14
+       prefetcht0      15(%r14)
+       prefetcht0      15(%r15)
+       cmpl    $11,%eax
+       jb      .Ldec8x_tail
+
+       vaesdec %xmm1,%xmm2,%xmm2
+       vaesdec %xmm1,%xmm3,%xmm3
+       vaesdec %xmm1,%xmm4,%xmm4
+       vaesdec %xmm1,%xmm5,%xmm5
+       vaesdec %xmm1,%xmm6,%xmm6
+       vaesdec %xmm1,%xmm7,%xmm7
+       vaesdec %xmm1,%xmm8,%xmm8
+       vaesdec %xmm1,%xmm9,%xmm9
+       vmovups 176-120(%rsi),%xmm1
+
+       vaesdec %xmm0,%xmm2,%xmm2
+       vaesdec %xmm0,%xmm3,%xmm3
+       vaesdec %xmm0,%xmm4,%xmm4
+       vaesdec %xmm0,%xmm5,%xmm5
+       vaesdec %xmm0,%xmm6,%xmm6
+       vaesdec %xmm0,%xmm7,%xmm7
+       vaesdec %xmm0,%xmm8,%xmm8
+       vaesdec %xmm0,%xmm9,%xmm9
+       vmovups 192-120(%rsi),%xmm0
+       je      .Ldec8x_tail
+
+       vaesdec %xmm1,%xmm2,%xmm2
+       vaesdec %xmm1,%xmm3,%xmm3
+       vaesdec %xmm1,%xmm4,%xmm4
+       vaesdec %xmm1,%xmm5,%xmm5
+       vaesdec %xmm1,%xmm6,%xmm6
+       vaesdec %xmm1,%xmm7,%xmm7
+       vaesdec %xmm1,%xmm8,%xmm8
+       vaesdec %xmm1,%xmm9,%xmm9
+       vmovups 208-120(%rsi),%xmm1
+
+       vaesdec %xmm0,%xmm2,%xmm2
+       vaesdec %xmm0,%xmm3,%xmm3
+       vaesdec %xmm0,%xmm4,%xmm4
+       vaesdec %xmm0,%xmm5,%xmm5
+       vaesdec %xmm0,%xmm6,%xmm6
+       vaesdec %xmm0,%xmm7,%xmm7
+       vaesdec %xmm0,%xmm8,%xmm8
+       vaesdec %xmm0,%xmm9,%xmm9
+       vmovups 224-120(%rsi),%xmm0
+
+.Ldec8x_tail:
+       vaesdec %xmm1,%xmm2,%xmm2
+       vpxor   %xmm15,%xmm15,%xmm15
+       vaesdec %xmm1,%xmm3,%xmm3
+       vaesdec %xmm1,%xmm4,%xmm4
+       vpcmpgtd        %xmm15,%xmm14,%xmm15
+       vaesdec %xmm1,%xmm5,%xmm5
+       vaesdec %xmm1,%xmm6,%xmm6
+       vpaddd  %xmm14,%xmm15,%xmm15
+       vmovdqu 48(%rsp),%xmm14
+       vaesdec %xmm1,%xmm7,%xmm7
+       movq    64(%rsp),%rbx
+       vaesdec %xmm1,%xmm8,%xmm8
+       vaesdec %xmm1,%xmm9,%xmm9
+       vmovups 16-120(%rsi),%xmm1
+
+       vaesdeclast     %xmm0,%xmm2,%xmm2
+       vmovdqa %xmm15,32(%rsp)
+       vpxor   %xmm15,%xmm15,%xmm15
+       vaesdeclast     %xmm0,%xmm3,%xmm3
+       vpxor   0(%rbp),%xmm2,%xmm2
+       vaesdeclast     %xmm0,%xmm4,%xmm4
+       vpxor   16(%rbp),%xmm3,%xmm3
+       vpcmpgtd        %xmm15,%xmm14,%xmm15
+       vaesdeclast     %xmm0,%xmm5,%xmm5
+       vpxor   32(%rbp),%xmm4,%xmm4
+       vaesdeclast     %xmm0,%xmm6,%xmm6
+       vpxor   48(%rbp),%xmm5,%xmm5
+       vpaddd  %xmm15,%xmm14,%xmm14
+       vmovdqu -120(%rsi),%xmm15
+       vaesdeclast     %xmm0,%xmm7,%xmm7
+       vpxor   64(%rbp),%xmm6,%xmm6
+       vaesdeclast     %xmm0,%xmm8,%xmm8
+       vpxor   80(%rbp),%xmm7,%xmm7
+       vmovdqa %xmm14,48(%rsp)
+       vaesdeclast     %xmm0,%xmm9,%xmm9
+       vpxor   96(%rbp),%xmm8,%xmm8
+       vmovups 32-120(%rsi),%xmm0
+
+       vmovups %xmm2,-16(%r8)
+       subq    %rbx,%r8
+       vmovdqu 128+0(%rsp),%xmm2
+       vpxor   112(%rbp),%xmm9,%xmm9
+       vmovups %xmm3,-16(%r9)
+       subq    72(%rsp),%r9
+       vmovdqu %xmm2,0(%rbp)
+       vpxor   %xmm15,%xmm2,%xmm2
+       vmovdqu 128+16(%rsp),%xmm3
+       vmovups %xmm4,-16(%r10)
+       subq    80(%rsp),%r10
+       vmovdqu %xmm3,16(%rbp)
+       vpxor   %xmm15,%xmm3,%xmm3
+       vmovdqu 128+32(%rsp),%xmm4
+       vmovups %xmm5,-16(%r11)
+       subq    88(%rsp),%r11
+       vmovdqu %xmm4,32(%rbp)
+       vpxor   %xmm15,%xmm4,%xmm4
+       vmovdqu 128+48(%rsp),%xmm5
+       vmovups %xmm6,-16(%r12)
+       subq    96(%rsp),%r12
+       vmovdqu %xmm5,48(%rbp)
+       vpxor   %xmm15,%xmm5,%xmm5
+       vmovdqu %xmm10,64(%rbp)
+       vpxor   %xmm10,%xmm15,%xmm6
+       vmovups %xmm7,-16(%r13)
+       subq    104(%rsp),%r13
+       vmovdqu %xmm11,80(%rbp)
+       vpxor   %xmm11,%xmm15,%xmm7
+       vmovups %xmm8,-16(%r14)
+       subq    112(%rsp),%r14
+       vmovdqu %xmm12,96(%rbp)
+       vpxor   %xmm12,%xmm15,%xmm8
+       vmovups %xmm9,-16(%r15)
+       subq    120(%rsp),%r15
+       vmovdqu %xmm13,112(%rbp)
+       vpxor   %xmm13,%xmm15,%xmm9
+
+       xorq    $128,%rbp
+       decl    %edx
+       jnz     .Loop_dec8x
+
+       movq    16(%rsp),%rax
+
+
+
+
+
+.Ldec8x_done:
+       vzeroupper
+       movq    -48(%rax),%r15
+       movq    -40(%rax),%r14
+       movq    -32(%rax),%r13
+       movq    -24(%rax),%r12
+       movq    -16(%rax),%rbp
+       movq    -8(%rax),%rbx
+       leaq    (%rax),%rsp
+.Ldec8x_epilogue:
+       .byte   0xf3,0xc3
+.size  aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
 .section .note.GNU-stack,"",%progbits
index 53e360a..85ac116 100644 (file)
@@ -10,6 +10,11 @@ aesni_cbc_sha1_enc:
        movq    OPENSSL_ia32cap_P+4(%rip),%r11
        btq     $61,%r11
        jc      aesni_cbc_sha1_enc_shaext
+       andl    $268435456,%r11d
+       andl    $1073741824,%r10d
+       orl     %r11d,%r10d
+       cmpl    $1342177280,%r10d
+       je      aesni_cbc_sha1_enc_avx
        jmp     aesni_cbc_sha1_enc_ssse3
        .byte   0xf3,0xc3
 .size  aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
@@ -1367,6 +1372,1304 @@ aesni_cbc_sha1_enc_ssse3:
 .Lepilogue_ssse3:
        .byte   0xf3,0xc3
 .size  aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.type  aesni_cbc_sha1_enc_avx,@function
+.align 32
+aesni_cbc_sha1_enc_avx:
+       movq    8(%rsp),%r10
+
+
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -104(%rsp),%rsp
+
+
+       vzeroall
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       leaq    112(%rcx),%r15
+       vmovdqu (%r8),%xmm12
+       movq    %r8,88(%rsp)
+       shlq    $6,%r14
+       subq    %r12,%r13
+       movl    240-112(%r15),%r8d
+       addq    %r10,%r14
+
+       leaq    K_XX_XX(%rip),%r11
+       movl    0(%r9),%eax
+       movl    4(%r9),%ebx
+       movl    8(%r9),%ecx
+       movl    12(%r9),%edx
+       movl    %ebx,%esi
+       movl    16(%r9),%ebp
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       andl    %edi,%esi
+
+       vmovdqa 64(%r11),%xmm6
+       vmovdqa 0(%r11),%xmm10
+       vmovdqu 0(%r10),%xmm0
+       vmovdqu 16(%r10),%xmm1
+       vmovdqu 32(%r10),%xmm2
+       vmovdqu 48(%r10),%xmm3
+       vpshufb %xmm6,%xmm0,%xmm0
+       addq    $64,%r10
+       vpshufb %xmm6,%xmm1,%xmm1
+       vpshufb %xmm6,%xmm2,%xmm2
+       vpshufb %xmm6,%xmm3,%xmm3
+       vpaddd  %xmm10,%xmm0,%xmm4
+       vpaddd  %xmm10,%xmm1,%xmm5
+       vpaddd  %xmm10,%xmm2,%xmm6
+       vmovdqa %xmm4,0(%rsp)
+       vmovdqa %xmm5,16(%rsp)
+       vmovdqa %xmm6,32(%rsp)
+       vmovups -112(%r15),%xmm15
+       vmovups 16-112(%r15),%xmm14
+       jmp     .Loop_avx
+.align 32
+.Loop_avx:
+       shrdl   $2,%ebx,%ebx
+       vmovdqu 0(%r12),%xmm13
+       vpxor   %xmm15,%xmm13,%xmm13
+       vpxor   %xmm13,%xmm12,%xmm12
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -80(%r15),%xmm15
+       xorl    %edx,%esi
+       vpalignr        $8,%xmm0,%xmm1,%xmm4
+       movl    %eax,%edi
+       addl    0(%rsp),%ebp
+       vpaddd  %xmm3,%xmm10,%xmm9
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vpsrldq $4,%xmm3,%xmm8
+       addl    %esi,%ebp
+       andl    %ebx,%edi
+       vpxor   %xmm0,%xmm4,%xmm4
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm2,%xmm8,%xmm8
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    4(%rsp),%edx
+       vpxor   %xmm8,%xmm4,%xmm4
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vmovdqa %xmm9,48(%rsp)
+       addl    %edi,%edx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups -64(%r15),%xmm14
+       andl    %eax,%esi
+       vpsrld  $31,%xmm4,%xmm8
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%esi
+       vpslldq $12,%xmm4,%xmm9
+       vpaddd  %xmm4,%xmm4,%xmm4
+       movl    %edx,%edi
+       addl    8(%rsp),%ecx
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpor    %xmm8,%xmm4,%xmm4
+       vpsrld  $30,%xmm9,%xmm8
+       addl    %esi,%ecx
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpslld  $2,%xmm9,%xmm9
+       vpxor   %xmm8,%xmm4,%xmm4
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    12(%rsp),%ebx
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -48(%r15),%xmm15
+       vpxor   %xmm9,%xmm4,%xmm4
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       andl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%esi
+       vpalignr        $8,%xmm1,%xmm2,%xmm5
+       movl    %ebx,%edi
+       addl    16(%rsp),%eax
+       vpaddd  %xmm4,%xmm10,%xmm9
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vpsrldq $4,%xmm4,%xmm8
+       addl    %esi,%eax
+       andl    %ecx,%edi
+       vpxor   %xmm1,%xmm5,%xmm5
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpxor   %xmm3,%xmm8,%xmm8
+       shrdl   $7,%ebx,%ebx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups -32(%r15),%xmm14
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    20(%rsp),%ebp
+       vpxor   %xmm8,%xmm5,%xmm5
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%ebp
+       andl    %ebx,%esi
+       vpsrld  $31,%xmm5,%xmm8
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%esi
+       vpslldq $12,%xmm5,%xmm9
+       vpaddd  %xmm5,%xmm5,%xmm5
+       movl    %ebp,%edi
+       addl    24(%rsp),%edx
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vpor    %xmm8,%xmm5,%xmm5
+       vpsrld  $30,%xmm9,%xmm8
+       addl    %esi,%edx
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -16(%r15),%xmm15
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm9,%xmm9
+       vpxor   %xmm8,%xmm5,%xmm5
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%edi
+       movl    %edx,%esi
+       addl    28(%rsp),%ecx
+       vpxor   %xmm9,%xmm5,%xmm5
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vmovdqa 16(%r11),%xmm10
+       addl    %edi,%ecx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%esi
+       vpalignr        $8,%xmm2,%xmm3,%xmm6
+       movl    %ecx,%edi
+       addl    32(%rsp),%ebx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 0(%r15),%xmm14
+       vpaddd  %xmm5,%xmm10,%xmm9
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       vpsrldq $4,%xmm5,%xmm8
+       addl    %esi,%ebx
+       andl    %edx,%edi
+       vpxor   %xmm2,%xmm6,%xmm6
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       vpxor   %xmm4,%xmm8,%xmm8
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       addl    36(%rsp),%eax
+       vpxor   %xmm8,%xmm6,%xmm6
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vmovdqa %xmm9,16(%rsp)
+       addl    %edi,%eax
+       andl    %ecx,%esi
+       vpsrld  $31,%xmm6,%xmm8
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       shrdl   $7,%ebx,%ebx
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 16(%r15),%xmm15
+       xorl    %edx,%esi
+       vpslldq $12,%xmm6,%xmm9
+       vpaddd  %xmm6,%xmm6,%xmm6
+       movl    %eax,%edi
+       addl    40(%rsp),%ebp
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       vpor    %xmm8,%xmm6,%xmm6
+       vpsrld  $30,%xmm9,%xmm8
+       addl    %esi,%ebp
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpslld  $2,%xmm9,%xmm9
+       vpxor   %xmm8,%xmm6,%xmm6
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    44(%rsp),%edx
+       vpxor   %xmm9,%xmm6,%xmm6
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 32(%r15),%xmm14
+       andl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%esi
+       vpalignr        $8,%xmm3,%xmm4,%xmm7
+       movl    %edx,%edi
+       addl    48(%rsp),%ecx
+       vpaddd  %xmm6,%xmm10,%xmm9
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpsrldq $4,%xmm6,%xmm8
+       addl    %esi,%ecx
+       andl    %ebp,%edi
+       vpxor   %xmm3,%xmm7,%xmm7
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpxor   %xmm5,%xmm8,%xmm8
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    52(%rsp),%ebx
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 48(%r15),%xmm15
+       vpxor   %xmm8,%xmm7,%xmm7
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%ebx
+       andl    %edx,%esi
+       vpsrld  $31,%xmm7,%xmm8
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       shrdl   $7,%ecx,%ecx
+       xorl    %ebp,%esi
+       vpslldq $12,%xmm7,%xmm9
+       vpaddd  %xmm7,%xmm7,%xmm7
+       movl    %ebx,%edi
+       addl    56(%rsp),%eax
+       xorl    %edx,%ecx
+       shldl   $5,%ebx,%ebx
+       vpor    %xmm8,%xmm7,%xmm7
+       vpsrld  $30,%xmm9,%xmm8
+       addl    %esi,%eax
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpslld  $2,%xmm9,%xmm9
+       vpxor   %xmm8,%xmm7,%xmm7
+       shrdl   $7,%ebx,%ebx
+       cmpl    $11,%r8d
+       jb      .Lvaesenclast6
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 64(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 80(%r15),%xmm15
+       je      .Lvaesenclast6
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 96(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 112(%r15),%xmm15
+.Lvaesenclast6:
+       vaesenclast     %xmm15,%xmm12,%xmm12
+       vmovups -112(%r15),%xmm15
+       vmovups 16-112(%r15),%xmm14
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    60(%rsp),%ebp
+       vpxor   %xmm9,%xmm7,%xmm7
+       xorl    %ecx,%ebx
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       vpxor   %xmm4,%xmm0,%xmm0
+       shrdl   $7,%eax,%eax
+       xorl    %ecx,%esi
+       movl    %ebp,%edi
+       addl    0(%rsp),%edx
+       vpxor   %xmm1,%xmm0,%xmm0
+       xorl    %ebx,%eax
+       shldl   $5,%ebp,%ebp
+       vpaddd  %xmm7,%xmm10,%xmm9
+       addl    %esi,%edx
+       vmovdqu 16(%r12),%xmm13
+       vpxor   %xmm15,%xmm13,%xmm13
+       vmovups %xmm12,0(%r12,%r13,1)
+       vpxor   %xmm13,%xmm12,%xmm12
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -80(%r15),%xmm15
+       andl    %eax,%edi
+       vpxor   %xmm8,%xmm0,%xmm0
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       shrdl   $7,%ebp,%ebp
+       xorl    %ebx,%edi
+       vpsrld  $30,%xmm0,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       movl    %edx,%esi
+       addl    4(%rsp),%ecx
+       xorl    %eax,%ebp
+       shldl   $5,%edx,%edx
+       vpslld  $2,%xmm0,%xmm0
+       addl    %edi,%ecx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       shrdl   $7,%edx,%edx
+       xorl    %eax,%esi
+       movl    %ecx,%edi
+       addl    8(%rsp),%ebx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups -64(%r15),%xmm14
+       vpor    %xmm8,%xmm0,%xmm0
+       xorl    %ebp,%edx
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    12(%rsp),%eax
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       vpxor   %xmm5,%xmm1,%xmm1
+       addl    16(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -48(%r15),%xmm15
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       vpxor   %xmm2,%xmm1,%xmm1
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       vpaddd  %xmm0,%xmm10,%xmm9
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm8,%xmm1,%xmm1
+       addl    20(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm1,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm1,%xmm1
+       addl    24(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups -32(%r15),%xmm14
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpor    %xmm8,%xmm1,%xmm1
+       addl    28(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       vpxor   %xmm6,%xmm2,%xmm2
+       addl    32(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       vpxor   %xmm3,%xmm2,%xmm2
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       vpaddd  %xmm1,%xmm10,%xmm9
+       vmovdqa 32(%r11),%xmm10
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpxor   %xmm8,%xmm2,%xmm2
+       addl    36(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -16(%r15),%xmm15
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       vpsrld  $30,%xmm2,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpslld  $2,%xmm2,%xmm2
+       addl    40(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpor    %xmm8,%xmm2,%xmm2
+       addl    44(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 0(%r15),%xmm14
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       vpxor   %xmm7,%xmm3,%xmm3
+       addl    48(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       vpxor   %xmm4,%xmm3,%xmm3
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       vpaddd  %xmm2,%xmm10,%xmm9
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpxor   %xmm8,%xmm3,%xmm3
+       addl    52(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       vpsrld  $30,%xmm3,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpslld  $2,%xmm3,%xmm3
+       addl    56(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 16(%r15),%xmm15
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpor    %xmm8,%xmm3,%xmm3
+       addl    60(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       vpxor   %xmm0,%xmm4,%xmm4
+       addl    0(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       vpxor   %xmm5,%xmm4,%xmm4
+       addl    %esi,%ecx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 32(%r15),%xmm14
+       xorl    %eax,%edi
+       vpaddd  %xmm3,%xmm10,%xmm9
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpxor   %xmm8,%xmm4,%xmm4
+       addl    4(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       vpsrld  $30,%xmm4,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpslld  $2,%xmm4,%xmm4
+       addl    8(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vpor    %xmm8,%xmm4,%xmm4
+       addl    12(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 48(%r15),%xmm15
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       vpxor   %xmm1,%xmm5,%xmm5
+       addl    16(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       vpxor   %xmm6,%xmm5,%xmm5
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       vpaddd  %xmm4,%xmm10,%xmm9
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpxor   %xmm8,%xmm5,%xmm5
+       addl    20(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       vpsrld  $30,%xmm5,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       addl    %edi,%ecx
+       cmpl    $11,%r8d
+       jb      .Lvaesenclast7
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 64(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 80(%r15),%xmm15
+       je      .Lvaesenclast7
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 96(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 112(%r15),%xmm15
+.Lvaesenclast7:
+       vaesenclast     %xmm15,%xmm12,%xmm12
+       vmovups -112(%r15),%xmm15
+       vmovups 16-112(%r15),%xmm14
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpslld  $2,%xmm5,%xmm5
+       addl    24(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vpor    %xmm8,%xmm5,%xmm5
+       addl    28(%rsp),%eax
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %ecx,%esi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       vpxor   %xmm2,%xmm6,%xmm6
+       addl    32(%rsp),%ebp
+       vmovdqu 32(%r12),%xmm13
+       vpxor   %xmm15,%xmm13,%xmm13
+       vmovups %xmm12,16(%r13,%r12,1)
+       vpxor   %xmm13,%xmm12,%xmm12
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -80(%r15),%xmm15
+       andl    %ecx,%esi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %eax,%edi
+       xorl    %ecx,%esi
+       vpaddd  %xmm5,%xmm10,%xmm9
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       vpxor   %xmm8,%xmm6,%xmm6
+       xorl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    36(%rsp),%edx
+       vpsrld  $30,%xmm6,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%esi
+       vpslld  $2,%xmm6,%xmm6
+       xorl    %ebx,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups -64(%r15),%xmm14
+       xorl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    40(%rsp),%ecx
+       andl    %eax,%esi
+       vpor    %xmm8,%xmm6,%xmm6
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%edi
+       xorl    %eax,%esi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       xorl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    44(%rsp),%ebx
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -48(%r15),%xmm15
+       movl    %ecx,%esi
+       xorl    %ebp,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       vpxor   %xmm3,%xmm7,%xmm7
+       addl    48(%rsp),%eax
+       andl    %edx,%esi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       vpxor   %xmm0,%xmm7,%xmm7
+       movl    %ebx,%edi
+       xorl    %edx,%esi
+       vpaddd  %xmm6,%xmm10,%xmm9
+       vmovdqa 48(%r11),%xmm10
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       vpxor   %xmm8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    52(%rsp),%ebp
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups -32(%r15),%xmm14
+       vpsrld  $30,%xmm7,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%esi
+       vpslld  $2,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    56(%rsp),%edx
+       andl    %ebx,%esi
+       vpor    %xmm8,%xmm7,%xmm7
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%edi
+       xorl    %ebx,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -16(%r15),%xmm15
+       xorl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    60(%rsp),%ecx
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%esi
+       xorl    %eax,%edi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       vpxor   %xmm4,%xmm0,%xmm0
+       addl    0(%rsp),%ebx
+       andl    %ebp,%esi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 0(%r15),%xmm14
+       vpxor   %xmm1,%xmm0,%xmm0
+       movl    %ecx,%edi
+       xorl    %ebp,%esi
+       vpaddd  %xmm7,%xmm10,%xmm9
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       vpxor   %xmm8,%xmm0,%xmm0
+       xorl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    4(%rsp),%eax
+       vpsrld  $30,%xmm0,%xmm8
+       vmovdqa %xmm9,48(%rsp)
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       vpslld  $2,%xmm0,%xmm0
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %ecx,%esi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    8(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 16(%r15),%xmm15
+       andl    %ecx,%esi
+       vpor    %xmm8,%xmm0,%xmm0
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%edi
+       xorl    %ecx,%esi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ebx,%edi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       addl    12(%rsp),%edx
+       andl    %ebx,%edi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       movl    %ebp,%esi
+       xorl    %ebx,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 32(%r15),%xmm14
+       xorl    %eax,%esi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       vpxor   %xmm5,%xmm1,%xmm1
+       addl    16(%rsp),%ecx
+       andl    %eax,%esi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       vpxor   %xmm2,%xmm1,%xmm1
+       movl    %edx,%edi
+       xorl    %eax,%esi
+       vpaddd  %xmm0,%xmm10,%xmm9
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       vpxor   %xmm8,%xmm1,%xmm1
+       xorl    %ebp,%edi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    20(%rsp),%ebx
+       vpsrld  $30,%xmm1,%xmm8
+       vmovdqa %xmm9,0(%rsp)
+       andl    %ebp,%edi
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 48(%r15),%xmm15
+       movl    %ecx,%esi
+       vpslld  $2,%xmm1,%xmm1
+       xorl    %ebp,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %edx,%esi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    24(%rsp),%eax
+       andl    %edx,%esi
+       vpor    %xmm8,%xmm1,%xmm1
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%edi
+       xorl    %edx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
+       addl    %ebx,%eax
+       addl    28(%rsp),%ebp
+       cmpl    $11,%r8d
+       jb      .Lvaesenclast8
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 64(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 80(%r15),%xmm15
+       je      .Lvaesenclast8
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 96(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 112(%r15),%xmm15
+.Lvaesenclast8:
+       vaesenclast     %xmm15,%xmm12,%xmm12
+       vmovups -112(%r15),%xmm15
+       vmovups 16-112(%r15),%xmm14
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       shrdl   $7,%ebx,%ebx
+       movl    %eax,%esi
+       xorl    %ecx,%edi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ebx,%esi
+       xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       vpxor   %xmm6,%xmm2,%xmm2
+       addl    32(%rsp),%edx
+       andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       shrdl   $7,%eax,%eax
+       vpxor   %xmm3,%xmm2,%xmm2
+       movl    %ebp,%edi
+       xorl    %ebx,%esi
+       vpaddd  %xmm1,%xmm10,%xmm9
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       vmovdqu 48(%r12),%xmm13
+       vpxor   %xmm15,%xmm13,%xmm13
+       vmovups %xmm12,32(%r13,%r12,1)
+       vpxor   %xmm13,%xmm12,%xmm12
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -80(%r15),%xmm15
+       vpxor   %xmm8,%xmm2,%xmm2
+       xorl    %eax,%edi
+       xorl    %ebx,%eax
+       addl    %ebp,%edx
+       addl    36(%rsp),%ecx
+       vpsrld  $30,%xmm2,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       andl    %eax,%edi
+       xorl    %ebx,%eax
+       shrdl   $7,%ebp,%ebp
+       movl    %edx,%esi
+       vpslld  $2,%xmm2,%xmm2
+       xorl    %eax,%edi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       xorl    %ebp,%esi
+       xorl    %eax,%ebp
+       addl    %edx,%ecx
+       addl    40(%rsp),%ebx
+       andl    %ebp,%esi
+       vpor    %xmm8,%xmm2,%xmm2
+       xorl    %eax,%ebp
+       shrdl   $7,%edx,%edx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups -64(%r15),%xmm14
+       movl    %ecx,%edi
+       xorl    %ebp,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %edx,%edi
+       xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    44(%rsp),%eax
+       andl    %edx,%edi
+       xorl    %ebp,%edx
+       shrdl   $7,%ecx,%ecx
+       movl    %ebx,%esi
+       xorl    %edx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       addl    %ebx,%eax
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       vpxor   %xmm7,%xmm3,%xmm3
+       addl    48(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -48(%r15),%xmm15
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       vpxor   %xmm4,%xmm3,%xmm3
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       vpaddd  %xmm2,%xmm10,%xmm9
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       vpxor   %xmm8,%xmm3,%xmm3
+       addl    52(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       vpsrld  $30,%xmm3,%xmm8
+       vmovdqa %xmm9,32(%rsp)
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vpslld  $2,%xmm3,%xmm3
+       addl    56(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups -32(%r15),%xmm14
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vpor    %xmm8,%xmm3,%xmm3
+       addl    60(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    0(%rsp),%eax
+       vpaddd  %xmm3,%xmm10,%xmm9
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       vmovdqa %xmm9,48(%rsp)
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    4(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups -16(%r15),%xmm15
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    8(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    12(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 0(%r15),%xmm14
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       cmpq    %r14,%r10
+       je      .Ldone_avx
+       vmovdqa 64(%r11),%xmm9
+       vmovdqa 0(%r11),%xmm10
+       vmovdqu 0(%r10),%xmm0
+       vmovdqu 16(%r10),%xmm1
+       vmovdqu 32(%r10),%xmm2
+       vmovdqu 48(%r10),%xmm3
+       vpshufb %xmm9,%xmm0,%xmm0
+       addq    $64,%r10
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       vpshufb %xmm9,%xmm1,%xmm1
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       vpaddd  %xmm10,%xmm0,%xmm8
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       vmovdqa %xmm8,0(%rsp)
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 16(%r15),%xmm15
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       vpshufb %xmm9,%xmm2,%xmm2
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       vpaddd  %xmm10,%xmm1,%xmm8
+       addl    %esi,%ecx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 32(%r15),%xmm14
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       vmovdqa %xmm8,16(%rsp)
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 48(%r15),%xmm15
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       vpshufb %xmm9,%xmm3,%xmm3
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       vpaddd  %xmm10,%xmm2,%xmm8
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       vmovdqa %xmm8,32(%rsp)
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       cmpl    $11,%r8d
+       jb      .Lvaesenclast9
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 64(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 80(%r15),%xmm15
+       je      .Lvaesenclast9
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 96(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 112(%r15),%xmm15
+.Lvaesenclast9:
+       vaesenclast     %xmm15,%xmm12,%xmm12
+       vmovups -112(%r15),%xmm15
+       vmovups 16-112(%r15),%xmm14
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vmovups %xmm12,48(%r13,%r12,1)
+       leaq    64(%r12),%r12
+
+       addl    0(%r9),%eax
+       addl    4(%r9),%esi
+       addl    8(%r9),%ecx
+       addl    12(%r9),%edx
+       movl    %eax,0(%r9)
+       addl    16(%r9),%ebp
+       movl    %esi,4(%r9)
+       movl    %esi,%ebx
+       movl    %ecx,8(%r9)
+       movl    %ecx,%edi
+       movl    %edx,12(%r9)
+       xorl    %edx,%edi
+       movl    %ebp,16(%r9)
+       andl    %edi,%esi
+       jmp     .Loop_avx
+
+.Ldone_avx:
+       addl    16(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       xorl    %edx,%esi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 16(%r15),%xmm15
+       xorl    %ecx,%esi
+       movl    %eax,%edi
+       shldl   $5,%eax,%eax
+       addl    %esi,%ebp
+       xorl    %ecx,%edi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
+       movl    %ebp,%esi
+       shldl   $5,%ebp,%ebp
+       addl    %edi,%edx
+       xorl    %ebx,%esi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
+       movl    %edx,%edi
+       shldl   $5,%edx,%edx
+       addl    %esi,%ecx
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 32(%r15),%xmm14
+       xorl    %eax,%edi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
+       movl    %ecx,%esi
+       shldl   $5,%ecx,%ecx
+       addl    %edi,%ebx
+       xorl    %ebp,%esi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
+       movl    %ebx,%edi
+       shldl   $5,%ebx,%ebx
+       addl    %esi,%eax
+       xorl    %edx,%edi
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 48(%r15),%xmm15
+       xorl    %ecx,%edi
+       movl    %eax,%esi
+       shldl   $5,%eax,%eax
+       addl    %edi,%ebp
+       xorl    %ecx,%esi
+       shrdl   $7,%ebx,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
+       movl    %ebp,%edi
+       shldl   $5,%ebp,%ebp
+       addl    %esi,%edx
+       xorl    %ebx,%edi
+       shrdl   $7,%eax,%eax
+       addl    %ebp,%edx
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
+       movl    %edx,%esi
+       shldl   $5,%edx,%edx
+       addl    %edi,%ecx
+       cmpl    $11,%r8d
+       jb      .Lvaesenclast10
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 64(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 80(%r15),%xmm15
+       je      .Lvaesenclast10
+       vaesenc %xmm15,%xmm12,%xmm12
+       vmovups 96(%r15),%xmm14
+       vaesenc %xmm14,%xmm12,%xmm12
+       vmovups 112(%r15),%xmm15
+.Lvaesenclast10:
+       vaesenclast     %xmm15,%xmm12,%xmm12
+       vmovups -112(%r15),%xmm15
+       vmovups 16-112(%r15),%xmm14
+       xorl    %eax,%esi
+       shrdl   $7,%ebp,%ebp
+       addl    %edx,%ecx
+       addl    56(%rsp),%ebx
+       xorl    %ebp,%esi
+       movl    %ecx,%edi
+       shldl   $5,%ecx,%ecx
+       addl    %esi,%ebx
+       xorl    %ebp,%edi
+       shrdl   $7,%edx,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
+       movl    %ebx,%esi
+       shldl   $5,%ebx,%ebx
+       addl    %edi,%eax
+       shrdl   $7,%ecx,%ecx
+       addl    %ebx,%eax
+       vmovups %xmm12,48(%r13,%r12,1)
+       movq    88(%rsp),%r8
+
+       addl    0(%r9),%eax
+       addl    4(%r9),%esi
+       addl    8(%r9),%ecx
+       movl    %eax,0(%r9)
+       addl    12(%r9),%edx
+       movl    %esi,4(%r9)
+       addl    16(%r9),%ebp
+       movl    %ecx,8(%r9)
+       movl    %edx,12(%r9)
+       movl    %ebp,16(%r9)
+       vmovups %xmm12,(%r8)
+       vzeroall
+       leaq    104(%rsp),%rsi
+       movq    0(%rsi),%r15
+       movq    8(%rsi),%r14
+       movq    16(%rsi),%r13
+       movq    24(%rsi),%r12
+       movq    32(%rsi),%rbp
+       movq    40(%rsi),%rbx
+       leaq    48(%rsi),%rsp
+.Lepilogue_avx:
+       .byte   0xf3,0xc3
+.size  aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
 .align 64
 K_XX_XX:
 .long  0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -1456,17 +2759,17 @@ aesni_cbc_sha1_enc_shaext:
        pxor    %xmm3,%xmm5
 .byte  15,56,201,243
        cmpl    $11,%r11d
-       jb      .Laesenclast6
+       jb      .Laesenclast11
        movups  64(%rcx),%xmm0
 .byte  102,15,56,220,209
        movups  80(%rcx),%xmm1
 .byte  102,15,56,220,208
-       je      .Laesenclast6
+       je      .Laesenclast11
        movups  96(%rcx),%xmm0
 .byte  102,15,56,220,209
        movups  112(%rcx),%xmm1
 .byte  102,15,56,220,208
-.Laesenclast6:
+.Laesenclast11:
 .byte  102,15,56,221,209
        movups  16-112(%rcx),%xmm0
        movdqa  %xmm8,%xmm10
@@ -1522,17 +2825,17 @@ aesni_cbc_sha1_enc_shaext:
        pxor    %xmm4,%xmm6
 .byte  15,56,201,220
        cmpl    $11,%r11d
-       jb      .Laesenclast7
+       jb      .Laesenclast12
        movups  64(%rcx),%xmm0
 .byte  102,15,56,220,209
        movups  80(%rcx),%xmm1
 .byte  102,15,56,220,208
-       je      .Laesenclast7
+       je      .Laesenclast12
        movups  96(%rcx),%xmm0
 .byte  102,15,56,220,209
        movups  112(%rcx),%xmm1
 .byte  102,15,56,220,208
-.Laesenclast7:
+.Laesenclast12:
 .byte  102,15,56,221,209
        movups  16-112(%rcx),%xmm0
        movdqa  %xmm8,%xmm9
@@ -1588,17 +2891,17 @@ aesni_cbc_sha1_enc_shaext:
        pxor    %xmm5,%xmm3
 .byte  15,56,201,229
        cmpl    $11,%r11d
-       jb      .Laesenclast8
+       jb      .Laesenclast13
        movups  64(%rcx),%xmm0
 .byte  102,15,56,220,209
        movups  80(%rcx),%xmm1
 .byte  102,15,56,220,208
-       je      .Laesenclast8
+       je      .Laesenclast13
        movups  96(%rcx),%xmm0
 .byte  102,15,56,220,209
        movups  112(%rcx),%xmm1
 .byte  102,15,56,220,208
-.Laesenclast8:
+.Laesenclast13:
 .byte  102,15,56,221,209
        movups  16-112(%rcx),%xmm0
        movdqa  %xmm8,%xmm10
@@ -1652,17 +2955,17 @@ aesni_cbc_sha1_enc_shaext:
        movups  48(%rcx),%xmm1
 .byte  102,15,56,220,208
        cmpl    $11,%r11d
-       jb      .Laesenclast9
+       jb      .Laesenclast14
        movups  64(%rcx),%xmm0
 .byte  102,15,56,220,209
        movups  80(%rcx),%xmm1
 .byte  102,15,56,220,208
-       je      .Laesenclast9
+       je      .Laesenclast14
        movups  96(%rcx),%xmm0
 .byte  102,15,56,220,209
        movups  112(%rcx),%xmm1
 .byte  102,15,56,220,208
-.Laesenclast9:
+.Laesenclast14:
 .byte  102,15,56,221,209
        movups  16-112(%rcx),%xmm0
        decq    %rdx
index 26f0e10..fe1f932 100644 (file)
@@ -5,6 +5,25 @@
 .type  aesni_cbc_sha256_enc,@function
 .align 16
 aesni_cbc_sha256_enc:
+       leaq    OPENSSL_ia32cap_P(%rip),%r11
+       movl    $1,%eax
+       cmpq    $0,%rdi
+       je      .Lprobe
+       movl    0(%r11),%eax
+       movq    4(%r11),%r10
+       btq     $61,%r10
+       jc      aesni_cbc_sha256_enc_shaext
+       movq    %r10,%r11
+       shrq    $32,%r11
+
+       testl   $2048,%r10d
+       jnz     aesni_cbc_sha256_enc_xop
+       andl    $296,%r11d
+       cmpl    $296,%r11d
+       je      aesni_cbc_sha256_enc_avx2
+       andl    $268435456,%r10d
+       jnz     aesni_cbc_sha256_enc_avx
+       ud2
        xorl    %eax,%eax
        cmpq    $0,%rdi
        je      .Lprobe
@@ -55,4 +74,4282 @@ K256:
 .long  0,0,0,0,   0,0,0,0
 .byte  65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 64
+.type  aesni_cbc_sha256_enc_xop,@function
+.align 64
+aesni_cbc_sha256_enc_xop:
+.Lxop_shortcut:
+       movq    8(%rsp),%r10
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       movq    %rsp,%r11
+       subq    $128,%rsp
+       andq    $-64,%rsp
+
+       shlq    $6,%rdx
+       subq    %rdi,%rsi
+       subq    %rdi,%r10
+       addq    %rdi,%rdx
+
+
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+
+       movq    %r8,64+32(%rsp)
+       movq    %r9,64+40(%rsp)
+       movq    %r10,64+48(%rsp)
+       movq    %r11,64+56(%rsp)
+.Lprologue_xop:
+       vzeroall
+
+       movq    %rdi,%r12
+       leaq    128(%rcx),%rdi
+       leaq    K256+544(%rip),%r13
+       movl    240-128(%rdi),%r14d
+       movq    %r9,%r15
+       movq    %r10,%rsi
+       vmovdqu (%r8),%xmm8
+       subq    $9,%r14
+
+       movl    0(%r15),%eax
+       movl    4(%r15),%ebx
+       movl    8(%r15),%ecx
+       movl    12(%r15),%edx
+       movl    16(%r15),%r8d
+       movl    20(%r15),%r9d
+       movl    24(%r15),%r10d
+       movl    28(%r15),%r11d
+
+       vmovdqa 0(%r13,%r14,8),%xmm14
+       vmovdqa 16(%r13,%r14,8),%xmm13
+       vmovdqa 32(%r13,%r14,8),%xmm12
+       vmovdqu 0-128(%rdi),%xmm10
+       jmp     .Lloop_xop
+.align 16
+.Lloop_xop:
+       vmovdqa K256+512(%rip),%xmm7
+       vmovdqu 0(%rsi,%r12,1),%xmm0
+       vmovdqu 16(%rsi,%r12,1),%xmm1
+       vmovdqu 32(%rsi,%r12,1),%xmm2
+       vmovdqu 48(%rsi,%r12,1),%xmm3
+       vpshufb %xmm7,%xmm0,%xmm0
+       leaq    K256(%rip),%rbp
+       vpshufb %xmm7,%xmm1,%xmm1
+       vpshufb %xmm7,%xmm2,%xmm2
+       vpaddd  0(%rbp),%xmm0,%xmm4
+       vpshufb %xmm7,%xmm3,%xmm3
+       vpaddd  32(%rbp),%xmm1,%xmm5
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       vpaddd  96(%rbp),%xmm3,%xmm7
+       vmovdqa %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       vmovdqa %xmm5,16(%rsp)
+       movl    %ebx,%esi
+       vmovdqa %xmm6,32(%rsp)
+       xorl    %ecx,%esi
+       vmovdqa %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+       subq    $-32*4,%rbp
+       vmovdqu (%r12),%xmm9
+       movq    %r12,64+0(%rsp)
+       vpalignr        $4,%xmm0,%xmm1,%xmm4
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       vpalignr        $4,%xmm2,%xmm3,%xmm7
+       movl    %r9d,%r12d
+       xorl    %r8d,%r13d
+.byte  143,232,120,194,236,14
+       rorl    $9,%r14d
+       xorl    %r10d,%r12d
+       vpsrld  $3,%xmm4,%xmm4
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       vpaddd  %xmm7,%xmm0,%xmm0
+       andl    %r8d,%r12d
+       vpxor   %xmm10,%xmm9,%xmm9
+       vmovdqu 16-128(%rdi),%xmm10
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+.byte  143,232,120,194,245,11
+       rorl    $11,%r14d
+       xorl    %r10d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%r11d
+       andl    %r15d,%esi
+.byte  143,232,120,194,251,13
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %ebx,%esi
+       addl    %r11d,%edx
+       vpsrld  $10,%xmm3,%xmm6
+       rorl    $2,%r14d
+       addl    %esi,%r11d
+       vpaddd  %xmm4,%xmm0,%xmm0
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+.byte  143,232,120,194,239,2
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       vpxor   %xmm6,%xmm7,%xmm7
+       movl    %r8d,%r12d
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm7,%xmm7
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       vpxor   %xmm8,%xmm9,%xmm9
+       xorl    %edx,%r13d
+       vpsrldq $8,%xmm7,%xmm7
+       addl    4(%rsp),%r10d
+       movl    %r11d,%esi
+       rorl    $11,%r14d
+       xorl    %r9d,%r12d
+       vpaddd  %xmm7,%xmm0,%xmm0
+       xorl    %eax,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%r10d
+       andl    %esi,%r15d
+.byte  143,232,120,194,248,13
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       vpsrld  $10,%xmm0,%xmm6
+       xorl    %eax,%r15d
+       addl    %r10d,%ecx
+.byte  143,232,120,194,239,2
+       rorl    $2,%r14d
+       addl    %r15d,%r10d
+       vpxor   %xmm6,%xmm7,%xmm7
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm5,%xmm7,%xmm7
+       movl    %edx,%r12d
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r12d
+       vpslldq $8,%xmm7,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 32-128(%rdi),%xmm10
+       xorl    %ecx,%r13d
+       vpaddd  %xmm7,%xmm0,%xmm0
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       rorl    $11,%r14d
+       xorl    %r8d,%r12d
+       vpaddd  0(%rbp),%xmm0,%xmm6
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%r9d
+       andl    %r15d,%esi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%esi
+       addl    %r9d,%ebx
+       rorl    $2,%r14d
+       addl    %esi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 48-128(%rdi),%xmm10
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%esi
+       rorl    $11,%r14d
+       xorl    %edx,%r12d
+       xorl    %r10d,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%r8d
+       andl    %esi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       addl    %r8d,%eax
+       rorl    $2,%r14d
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,0(%rsp)
+       vpalignr        $4,%xmm1,%xmm2,%xmm4
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       vpalignr        $4,%xmm3,%xmm0,%xmm7
+       movl    %ebx,%r12d
+       xorl    %eax,%r13d
+.byte  143,232,120,194,236,14
+       rorl    $9,%r14d
+       xorl    %ecx,%r12d
+       vpsrld  $3,%xmm4,%xmm4
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       vpaddd  %xmm7,%xmm1,%xmm1
+       andl    %eax,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 64-128(%rdi),%xmm10
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+.byte  143,232,120,194,245,11
+       rorl    $11,%r14d
+       xorl    %ecx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%edx
+       andl    %r15d,%esi
+.byte  143,232,120,194,248,13
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r9d,%esi
+       addl    %edx,%r11d
+       vpsrld  $10,%xmm0,%xmm6
+       rorl    $2,%r14d
+       addl    %esi,%edx
+       vpaddd  %xmm4,%xmm1,%xmm1
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+.byte  143,232,120,194,239,2
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       vpxor   %xmm6,%xmm7,%xmm7
+       movl    %eax,%r12d
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm7,%xmm7
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 80-128(%rdi),%xmm10
+       xorl    %r11d,%r13d
+       vpsrldq $8,%xmm7,%xmm7
+       addl    20(%rsp),%ecx
+       movl    %edx,%esi
+       rorl    $11,%r14d
+       xorl    %ebx,%r12d
+       vpaddd  %xmm7,%xmm1,%xmm1
+       xorl    %r8d,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%ecx
+       andl    %esi,%r15d
+.byte  143,232,120,194,249,13
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       vpsrld  $10,%xmm1,%xmm6
+       xorl    %r8d,%r15d
+       addl    %ecx,%r10d
+.byte  143,232,120,194,239,2
+       rorl    $2,%r14d
+       addl    %r15d,%ecx
+       vpxor   %xmm6,%xmm7,%xmm7
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm5,%xmm7,%xmm7
+       movl    %r11d,%r12d
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r12d
+       vpslldq $8,%xmm7,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 96-128(%rdi),%xmm10
+       xorl    %r10d,%r13d
+       vpaddd  %xmm7,%xmm1,%xmm1
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       rorl    $11,%r14d
+       xorl    %eax,%r12d
+       vpaddd  32(%rbp),%xmm1,%xmm6
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%ebx
+       andl    %r15d,%esi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%esi
+       addl    %ebx,%r9d
+       rorl    $2,%r14d
+       addl    %esi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 112-128(%rdi),%xmm10
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%esi
+       rorl    $11,%r14d
+       xorl    %r11d,%r12d
+       xorl    %ecx,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%eax
+       andl    %esi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       addl    %eax,%r8d
+       rorl    $2,%r14d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,16(%rsp)
+       vpalignr        $4,%xmm2,%xmm3,%xmm4
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       vpalignr        $4,%xmm0,%xmm1,%xmm7
+       movl    %r9d,%r12d
+       xorl    %r8d,%r13d
+.byte  143,232,120,194,236,14
+       rorl    $9,%r14d
+       xorl    %r10d,%r12d
+       vpsrld  $3,%xmm4,%xmm4
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       vpaddd  %xmm7,%xmm2,%xmm2
+       andl    %r8d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 128-128(%rdi),%xmm10
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+.byte  143,232,120,194,245,11
+       rorl    $11,%r14d
+       xorl    %r10d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%r11d
+       andl    %r15d,%esi
+.byte  143,232,120,194,249,13
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %ebx,%esi
+       addl    %r11d,%edx
+       vpsrld  $10,%xmm1,%xmm6
+       rorl    $2,%r14d
+       addl    %esi,%r11d
+       vpaddd  %xmm4,%xmm2,%xmm2
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+.byte  143,232,120,194,239,2
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       vpxor   %xmm6,%xmm7,%xmm7
+       movl    %r8d,%r12d
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm7,%xmm7
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 144-128(%rdi),%xmm10
+       xorl    %edx,%r13d
+       vpsrldq $8,%xmm7,%xmm7
+       addl    36(%rsp),%r10d
+       movl    %r11d,%esi
+       rorl    $11,%r14d
+       xorl    %r9d,%r12d
+       vpaddd  %xmm7,%xmm2,%xmm2
+       xorl    %eax,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%r10d
+       andl    %esi,%r15d
+.byte  143,232,120,194,250,13
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       vpsrld  $10,%xmm2,%xmm6
+       xorl    %eax,%r15d
+       addl    %r10d,%ecx
+.byte  143,232,120,194,239,2
+       rorl    $2,%r14d
+       addl    %r15d,%r10d
+       vpxor   %xmm6,%xmm7,%xmm7
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm5,%xmm7,%xmm7
+       movl    %edx,%r12d
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r12d
+       vpslldq $8,%xmm7,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 160-128(%rdi),%xmm10
+       xorl    %ecx,%r13d
+       vpaddd  %xmm7,%xmm2,%xmm2
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       rorl    $11,%r14d
+       xorl    %r8d,%r12d
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%r9d
+       andl    %r15d,%esi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%esi
+       addl    %r9d,%ebx
+       rorl    $2,%r14d
+       addl    %esi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 176-128(%rdi),%xmm10
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%esi
+       rorl    $11,%r14d
+       xorl    %edx,%r12d
+       xorl    %r10d,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%r8d
+       andl    %esi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       addl    %r8d,%eax
+       rorl    $2,%r14d
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,32(%rsp)
+       vpalignr        $4,%xmm3,%xmm0,%xmm4
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       vpalignr        $4,%xmm1,%xmm2,%xmm7
+       movl    %ebx,%r12d
+       xorl    %eax,%r13d
+.byte  143,232,120,194,236,14
+       rorl    $9,%r14d
+       xorl    %ecx,%r12d
+       vpsrld  $3,%xmm4,%xmm4
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       vpaddd  %xmm7,%xmm3,%xmm3
+       andl    %eax,%r12d
+       vpand   %xmm12,%xmm11,%xmm8
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 192-128(%rdi),%xmm10
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+.byte  143,232,120,194,245,11
+       rorl    $11,%r14d
+       xorl    %ecx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%edx
+       andl    %r15d,%esi
+.byte  143,232,120,194,250,13
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r9d,%esi
+       addl    %edx,%r11d
+       vpsrld  $10,%xmm2,%xmm6
+       rorl    $2,%r14d
+       addl    %esi,%edx
+       vpaddd  %xmm4,%xmm3,%xmm3
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+.byte  143,232,120,194,239,2
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       vpxor   %xmm6,%xmm7,%xmm7
+       movl    %eax,%r12d
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm7,%xmm7
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 208-128(%rdi),%xmm10
+       xorl    %r11d,%r13d
+       vpsrldq $8,%xmm7,%xmm7
+       addl    52(%rsp),%ecx
+       movl    %edx,%esi
+       rorl    $11,%r14d
+       xorl    %ebx,%r12d
+       vpaddd  %xmm7,%xmm3,%xmm3
+       xorl    %r8d,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%ecx
+       andl    %esi,%r15d
+.byte  143,232,120,194,251,13
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       vpsrld  $10,%xmm3,%xmm6
+       xorl    %r8d,%r15d
+       addl    %ecx,%r10d
+.byte  143,232,120,194,239,2
+       rorl    $2,%r14d
+       addl    %r15d,%ecx
+       vpxor   %xmm6,%xmm7,%xmm7
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm5,%xmm7,%xmm7
+       movl    %r11d,%r12d
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r12d
+       vpslldq $8,%xmm7,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       vpand   %xmm13,%xmm11,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 224-128(%rdi),%xmm10
+       xorl    %r10d,%r13d
+       vpaddd  %xmm7,%xmm3,%xmm3
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       rorl    $11,%r14d
+       xorl    %eax,%r12d
+       vpaddd  96(%rbp),%xmm3,%xmm6
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%ebx
+       andl    %r15d,%esi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%esi
+       addl    %ebx,%r9d
+       rorl    $2,%r14d
+       addl    %esi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpor    %xmm11,%xmm8,%xmm8
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vmovdqu 0-128(%rdi),%xmm10
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%esi
+       rorl    $11,%r14d
+       xorl    %r11d,%r12d
+       xorl    %ecx,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%eax
+       andl    %esi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       addl    %eax,%r8d
+       rorl    $2,%r14d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,48(%rsp)
+       movq    64+0(%rsp),%r12
+       vpand   %xmm14,%xmm11,%xmm11
+       movq    64+8(%rsp),%r15
+       vpor    %xmm11,%xmm8,%xmm8
+       vmovdqu %xmm8,(%r15,%r12,1)
+       leaq    16(%r12),%r12
+       cmpb    $0,131(%rbp)
+       jne     .Lxop_00_47
+       vmovdqu (%r12),%xmm9
+       movq    %r12,64+0(%rsp)
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpxor   %xmm10,%xmm9,%xmm9
+       vmovdqu 16-128(%rdi),%xmm10
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       rorl    $11,%r14d
+       xorl    %r10d,%r12d
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%r11d
+       andl    %r15d,%esi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%esi
+       addl    %r11d,%edx
+       rorl    $2,%r14d
+       addl    %esi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       vpxor   %xmm8,%xmm9,%xmm9
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%esi
+       rorl    $11,%r14d
+       xorl    %r9d,%r12d
+       xorl    %eax,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%r10d
+       andl    %esi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       addl    %r10d,%ecx
+       rorl    $2,%r14d
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 32-128(%rdi),%xmm10
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       rorl    $11,%r14d
+       xorl    %r8d,%r12d
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%r9d
+       andl    %r15d,%esi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%esi
+       addl    %r9d,%ebx
+       rorl    $2,%r14d
+       addl    %esi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 48-128(%rdi),%xmm10
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%esi
+       rorl    $11,%r14d
+       xorl    %edx,%r12d
+       xorl    %r10d,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%r8d
+       andl    %esi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       addl    %r8d,%eax
+       rorl    $2,%r14d
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 64-128(%rdi),%xmm10
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       rorl    $11,%r14d
+       xorl    %ecx,%r12d
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%edx
+       andl    %r15d,%esi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%esi
+       addl    %edx,%r11d
+       rorl    $2,%r14d
+       addl    %esi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 80-128(%rdi),%xmm10
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%esi
+       rorl    $11,%r14d
+       xorl    %ebx,%r12d
+       xorl    %r8d,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%ecx
+       andl    %esi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       addl    %ecx,%r10d
+       rorl    $2,%r14d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 96-128(%rdi),%xmm10
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       rorl    $11,%r14d
+       xorl    %eax,%r12d
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%ebx
+       andl    %r15d,%esi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%esi
+       addl    %ebx,%r9d
+       rorl    $2,%r14d
+       addl    %esi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 112-128(%rdi),%xmm10
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%esi
+       rorl    $11,%r14d
+       xorl    %r11d,%r12d
+       xorl    %ecx,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%eax
+       andl    %esi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       addl    %eax,%r8d
+       rorl    $2,%r14d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 128-128(%rdi),%xmm10
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       rorl    $11,%r14d
+       xorl    %r10d,%r12d
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%r11d
+       andl    %r15d,%esi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%esi
+       addl    %r11d,%edx
+       rorl    $2,%r14d
+       addl    %esi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 144-128(%rdi),%xmm10
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%esi
+       rorl    $11,%r14d
+       xorl    %r9d,%r12d
+       xorl    %eax,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%r10d
+       andl    %esi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       addl    %r10d,%ecx
+       rorl    $2,%r14d
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 160-128(%rdi),%xmm10
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       rorl    $11,%r14d
+       xorl    %r8d,%r12d
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%r9d
+       andl    %r15d,%esi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%esi
+       addl    %r9d,%ebx
+       rorl    $2,%r14d
+       addl    %esi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 176-128(%rdi),%xmm10
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%esi
+       rorl    $11,%r14d
+       xorl    %edx,%r12d
+       xorl    %r10d,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%r8d
+       andl    %esi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       addl    %r8d,%eax
+       rorl    $2,%r14d
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpand   %xmm12,%xmm11,%xmm8
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 192-128(%rdi),%xmm10
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       rorl    $11,%r14d
+       xorl    %ecx,%r12d
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%edx
+       andl    %r15d,%esi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%esi
+       addl    %edx,%r11d
+       rorl    $2,%r14d
+       addl    %esi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 208-128(%rdi),%xmm10
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%esi
+       rorl    $11,%r14d
+       xorl    %ebx,%r12d
+       xorl    %r8d,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%ecx
+       andl    %esi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       addl    %ecx,%r10d
+       rorl    $2,%r14d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       vpand   %xmm13,%xmm11,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 224-128(%rdi),%xmm10
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       rorl    $11,%r14d
+       xorl    %eax,%r12d
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       addl    %r12d,%ebx
+       andl    %r15d,%esi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%esi
+       addl    %ebx,%r9d
+       rorl    $2,%r14d
+       addl    %esi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpor    %xmm11,%xmm8,%xmm8
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vmovdqu 0-128(%rdi),%xmm10
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%esi
+       rorl    $11,%r14d
+       xorl    %r11d,%r12d
+       xorl    %ecx,%esi
+       rorl    $6,%r13d
+       addl    %r12d,%eax
+       andl    %esi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       addl    %eax,%r8d
+       rorl    $2,%r14d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%r12
+       movq    64+8(%rsp),%r13
+       movq    64+40(%rsp),%r15
+       movq    64+48(%rsp),%rsi
+
+       vpand   %xmm14,%xmm11,%xmm11
+       movl    %r14d,%eax
+       vpor    %xmm11,%xmm8,%xmm8
+       vmovdqu %xmm8,(%r12,%r13,1)
+       leaq    16(%r12),%r12
+
+       addl    0(%r15),%eax
+       addl    4(%r15),%ebx
+       addl    8(%r15),%ecx
+       addl    12(%r15),%edx
+       addl    16(%r15),%r8d
+       addl    20(%r15),%r9d
+       addl    24(%r15),%r10d
+       addl    28(%r15),%r11d
+
+       cmpq    64+16(%rsp),%r12
+
+       movl    %eax,0(%r15)
+       movl    %ebx,4(%r15)
+       movl    %ecx,8(%r15)
+       movl    %edx,12(%r15)
+       movl    %r8d,16(%r15)
+       movl    %r9d,20(%r15)
+       movl    %r10d,24(%r15)
+       movl    %r11d,28(%r15)
+
+       jb      .Lloop_xop
+
+       movq    64+32(%rsp),%r8
+       movq    64+56(%rsp),%rsi
+       vmovdqu %xmm8,(%r8)
+       vzeroall
+       movq    (%rsi),%r15
+       movq    8(%rsi),%r14
+       movq    16(%rsi),%r13
+       movq    24(%rsi),%r12
+       movq    32(%rsi),%rbp
+       movq    40(%rsi),%rbx
+       leaq    48(%rsi),%rsp
+.Lepilogue_xop:
+       .byte   0xf3,0xc3
+.size  aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop
+.type  aesni_cbc_sha256_enc_avx,@function
+.align 64
+aesni_cbc_sha256_enc_avx:
+.Lavx_shortcut:
+       movq    8(%rsp),%r10
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       movq    %rsp,%r11
+       subq    $128,%rsp
+       andq    $-64,%rsp
+
+       shlq    $6,%rdx
+       subq    %rdi,%rsi
+       subq    %rdi,%r10
+       addq    %rdi,%rdx
+
+
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+
+       movq    %r8,64+32(%rsp)
+       movq    %r9,64+40(%rsp)
+       movq    %r10,64+48(%rsp)
+       movq    %r11,64+56(%rsp)
+.Lprologue_avx:
+       vzeroall
+
+       movq    %rdi,%r12
+       leaq    128(%rcx),%rdi
+       leaq    K256+544(%rip),%r13
+       movl    240-128(%rdi),%r14d
+       movq    %r9,%r15
+       movq    %r10,%rsi
+       vmovdqu (%r8),%xmm8
+       subq    $9,%r14
+
+       movl    0(%r15),%eax
+       movl    4(%r15),%ebx
+       movl    8(%r15),%ecx
+       movl    12(%r15),%edx
+       movl    16(%r15),%r8d
+       movl    20(%r15),%r9d
+       movl    24(%r15),%r10d
+       movl    28(%r15),%r11d
+
+       vmovdqa 0(%r13,%r14,8),%xmm14
+       vmovdqa 16(%r13,%r14,8),%xmm13
+       vmovdqa 32(%r13,%r14,8),%xmm12
+       vmovdqu 0-128(%rdi),%xmm10
+       jmp     .Lloop_avx
+.align 16
+.Lloop_avx:
+       vmovdqa K256+512(%rip),%xmm7
+       vmovdqu 0(%rsi,%r12,1),%xmm0
+       vmovdqu 16(%rsi,%r12,1),%xmm1
+       vmovdqu 32(%rsi,%r12,1),%xmm2
+       vmovdqu 48(%rsi,%r12,1),%xmm3
+       vpshufb %xmm7,%xmm0,%xmm0
+       leaq    K256(%rip),%rbp
+       vpshufb %xmm7,%xmm1,%xmm1
+       vpshufb %xmm7,%xmm2,%xmm2
+       vpaddd  0(%rbp),%xmm0,%xmm4
+       vpshufb %xmm7,%xmm3,%xmm3
+       vpaddd  32(%rbp),%xmm1,%xmm5
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       vpaddd  96(%rbp),%xmm3,%xmm7
+       vmovdqa %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       vmovdqa %xmm5,16(%rsp)
+       movl    %ebx,%esi
+       vmovdqa %xmm6,32(%rsp)
+       xorl    %ecx,%esi
+       vmovdqa %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+       subq    $-32*4,%rbp
+       vmovdqu (%r12),%xmm9
+       movq    %r12,64+0(%rsp)
+       vpalignr        $4,%xmm0,%xmm1,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm2,%xmm3,%xmm7
+       xorl    %r8d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm0,%xmm0
+       vpxor   %xmm10,%xmm9,%xmm9
+       vmovdqu 16-128(%rdi),%xmm10
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%r12d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r11d
+       andl    %r15d,%esi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%esi
+       vpshufd $250,%xmm3,%xmm7
+       addl    %r11d,%edx
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       xorl    %edx,%r13d
+       vpslld  $11,%xmm5,%xmm5
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       vpxor   %xmm8,%xmm9,%xmm9
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    4(%rsp),%r10d
+       movl    %r11d,%esi
+       shrdl   $11,%r14d,%r14d
+       vpxor   %xmm5,%xmm4,%xmm4
+       xorl    %r9d,%r12d
+       xorl    %eax,%esi
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       addl    %r12d,%r10d
+       andl    %esi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm0,%xmm0
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       addl    %r10d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       xorl    %ecx,%r13d
+       shrdl   $9,%r14d,%r14d
+       vpshufd $132,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpsrldq $8,%xmm6,%xmm6
+       andl    %ecx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 32-128(%rdi),%xmm10
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       movl    %r10d,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%r12d
+       vpshufd $80,%xmm0,%xmm7
+       xorl    %r11d,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r9d
+       vpsrld  $10,%xmm7,%xmm6
+       andl    %r15d,%esi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpsrlq  $17,%xmm7,%xmm7
+       xorl    %r11d,%esi
+       addl    %r9d,%ebx
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %esi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpsrlq  $2,%xmm7,%xmm7
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %ebx,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r12d
+       vpshufd $232,%xmm6,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpslldq $8,%xmm6,%xmm6
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 48-128(%rdi),%xmm10
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%esi
+       vpaddd  %xmm6,%xmm0,%xmm0
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r12d
+       xorl    %r10d,%esi
+       vpaddd  0(%rbp),%xmm0,%xmm6
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r8d
+       andl    %esi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       addl    %r8d,%eax
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,0(%rsp)
+       vpalignr        $4,%xmm1,%xmm2,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm3,%xmm0,%xmm7
+       xorl    %eax,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm1,%xmm1
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 64-128(%rdi),%xmm10
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%r12d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%edx
+       andl    %r15d,%esi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%esi
+       vpshufd $250,%xmm0,%xmm7
+       addl    %edx,%r11d
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       xorl    %r11d,%r13d
+       vpslld  $11,%xmm5,%xmm5
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 80-128(%rdi),%xmm10
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    20(%rsp),%ecx
+       movl    %edx,%esi
+       shrdl   $11,%r14d,%r14d
+       vpxor   %xmm5,%xmm4,%xmm4
+       xorl    %ebx,%r12d
+       xorl    %r8d,%esi
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       addl    %r12d,%ecx
+       andl    %esi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm1,%xmm1
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       addl    %ecx,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       xorl    %r10d,%r13d
+       shrdl   $9,%r14d,%r14d
+       vpshufd $132,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpsrldq $8,%xmm6,%xmm6
+       andl    %r10d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 96-128(%rdi),%xmm10
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       vpaddd  %xmm6,%xmm1,%xmm1
+       movl    %ecx,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%r12d
+       vpshufd $80,%xmm1,%xmm7
+       xorl    %edx,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%ebx
+       vpsrld  $10,%xmm7,%xmm6
+       andl    %r15d,%esi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpsrlq  $17,%xmm7,%xmm7
+       xorl    %edx,%esi
+       addl    %ebx,%r9d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %esi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpsrlq  $2,%xmm7,%xmm7
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r9d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r12d
+       vpshufd $232,%xmm6,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpslldq $8,%xmm6,%xmm6
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 112-128(%rdi),%xmm10
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%esi
+       vpaddd  %xmm6,%xmm1,%xmm1
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r12d
+       xorl    %ecx,%esi
+       vpaddd  32(%rbp),%xmm1,%xmm6
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%eax
+       andl    %esi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       addl    %eax,%r8d
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,16(%rsp)
+       vpalignr        $4,%xmm2,%xmm3,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm0,%xmm1,%xmm7
+       xorl    %r8d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm2,%xmm2
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 128-128(%rdi),%xmm10
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%r12d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r11d
+       andl    %r15d,%esi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%esi
+       vpshufd $250,%xmm1,%xmm7
+       addl    %r11d,%edx
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       xorl    %edx,%r13d
+       vpslld  $11,%xmm5,%xmm5
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 144-128(%rdi),%xmm10
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    36(%rsp),%r10d
+       movl    %r11d,%esi
+       shrdl   $11,%r14d,%r14d
+       vpxor   %xmm5,%xmm4,%xmm4
+       xorl    %r9d,%r12d
+       xorl    %eax,%esi
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       addl    %r12d,%r10d
+       andl    %esi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm2,%xmm2
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       addl    %r10d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       xorl    %ecx,%r13d
+       shrdl   $9,%r14d,%r14d
+       vpshufd $132,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpsrldq $8,%xmm6,%xmm6
+       andl    %ecx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 160-128(%rdi),%xmm10
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       movl    %r10d,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%r12d
+       vpshufd $80,%xmm2,%xmm7
+       xorl    %r11d,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r9d
+       vpsrld  $10,%xmm7,%xmm6
+       andl    %r15d,%esi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpsrlq  $17,%xmm7,%xmm7
+       xorl    %r11d,%esi
+       addl    %r9d,%ebx
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %esi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpsrlq  $2,%xmm7,%xmm7
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %ebx,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r12d
+       vpshufd $232,%xmm6,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpslldq $8,%xmm6,%xmm6
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 176-128(%rdi),%xmm10
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%esi
+       vpaddd  %xmm6,%xmm2,%xmm2
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r12d
+       xorl    %r10d,%esi
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r8d
+       andl    %esi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       addl    %r8d,%eax
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,32(%rsp)
+       vpalignr        $4,%xmm3,%xmm0,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm1,%xmm2,%xmm7
+       xorl    %eax,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm3,%xmm3
+       vpand   %xmm12,%xmm11,%xmm8
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 192-128(%rdi),%xmm10
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%r12d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%edx
+       andl    %r15d,%esi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%esi
+       vpshufd $250,%xmm2,%xmm7
+       addl    %edx,%r11d
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       xorl    %r11d,%r13d
+       vpslld  $11,%xmm5,%xmm5
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 208-128(%rdi),%xmm10
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    52(%rsp),%ecx
+       movl    %edx,%esi
+       shrdl   $11,%r14d,%r14d
+       vpxor   %xmm5,%xmm4,%xmm4
+       xorl    %ebx,%r12d
+       xorl    %r8d,%esi
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       addl    %r12d,%ecx
+       andl    %esi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm3,%xmm3
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       addl    %ecx,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       xorl    %r10d,%r13d
+       shrdl   $9,%r14d,%r14d
+       vpshufd $132,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpsrldq $8,%xmm6,%xmm6
+       andl    %r10d,%r12d
+       vpand   %xmm13,%xmm11,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 224-128(%rdi),%xmm10
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       vpaddd  %xmm6,%xmm3,%xmm3
+       movl    %ecx,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%r12d
+       vpshufd $80,%xmm3,%xmm7
+       xorl    %edx,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%ebx
+       vpsrld  $10,%xmm7,%xmm6
+       andl    %r15d,%esi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpsrlq  $17,%xmm7,%xmm7
+       xorl    %edx,%esi
+       addl    %ebx,%r9d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %esi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpsrlq  $2,%xmm7,%xmm7
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r9d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r12d
+       vpshufd $232,%xmm6,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpslldq $8,%xmm6,%xmm6
+       vpor    %xmm11,%xmm8,%xmm8
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vmovdqu 0-128(%rdi),%xmm10
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%esi
+       vpaddd  %xmm6,%xmm3,%xmm3
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r12d
+       xorl    %ecx,%esi
+       vpaddd  96(%rbp),%xmm3,%xmm6
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%eax
+       andl    %esi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       addl    %eax,%r8d
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,48(%rsp)
+       movq    64+0(%rsp),%r12
+       vpand   %xmm14,%xmm11,%xmm11
+       movq    64+8(%rsp),%r15
+       vpor    %xmm11,%xmm8,%xmm8
+       vmovdqu %xmm8,(%r15,%r12,1)
+       leaq    16(%r12),%r12
+       cmpb    $0,131(%rbp)
+       jne     .Lavx_00_47
+       vmovdqu (%r12),%xmm9
+       movq    %r12,64+0(%rsp)
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       xorl    %r8d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpxor   %xmm10,%xmm9,%xmm9
+       vmovdqu 16-128(%rdi),%xmm10
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%r12d
+       xorl    %ebx,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r11d
+       andl    %r15d,%esi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%esi
+       addl    %r11d,%edx
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       xorl    %edx,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       vpxor   %xmm8,%xmm9,%xmm9
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%esi
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r12d
+       xorl    %eax,%esi
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r10d
+       andl    %esi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       addl    %r10d,%ecx
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       xorl    %ecx,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 32-128(%rdi),%xmm10
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%r12d
+       xorl    %r11d,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r9d
+       andl    %r15d,%esi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%esi
+       addl    %r9d,%ebx
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       xorl    %ebx,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 48-128(%rdi),%xmm10
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%esi
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r12d
+       xorl    %r10d,%esi
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r8d
+       andl    %esi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       addl    %r8d,%eax
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       xorl    %eax,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 64-128(%rdi),%xmm10
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%r12d
+       xorl    %r9d,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%edx
+       andl    %r15d,%esi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%esi
+       addl    %edx,%r11d
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       xorl    %r11d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 80-128(%rdi),%xmm10
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%esi
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r12d
+       xorl    %r8d,%esi
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%ecx
+       andl    %esi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       addl    %ecx,%r10d
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       xorl    %r10d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 96-128(%rdi),%xmm10
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%r12d
+       xorl    %edx,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%ebx
+       andl    %r15d,%esi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%esi
+       addl    %ebx,%r9d
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       xorl    %r9d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 112-128(%rdi),%xmm10
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%esi
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r12d
+       xorl    %ecx,%esi
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%eax
+       andl    %esi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       addl    %eax,%r8d
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       xorl    %r8d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 128-128(%rdi),%xmm10
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%r12d
+       xorl    %ebx,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r11d
+       andl    %r15d,%esi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%esi
+       addl    %r11d,%edx
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       xorl    %edx,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 144-128(%rdi),%xmm10
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%esi
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r12d
+       xorl    %eax,%esi
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r10d
+       andl    %esi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       addl    %r10d,%ecx
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       xorl    %ecx,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 160-128(%rdi),%xmm10
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%r12d
+       xorl    %r11d,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r9d
+       andl    %r15d,%esi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%esi
+       addl    %r9d,%ebx
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       xorl    %ebx,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 176-128(%rdi),%xmm10
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%esi
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r12d
+       xorl    %r10d,%esi
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%r8d
+       andl    %esi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       addl    %r8d,%eax
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       xorl    %eax,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpand   %xmm12,%xmm11,%xmm8
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 192-128(%rdi),%xmm10
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%r12d
+       xorl    %r9d,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%edx
+       andl    %r15d,%esi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%esi
+       addl    %edx,%r11d
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       xorl    %r11d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 208-128(%rdi),%xmm10
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%esi
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r12d
+       xorl    %r8d,%esi
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%ecx
+       andl    %esi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       addl    %ecx,%r10d
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       xorl    %r10d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       vpand   %xmm13,%xmm11,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 224-128(%rdi),%xmm10
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%r12d
+       xorl    %edx,%r15d
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%ebx
+       andl    %r15d,%esi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%esi
+       addl    %ebx,%r9d
+       shrdl   $2,%r14d,%r14d
+       addl    %esi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       xorl    %r9d,%r13d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpor    %xmm11,%xmm8,%xmm8
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vmovdqu 0-128(%rdi),%xmm10
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%esi
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r12d
+       xorl    %ecx,%esi
+       shrdl   $6,%r13d,%r13d
+       addl    %r12d,%eax
+       andl    %esi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       addl    %eax,%r8d
+       shrdl   $2,%r14d,%r14d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%r12
+       movq    64+8(%rsp),%r13
+       movq    64+40(%rsp),%r15
+       movq    64+48(%rsp),%rsi
+
+       vpand   %xmm14,%xmm11,%xmm11
+       movl    %r14d,%eax
+       vpor    %xmm11,%xmm8,%xmm8
+       vmovdqu %xmm8,(%r12,%r13,1)
+       leaq    16(%r12),%r12
+
+       addl    0(%r15),%eax
+       addl    4(%r15),%ebx
+       addl    8(%r15),%ecx
+       addl    12(%r15),%edx
+       addl    16(%r15),%r8d
+       addl    20(%r15),%r9d
+       addl    24(%r15),%r10d
+       addl    28(%r15),%r11d
+
+       cmpq    64+16(%rsp),%r12
+
+       movl    %eax,0(%r15)
+       movl    %ebx,4(%r15)
+       movl    %ecx,8(%r15)
+       movl    %edx,12(%r15)
+       movl    %r8d,16(%r15)
+       movl    %r9d,20(%r15)
+       movl    %r10d,24(%r15)
+       movl    %r11d,28(%r15)
+       jb      .Lloop_avx
+
+       movq    64+32(%rsp),%r8
+       movq    64+56(%rsp),%rsi
+       vmovdqu %xmm8,(%r8)
+       vzeroall
+       movq    (%rsi),%r15
+       movq    8(%rsi),%r14
+       movq    16(%rsi),%r13
+       movq    24(%rsi),%r12
+       movq    32(%rsi),%rbp
+       movq    40(%rsi),%rbx
+       leaq    48(%rsi),%rsp
+.Lepilogue_avx:
+       .byte   0xf3,0xc3
+.size  aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx
+.type  aesni_cbc_sha256_enc_avx2,@function
+.align 64
+aesni_cbc_sha256_enc_avx2:
+.Lavx2_shortcut:
+       movq    8(%rsp),%r10
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       movq    %rsp,%r11
+       subq    $576,%rsp
+       andq    $-1024,%rsp
+       addq    $448,%rsp
+
+       shlq    $6,%rdx
+       subq    %rdi,%rsi
+       subq    %rdi,%r10
+       addq    %rdi,%rdx
+
+
+
+       movq    %rdx,64+16(%rsp)
+
+       movq    %r8,64+32(%rsp)
+       movq    %r9,64+40(%rsp)
+       movq    %r10,64+48(%rsp)
+       movq    %r11,64+56(%rsp)
+.Lprologue_avx2:
+       vzeroall
+
+       movq    %rdi,%r13
+       vpinsrq $1,%rsi,%xmm15,%xmm15
+       leaq    128(%rcx),%rdi
+       leaq    K256+544(%rip),%r12
+       movl    240-128(%rdi),%r14d
+       movq    %r9,%r15
+       movq    %r10,%rsi
+       vmovdqu (%r8),%xmm8
+       leaq    -9(%r14),%r14
+
+       vmovdqa 0(%r12,%r14,8),%xmm14
+       vmovdqa 16(%r12,%r14,8),%xmm13
+       vmovdqa 32(%r12,%r14,8),%xmm12
+
+       subq    $-64,%r13
+       movl    0(%r15),%eax
+       leaq    (%rsi,%r13,1),%r12
+       movl    4(%r15),%ebx
+       cmpq    %rdx,%r13
+       movl    8(%r15),%ecx
+       cmoveq  %rsp,%r12
+       movl    12(%r15),%edx
+       movl    16(%r15),%r8d
+       movl    20(%r15),%r9d
+       movl    24(%r15),%r10d
+       movl    28(%r15),%r11d
+       vmovdqu 0-128(%rdi),%xmm10
+       jmp     .Loop_avx2
+.align 16
+.Loop_avx2:
+       vmovdqa K256+512(%rip),%ymm7
+       vmovdqu -64+0(%rsi,%r13,1),%xmm0
+       vmovdqu -64+16(%rsi,%r13,1),%xmm1
+       vmovdqu -64+32(%rsi,%r13,1),%xmm2
+       vmovdqu -64+48(%rsi,%r13,1),%xmm3
+
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm7,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm7,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+
+       leaq    K256(%rip),%rbp
+       vpshufb %ymm7,%ymm2,%ymm2
+       leaq    -64(%r13),%r13
+       vpaddd  0(%rbp),%ymm0,%ymm4
+       vpshufb %ymm7,%ymm3,%ymm3
+       vpaddd  32(%rbp),%ymm1,%ymm5
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       vpaddd  96(%rbp),%ymm3,%ymm7
+       vmovdqa %ymm4,0(%rsp)
+       xorl    %r14d,%r14d
+       vmovdqa %ymm5,32(%rsp)
+       leaq    -64(%rsp),%rsp
+       movl    %ebx,%esi
+       vmovdqa %ymm6,0(%rsp)
+       xorl    %ecx,%esi
+       vmovdqa %ymm7,32(%rsp)
+       movl    %r9d,%r12d
+       subq    $-32*4,%rbp
+       jmp     .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+       vmovdqu (%r13),%xmm9
+       vpinsrq $0,%r13,%xmm15,%xmm15
+       leaq    -64(%rsp),%rsp
+       vpalignr        $4,%ymm0,%ymm1,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm2,%ymm3,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm0,%ymm0
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%esi
+       vpxor   %xmm10,%xmm9,%xmm9
+       vmovdqu 16-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ebx,%esi
+       vpshufd $250,%ymm3,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rsi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%esi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%esi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%esi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %esi,%r15d
+       vpxor   %xmm8,%xmm9,%xmm9
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm0,%ymm0
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufd $132,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpsrldq $8,%ymm6,%ymm6
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpshufd $80,%ymm0,%ymm7
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 32-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r11d,%esi
+       vpsrld  $10,%ymm7,%ymm6
+       xorl    %r13d,%r14d
+       leal    (%r9,%rsi,1),%r9d
+       movl    %ecx,%r12d
+       vpsrlq  $17,%ymm7,%ymm7
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpxor   %ymm7,%ymm6,%ymm6
+       rorxl   $11,%ebx,%esi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpsrlq  $2,%ymm7,%ymm7
+       andnl   %edx,%ebx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpxor   %ymm7,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%esi
+       vpshufd $232,%ymm6,%ymm6
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%esi
+       vpslldq $8,%ymm6,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       vpaddd  %ymm6,%ymm0,%ymm0
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 48-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       vpaddd  0(%rbp),%ymm0,%ymm6
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm1,%ymm2,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm3,%ymm0,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm1,%ymm1
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 64-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r9d,%esi
+       vpshufd $250,%ymm0,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rsi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%esi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%esi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%esi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 80-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm1,%ymm1
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufd $132,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpsrldq $8,%ymm6,%ymm6
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpshufd $80,%ymm1,%ymm7
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 96-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %edx,%esi
+       vpsrld  $10,%ymm7,%ymm6
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rsi,1),%ebx
+       movl    %r10d,%r12d
+       vpsrlq  $17,%ymm7,%ymm7
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpxor   %ymm7,%ymm6,%ymm6
+       rorxl   $11,%r9d,%esi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpsrlq  $2,%ymm7,%ymm7
+       andnl   %r11d,%r9d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpxor   %ymm7,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%esi
+       vpshufd $232,%ymm6,%ymm6
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%esi
+       vpslldq $8,%ymm6,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       vpaddd  %ymm6,%ymm1,%ymm1
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 112-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       vpaddd  32(%rbp),%ymm1,%ymm6
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    -64(%rsp),%rsp
+       vpalignr        $4,%ymm2,%ymm3,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm0,%ymm1,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm2,%ymm2
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 128-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ebx,%esi
+       vpshufd $250,%ymm1,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rsi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%esi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%esi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%esi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 144-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm2,%ymm2
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufd $132,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpsrldq $8,%ymm6,%ymm6
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpshufd $80,%ymm2,%ymm7
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 160-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r11d,%esi
+       vpsrld  $10,%ymm7,%ymm6
+       xorl    %r13d,%r14d
+       leal    (%r9,%rsi,1),%r9d
+       movl    %ecx,%r12d
+       vpsrlq  $17,%ymm7,%ymm7
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpxor   %ymm7,%ymm6,%ymm6
+       rorxl   $11,%ebx,%esi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpsrlq  $2,%ymm7,%ymm7
+       andnl   %edx,%ebx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpxor   %ymm7,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%esi
+       vpshufd $232,%ymm6,%ymm6
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%esi
+       vpslldq $8,%ymm6,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       vpaddd  %ymm6,%ymm2,%ymm2
+       andl    %esi,%r15d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 176-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm3,%ymm0,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm1,%ymm2,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm3,%ymm3
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%esi
+       vpand   %xmm12,%xmm11,%xmm8
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 192-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r9d,%esi
+       vpshufd $250,%ymm2,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rsi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%esi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%esi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%esi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %esi,%r15d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 208-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm3,%ymm3
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufd $132,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpsrldq $8,%ymm6,%ymm6
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpshufd $80,%ymm3,%ymm7
+       andl    %r15d,%esi
+       vpand   %xmm13,%xmm11,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 224-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %edx,%esi
+       vpsrld  $10,%ymm7,%ymm6
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rsi,1),%ebx
+       movl    %r10d,%r12d
+       vpsrlq  $17,%ymm7,%ymm7
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpxor   %ymm7,%ymm6,%ymm6
+       rorxl   $11,%r9d,%esi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpsrlq  $2,%ymm7,%ymm7
+       andnl   %r11d,%r9d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpxor   %ymm7,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%esi
+       vpshufd $232,%ymm6,%ymm6
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%esi
+       vpslldq $8,%ymm6,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       vpaddd  %ymm6,%ymm3,%ymm3
+       andl    %esi,%r15d
+       vpor    %xmm11,%xmm8,%xmm8
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vmovdqu 0-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       vpaddd  96(%rbp),%ymm3,%ymm6
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       vmovq   %xmm15,%r13
+       vpextrq $1,%xmm15,%r15
+       vpand   %xmm14,%xmm11,%xmm11
+       vpor    %xmm11,%xmm8,%xmm8
+       vmovdqu %xmm8,(%r15,%r13,1)
+       leaq    16(%r13),%r13
+       leaq    128(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jne     .Lavx2_00_47
+       vmovdqu (%r13),%xmm9
+       vpinsrq $0,%r13,%xmm15,%xmm15
+       addl    0+64(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%esi
+       vpxor   %xmm10,%xmm9,%xmm9
+       vmovdqu 16-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ebx,%esi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rsi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+64(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%esi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%esi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%esi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %esi,%r15d
+       vpxor   %xmm8,%xmm9,%xmm9
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+64(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 32-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r11d,%esi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rsi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+64(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%esi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%esi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%esi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 48-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+64(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 64-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r9d,%esi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rsi,1),%edx
+       movl    %eax,%r12d
+       addl    36+64(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%esi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%esi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%esi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 80-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+64(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 96-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %edx,%esi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rsi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+64(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%esi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%esi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%esi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 112-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       addl    0(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 128-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ebx,%esi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rsi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%esi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%esi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%esi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 144-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 160-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r11d,%esi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rsi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%esi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%esi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%esi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %esi,%r15d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 176-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%esi
+       vpand   %xmm12,%xmm11,%xmm8
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 192-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r9d,%esi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rsi,1),%edx
+       movl    %eax,%r12d
+       addl    36(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%esi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%esi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%esi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %esi,%r15d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 208-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%esi
+       vpand   %xmm13,%xmm11,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 224-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %edx,%esi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rsi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%esi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%esi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%esi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %esi,%r15d
+       vpor    %xmm11,%xmm8,%xmm8
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vmovdqu 0-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vpextrq $1,%xmm15,%r12
+       vmovq   %xmm15,%r13
+       movq    552(%rsp),%r15
+       addl    %r14d,%eax
+       leaq    448(%rsp),%rbp
+
+       vpand   %xmm14,%xmm11,%xmm11
+       vpor    %xmm11,%xmm8,%xmm8
+       vmovdqu %xmm8,(%r12,%r13,1)
+       leaq    16(%r13),%r13
+
+       addl    0(%r15),%eax
+       addl    4(%r15),%ebx
+       addl    8(%r15),%ecx
+       addl    12(%r15),%edx
+       addl    16(%r15),%r8d
+       addl    20(%r15),%r9d
+       addl    24(%r15),%r10d
+       addl    28(%r15),%r11d
+
+       movl    %eax,0(%r15)
+       movl    %ebx,4(%r15)
+       movl    %ecx,8(%r15)
+       movl    %edx,12(%r15)
+       movl    %r8d,16(%r15)
+       movl    %r9d,20(%r15)
+       movl    %r10d,24(%r15)
+       movl    %r11d,28(%r15)
+
+       cmpq    80(%rbp),%r13
+       je      .Ldone_avx2
+
+       xorl    %r14d,%r14d
+       movl    %ebx,%esi
+       movl    %r9d,%r12d
+       xorl    %ecx,%esi
+       jmp     .Lower_avx2
+.align 16
+.Lower_avx2:
+       vmovdqu (%r13),%xmm9
+       vpinsrq $0,%r13,%xmm15,%xmm15
+       addl    0+16(%rbp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%esi
+       vpxor   %xmm10,%xmm9,%xmm9
+       vmovdqu 16-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ebx,%esi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rsi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+16(%rbp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%esi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%esi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%esi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %esi,%r15d
+       vpxor   %xmm8,%xmm9,%xmm9
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+16(%rbp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 32-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r11d,%esi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rsi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+16(%rbp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%esi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%esi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%esi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 48-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+16(%rbp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 64-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r9d,%esi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rsi,1),%edx
+       movl    %eax,%r12d
+       addl    36+16(%rbp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%esi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%esi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%esi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 80-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+16(%rbp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 96-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %edx,%esi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rsi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+16(%rbp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%esi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%esi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%esi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 112-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       leaq    -64(%rbp),%rbp
+       addl    0+16(%rbp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 128-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ebx,%esi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rsi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+16(%rbp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%esi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%esi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%esi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %esi,%r15d
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 144-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+16(%rbp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%esi
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 160-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r11d,%esi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rsi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+16(%rbp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%esi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%esi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%esi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %esi,%r15d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 176-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+16(%rbp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%esi
+       vpand   %xmm12,%xmm11,%xmm8
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 192-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r9d,%esi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rsi,1),%edx
+       movl    %eax,%r12d
+       addl    36+16(%rbp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%esi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%esi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%esi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %esi,%r15d
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 208-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+16(%rbp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%esi
+       vpand   %xmm13,%xmm11,%xmm11
+       vaesenc %xmm10,%xmm9,%xmm9
+       vmovdqu 224-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %edx,%esi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rsi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+16(%rbp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%esi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %esi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%esi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%esi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %esi,%r15d
+       vpor    %xmm11,%xmm8,%xmm8
+       vaesenclast     %xmm10,%xmm9,%xmm11
+       vmovdqu 0-128(%rdi),%xmm10
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovq   %xmm15,%r13
+       vpextrq $1,%xmm15,%r15
+       vpand   %xmm14,%xmm11,%xmm11
+       vpor    %xmm11,%xmm8,%xmm8
+       leaq    -64(%rbp),%rbp
+       vmovdqu %xmm8,(%r15,%r13,1)
+       leaq    16(%r13),%r13
+       cmpq    %rsp,%rbp
+       jae     .Lower_avx2
+
+       movq    552(%rsp),%r15
+       leaq    64(%r13),%r13
+       movq    560(%rsp),%rsi
+       addl    %r14d,%eax
+       leaq    448(%rsp),%rsp
+
+       addl    0(%r15),%eax
+       addl    4(%r15),%ebx
+       addl    8(%r15),%ecx
+       addl    12(%r15),%edx
+       addl    16(%r15),%r8d
+       addl    20(%r15),%r9d
+       addl    24(%r15),%r10d
+       leaq    (%rsi,%r13,1),%r12
+       addl    28(%r15),%r11d
+
+       cmpq    64+16(%rsp),%r13
+
+       movl    %eax,0(%r15)
+       cmoveq  %rsp,%r12
+       movl    %ebx,4(%r15)
+       movl    %ecx,8(%r15)
+       movl    %edx,12(%r15)
+       movl    %r8d,16(%r15)
+       movl    %r9d,20(%r15)
+       movl    %r10d,24(%r15)
+       movl    %r11d,28(%r15)
+
+       jbe     .Loop_avx2
+       leaq    (%rsp),%rbp
+
+.Ldone_avx2:
+       leaq    (%rbp),%rsp
+       movq    64+32(%rsp),%r8
+       movq    64+56(%rsp),%rsi
+       vmovdqu %xmm8,(%r8)
+       vzeroall
+       movq    (%rsi),%r15
+       movq    8(%rsi),%r14
+       movq    16(%rsi),%r13
+       movq    24(%rsi),%r12
+       movq    32(%rsi),%rbp
+       movq    40(%rsi),%rbx
+       leaq    48(%rsi),%rsp
+.Lepilogue_avx2:
+       .byte   0xf3,0xc3
+.size  aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2
+.type  aesni_cbc_sha256_enc_shaext,@function
+.align 32
+aesni_cbc_sha256_enc_shaext:
+       movq    8(%rsp),%r10
+       leaq    K256+128(%rip),%rax
+       movdqu  (%r9),%xmm1
+       movdqu  16(%r9),%xmm2
+       movdqa  512-128(%rax),%xmm3
+
+       movl    240(%rcx),%r11d
+       subq    %rdi,%rsi
+       movups  (%rcx),%xmm15
+       movups  16(%rcx),%xmm4
+       leaq    112(%rcx),%rcx
+
+       pshufd  $0x1b,%xmm1,%xmm0
+       pshufd  $0xb1,%xmm1,%xmm1
+       pshufd  $0x1b,%xmm2,%xmm2
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,202,8
+       punpcklqdq      %xmm0,%xmm2
+
+       jmp     .Loop_shaext
+
+.align 16
+.Loop_shaext:
+       movdqu  (%r10),%xmm10
+       movdqu  16(%r10),%xmm11
+       movdqu  32(%r10),%xmm12
+.byte  102,68,15,56,0,211
+       movdqu  48(%r10),%xmm13
+
+       movdqa  0-128(%rax),%xmm0
+       paddd   %xmm10,%xmm0
+.byte  102,68,15,56,0,219
+       movdqa  %xmm2,%xmm9
+       movdqa  %xmm1,%xmm8
+       movups  0(%rdi),%xmm14
+       xorps   %xmm15,%xmm14
+       xorps   %xmm14,%xmm6
+       movups  -80(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movups  -64(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,202
+
+       movdqa  32-128(%rax),%xmm0
+       paddd   %xmm11,%xmm0
+.byte  102,68,15,56,0,227
+       leaq    64(%r10),%r10
+       movups  -48(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movups  -32(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,202
+
+       movdqa  64-128(%rax),%xmm0
+       paddd   %xmm12,%xmm0
+.byte  102,68,15,56,0,235
+.byte  69,15,56,204,211
+       movups  -16(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm13,%xmm3
+.byte  102,65,15,58,15,220,4
+       paddd   %xmm3,%xmm10
+       movups  0(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,202
+
+       movdqa  96-128(%rax),%xmm0
+       paddd   %xmm13,%xmm0
+.byte  69,15,56,205,213
+.byte  69,15,56,204,220
+       movups  16(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movups  32(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movdqa  %xmm10,%xmm3
+.byte  102,65,15,58,15,221,4
+       paddd   %xmm3,%xmm11
+.byte  15,56,203,202
+       movdqa  128-128(%rax),%xmm0
+       paddd   %xmm10,%xmm0
+.byte  69,15,56,205,218
+.byte  69,15,56,204,229
+       movups  48(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm11,%xmm3
+.byte  102,65,15,58,15,218,4
+       paddd   %xmm3,%xmm12
+       cmpl    $11,%r11d
+       jb      .Laesenclast1
+       movups  64(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  80(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       je      .Laesenclast1
+       movups  96(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  112(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.Laesenclast1:
+       aesenclast      %xmm5,%xmm6
+       movups  16-112(%rcx),%xmm4
+       nop
+.byte  15,56,203,202
+       movups  16(%rdi),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm6,0(%rsi,%rdi,1)
+       xorps   %xmm14,%xmm6
+       movups  -80(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       movdqa  160-128(%rax),%xmm0
+       paddd   %xmm11,%xmm0
+.byte  69,15,56,205,227
+.byte  69,15,56,204,234
+       movups  -64(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm12,%xmm3
+.byte  102,65,15,58,15,219,4
+       paddd   %xmm3,%xmm13
+       movups  -48(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,202
+       movdqa  192-128(%rax),%xmm0
+       paddd   %xmm12,%xmm0
+.byte  69,15,56,205,236
+.byte  69,15,56,204,211
+       movups  -32(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm13,%xmm3
+.byte  102,65,15,58,15,220,4
+       paddd   %xmm3,%xmm10
+       movups  -16(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,202
+       movdqa  224-128(%rax),%xmm0
+       paddd   %xmm13,%xmm0
+.byte  69,15,56,205,213
+.byte  69,15,56,204,220
+       movups  0(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm10,%xmm3
+.byte  102,65,15,58,15,221,4
+       paddd   %xmm3,%xmm11
+       movups  16(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,202
+       movdqa  256-128(%rax),%xmm0
+       paddd   %xmm10,%xmm0
+.byte  69,15,56,205,218
+.byte  69,15,56,204,229
+       movups  32(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm11,%xmm3
+.byte  102,65,15,58,15,218,4
+       paddd   %xmm3,%xmm12
+       movups  48(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       cmpl    $11,%r11d
+       jb      .Laesenclast2
+       movups  64(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  80(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       je      .Laesenclast2
+       movups  96(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  112(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.Laesenclast2:
+       aesenclast      %xmm5,%xmm6
+       movups  16-112(%rcx),%xmm4
+       nop
+.byte  15,56,203,202
+       movups  32(%rdi),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm6,16(%rsi,%rdi,1)
+       xorps   %xmm14,%xmm6
+       movups  -80(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       movdqa  288-128(%rax),%xmm0
+       paddd   %xmm11,%xmm0
+.byte  69,15,56,205,227
+.byte  69,15,56,204,234
+       movups  -64(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm12,%xmm3
+.byte  102,65,15,58,15,219,4
+       paddd   %xmm3,%xmm13
+       movups  -48(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,202
+       movdqa  320-128(%rax),%xmm0
+       paddd   %xmm12,%xmm0
+.byte  69,15,56,205,236
+.byte  69,15,56,204,211
+       movups  -32(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm13,%xmm3
+.byte  102,65,15,58,15,220,4
+       paddd   %xmm3,%xmm10
+       movups  -16(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,202
+       movdqa  352-128(%rax),%xmm0
+       paddd   %xmm13,%xmm0
+.byte  69,15,56,205,213
+.byte  69,15,56,204,220
+       movups  0(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm10,%xmm3
+.byte  102,65,15,58,15,221,4
+       paddd   %xmm3,%xmm11
+       movups  16(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,202
+       movdqa  384-128(%rax),%xmm0
+       paddd   %xmm10,%xmm0
+.byte  69,15,56,205,218
+.byte  69,15,56,204,229
+       movups  32(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm11,%xmm3
+.byte  102,65,15,58,15,218,4
+       paddd   %xmm3,%xmm12
+       movups  48(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,202
+       movdqa  416-128(%rax),%xmm0
+       paddd   %xmm11,%xmm0
+.byte  69,15,56,205,227
+.byte  69,15,56,204,234
+       cmpl    $11,%r11d
+       jb      .Laesenclast3
+       movups  64(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  80(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       je      .Laesenclast3
+       movups  96(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  112(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.Laesenclast3:
+       aesenclast      %xmm5,%xmm6
+       movups  16-112(%rcx),%xmm4
+       nop
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm12,%xmm3
+.byte  102,65,15,58,15,219,4
+       paddd   %xmm3,%xmm13
+       movups  48(%rdi),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm6,32(%rsi,%rdi,1)
+       xorps   %xmm14,%xmm6
+       movups  -80(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       movups  -64(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,202
+
+       movdqa  448-128(%rax),%xmm0
+       paddd   %xmm12,%xmm0
+.byte  69,15,56,205,236
+       movdqa  %xmm7,%xmm3
+       movups  -48(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movups  -32(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,202
+
+       movdqa  480-128(%rax),%xmm0
+       paddd   %xmm13,%xmm0
+       movups  -16(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       movups  0(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movups  16(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.byte  15,56,203,202
+
+       movups  32(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  48(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       cmpl    $11,%r11d
+       jb      .Laesenclast4
+       movups  64(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  80(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+       je      .Laesenclast4
+       movups  96(%rcx),%xmm4
+       aesenc  %xmm5,%xmm6
+       movups  112(%rcx),%xmm5
+       aesenc  %xmm4,%xmm6
+.Laesenclast4:
+       aesenclast      %xmm5,%xmm6
+       movups  16-112(%rcx),%xmm4
+       nop
+
+       paddd   %xmm9,%xmm2
+       paddd   %xmm8,%xmm1
+
+       decq    %rdx
+       movups  %xmm6,48(%rsi,%rdi,1)
+       leaq    64(%rdi),%rdi
+       jnz     .Loop_shaext
+
+       pshufd  $0xb1,%xmm2,%xmm2
+       pshufd  $0x1b,%xmm1,%xmm3
+       pshufd  $0xb1,%xmm1,%xmm1
+       punpckhqdq      %xmm2,%xmm1
+.byte  102,15,58,15,211,8
+
+       movups  %xmm6,(%r8)
+       movdqu  %xmm1,(%r9)
+       movdqu  %xmm2,16(%r9)
+       .byte   0xf3,0xc3
+.size  aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext
 .section .note.GNU-stack,"",%progbits
index f5af284..23d3f3f 100644 (file)
@@ -332,6 +332,8 @@ ecp_nistz256_neg:
 .type  ecp_nistz256_to_mont,@function
 .align 32
 ecp_nistz256_to_mont:
+       movl    $0x80100,%ecx
+       andl    OPENSSL_ia32cap_P+8(%rip),%ecx
        leaq    .LRR(%rip),%rdx
        jmp     .Lmul_mont
 .size  ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
@@ -346,6 +348,8 @@ ecp_nistz256_to_mont:
 .type  ecp_nistz256_mul_mont,@function
 .align 32
 ecp_nistz256_mul_mont:
+       movl    $0x80100,%ecx
+       andl    OPENSSL_ia32cap_P+8(%rip),%ecx
 .Lmul_mont:
        pushq   %rbp
        pushq   %rbx
@@ -353,6 +357,8 @@ ecp_nistz256_mul_mont:
        pushq   %r13
        pushq   %r14
        pushq   %r15
+       cmpl    $0x80100,%ecx
+       je      .Lmul_montx
        movq    %rdx,%rbx
        movq    0(%rdx),%rax
        movq    0(%rsi),%r9
@@ -361,6 +367,19 @@ ecp_nistz256_mul_mont:
        movq    24(%rsi),%r12
 
        call    __ecp_nistz256_mul_montq
+       jmp     .Lmul_mont_done
+
+.align 32
+.Lmul_montx:
+       movq    %rdx,%rbx
+       movq    0(%rdx),%rdx
+       movq    0(%rsi),%r9
+       movq    8(%rsi),%r10
+       movq    16(%rsi),%r11
+       movq    24(%rsi),%r12
+       leaq    -128(%rsi),%rsi
+
+       call    __ecp_nistz256_mul_montx
 .Lmul_mont_done:
        popq    %r15
        popq    %r14
@@ -598,18 +617,33 @@ __ecp_nistz256_mul_montq:
 .type  ecp_nistz256_sqr_mont,@function
 .align 32
 ecp_nistz256_sqr_mont:
+       movl    $0x80100,%ecx
+       andl    OPENSSL_ia32cap_P+8(%rip),%ecx
        pushq   %rbp
        pushq   %rbx
        pushq   %r12
        pushq   %r13
        pushq   %r14
        pushq   %r15
+       cmpl    $0x80100,%ecx
+       je      .Lsqr_montx
        movq    0(%rsi),%rax
        movq    8(%rsi),%r14
        movq    16(%rsi),%r15
        movq    24(%rsi),%r8
 
        call    __ecp_nistz256_sqr_montq
+       jmp     .Lsqr_mont_done
+
+.align 32
+.Lsqr_montx:
+       movq    0(%rsi),%rdx
+       movq    8(%rsi),%r14
+       movq    16(%rsi),%r15
+       movq    24(%rsi),%r8
+       leaq    -128(%rsi),%rsi
+
+       call    __ecp_nistz256_sqr_montx
 .Lsqr_mont_done:
        popq    %r15
        popq    %r14
@@ -781,6 +815,304 @@ __ecp_nistz256_sqr_montq:
 
        .byte   0xf3,0xc3
 .size  __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type  __ecp_nistz256_mul_montx,@function
+.align 32
+__ecp_nistz256_mul_montx:
+
+
+       mulxq   %r9,%r8,%r9
+       mulxq   %r10,%rcx,%r10
+       movq    $32,%r14
+       xorq    %r13,%r13
+       mulxq   %r11,%rbp,%r11
+       movq    .Lpoly+24(%rip),%r15
+       adcq    %rcx,%r9
+       mulxq   %r12,%rcx,%r12
+       movq    %r8,%rdx
+       adcq    %rbp,%r10
+       shlxq   %r14,%r8,%rbp
+       adcq    %rcx,%r11
+       shrxq   %r14,%r8,%rcx
+       adcq    $0,%r12
+
+
+
+       addq    %rbp,%r9
+       adcq    %rcx,%r10
+
+       mulxq   %r15,%rcx,%rbp
+       movq    8(%rbx),%rdx
+       adcq    %rcx,%r11
+       adcq    %rbp,%r12
+       adcq    $0,%r13
+       xorq    %r8,%r8
+
+
+
+       mulxq   0+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r9
+       adoxq   %rbp,%r10
+
+       mulxq   8+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r10
+       adoxq   %rbp,%r11
+
+       mulxq   16+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r11
+       adoxq   %rbp,%r12
+
+       mulxq   24+128(%rsi),%rcx,%rbp
+       movq    %r9,%rdx
+       adcxq   %rcx,%r12
+       shlxq   %r14,%r9,%rcx
+       adoxq   %rbp,%r13
+       shrxq   %r14,%r9,%rbp
+
+       adcxq   %r8,%r13
+       adoxq   %r8,%r8
+       adcq    $0,%r8
+
+
+
+       addq    %rcx,%r10
+       adcq    %rbp,%r11
+
+       mulxq   %r15,%rcx,%rbp
+       movq    16(%rbx),%rdx
+       adcq    %rcx,%r12
+       adcq    %rbp,%r13
+       adcq    $0,%r8
+       xorq    %r9,%r9
+
+
+
+       mulxq   0+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r10
+       adoxq   %rbp,%r11
+
+       mulxq   8+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r11
+       adoxq   %rbp,%r12
+
+       mulxq   16+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r12
+       adoxq   %rbp,%r13
+
+       mulxq   24+128(%rsi),%rcx,%rbp
+       movq    %r10,%rdx
+       adcxq   %rcx,%r13
+       shlxq   %r14,%r10,%rcx
+       adoxq   %rbp,%r8
+       shrxq   %r14,%r10,%rbp
+
+       adcxq   %r9,%r8
+       adoxq   %r9,%r9
+       adcq    $0,%r9
+
+
+
+       addq    %rcx,%r11
+       adcq    %rbp,%r12
+
+       mulxq   %r15,%rcx,%rbp
+       movq    24(%rbx),%rdx
+       adcq    %rcx,%r13
+       adcq    %rbp,%r8
+       adcq    $0,%r9
+       xorq    %r10,%r10
+
+
+
+       mulxq   0+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r11
+       adoxq   %rbp,%r12
+
+       mulxq   8+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r12
+       adoxq   %rbp,%r13
+
+       mulxq   16+128(%rsi),%rcx,%rbp
+       adcxq   %rcx,%r13
+       adoxq   %rbp,%r8
+
+       mulxq   24+128(%rsi),%rcx,%rbp
+       movq    %r11,%rdx
+       adcxq   %rcx,%r8
+       shlxq   %r14,%r11,%rcx
+       adoxq   %rbp,%r9
+       shrxq   %r14,%r11,%rbp
+
+       adcxq   %r10,%r9
+       adoxq   %r10,%r10
+       adcq    $0,%r10
+
+
+
+       addq    %rcx,%r12
+       adcq    %rbp,%r13
+
+       mulxq   %r15,%rcx,%rbp
+       movq    %r12,%rbx
+       movq    .Lpoly+8(%rip),%r14
+       adcq    %rcx,%r8
+       movq    %r13,%rdx
+       adcq    %rbp,%r9
+       adcq    $0,%r10
+
+
+
+       xorl    %eax,%eax
+       movq    %r8,%rcx
+       sbbq    $-1,%r12
+       sbbq    %r14,%r13
+       sbbq    $0,%r8
+       movq    %r9,%rbp
+       sbbq    %r15,%r9
+       sbbq    $0,%r10
+
+       cmovcq  %rbx,%r12
+       cmovcq  %rdx,%r13
+       movq    %r12,0(%rdi)
+       cmovcq  %rcx,%r8
+       movq    %r13,8(%rdi)
+       cmovcq  %rbp,%r9
+       movq    %r8,16(%rdi)
+       movq    %r9,24(%rdi)
+
+       .byte   0xf3,0xc3
+.size  __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type  __ecp_nistz256_sqr_montx,@function
+.align 32
+__ecp_nistz256_sqr_montx:
+       mulxq   %r14,%r9,%r10
+       mulxq   %r15,%rcx,%r11
+       xorl    %eax,%eax
+       adcq    %rcx,%r10
+       mulxq   %r8,%rbp,%r12
+       movq    %r14,%rdx
+       adcq    %rbp,%r11
+       adcq    $0,%r12
+       xorq    %r13,%r13
+
+
+       mulxq   %r15,%rcx,%rbp
+       adcxq   %rcx,%r11
+       adoxq   %rbp,%r12
+
+       mulxq   %r8,%rcx,%rbp
+       movq    %r15,%rdx
+       adcxq   %rcx,%r12
+       adoxq   %rbp,%r13
+       adcq    $0,%r13
+
+
+       mulxq   %r8,%rcx,%r14
+       movq    0+128(%rsi),%rdx
+       xorq    %r15,%r15
+       adcxq   %r9,%r9
+       adoxq   %rcx,%r13
+       adcxq   %r10,%r10
+       adoxq   %r15,%r14
+
+       mulxq   %rdx,%r8,%rbp
+       movq    8+128(%rsi),%rdx
+       adcxq   %r11,%r11
+       adoxq   %rbp,%r9
+       adcxq   %r12,%r12
+       mulxq   %rdx,%rcx,%rax
+       movq    16+128(%rsi),%rdx
+       adcxq   %r13,%r13
+       adoxq   %rcx,%r10
+       adcxq   %r14,%r14
+.byte  0x67
+       mulxq   %rdx,%rcx,%rbp
+       movq    24+128(%rsi),%rdx
+       adoxq   %rax,%r11
+       adcxq   %r15,%r15
+       adoxq   %rcx,%r12
+       movq    $32,%rsi
+       adoxq   %rbp,%r13
+.byte  0x67,0x67
+       mulxq   %rdx,%rcx,%rax
+       movq    %r8,%rdx
+       adoxq   %rcx,%r14
+       shlxq   %rsi,%r8,%rcx
+       adoxq   %rax,%r15
+       shrxq   %rsi,%r8,%rax
+       movq    .Lpoly+24(%rip),%rbp
+
+
+       addq    %rcx,%r9
+       adcq    %rax,%r10
+
+       mulxq   %rbp,%rcx,%r8
+       movq    %r9,%rdx
+       adcq    %rcx,%r11
+       shlxq   %rsi,%r9,%rcx
+       adcq    $0,%r8
+       shrxq   %rsi,%r9,%rax
+
+
+       addq    %rcx,%r10
+       adcq    %rax,%r11
+
+       mulxq   %rbp,%rcx,%r9
+       movq    %r10,%rdx
+       adcq    %rcx,%r8
+       shlxq   %rsi,%r10,%rcx
+       adcq    $0,%r9
+       shrxq   %rsi,%r10,%rax
+
+
+       addq    %rcx,%r11
+       adcq    %rax,%r8
+
+       mulxq   %rbp,%rcx,%r10
+       movq    %r11,%rdx
+       adcq    %rcx,%r9
+       shlxq   %rsi,%r11,%rcx
+       adcq    $0,%r10
+       shrxq   %rsi,%r11,%rax
+
+
+       addq    %rcx,%r8
+       adcq    %rax,%r9
+
+       mulxq   %rbp,%rcx,%r11
+       adcq    %rcx,%r10
+       adcq    $0,%r11
+
+       xorq    %rdx,%rdx
+       adcq    %r8,%r12
+       movq    .Lpoly+8(%rip),%rsi
+       adcq    %r9,%r13
+       movq    %r12,%r8
+       adcq    %r10,%r14
+       adcq    %r11,%r15
+       movq    %r13,%r9
+       adcq    $0,%rdx
+
+       xorl    %eax,%eax
+       sbbq    $-1,%r12
+       movq    %r14,%r10
+       sbbq    %rsi,%r13
+       sbbq    $0,%r14
+       movq    %r15,%r11
+       sbbq    %rbp,%r15
+       sbbq    $0,%rdx
+
+       cmovcq  %r8,%r12
+       cmovcq  %r9,%r13
+       movq    %r12,0(%rdi)
+       cmovcq  %r10,%r14
+       movq    %r13,8(%rdi)
+       cmovcq  %r11,%r15
+       movq    %r14,16(%rdi)
+       movq    %r15,24(%rdi)
+
+       .byte   0xf3,0xc3
+.size  __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
 
 
 
@@ -883,6 +1215,9 @@ ecp_nistz256_from_mont:
 .type  ecp_nistz256_select_w5,@function
 .align 32
 ecp_nistz256_select_w5:
+       movl    OPENSSL_ia32cap_P+8(%rip),%eax
+       testl   $32,%eax
+       jnz     .Lavx2_select_w5
        movdqa  .LOne(%rip),%xmm0
        movd    %edx,%xmm1
 
@@ -942,6 +1277,9 @@ ecp_nistz256_select_w5:
 .type  ecp_nistz256_select_w7,@function
 .align 32
 ecp_nistz256_select_w7:
+       movl    OPENSSL_ia32cap_P+8(%rip),%eax
+       testl   $32,%eax
+       jnz     .Lavx2_select_w7
        movdqa  .LOne(%rip),%xmm8
        movd    %edx,%xmm1
 
@@ -983,11 +1321,141 @@ ecp_nistz256_select_w7:
        movdqu  %xmm5,48(%rdi)
        .byte   0xf3,0xc3
 .size  ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+
+
+.type  ecp_nistz256_avx2_select_w5,@function
+.align 32
+ecp_nistz256_avx2_select_w5:
+.Lavx2_select_w5:
+       vzeroupper
+       vmovdqa .LTwo(%rip),%ymm0
+
+       vpxor   %ymm2,%ymm2,%ymm2
+       vpxor   %ymm3,%ymm3,%ymm3
+       vpxor   %ymm4,%ymm4,%ymm4
+
+       vmovdqa .LOne(%rip),%ymm5
+       vmovdqa .LTwo(%rip),%ymm10
+
+       vmovd   %edx,%xmm1
+       vpermd  %ymm1,%ymm2,%ymm1
+
+       movq    $8,%rax
+.Lselect_loop_avx2_w5:
+
+       vmovdqa 0(%rsi),%ymm6
+       vmovdqa 32(%rsi),%ymm7
+       vmovdqa 64(%rsi),%ymm8
+
+       vmovdqa 96(%rsi),%ymm11
+       vmovdqa 128(%rsi),%ymm12
+       vmovdqa 160(%rsi),%ymm13
+
+       vpcmpeqd        %ymm1,%ymm5,%ymm9
+       vpcmpeqd        %ymm1,%ymm10,%ymm14
+
+       vpaddd  %ymm0,%ymm5,%ymm5
+       vpaddd  %ymm0,%ymm10,%ymm10
+       leaq    192(%rsi),%rsi
+
+       vpand   %ymm9,%ymm6,%ymm6
+       vpand   %ymm9,%ymm7,%ymm7
+       vpand   %ymm9,%ymm8,%ymm8
+       vpand   %ymm14,%ymm11,%ymm11
+       vpand   %ymm14,%ymm12,%ymm12
+       vpand   %ymm14,%ymm13,%ymm13
+
+       vpxor   %ymm6,%ymm2,%ymm2
+       vpxor   %ymm7,%ymm3,%ymm3
+       vpxor   %ymm8,%ymm4,%ymm4
+       vpxor   %ymm11,%ymm2,%ymm2
+       vpxor   %ymm12,%ymm3,%ymm3
+       vpxor   %ymm13,%ymm4,%ymm4
+
+       decq    %rax
+       jnz     .Lselect_loop_avx2_w5
+
+       vmovdqu %ymm2,0(%rdi)
+       vmovdqu %ymm3,32(%rdi)
+       vmovdqu %ymm4,64(%rdi)
+       vzeroupper
+       .byte   0xf3,0xc3
+.size  ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+
+
+
 .globl ecp_nistz256_avx2_select_w7
 .type  ecp_nistz256_avx2_select_w7,@function
 .align 32
 ecp_nistz256_avx2_select_w7:
-.byte  0x0f,0x0b
+.Lavx2_select_w7:
+       vzeroupper
+       vmovdqa .LThree(%rip),%ymm0
+
+       vpxor   %ymm2,%ymm2,%ymm2
+       vpxor   %ymm3,%ymm3,%ymm3
+
+       vmovdqa .LOne(%rip),%ymm4
+       vmovdqa .LTwo(%rip),%ymm8
+       vmovdqa .LThree(%rip),%ymm12
+
+       vmovd   %edx,%xmm1
+       vpermd  %ymm1,%ymm2,%ymm1
+
+
+       movq    $21,%rax
+.Lselect_loop_avx2_w7:
+
+       vmovdqa 0(%rsi),%ymm5
+       vmovdqa 32(%rsi),%ymm6
+
+       vmovdqa 64(%rsi),%ymm9
+       vmovdqa 96(%rsi),%ymm10
+
+       vmovdqa 128(%rsi),%ymm13
+       vmovdqa 160(%rsi),%ymm14
+
+       vpcmpeqd        %ymm1,%ymm4,%ymm7
+       vpcmpeqd        %ymm1,%ymm8,%ymm11
+       vpcmpeqd        %ymm1,%ymm12,%ymm15
+
+       vpaddd  %ymm0,%ymm4,%ymm4
+       vpaddd  %ymm0,%ymm8,%ymm8
+       vpaddd  %ymm0,%ymm12,%ymm12
+       leaq    192(%rsi),%rsi
+
+       vpand   %ymm7,%ymm5,%ymm5
+       vpand   %ymm7,%ymm6,%ymm6
+       vpand   %ymm11,%ymm9,%ymm9
+       vpand   %ymm11,%ymm10,%ymm10
+       vpand   %ymm15,%ymm13,%ymm13
+       vpand   %ymm15,%ymm14,%ymm14
+
+       vpxor   %ymm5,%ymm2,%ymm2
+       vpxor   %ymm6,%ymm3,%ymm3
+       vpxor   %ymm9,%ymm2,%ymm2
+       vpxor   %ymm10,%ymm3,%ymm3
+       vpxor   %ymm13,%ymm2,%ymm2
+       vpxor   %ymm14,%ymm3,%ymm3
+
+       decq    %rax
+       jnz     .Lselect_loop_avx2_w7
+
+
+       vmovdqa 0(%rsi),%ymm5
+       vmovdqa 32(%rsi),%ymm6
+
+       vpcmpeqd        %ymm1,%ymm4,%ymm7
+
+       vpand   %ymm7,%ymm5,%ymm5
+       vpand   %ymm7,%ymm6,%ymm6
+
+       vpxor   %ymm5,%ymm2,%ymm2
+       vpxor   %ymm6,%ymm3,%ymm3
+
+       vmovdqu %ymm2,0(%rdi)
+       vmovdqu %ymm3,32(%rdi)
+       vzeroupper
        .byte   0xf3,0xc3
 .size  ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
 .type  __ecp_nistz256_add_toq,@function
@@ -1113,6 +1581,10 @@ __ecp_nistz256_mul_by_2q:
 .type  ecp_nistz256_point_double,@function
 .align 32
 ecp_nistz256_point_double:
+       movl    $0x80100,%ecx
+       andl    OPENSSL_ia32cap_P+8(%rip),%ecx
+       cmpl    $0x80100,%ecx
+       je      .Lpoint_doublex
        pushq   %rbp
        pushq   %rbx
        pushq   %r12
@@ -1315,6 +1787,10 @@ ecp_nistz256_point_double:
 .type  ecp_nistz256_point_add,@function
 .align 32
 ecp_nistz256_point_add:
+       movl    $0x80100,%ecx
+       andl    OPENSSL_ia32cap_P+8(%rip),%ecx
+       cmpl    $0x80100,%ecx
+       je      .Lpoint_addx
        pushq   %rbp
        pushq   %rbx
        pushq   %r12
@@ -1712,6 +2188,10 @@ ecp_nistz256_point_add:
 .type  ecp_nistz256_point_add_affine,@function
 .align 32
 ecp_nistz256_point_add_affine:
+       movl    $0x80100,%ecx
+       andl    OPENSSL_ia32cap_P+8(%rip),%ecx
+       cmpl    $0x80100,%ecx
+       je      .Lpoint_add_affinex
        pushq   %rbp
        pushq   %rbx
        pushq   %r12
@@ -2011,4 +2491,1033 @@ ecp_nistz256_point_add_affine:
        popq    %rbp
        .byte   0xf3,0xc3
 .size  ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type  __ecp_nistz256_add_tox,@function
+.align 32
+__ecp_nistz256_add_tox:
+       xorq    %r11,%r11
+       adcq    0(%rbx),%r12
+       adcq    8(%rbx),%r13
+       movq    %r12,%rax
+       adcq    16(%rbx),%r8
+       adcq    24(%rbx),%r9
+       movq    %r13,%rbp
+       adcq    $0,%r11
+
+       xorq    %r10,%r10
+       sbbq    $-1,%r12
+       movq    %r8,%rcx
+       sbbq    %r14,%r13
+       sbbq    $0,%r8
+       movq    %r9,%r10
+       sbbq    %r15,%r9
+
+       btq     $0,%r11
+       cmovncq %rax,%r12
+       cmovncq %rbp,%r13
+       movq    %r12,0(%rdi)
+       cmovncq %rcx,%r8
+       movq    %r13,8(%rdi)
+       cmovncq %r10,%r9
+       movq    %r8,16(%rdi)
+       movq    %r9,24(%rdi)
+
+       .byte   0xf3,0xc3
+.size  __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type  __ecp_nistz256_sub_fromx,@function
+.align 32
+__ecp_nistz256_sub_fromx:
+       xorq    %r11,%r11
+       sbbq    0(%rbx),%r12
+       sbbq    8(%rbx),%r13
+       movq    %r12,%rax
+       sbbq    16(%rbx),%r8
+       sbbq    24(%rbx),%r9
+       movq    %r13,%rbp
+       sbbq    $0,%r11
+
+       xorq    %r10,%r10
+       adcq    $-1,%r12
+       movq    %r8,%rcx
+       adcq    %r14,%r13
+       adcq    $0,%r8
+       movq    %r9,%r10
+       adcq    %r15,%r9
+
+       btq     $0,%r11
+       cmovncq %rax,%r12
+       cmovncq %rbp,%r13
+       movq    %r12,0(%rdi)
+       cmovncq %rcx,%r8
+       movq    %r13,8(%rdi)
+       cmovncq %r10,%r9
+       movq    %r8,16(%rdi)
+       movq    %r9,24(%rdi)
+
+       .byte   0xf3,0xc3
+.size  __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type  __ecp_nistz256_subx,@function
+.align 32
+__ecp_nistz256_subx:
+       xorq    %r11,%r11
+       sbbq    %r12,%rax
+       sbbq    %r13,%rbp
+       movq    %rax,%r12
+       sbbq    %r8,%rcx
+       sbbq    %r9,%r10
+       movq    %rbp,%r13
+       sbbq    $0,%r11
+
+       xorq    %r9,%r9
+       adcq    $-1,%rax
+       movq    %rcx,%r8
+       adcq    %r14,%rbp
+       adcq    $0,%rcx
+       movq    %r10,%r9
+       adcq    %r15,%r10
+
+       btq     $0,%r11
+       cmovcq  %rax,%r12
+       cmovcq  %rbp,%r13
+       cmovcq  %rcx,%r8
+       cmovcq  %r10,%r9
+
+       .byte   0xf3,0xc3
+.size  __ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type  __ecp_nistz256_mul_by_2x,@function
+.align 32
+__ecp_nistz256_mul_by_2x:
+       xorq    %r11,%r11
+       adcq    %r12,%r12
+       adcq    %r13,%r13
+       movq    %r12,%rax
+       adcq    %r8,%r8
+       adcq    %r9,%r9
+       movq    %r13,%rbp
+       adcq    $0,%r11
+
+       xorq    %r10,%r10
+       sbbq    $-1,%r12
+       movq    %r8,%rcx
+       sbbq    %r14,%r13
+       sbbq    $0,%r8
+       movq    %r9,%r10
+       sbbq    %r15,%r9
+
+       btq     $0,%r11
+       cmovncq %rax,%r12
+       cmovncq %rbp,%r13
+       movq    %r12,0(%rdi)
+       cmovncq %rcx,%r8
+       movq    %r13,8(%rdi)
+       cmovncq %r10,%r9
+       movq    %r8,16(%rdi)
+       movq    %r9,24(%rdi)
+
+       .byte   0xf3,0xc3
+.size  __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type  ecp_nistz256_point_doublex,@function
+.align 32
+ecp_nistz256_point_doublex:
+.Lpoint_doublex:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       subq    $160+8,%rsp
+
+.Lpoint_double_shortcutx:
+       movdqu  0(%rsi),%xmm0
+       movq    %rsi,%rbx
+       movdqu  16(%rsi),%xmm1
+       movq    32+0(%rsi),%r12
+       movq    32+8(%rsi),%r13
+       movq    32+16(%rsi),%r8
+       movq    32+24(%rsi),%r9
+       movq    .Lpoly+8(%rip),%r14
+       movq    .Lpoly+24(%rip),%r15
+       movdqa  %xmm0,96(%rsp)
+       movdqa  %xmm1,96+16(%rsp)
+       leaq    32(%rdi),%r10
+       leaq    64(%rdi),%r11
+.byte  102,72,15,110,199
+.byte  102,73,15,110,202
+.byte  102,73,15,110,211
+
+       leaq    0(%rsp),%rdi
+       call    __ecp_nistz256_mul_by_2x
+
+       movq    64+0(%rsi),%rdx
+       movq    64+8(%rsi),%r14
+       movq    64+16(%rsi),%r15
+       movq    64+24(%rsi),%r8
+       leaq    64-128(%rsi),%rsi
+       leaq    64(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       movq    0+0(%rsp),%rdx
+       movq    8+0(%rsp),%r14
+       leaq    -128+0(%rsp),%rsi
+       movq    16+0(%rsp),%r15
+       movq    24+0(%rsp),%r8
+       leaq    0(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       movq    32(%rbx),%rdx
+       movq    64+0(%rbx),%r9
+       movq    64+8(%rbx),%r10
+       movq    64+16(%rbx),%r11
+       movq    64+24(%rbx),%r12
+       leaq    64-128(%rbx),%rsi
+       leaq    32(%rbx),%rbx
+.byte  102,72,15,126,215
+       call    __ecp_nistz256_mul_montx
+       call    __ecp_nistz256_mul_by_2x
+
+       movq    96+0(%rsp),%r12
+       movq    96+8(%rsp),%r13
+       leaq    64(%rsp),%rbx
+       movq    96+16(%rsp),%r8
+       movq    96+24(%rsp),%r9
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_add_tox
+
+       movq    96+0(%rsp),%r12
+       movq    96+8(%rsp),%r13
+       leaq    64(%rsp),%rbx
+       movq    96+16(%rsp),%r8
+       movq    96+24(%rsp),%r9
+       leaq    64(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+       movq    0+0(%rsp),%rdx
+       movq    8+0(%rsp),%r14
+       leaq    -128+0(%rsp),%rsi
+       movq    16+0(%rsp),%r15
+       movq    24+0(%rsp),%r8
+.byte  102,72,15,126,207
+       call    __ecp_nistz256_sqr_montx
+       xorq    %r9,%r9
+       movq    %r12,%rax
+       addq    $-1,%r12
+       movq    %r13,%r10
+       adcq    %rsi,%r13
+       movq    %r14,%rcx
+       adcq    $0,%r14
+       movq    %r15,%r8
+       adcq    %rbp,%r15
+       adcq    $0,%r9
+       xorq    %rsi,%rsi
+       testq   $1,%rax
+
+       cmovzq  %rax,%r12
+       cmovzq  %r10,%r13
+       cmovzq  %rcx,%r14
+       cmovzq  %r8,%r15
+       cmovzq  %rsi,%r9
+
+       movq    %r13,%rax
+       shrq    $1,%r12
+       shlq    $63,%rax
+       movq    %r14,%r10
+       shrq    $1,%r13
+       orq     %rax,%r12
+       shlq    $63,%r10
+       movq    %r15,%rcx
+       shrq    $1,%r14
+       orq     %r10,%r13
+       shlq    $63,%rcx
+       movq    %r12,0(%rdi)
+       shrq    $1,%r15
+       movq    %r13,8(%rdi)
+       shlq    $63,%r9
+       orq     %rcx,%r14
+       orq     %r9,%r15
+       movq    %r14,16(%rdi)
+       movq    %r15,24(%rdi)
+       movq    64(%rsp),%rdx
+       leaq    64(%rsp),%rbx
+       movq    0+32(%rsp),%r9
+       movq    8+32(%rsp),%r10
+       leaq    -128+32(%rsp),%rsi
+       movq    16+32(%rsp),%r11
+       movq    24+32(%rsp),%r12
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       leaq    128(%rsp),%rdi
+       call    __ecp_nistz256_mul_by_2x
+
+       leaq    32(%rsp),%rbx
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_add_tox
+
+       movq    96(%rsp),%rdx
+       leaq    96(%rsp),%rbx
+       movq    0+0(%rsp),%r9
+       movq    8+0(%rsp),%r10
+       leaq    -128+0(%rsp),%rsi
+       movq    16+0(%rsp),%r11
+       movq    24+0(%rsp),%r12
+       leaq    0(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       leaq    128(%rsp),%rdi
+       call    __ecp_nistz256_mul_by_2x
+
+       movq    0+32(%rsp),%rdx
+       movq    8+32(%rsp),%r14
+       leaq    -128+32(%rsp),%rsi
+       movq    16+32(%rsp),%r15
+       movq    24+32(%rsp),%r8
+.byte  102,72,15,126,199
+       call    __ecp_nistz256_sqr_montx
+
+       leaq    128(%rsp),%rbx
+       movq    %r14,%r8
+       movq    %r15,%r9
+       movq    %rsi,%r14
+       movq    %rbp,%r15
+       call    __ecp_nistz256_sub_fromx
+
+       movq    0+0(%rsp),%rax
+       movq    0+8(%rsp),%rbp
+       movq    0+16(%rsp),%rcx
+       movq    0+24(%rsp),%r10
+       leaq    0(%rsp),%rdi
+       call    __ecp_nistz256_subx
+
+       movq    32(%rsp),%rdx
+       leaq    32(%rsp),%rbx
+       movq    %r12,%r14
+       xorl    %ecx,%ecx
+       movq    %r12,0+0(%rsp)
+       movq    %r13,%r10
+       movq    %r13,0+8(%rsp)
+       cmovzq  %r8,%r11
+       movq    %r8,0+16(%rsp)
+       leaq    0-128(%rsp),%rsi
+       cmovzq  %r9,%r12
+       movq    %r9,0+24(%rsp)
+       movq    %r14,%r9
+       leaq    0(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+.byte  102,72,15,126,203
+.byte  102,72,15,126,207
+       call    __ecp_nistz256_sub_fromx
+
+       addq    $160+8,%rsp
+       popq    %r15
+       popq    %r14
+       popq    %r13
+       popq    %r12
+       popq    %rbx
+       popq    %rbp
+       .byte   0xf3,0xc3
+.size  ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type  ecp_nistz256_point_addx,@function
+.align 32
+ecp_nistz256_point_addx:
+.Lpoint_addx:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       subq    $576+8,%rsp
+
+       movdqu  0(%rsi),%xmm0
+       movdqu  16(%rsi),%xmm1
+       movdqu  32(%rsi),%xmm2
+       movdqu  48(%rsi),%xmm3
+       movdqu  64(%rsi),%xmm4
+       movdqu  80(%rsi),%xmm5
+       movq    %rsi,%rbx
+       movq    %rdx,%rsi
+       movdqa  %xmm0,384(%rsp)
+       movdqa  %xmm1,384+16(%rsp)
+       por     %xmm0,%xmm1
+       movdqa  %xmm2,416(%rsp)
+       movdqa  %xmm3,416+16(%rsp)
+       por     %xmm2,%xmm3
+       movdqa  %xmm4,448(%rsp)
+       movdqa  %xmm5,448+16(%rsp)
+       por     %xmm1,%xmm3
+
+       movdqu  0(%rsi),%xmm0
+       pshufd  $0xb1,%xmm3,%xmm5
+       movdqu  16(%rsi),%xmm1
+       movdqu  32(%rsi),%xmm2
+       por     %xmm3,%xmm5
+       movdqu  48(%rsi),%xmm3
+       movq    64+0(%rsi),%rdx
+       movq    64+8(%rsi),%r14
+       movq    64+16(%rsi),%r15
+       movq    64+24(%rsi),%r8
+       movdqa  %xmm0,480(%rsp)
+       pshufd  $0x1e,%xmm5,%xmm4
+       movdqa  %xmm1,480+16(%rsp)
+       por     %xmm0,%xmm1
+.byte  102,72,15,110,199
+       movdqa  %xmm2,512(%rsp)
+       movdqa  %xmm3,512+16(%rsp)
+       por     %xmm2,%xmm3
+       por     %xmm4,%xmm5
+       pxor    %xmm4,%xmm4
+       por     %xmm1,%xmm3
+
+       leaq    64-128(%rsi),%rsi
+       movq    %rdx,544+0(%rsp)
+       movq    %r14,544+8(%rsp)
+       movq    %r15,544+16(%rsp)
+       movq    %r8,544+24(%rsp)
+       leaq    96(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       pcmpeqd %xmm4,%xmm5
+       pshufd  $0xb1,%xmm3,%xmm4
+       por     %xmm3,%xmm4
+       pshufd  $0,%xmm5,%xmm5
+       pshufd  $0x1e,%xmm4,%xmm3
+       por     %xmm3,%xmm4
+       pxor    %xmm3,%xmm3
+       pcmpeqd %xmm3,%xmm4
+       pshufd  $0,%xmm4,%xmm4
+       movq    64+0(%rbx),%rdx
+       movq    64+8(%rbx),%r14
+       movq    64+16(%rbx),%r15
+       movq    64+24(%rbx),%r8
+.byte  102,72,15,110,203
+
+       leaq    64-128(%rbx),%rsi
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       movq    544(%rsp),%rdx
+       leaq    544(%rsp),%rbx
+       movq    0+96(%rsp),%r9
+       movq    8+96(%rsp),%r10
+       leaq    -128+96(%rsp),%rsi
+       movq    16+96(%rsp),%r11
+       movq    24+96(%rsp),%r12
+       leaq    224(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    448(%rsp),%rdx
+       leaq    448(%rsp),%rbx
+       movq    0+32(%rsp),%r9
+       movq    8+32(%rsp),%r10
+       leaq    -128+32(%rsp),%rsi
+       movq    16+32(%rsp),%r11
+       movq    24+32(%rsp),%r12
+       leaq    256(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    416(%rsp),%rdx
+       leaq    416(%rsp),%rbx
+       movq    0+224(%rsp),%r9
+       movq    8+224(%rsp),%r10
+       leaq    -128+224(%rsp),%rsi
+       movq    16+224(%rsp),%r11
+       movq    24+224(%rsp),%r12
+       leaq    224(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    512(%rsp),%rdx
+       leaq    512(%rsp),%rbx
+       movq    0+256(%rsp),%r9
+       movq    8+256(%rsp),%r10
+       leaq    -128+256(%rsp),%rsi
+       movq    16+256(%rsp),%r11
+       movq    24+256(%rsp),%r12
+       leaq    256(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       leaq    224(%rsp),%rbx
+       leaq    64(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+       orq     %r13,%r12
+       movdqa  %xmm4,%xmm2
+       orq     %r8,%r12
+       orq     %r9,%r12
+       por     %xmm5,%xmm2
+.byte  102,73,15,110,220
+
+       movq    384(%rsp),%rdx
+       leaq    384(%rsp),%rbx
+       movq    0+96(%rsp),%r9
+       movq    8+96(%rsp),%r10
+       leaq    -128+96(%rsp),%rsi
+       movq    16+96(%rsp),%r11
+       movq    24+96(%rsp),%r12
+       leaq    160(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    480(%rsp),%rdx
+       leaq    480(%rsp),%rbx
+       movq    0+32(%rsp),%r9
+       movq    8+32(%rsp),%r10
+       leaq    -128+32(%rsp),%rsi
+       movq    16+32(%rsp),%r11
+       movq    24+32(%rsp),%r12
+       leaq    192(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       leaq    160(%rsp),%rbx
+       leaq    0(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+       orq     %r13,%r12
+       orq     %r8,%r12
+       orq     %r9,%r12
+
+.byte  0x3e
+       jnz     .Ladd_proceedx
+.byte  102,73,15,126,208
+.byte  102,73,15,126,217
+       testq   %r8,%r8
+       jnz     .Ladd_proceedx
+       testq   %r9,%r9
+       jz      .Ladd_doublex
+
+.byte  102,72,15,126,199
+       pxor    %xmm0,%xmm0
+       movdqu  %xmm0,0(%rdi)
+       movdqu  %xmm0,16(%rdi)
+       movdqu  %xmm0,32(%rdi)
+       movdqu  %xmm0,48(%rdi)
+       movdqu  %xmm0,64(%rdi)
+       movdqu  %xmm0,80(%rdi)
+       jmp     .Ladd_donex
+
+.align 32
+.Ladd_doublex:
+.byte  102,72,15,126,206
+.byte  102,72,15,126,199
+       addq    $416,%rsp
+       jmp     .Lpoint_double_shortcutx
+
+.align 32
+.Ladd_proceedx:
+       movq    0+64(%rsp),%rdx
+       movq    8+64(%rsp),%r14
+       leaq    -128+64(%rsp),%rsi
+       movq    16+64(%rsp),%r15
+       movq    24+64(%rsp),%r8
+       leaq    96(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       movq    448(%rsp),%rdx
+       leaq    448(%rsp),%rbx
+       movq    0+0(%rsp),%r9
+       movq    8+0(%rsp),%r10
+       leaq    -128+0(%rsp),%rsi
+       movq    16+0(%rsp),%r11
+       movq    24+0(%rsp),%r12
+       leaq    352(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    0+0(%rsp),%rdx
+       movq    8+0(%rsp),%r14
+       leaq    -128+0(%rsp),%rsi
+       movq    16+0(%rsp),%r15
+       movq    24+0(%rsp),%r8
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       movq    544(%rsp),%rdx
+       leaq    544(%rsp),%rbx
+       movq    0+352(%rsp),%r9
+       movq    8+352(%rsp),%r10
+       leaq    -128+352(%rsp),%rsi
+       movq    16+352(%rsp),%r11
+       movq    24+352(%rsp),%r12
+       leaq    352(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    0(%rsp),%rdx
+       leaq    0(%rsp),%rbx
+       movq    0+32(%rsp),%r9
+       movq    8+32(%rsp),%r10
+       leaq    -128+32(%rsp),%rsi
+       movq    16+32(%rsp),%r11
+       movq    24+32(%rsp),%r12
+       leaq    128(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    160(%rsp),%rdx
+       leaq    160(%rsp),%rbx
+       movq    0+32(%rsp),%r9
+       movq    8+32(%rsp),%r10
+       leaq    -128+32(%rsp),%rsi
+       movq    16+32(%rsp),%r11
+       movq    24+32(%rsp),%r12
+       leaq    192(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+
+
+
+       addq    %r12,%r12
+       leaq    96(%rsp),%rsi
+       adcq    %r13,%r13
+       movq    %r12,%rax
+       adcq    %r8,%r8
+       adcq    %r9,%r9
+       movq    %r13,%rbp
+       sbbq    %r11,%r11
+
+       subq    $-1,%r12
+       movq    %r8,%rcx
+       sbbq    %r14,%r13
+       sbbq    $0,%r8
+       movq    %r9,%r10
+       sbbq    %r15,%r9
+       testq   %r11,%r11
+
+       cmovzq  %rax,%r12
+       movq    0(%rsi),%rax
+       cmovzq  %rbp,%r13
+       movq    8(%rsi),%rbp
+       cmovzq  %rcx,%r8
+       movq    16(%rsi),%rcx
+       cmovzq  %r10,%r9
+       movq    24(%rsi),%r10
+
+       call    __ecp_nistz256_subx
+
+       leaq    128(%rsp),%rbx
+       leaq    288(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+       movq    192+0(%rsp),%rax
+       movq    192+8(%rsp),%rbp
+       movq    192+16(%rsp),%rcx
+       movq    192+24(%rsp),%r10
+       leaq    320(%rsp),%rdi
+
+       call    __ecp_nistz256_subx
+
+       movq    %r12,0(%rdi)
+       movq    %r13,8(%rdi)
+       movq    %r8,16(%rdi)
+       movq    %r9,24(%rdi)
+       movq    128(%rsp),%rdx
+       leaq    128(%rsp),%rbx
+       movq    0+224(%rsp),%r9
+       movq    8+224(%rsp),%r10
+       leaq    -128+224(%rsp),%rsi
+       movq    16+224(%rsp),%r11
+       movq    24+224(%rsp),%r12
+       leaq    256(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    320(%rsp),%rdx
+       leaq    320(%rsp),%rbx
+       movq    0+64(%rsp),%r9
+       movq    8+64(%rsp),%r10
+       leaq    -128+64(%rsp),%rsi
+       movq    16+64(%rsp),%r11
+       movq    24+64(%rsp),%r12
+       leaq    320(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       leaq    256(%rsp),%rbx
+       leaq    320(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+.byte  102,72,15,126,199
+
+       movdqa  %xmm5,%xmm0
+       movdqa  %xmm5,%xmm1
+       pandn   352(%rsp),%xmm0
+       movdqa  %xmm5,%xmm2
+       pandn   352+16(%rsp),%xmm1
+       movdqa  %xmm5,%xmm3
+       pand    544(%rsp),%xmm2
+       pand    544+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+
+       movdqa  %xmm4,%xmm0
+       movdqa  %xmm4,%xmm1
+       pandn   %xmm2,%xmm0
+       movdqa  %xmm4,%xmm2
+       pandn   %xmm3,%xmm1
+       movdqa  %xmm4,%xmm3
+       pand    448(%rsp),%xmm2
+       pand    448+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+       movdqu  %xmm2,64(%rdi)
+       movdqu  %xmm3,80(%rdi)
+
+       movdqa  %xmm5,%xmm0
+       movdqa  %xmm5,%xmm1
+       pandn   288(%rsp),%xmm0
+       movdqa  %xmm5,%xmm2
+       pandn   288+16(%rsp),%xmm1
+       movdqa  %xmm5,%xmm3
+       pand    480(%rsp),%xmm2
+       pand    480+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+
+       movdqa  %xmm4,%xmm0
+       movdqa  %xmm4,%xmm1
+       pandn   %xmm2,%xmm0
+       movdqa  %xmm4,%xmm2
+       pandn   %xmm3,%xmm1
+       movdqa  %xmm4,%xmm3
+       pand    384(%rsp),%xmm2
+       pand    384+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+       movdqu  %xmm2,0(%rdi)
+       movdqu  %xmm3,16(%rdi)
+
+       movdqa  %xmm5,%xmm0
+       movdqa  %xmm5,%xmm1
+       pandn   320(%rsp),%xmm0
+       movdqa  %xmm5,%xmm2
+       pandn   320+16(%rsp),%xmm1
+       movdqa  %xmm5,%xmm3
+       pand    512(%rsp),%xmm2
+       pand    512+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+
+       movdqa  %xmm4,%xmm0
+       movdqa  %xmm4,%xmm1
+       pandn   %xmm2,%xmm0
+       movdqa  %xmm4,%xmm2
+       pandn   %xmm3,%xmm1
+       movdqa  %xmm4,%xmm3
+       pand    416(%rsp),%xmm2
+       pand    416+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+       movdqu  %xmm2,32(%rdi)
+       movdqu  %xmm3,48(%rdi)
+
+.Ladd_donex:
+       addq    $576+8,%rsp
+       popq    %r15
+       popq    %r14
+       popq    %r13
+       popq    %r12
+       popq    %rbx
+       popq    %rbp
+       .byte   0xf3,0xc3
+.size  ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type  ecp_nistz256_point_add_affinex,@function
+.align 32
+ecp_nistz256_point_add_affinex:
+.Lpoint_add_affinex:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       subq    $480+8,%rsp
+
+       movdqu  0(%rsi),%xmm0
+       movq    %rdx,%rbx
+       movdqu  16(%rsi),%xmm1
+       movdqu  32(%rsi),%xmm2
+       movdqu  48(%rsi),%xmm3
+       movdqu  64(%rsi),%xmm4
+       movdqu  80(%rsi),%xmm5
+       movq    64+0(%rsi),%rdx
+       movq    64+8(%rsi),%r14
+       movq    64+16(%rsi),%r15
+       movq    64+24(%rsi),%r8
+       movdqa  %xmm0,320(%rsp)
+       movdqa  %xmm1,320+16(%rsp)
+       por     %xmm0,%xmm1
+       movdqa  %xmm2,352(%rsp)
+       movdqa  %xmm3,352+16(%rsp)
+       por     %xmm2,%xmm3
+       movdqa  %xmm4,384(%rsp)
+       movdqa  %xmm5,384+16(%rsp)
+       por     %xmm1,%xmm3
+
+       movdqu  0(%rbx),%xmm0
+       pshufd  $0xb1,%xmm3,%xmm5
+       movdqu  16(%rbx),%xmm1
+       movdqu  32(%rbx),%xmm2
+       por     %xmm3,%xmm5
+       movdqu  48(%rbx),%xmm3
+       movdqa  %xmm0,416(%rsp)
+       pshufd  $0x1e,%xmm5,%xmm4
+       movdqa  %xmm1,416+16(%rsp)
+       por     %xmm0,%xmm1
+.byte  102,72,15,110,199
+       movdqa  %xmm2,448(%rsp)
+       movdqa  %xmm3,448+16(%rsp)
+       por     %xmm2,%xmm3
+       por     %xmm4,%xmm5
+       pxor    %xmm4,%xmm4
+       por     %xmm1,%xmm3
+
+       leaq    64-128(%rsi),%rsi
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       pcmpeqd %xmm4,%xmm5
+       pshufd  $0xb1,%xmm3,%xmm4
+       movq    0(%rbx),%rdx
+
+       movq    %r12,%r9
+       por     %xmm3,%xmm4
+       pshufd  $0,%xmm5,%xmm5
+       pshufd  $0x1e,%xmm4,%xmm3
+       movq    %r13,%r10
+       por     %xmm3,%xmm4
+       pxor    %xmm3,%xmm3
+       movq    %r14,%r11
+       pcmpeqd %xmm3,%xmm4
+       pshufd  $0,%xmm4,%xmm4
+
+       leaq    32-128(%rsp),%rsi
+       movq    %r15,%r12
+       leaq    0(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       leaq    320(%rsp),%rbx
+       leaq    64(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+       movq    384(%rsp),%rdx
+       leaq    384(%rsp),%rbx
+       movq    0+32(%rsp),%r9
+       movq    8+32(%rsp),%r10
+       leaq    -128+32(%rsp),%rsi
+       movq    16+32(%rsp),%r11
+       movq    24+32(%rsp),%r12
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    384(%rsp),%rdx
+       leaq    384(%rsp),%rbx
+       movq    0+64(%rsp),%r9
+       movq    8+64(%rsp),%r10
+       leaq    -128+64(%rsp),%rsi
+       movq    16+64(%rsp),%r11
+       movq    24+64(%rsp),%r12
+       leaq    288(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    448(%rsp),%rdx
+       leaq    448(%rsp),%rbx
+       movq    0+32(%rsp),%r9
+       movq    8+32(%rsp),%r10
+       leaq    -128+32(%rsp),%rsi
+       movq    16+32(%rsp),%r11
+       movq    24+32(%rsp),%r12
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       leaq    352(%rsp),%rbx
+       leaq    96(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+       movq    0+64(%rsp),%rdx
+       movq    8+64(%rsp),%r14
+       leaq    -128+64(%rsp),%rsi
+       movq    16+64(%rsp),%r15
+       movq    24+64(%rsp),%r8
+       leaq    128(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       movq    0+96(%rsp),%rdx
+       movq    8+96(%rsp),%r14
+       leaq    -128+96(%rsp),%rsi
+       movq    16+96(%rsp),%r15
+       movq    24+96(%rsp),%r8
+       leaq    192(%rsp),%rdi
+       call    __ecp_nistz256_sqr_montx
+
+       movq    128(%rsp),%rdx
+       leaq    128(%rsp),%rbx
+       movq    0+64(%rsp),%r9
+       movq    8+64(%rsp),%r10
+       leaq    -128+64(%rsp),%rsi
+       movq    16+64(%rsp),%r11
+       movq    24+64(%rsp),%r12
+       leaq    160(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    320(%rsp),%rdx
+       leaq    320(%rsp),%rbx
+       movq    0+128(%rsp),%r9
+       movq    8+128(%rsp),%r10
+       leaq    -128+128(%rsp),%rsi
+       movq    16+128(%rsp),%r11
+       movq    24+128(%rsp),%r12
+       leaq    0(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+
+
+
+       addq    %r12,%r12
+       leaq    192(%rsp),%rsi
+       adcq    %r13,%r13
+       movq    %r12,%rax
+       adcq    %r8,%r8
+       adcq    %r9,%r9
+       movq    %r13,%rbp
+       sbbq    %r11,%r11
+
+       subq    $-1,%r12
+       movq    %r8,%rcx
+       sbbq    %r14,%r13
+       sbbq    $0,%r8
+       movq    %r9,%r10
+       sbbq    %r15,%r9
+       testq   %r11,%r11
+
+       cmovzq  %rax,%r12
+       movq    0(%rsi),%rax
+       cmovzq  %rbp,%r13
+       movq    8(%rsi),%rbp
+       cmovzq  %rcx,%r8
+       movq    16(%rsi),%rcx
+       cmovzq  %r10,%r9
+       movq    24(%rsi),%r10
+
+       call    __ecp_nistz256_subx
+
+       leaq    160(%rsp),%rbx
+       leaq    224(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+       movq    0+0(%rsp),%rax
+       movq    0+8(%rsp),%rbp
+       movq    0+16(%rsp),%rcx
+       movq    0+24(%rsp),%r10
+       leaq    64(%rsp),%rdi
+
+       call    __ecp_nistz256_subx
+
+       movq    %r12,0(%rdi)
+       movq    %r13,8(%rdi)
+       movq    %r8,16(%rdi)
+       movq    %r9,24(%rdi)
+       movq    352(%rsp),%rdx
+       leaq    352(%rsp),%rbx
+       movq    0+160(%rsp),%r9
+       movq    8+160(%rsp),%r10
+       leaq    -128+160(%rsp),%rsi
+       movq    16+160(%rsp),%r11
+       movq    24+160(%rsp),%r12
+       leaq    32(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       movq    96(%rsp),%rdx
+       leaq    96(%rsp),%rbx
+       movq    0+64(%rsp),%r9
+       movq    8+64(%rsp),%r10
+       leaq    -128+64(%rsp),%rsi
+       movq    16+64(%rsp),%r11
+       movq    24+64(%rsp),%r12
+       leaq    64(%rsp),%rdi
+       call    __ecp_nistz256_mul_montx
+
+       leaq    32(%rsp),%rbx
+       leaq    256(%rsp),%rdi
+       call    __ecp_nistz256_sub_fromx
+
+.byte  102,72,15,126,199
+
+       movdqa  %xmm5,%xmm0
+       movdqa  %xmm5,%xmm1
+       pandn   288(%rsp),%xmm0
+       movdqa  %xmm5,%xmm2
+       pandn   288+16(%rsp),%xmm1
+       movdqa  %xmm5,%xmm3
+       pand    .LONE_mont(%rip),%xmm2
+       pand    .LONE_mont+16(%rip),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+
+       movdqa  %xmm4,%xmm0
+       movdqa  %xmm4,%xmm1
+       pandn   %xmm2,%xmm0
+       movdqa  %xmm4,%xmm2
+       pandn   %xmm3,%xmm1
+       movdqa  %xmm4,%xmm3
+       pand    384(%rsp),%xmm2
+       pand    384+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+       movdqu  %xmm2,64(%rdi)
+       movdqu  %xmm3,80(%rdi)
+
+       movdqa  %xmm5,%xmm0
+       movdqa  %xmm5,%xmm1
+       pandn   224(%rsp),%xmm0
+       movdqa  %xmm5,%xmm2
+       pandn   224+16(%rsp),%xmm1
+       movdqa  %xmm5,%xmm3
+       pand    416(%rsp),%xmm2
+       pand    416+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+
+       movdqa  %xmm4,%xmm0
+       movdqa  %xmm4,%xmm1
+       pandn   %xmm2,%xmm0
+       movdqa  %xmm4,%xmm2
+       pandn   %xmm3,%xmm1
+       movdqa  %xmm4,%xmm3
+       pand    320(%rsp),%xmm2
+       pand    320+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+       movdqu  %xmm2,0(%rdi)
+       movdqu  %xmm3,16(%rdi)
+
+       movdqa  %xmm5,%xmm0
+       movdqa  %xmm5,%xmm1
+       pandn   256(%rsp),%xmm0
+       movdqa  %xmm5,%xmm2
+       pandn   256+16(%rsp),%xmm1
+       movdqa  %xmm5,%xmm3
+       pand    448(%rsp),%xmm2
+       pand    448+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+
+       movdqa  %xmm4,%xmm0
+       movdqa  %xmm4,%xmm1
+       pandn   %xmm2,%xmm0
+       movdqa  %xmm4,%xmm2
+       pandn   %xmm3,%xmm1
+       movdqa  %xmm4,%xmm3
+       pand    352(%rsp),%xmm2
+       pand    352+16(%rsp),%xmm3
+       por     %xmm0,%xmm2
+       por     %xmm1,%xmm3
+       movdqu  %xmm2,32(%rdi)
+       movdqu  %xmm3,48(%rdi)
+
+       addq    $480+8,%rsp
+       popq    %r15
+       popq    %r14
+       popq    %r13
+       popq    %r12
+       popq    %rbx
+       popq    %rbp
+       .byte   0xf3,0xc3
+.size  ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
 .section .note.GNU-stack,"",%progbits
index 9b552d1..0ae4ff7 100644 (file)
@@ -1249,7 +1249,108 @@ gcm_ghash_clmul:
 .type  gcm_init_avx,@function
 .align 32
 gcm_init_avx:
-       jmp     .L_init_clmul
+       vzeroupper
+
+       vmovdqu (%rsi),%xmm2
+       vpshufd $78,%xmm2,%xmm2
+
+
+       vpshufd $255,%xmm2,%xmm4
+       vpsrlq  $63,%xmm2,%xmm3
+       vpsllq  $1,%xmm2,%xmm2
+       vpxor   %xmm5,%xmm5,%xmm5
+       vpcmpgtd        %xmm4,%xmm5,%xmm5
+       vpslldq $8,%xmm3,%xmm3
+       vpor    %xmm3,%xmm2,%xmm2
+
+
+       vpand   .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+       vpxor   %xmm5,%xmm2,%xmm2
+
+       vpunpckhqdq     %xmm2,%xmm2,%xmm6
+       vmovdqa %xmm2,%xmm0
+       vpxor   %xmm2,%xmm6,%xmm6
+       movq    $4,%r10
+       jmp     .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+       vpalignr        $8,%xmm3,%xmm4,%xmm5
+       vmovdqu %xmm5,-16(%rdi)
+       vpunpckhqdq     %xmm0,%xmm0,%xmm3
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x11,%xmm2,%xmm0,%xmm1
+       vpclmulqdq      $0x00,%xmm2,%xmm0,%xmm0
+       vpclmulqdq      $0x00,%xmm6,%xmm3,%xmm3
+       vpxor   %xmm0,%xmm1,%xmm4
+       vpxor   %xmm4,%xmm3,%xmm3
+
+       vpslldq $8,%xmm3,%xmm4
+       vpsrldq $8,%xmm3,%xmm3
+       vpxor   %xmm4,%xmm0,%xmm0
+       vpxor   %xmm3,%xmm1,%xmm1
+       vpsllq  $57,%xmm0,%xmm3
+       vpsllq  $62,%xmm0,%xmm4
+       vpxor   %xmm3,%xmm4,%xmm4
+       vpsllq  $63,%xmm0,%xmm3
+       vpxor   %xmm3,%xmm4,%xmm4
+       vpslldq $8,%xmm4,%xmm3
+       vpsrldq $8,%xmm4,%xmm4
+       vpxor   %xmm3,%xmm0,%xmm0
+       vpxor   %xmm4,%xmm1,%xmm1
+
+       vpsrlq  $1,%xmm0,%xmm4
+       vpxor   %xmm0,%xmm1,%xmm1
+       vpxor   %xmm4,%xmm0,%xmm0
+       vpsrlq  $5,%xmm4,%xmm4
+       vpxor   %xmm4,%xmm0,%xmm0
+       vpsrlq  $1,%xmm0,%xmm0
+       vpxor   %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+       vmovdqa %xmm0,%xmm5
+       vpunpckhqdq     %xmm0,%xmm0,%xmm3
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x11,%xmm2,%xmm0,%xmm1
+       vpclmulqdq      $0x00,%xmm2,%xmm0,%xmm0
+       vpclmulqdq      $0x00,%xmm6,%xmm3,%xmm3
+       vpxor   %xmm0,%xmm1,%xmm4
+       vpxor   %xmm4,%xmm3,%xmm3
+
+       vpslldq $8,%xmm3,%xmm4
+       vpsrldq $8,%xmm3,%xmm3
+       vpxor   %xmm4,%xmm0,%xmm0
+       vpxor   %xmm3,%xmm1,%xmm1
+       vpsllq  $57,%xmm0,%xmm3
+       vpsllq  $62,%xmm0,%xmm4
+       vpxor   %xmm3,%xmm4,%xmm4
+       vpsllq  $63,%xmm0,%xmm3
+       vpxor   %xmm3,%xmm4,%xmm4
+       vpslldq $8,%xmm4,%xmm3
+       vpsrldq $8,%xmm4,%xmm4
+       vpxor   %xmm3,%xmm0,%xmm0
+       vpxor   %xmm4,%xmm1,%xmm1
+
+       vpsrlq  $1,%xmm0,%xmm4
+       vpxor   %xmm0,%xmm1,%xmm1
+       vpxor   %xmm4,%xmm0,%xmm0
+       vpsrlq  $5,%xmm4,%xmm4
+       vpxor   %xmm4,%xmm0,%xmm0
+       vpsrlq  $1,%xmm0,%xmm0
+       vpxor   %xmm1,%xmm0,%xmm0
+       vpshufd $78,%xmm5,%xmm3
+       vpshufd $78,%xmm0,%xmm4
+       vpxor   %xmm5,%xmm3,%xmm3
+       vmovdqu %xmm5,0(%rdi)
+       vpxor   %xmm0,%xmm4,%xmm4
+       vmovdqu %xmm0,16(%rdi)
+       leaq    48(%rdi),%rdi
+       subq    $1,%r10
+       jnz     .Linit_loop_avx
+
+       vpalignr        $8,%xmm4,%xmm3,%xmm5
+       vmovdqu %xmm5,-16(%rdi)
+
+       vzeroupper
+       .byte   0xf3,0xc3
 .size  gcm_init_avx,.-gcm_init_avx
 .globl gcm_gmult_avx
 .type  gcm_gmult_avx,@function
@@ -1261,7 +1362,377 @@ gcm_gmult_avx:
 .type  gcm_ghash_avx,@function
 .align 32
 gcm_ghash_avx:
-       jmp     .L_ghash_clmul
+       vzeroupper
+
+       vmovdqu (%rdi),%xmm10
+       leaq    .L0x1c2_polynomial(%rip),%r10
+       leaq    64(%rsi),%rsi
+       vmovdqu .Lbswap_mask(%rip),%xmm13
+       vpshufb %xmm13,%xmm10,%xmm10
+       cmpq    $0x80,%rcx
+       jb      .Lshort_avx
+       subq    $0x80,%rcx
+
+       vmovdqu 112(%rdx),%xmm14
+       vmovdqu 0-64(%rsi),%xmm6
+       vpshufb %xmm13,%xmm14,%xmm14
+       vmovdqu 32-64(%rsi),%xmm7
+
+       vpunpckhqdq     %xmm14,%xmm14,%xmm9
+       vmovdqu 96(%rdx),%xmm15
+       vpclmulqdq      $0x00,%xmm6,%xmm14,%xmm0
+       vpxor   %xmm14,%xmm9,%xmm9
+       vpshufb %xmm13,%xmm15,%xmm15
+       vpclmulqdq      $0x11,%xmm6,%xmm14,%xmm1
+       vmovdqu 16-64(%rsi),%xmm6
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vmovdqu 80(%rdx),%xmm14
+       vpclmulqdq      $0x00,%xmm7,%xmm9,%xmm2
+       vpxor   %xmm15,%xmm8,%xmm8
+
+       vpshufb %xmm13,%xmm14,%xmm14
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm3
+       vpunpckhqdq     %xmm14,%xmm14,%xmm9
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm4
+       vmovdqu 48-64(%rsi),%xmm6
+       vpxor   %xmm14,%xmm9,%xmm9
+       vmovdqu 64(%rdx),%xmm15
+       vpclmulqdq      $0x10,%xmm7,%xmm8,%xmm5
+       vmovdqu 80-64(%rsi),%xmm7
+
+       vpshufb %xmm13,%xmm15,%xmm15
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm14,%xmm0
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpclmulqdq      $0x11,%xmm6,%xmm14,%xmm1
+       vmovdqu 64-64(%rsi),%xmm6
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm9,%xmm2
+       vpxor   %xmm15,%xmm8,%xmm8
+
+       vmovdqu 48(%rdx),%xmm14
+       vpxor   %xmm3,%xmm0,%xmm0
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm3
+       vpxor   %xmm4,%xmm1,%xmm1
+       vpshufb %xmm13,%xmm14,%xmm14
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm4
+       vmovdqu 96-64(%rsi),%xmm6
+       vpxor   %xmm5,%xmm2,%xmm2
+       vpunpckhqdq     %xmm14,%xmm14,%xmm9
+       vpclmulqdq      $0x10,%xmm7,%xmm8,%xmm5
+       vmovdqu 128-64(%rsi),%xmm7
+       vpxor   %xmm14,%xmm9,%xmm9
+
+       vmovdqu 32(%rdx),%xmm15
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm14,%xmm0
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpshufb %xmm13,%xmm15,%xmm15
+       vpclmulqdq      $0x11,%xmm6,%xmm14,%xmm1
+       vmovdqu 112-64(%rsi),%xmm6
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpclmulqdq      $0x00,%xmm7,%xmm9,%xmm2
+       vpxor   %xmm15,%xmm8,%xmm8
+
+       vmovdqu 16(%rdx),%xmm14
+       vpxor   %xmm3,%xmm0,%xmm0
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm3
+       vpxor   %xmm4,%xmm1,%xmm1
+       vpshufb %xmm13,%xmm14,%xmm14
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm4
+       vmovdqu 144-64(%rsi),%xmm6
+       vpxor   %xmm5,%xmm2,%xmm2
+       vpunpckhqdq     %xmm14,%xmm14,%xmm9
+       vpclmulqdq      $0x10,%xmm7,%xmm8,%xmm5
+       vmovdqu 176-64(%rsi),%xmm7
+       vpxor   %xmm14,%xmm9,%xmm9
+
+       vmovdqu (%rdx),%xmm15
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm14,%xmm0
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpshufb %xmm13,%xmm15,%xmm15
+       vpclmulqdq      $0x11,%xmm6,%xmm14,%xmm1
+       vmovdqu 160-64(%rsi),%xmm6
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x10,%xmm7,%xmm9,%xmm2
+
+       leaq    128(%rdx),%rdx
+       cmpq    $0x80,%rcx
+       jb      .Ltail_avx
+
+       vpxor   %xmm10,%xmm15,%xmm15
+       subq    $0x80,%rcx
+       jmp     .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vmovdqu 112(%rdx),%xmm14
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpxor   %xmm15,%xmm8,%xmm8
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm10
+       vpshufb %xmm13,%xmm14,%xmm14
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm11
+       vmovdqu 0-64(%rsi),%xmm6
+       vpunpckhqdq     %xmm14,%xmm14,%xmm9
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm8,%xmm12
+       vmovdqu 32-64(%rsi),%xmm7
+       vpxor   %xmm14,%xmm9,%xmm9
+
+       vmovdqu 96(%rdx),%xmm15
+       vpclmulqdq      $0x00,%xmm6,%xmm14,%xmm0
+       vpxor   %xmm3,%xmm10,%xmm10
+       vpshufb %xmm13,%xmm15,%xmm15
+       vpclmulqdq      $0x11,%xmm6,%xmm14,%xmm1
+       vxorps  %xmm4,%xmm11,%xmm11
+       vmovdqu 16-64(%rsi),%xmm6
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpclmulqdq      $0x00,%xmm7,%xmm9,%xmm2
+       vpxor   %xmm5,%xmm12,%xmm12
+       vxorps  %xmm15,%xmm8,%xmm8
+
+       vmovdqu 80(%rdx),%xmm14
+       vpxor   %xmm10,%xmm12,%xmm12
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm3
+       vpxor   %xmm11,%xmm12,%xmm12
+       vpslldq $8,%xmm12,%xmm9
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm4
+       vpsrldq $8,%xmm12,%xmm12
+       vpxor   %xmm9,%xmm10,%xmm10
+       vmovdqu 48-64(%rsi),%xmm6
+       vpshufb %xmm13,%xmm14,%xmm14
+       vxorps  %xmm12,%xmm11,%xmm11
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpunpckhqdq     %xmm14,%xmm14,%xmm9
+       vpclmulqdq      $0x10,%xmm7,%xmm8,%xmm5
+       vmovdqu 80-64(%rsi),%xmm7
+       vpxor   %xmm14,%xmm9,%xmm9
+       vpxor   %xmm2,%xmm5,%xmm5
+
+       vmovdqu 64(%rdx),%xmm15
+       vpalignr        $8,%xmm10,%xmm10,%xmm12
+       vpclmulqdq      $0x00,%xmm6,%xmm14,%xmm0
+       vpshufb %xmm13,%xmm15,%xmm15
+       vpxor   %xmm3,%xmm0,%xmm0
+       vpclmulqdq      $0x11,%xmm6,%xmm14,%xmm1
+       vmovdqu 64-64(%rsi),%xmm6
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm4,%xmm1,%xmm1
+       vpclmulqdq      $0x00,%xmm7,%xmm9,%xmm2
+       vxorps  %xmm15,%xmm8,%xmm8
+       vpxor   %xmm5,%xmm2,%xmm2
+
+       vmovdqu 48(%rdx),%xmm14
+       vpclmulqdq      $0x10,(%r10),%xmm10,%xmm10
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm3
+       vpshufb %xmm13,%xmm14,%xmm14
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm4
+       vmovdqu 96-64(%rsi),%xmm6
+       vpunpckhqdq     %xmm14,%xmm14,%xmm9
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x10,%xmm7,%xmm8,%xmm5
+       vmovdqu 128-64(%rsi),%xmm7
+       vpxor   %xmm14,%xmm9,%xmm9
+       vpxor   %xmm2,%xmm5,%xmm5
+
+       vmovdqu 32(%rdx),%xmm15
+       vpclmulqdq      $0x00,%xmm6,%xmm14,%xmm0
+       vpshufb %xmm13,%xmm15,%xmm15
+       vpxor   %xmm3,%xmm0,%xmm0
+       vpclmulqdq      $0x11,%xmm6,%xmm14,%xmm1
+       vmovdqu 112-64(%rsi),%xmm6
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm4,%xmm1,%xmm1
+       vpclmulqdq      $0x00,%xmm7,%xmm9,%xmm2
+       vpxor   %xmm15,%xmm8,%xmm8
+       vpxor   %xmm5,%xmm2,%xmm2
+       vxorps  %xmm12,%xmm10,%xmm10
+
+       vmovdqu 16(%rdx),%xmm14
+       vpalignr        $8,%xmm10,%xmm10,%xmm12
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm3
+       vpshufb %xmm13,%xmm14,%xmm14
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm4
+       vmovdqu 144-64(%rsi),%xmm6
+       vpclmulqdq      $0x10,(%r10),%xmm10,%xmm10
+       vxorps  %xmm11,%xmm12,%xmm12
+       vpunpckhqdq     %xmm14,%xmm14,%xmm9
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x10,%xmm7,%xmm8,%xmm5
+       vmovdqu 176-64(%rsi),%xmm7
+       vpxor   %xmm14,%xmm9,%xmm9
+       vpxor   %xmm2,%xmm5,%xmm5
+
+       vmovdqu (%rdx),%xmm15
+       vpclmulqdq      $0x00,%xmm6,%xmm14,%xmm0
+       vpshufb %xmm13,%xmm15,%xmm15
+       vpclmulqdq      $0x11,%xmm6,%xmm14,%xmm1
+       vmovdqu 160-64(%rsi),%xmm6
+       vpxor   %xmm12,%xmm15,%xmm15
+       vpclmulqdq      $0x10,%xmm7,%xmm9,%xmm2
+       vpxor   %xmm10,%xmm15,%xmm15
+
+       leaq    128(%rdx),%rdx
+       subq    $0x80,%rcx
+       jnc     .Loop8x_avx
+
+       addq    $0x80,%rcx
+       jmp     .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+       vmovdqu -16(%rdx,%rcx,1),%xmm14
+       leaq    (%rdx,%rcx,1),%rdx
+       vmovdqu 0-64(%rsi),%xmm6
+       vmovdqu 32-64(%rsi),%xmm7
+       vpshufb %xmm13,%xmm14,%xmm15
+
+       vmovdqa %xmm0,%xmm3
+       vmovdqa %xmm1,%xmm4
+       vmovdqa %xmm2,%xmm5
+       subq    $0x10,%rcx
+       jz      .Ltail_avx
+
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm0
+       vpxor   %xmm15,%xmm8,%xmm8
+       vmovdqu -32(%rdx),%xmm14
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm1
+       vmovdqu 16-64(%rsi),%xmm6
+       vpshufb %xmm13,%xmm14,%xmm15
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm8,%xmm2
+       vpsrldq $8,%xmm7,%xmm7
+       subq    $0x10,%rcx
+       jz      .Ltail_avx
+
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm0
+       vpxor   %xmm15,%xmm8,%xmm8
+       vmovdqu -48(%rdx),%xmm14
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm1
+       vmovdqu 48-64(%rsi),%xmm6
+       vpshufb %xmm13,%xmm14,%xmm15
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm8,%xmm2
+       vmovdqu 80-64(%rsi),%xmm7
+       subq    $0x10,%rcx
+       jz      .Ltail_avx
+
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm0
+       vpxor   %xmm15,%xmm8,%xmm8
+       vmovdqu -64(%rdx),%xmm14
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm1
+       vmovdqu 64-64(%rsi),%xmm6
+       vpshufb %xmm13,%xmm14,%xmm15
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm8,%xmm2
+       vpsrldq $8,%xmm7,%xmm7
+       subq    $0x10,%rcx
+       jz      .Ltail_avx
+
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm0
+       vpxor   %xmm15,%xmm8,%xmm8
+       vmovdqu -80(%rdx),%xmm14
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm1
+       vmovdqu 96-64(%rsi),%xmm6
+       vpshufb %xmm13,%xmm14,%xmm15
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm8,%xmm2
+       vmovdqu 128-64(%rsi),%xmm7
+       subq    $0x10,%rcx
+       jz      .Ltail_avx
+
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm0
+       vpxor   %xmm15,%xmm8,%xmm8
+       vmovdqu -96(%rdx),%xmm14
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm1
+       vmovdqu 112-64(%rsi),%xmm6
+       vpshufb %xmm13,%xmm14,%xmm15
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm8,%xmm2
+       vpsrldq $8,%xmm7,%xmm7
+       subq    $0x10,%rcx
+       jz      .Ltail_avx
+
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm0
+       vpxor   %xmm15,%xmm8,%xmm8
+       vmovdqu -112(%rdx),%xmm14
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm1
+       vmovdqu 144-64(%rsi),%xmm6
+       vpshufb %xmm13,%xmm14,%xmm15
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm8,%xmm2
+       vmovq   184-64(%rsi),%xmm7
+       subq    $0x10,%rcx
+       jmp     .Ltail_avx
+
+.align 32
+.Ltail_avx:
+       vpxor   %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+       vpunpckhqdq     %xmm15,%xmm15,%xmm8
+       vpxor   %xmm0,%xmm3,%xmm3
+       vpclmulqdq      $0x00,%xmm6,%xmm15,%xmm0
+       vpxor   %xmm15,%xmm8,%xmm8
+       vpxor   %xmm1,%xmm4,%xmm4
+       vpclmulqdq      $0x11,%xmm6,%xmm15,%xmm1
+       vpxor   %xmm2,%xmm5,%xmm5
+       vpclmulqdq      $0x00,%xmm7,%xmm8,%xmm2
+
+       vmovdqu (%r10),%xmm12
+
+       vpxor   %xmm0,%xmm3,%xmm10
+       vpxor   %xmm1,%xmm4,%xmm11
+       vpxor   %xmm2,%xmm5,%xmm5
+
+       vpxor   %xmm10,%xmm5,%xmm5
+       vpxor   %xmm11,%xmm5,%xmm5
+       vpslldq $8,%xmm5,%xmm9
+       vpsrldq $8,%xmm5,%xmm5
+       vpxor   %xmm9,%xmm10,%xmm10
+       vpxor   %xmm5,%xmm11,%xmm11
+
+       vpclmulqdq      $0x10,%xmm12,%xmm10,%xmm9
+       vpalignr        $8,%xmm10,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm10,%xmm10
+
+       vpclmulqdq      $0x10,%xmm12,%xmm10,%xmm9
+       vpalignr        $8,%xmm10,%xmm10,%xmm10
+       vpxor   %xmm11,%xmm10,%xmm10
+       vpxor   %xmm9,%xmm10,%xmm10
+
+       cmpq    $0,%rcx
+       jne     .Lshort_avx
+
+       vpshufb %xmm13,%xmm10,%xmm10
+       vmovdqu %xmm10,(%rdi)
+       vzeroupper
+       .byte   0xf3,0xc3
 .size  gcm_ghash_avx,.-gcm_ghash_avx
 .align 64
 .Lbswap_mask:
index fe8142e..476bf18 100644 (file)
 .text  
 
-.globl rsaz_avx2_eligible
-.type  rsaz_avx2_eligible,@function
-rsaz_avx2_eligible:
-       xorl    %eax,%eax
-       .byte   0xf3,0xc3
-.size  rsaz_avx2_eligible,.-rsaz_avx2_eligible
-
 .globl rsaz_1024_sqr_avx2
-.globl rsaz_1024_mul_avx2
-.globl rsaz_1024_norm2red_avx2
-.globl rsaz_1024_red2norm_avx2
-.globl rsaz_1024_scatter5_avx2
-.globl rsaz_1024_gather5_avx2
 .type  rsaz_1024_sqr_avx2,@function
+.align 64
 rsaz_1024_sqr_avx2:
+       leaq    (%rsp),%rax
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       vzeroupper
+       movq    %rax,%rbp
+       movq    %rdx,%r13
+       subq    $832,%rsp
+       movq    %r13,%r15
+       subq    $-128,%rdi
+       subq    $-128,%rsi
+       subq    $-128,%r13
+
+       andq    $4095,%r15
+       addq    $320,%r15
+       shrq    $12,%r15
+       vpxor   %ymm9,%ymm9,%ymm9
+       jz      .Lsqr_1024_no_n_copy
+
+
+
+
+
+       subq    $320,%rsp
+       vmovdqu 0-128(%r13),%ymm0
+       andq    $-2048,%rsp
+       vmovdqu 32-128(%r13),%ymm1
+       vmovdqu 64-128(%r13),%ymm2
+       vmovdqu 96-128(%r13),%ymm3
+       vmovdqu 128-128(%r13),%ymm4
+       vmovdqu 160-128(%r13),%ymm5
+       vmovdqu 192-128(%r13),%ymm6
+       vmovdqu 224-128(%r13),%ymm7
+       vmovdqu 256-128(%r13),%ymm8
+       leaq    832+128(%rsp),%r13
+       vmovdqu %ymm0,0-128(%r13)
+       vmovdqu %ymm1,32-128(%r13)
+       vmovdqu %ymm2,64-128(%r13)
+       vmovdqu %ymm3,96-128(%r13)
+       vmovdqu %ymm4,128-128(%r13)
+       vmovdqu %ymm5,160-128(%r13)
+       vmovdqu %ymm6,192-128(%r13)
+       vmovdqu %ymm7,224-128(%r13)
+       vmovdqu %ymm8,256-128(%r13)
+       vmovdqu %ymm9,288-128(%r13)
+
+.Lsqr_1024_no_n_copy:
+       andq    $-1024,%rsp
+
+       vmovdqu 32-128(%rsi),%ymm1
+       vmovdqu 64-128(%rsi),%ymm2
+       vmovdqu 96-128(%rsi),%ymm3
+       vmovdqu 128-128(%rsi),%ymm4
+       vmovdqu 160-128(%rsi),%ymm5
+       vmovdqu 192-128(%rsi),%ymm6
+       vmovdqu 224-128(%rsi),%ymm7
+       vmovdqu 256-128(%rsi),%ymm8
+
+       leaq    192(%rsp),%rbx
+       vpbroadcastq    .Land_mask(%rip),%ymm15
+       jmp     .LOOP_GRANDE_SQR_1024
+
+.align 32
+.LOOP_GRANDE_SQR_1024:
+       leaq    576+128(%rsp),%r9
+       leaq    448(%rsp),%r12
+
+
+
+
+       vpaddq  %ymm1,%ymm1,%ymm1
+       vpbroadcastq    0-128(%rsi),%ymm10
+       vpaddq  %ymm2,%ymm2,%ymm2
+       vmovdqa %ymm1,0-128(%r9)
+       vpaddq  %ymm3,%ymm3,%ymm3
+       vmovdqa %ymm2,32-128(%r9)
+       vpaddq  %ymm4,%ymm4,%ymm4
+       vmovdqa %ymm3,64-128(%r9)
+       vpaddq  %ymm5,%ymm5,%ymm5
+       vmovdqa %ymm4,96-128(%r9)
+       vpaddq  %ymm6,%ymm6,%ymm6
+       vmovdqa %ymm5,128-128(%r9)
+       vpaddq  %ymm7,%ymm7,%ymm7
+       vmovdqa %ymm6,160-128(%r9)
+       vpaddq  %ymm8,%ymm8,%ymm8
+       vmovdqa %ymm7,192-128(%r9)
+       vpxor   %ymm9,%ymm9,%ymm9
+       vmovdqa %ymm8,224-128(%r9)
+
+       vpmuludq        0-128(%rsi),%ymm10,%ymm0
+       vpbroadcastq    32-128(%rsi),%ymm11
+       vmovdqu %ymm9,288-192(%rbx)
+       vpmuludq        %ymm10,%ymm1,%ymm1
+       vmovdqu %ymm9,320-448(%r12)
+       vpmuludq        %ymm10,%ymm2,%ymm2
+       vmovdqu %ymm9,352-448(%r12)
+       vpmuludq        %ymm10,%ymm3,%ymm3
+       vmovdqu %ymm9,384-448(%r12)
+       vpmuludq        %ymm10,%ymm4,%ymm4
+       vmovdqu %ymm9,416-448(%r12)
+       vpmuludq        %ymm10,%ymm5,%ymm5
+       vmovdqu %ymm9,448-448(%r12)
+       vpmuludq        %ymm10,%ymm6,%ymm6
+       vmovdqu %ymm9,480-448(%r12)
+       vpmuludq        %ymm10,%ymm7,%ymm7
+       vmovdqu %ymm9,512-448(%r12)
+       vpmuludq        %ymm10,%ymm8,%ymm8
+       vpbroadcastq    64-128(%rsi),%ymm10
+       vmovdqu %ymm9,544-448(%r12)
+
+       movq    %rsi,%r15
+       movl    $4,%r14d
+       jmp     .Lsqr_entry_1024
+.align 32
+.LOOP_SQR_1024:
+       vpbroadcastq    32-128(%r15),%ymm11
+       vpmuludq        0-128(%rsi),%ymm10,%ymm0
+       vpaddq  0-192(%rbx),%ymm0,%ymm0
+       vpmuludq        0-128(%r9),%ymm10,%ymm1
+       vpaddq  32-192(%rbx),%ymm1,%ymm1
+       vpmuludq        32-128(%r9),%ymm10,%ymm2
+       vpaddq  64-192(%rbx),%ymm2,%ymm2
+       vpmuludq        64-128(%r9),%ymm10,%ymm3
+       vpaddq  96-192(%rbx),%ymm3,%ymm3
+       vpmuludq        96-128(%r9),%ymm10,%ymm4
+       vpaddq  128-192(%rbx),%ymm4,%ymm4
+       vpmuludq        128-128(%r9),%ymm10,%ymm5
+       vpaddq  160-192(%rbx),%ymm5,%ymm5
+       vpmuludq        160-128(%r9),%ymm10,%ymm6
+       vpaddq  192-192(%rbx),%ymm6,%ymm6
+       vpmuludq        192-128(%r9),%ymm10,%ymm7
+       vpaddq  224-192(%rbx),%ymm7,%ymm7
+       vpmuludq        224-128(%r9),%ymm10,%ymm8
+       vpbroadcastq    64-128(%r15),%ymm10
+       vpaddq  256-192(%rbx),%ymm8,%ymm8
+.Lsqr_entry_1024:
+       vmovdqu %ymm0,0-192(%rbx)
+       vmovdqu %ymm1,32-192(%rbx)
+
+       vpmuludq        32-128(%rsi),%ymm11,%ymm12
+       vpaddq  %ymm12,%ymm2,%ymm2
+       vpmuludq        32-128(%r9),%ymm11,%ymm14
+       vpaddq  %ymm14,%ymm3,%ymm3
+       vpmuludq        64-128(%r9),%ymm11,%ymm13
+       vpaddq  %ymm13,%ymm4,%ymm4
+       vpmuludq        96-128(%r9),%ymm11,%ymm12
+       vpaddq  %ymm12,%ymm5,%ymm5
+       vpmuludq        128-128(%r9),%ymm11,%ymm14
+       vpaddq  %ymm14,%ymm6,%ymm6
+       vpmuludq        160-128(%r9),%ymm11,%ymm13
+       vpaddq  %ymm13,%ymm7,%ymm7
+       vpmuludq        192-128(%r9),%ymm11,%ymm12
+       vpaddq  %ymm12,%ymm8,%ymm8
+       vpmuludq        224-128(%r9),%ymm11,%ymm0
+       vpbroadcastq    96-128(%r15),%ymm11
+       vpaddq  288-192(%rbx),%ymm0,%ymm0
+
+       vmovdqu %ymm2,64-192(%rbx)
+       vmovdqu %ymm3,96-192(%rbx)
+
+       vpmuludq        64-128(%rsi),%ymm10,%ymm13
+       vpaddq  %ymm13,%ymm4,%ymm4
+       vpmuludq        64-128(%r9),%ymm10,%ymm12
+       vpaddq  %ymm12,%ymm5,%ymm5
+       vpmuludq        96-128(%r9),%ymm10,%ymm14
+       vpaddq  %ymm14,%ymm6,%ymm6
+       vpmuludq        128-128(%r9),%ymm10,%ymm13
+       vpaddq  %ymm13,%ymm7,%ymm7
+       vpmuludq        160-128(%r9),%ymm10,%ymm12
+       vpaddq  %ymm12,%ymm8,%ymm8
+       vpmuludq        192-128(%r9),%ymm10,%ymm14
+       vpaddq  %ymm14,%ymm0,%ymm0
+       vpmuludq        224-128(%r9),%ymm10,%ymm1
+       vpbroadcastq    128-128(%r15),%ymm10
+       vpaddq  320-448(%r12),%ymm1,%ymm1
+
+       vmovdqu %ymm4,128-192(%rbx)
+       vmov