Update files for OpenSSL-1.0.2h import.
authorPeter Avalos <pavalos@dragonflybsd.org>
Wed, 3 Aug 2016 09:52:16 +0000 (02:52 -0700)
committerPeter Avalos <pavalos@dragonflybsd.org>
Wed, 3 Aug 2016 10:38:03 +0000 (03:38 -0700)
Make the jump to 1.0.2, because support for 1.0.1 ends at the end of
this year.

401 files changed:
secure/lib/libcrypto/Makefile
secure/lib/libcrypto/Makefile.inc
secure/lib/libcrypto/Makefile.man
secure/lib/libcrypto/asm/Makefile
secure/lib/libcrypto/asm/aes-x86_64.s
secure/lib/libcrypto/asm/aesni-gcm-x86_64.s [new file with mode: 0644]
secure/lib/libcrypto/asm/aesni-mb-x86_64.s [new file with mode: 0644]
secure/lib/libcrypto/asm/aesni-sha1-x86_64.s
secure/lib/libcrypto/asm/aesni-sha256-x86_64.s [new file with mode: 0644]
secure/lib/libcrypto/asm/aesni-x86_64.s
secure/lib/libcrypto/asm/bsaes-x86_64.s
secure/lib/libcrypto/asm/cmll-x86_64.s
secure/lib/libcrypto/asm/ecp_nistz256-x86_64.s [new file with mode: 0644]
secure/lib/libcrypto/asm/ghash-x86_64.s
secure/lib/libcrypto/asm/md5-x86_64.s
secure/lib/libcrypto/asm/modexp512-x86_64.s [deleted file]
secure/lib/libcrypto/asm/rc4-x86_64.s
secure/lib/libcrypto/asm/rsaz-avx2.s [new file with mode: 0644]
secure/lib/libcrypto/asm/rsaz-x86_64.s [new file with mode: 0644]
secure/lib/libcrypto/asm/sha1-mb-x86_64.s [new file with mode: 0644]
secure/lib/libcrypto/asm/sha1-x86_64.s
secure/lib/libcrypto/asm/sha256-mb-x86_64.s [new file with mode: 0644]
secure/lib/libcrypto/asm/sha256-x86_64.s
secure/lib/libcrypto/asm/sha512-x86_64.s
secure/lib/libcrypto/asm/vpaes-x86_64.s
secure/lib/libcrypto/asm/wp-x86_64.s
secure/lib/libcrypto/asm/x86_64-gf2m.s
secure/lib/libcrypto/asm/x86_64-mont.s
secure/lib/libcrypto/asm/x86_64-mont5.s
secure/lib/libcrypto/asm/x86_64cpuid.s
secure/lib/libcrypto/man/ASN1_OBJECT_new.3
secure/lib/libcrypto/man/ASN1_STRING_length.3
secure/lib/libcrypto/man/ASN1_STRING_new.3
secure/lib/libcrypto/man/ASN1_STRING_print_ex.3
secure/lib/libcrypto/man/ASN1_TIME_set.3 [new file with mode: 0644]
secure/lib/libcrypto/man/ASN1_generate_nconf.3
secure/lib/libcrypto/man/BIO_ctrl.3
secure/lib/libcrypto/man/BIO_f_base64.3
secure/lib/libcrypto/man/BIO_f_buffer.3
secure/lib/libcrypto/man/BIO_f_cipher.3
secure/lib/libcrypto/man/BIO_f_md.3
secure/lib/libcrypto/man/BIO_f_null.3
secure/lib/libcrypto/man/BIO_f_ssl.3
secure/lib/libcrypto/man/BIO_find_type.3
secure/lib/libcrypto/man/BIO_new.3
secure/lib/libcrypto/man/BIO_new_CMS.3
secure/lib/libcrypto/man/BIO_push.3
secure/lib/libcrypto/man/BIO_read.3
secure/lib/libcrypto/man/BIO_s_accept.3
secure/lib/libcrypto/man/BIO_s_bio.3
secure/lib/libcrypto/man/BIO_s_connect.3
secure/lib/libcrypto/man/BIO_s_fd.3
secure/lib/libcrypto/man/BIO_s_file.3
secure/lib/libcrypto/man/BIO_s_mem.3
secure/lib/libcrypto/man/BIO_s_null.3
secure/lib/libcrypto/man/BIO_s_socket.3
secure/lib/libcrypto/man/BIO_set_callback.3
secure/lib/libcrypto/man/BIO_should_retry.3
secure/lib/libcrypto/man/BN_BLINDING_new.3
secure/lib/libcrypto/man/BN_CTX_new.3
secure/lib/libcrypto/man/BN_CTX_start.3
secure/lib/libcrypto/man/BN_add.3
secure/lib/libcrypto/man/BN_add_word.3
secure/lib/libcrypto/man/BN_bn2bin.3
secure/lib/libcrypto/man/BN_cmp.3
secure/lib/libcrypto/man/BN_copy.3
secure/lib/libcrypto/man/BN_generate_prime.3
secure/lib/libcrypto/man/BN_mod_inverse.3
secure/lib/libcrypto/man/BN_mod_mul_montgomery.3
secure/lib/libcrypto/man/BN_mod_mul_reciprocal.3
secure/lib/libcrypto/man/BN_new.3
secure/lib/libcrypto/man/BN_num_bytes.3
secure/lib/libcrypto/man/BN_rand.3
secure/lib/libcrypto/man/BN_set_bit.3
secure/lib/libcrypto/man/BN_swap.3
secure/lib/libcrypto/man/BN_zero.3
secure/lib/libcrypto/man/CMS_add0_cert.3
secure/lib/libcrypto/man/CMS_add1_recipient_cert.3
secure/lib/libcrypto/man/CMS_add1_signer.3
secure/lib/libcrypto/man/CMS_compress.3
secure/lib/libcrypto/man/CMS_decrypt.3
secure/lib/libcrypto/man/CMS_encrypt.3
secure/lib/libcrypto/man/CMS_final.3
secure/lib/libcrypto/man/CMS_get0_RecipientInfos.3
secure/lib/libcrypto/man/CMS_get0_SignerInfos.3
secure/lib/libcrypto/man/CMS_get0_type.3
secure/lib/libcrypto/man/CMS_get1_ReceiptRequest.3
secure/lib/libcrypto/man/CMS_sign.3
secure/lib/libcrypto/man/CMS_sign_receipt.3
secure/lib/libcrypto/man/CMS_uncompress.3
secure/lib/libcrypto/man/CMS_verify.3
secure/lib/libcrypto/man/CMS_verify_receipt.3
secure/lib/libcrypto/man/CONF_modules_free.3
secure/lib/libcrypto/man/CONF_modules_load_file.3
secure/lib/libcrypto/man/CRYPTO_set_ex_data.3
secure/lib/libcrypto/man/DH_generate_key.3
secure/lib/libcrypto/man/DH_generate_parameters.3
secure/lib/libcrypto/man/DH_get_ex_new_index.3
secure/lib/libcrypto/man/DH_new.3
secure/lib/libcrypto/man/DH_set_method.3
secure/lib/libcrypto/man/DH_size.3
secure/lib/libcrypto/man/DSA_SIG_new.3
secure/lib/libcrypto/man/DSA_do_sign.3
secure/lib/libcrypto/man/DSA_dup_DH.3
secure/lib/libcrypto/man/DSA_generate_key.3
secure/lib/libcrypto/man/DSA_generate_parameters.3
secure/lib/libcrypto/man/DSA_get_ex_new_index.3
secure/lib/libcrypto/man/DSA_new.3
secure/lib/libcrypto/man/DSA_set_method.3
secure/lib/libcrypto/man/DSA_sign.3
secure/lib/libcrypto/man/DSA_size.3
secure/lib/libcrypto/man/EC_GFp_simple_method.3 [copied from secure/lib/libcrypto/man/d2i_ECPrivateKey.3 with 59% similarity]
secure/lib/libcrypto/man/EC_GROUP_copy.3 [new file with mode: 0644]
secure/lib/libcrypto/man/EC_GROUP_new.3 [new file with mode: 0644]
secure/lib/libcrypto/man/EC_KEY_new.3 [new file with mode: 0644]
secure/lib/libcrypto/man/EC_POINT_add.3 [copied from secure/lib/libcrypto/man/OpenSSL_add_all_algorithms.3 with 52% similarity]
secure/lib/libcrypto/man/EC_POINT_new.3 [new file with mode: 0644]
secure/lib/libcrypto/man/ERR_GET_LIB.3
secure/lib/libcrypto/man/ERR_clear_error.3
secure/lib/libcrypto/man/ERR_error_string.3
secure/lib/libcrypto/man/ERR_get_error.3
secure/lib/libcrypto/man/ERR_load_crypto_strings.3
secure/lib/libcrypto/man/ERR_load_strings.3
secure/lib/libcrypto/man/ERR_print_errors.3
secure/lib/libcrypto/man/ERR_put_error.3
secure/lib/libcrypto/man/ERR_remove_state.3
secure/lib/libcrypto/man/ERR_set_mark.3
secure/lib/libcrypto/man/EVP_BytesToKey.3
secure/lib/libcrypto/man/EVP_DigestInit.3
secure/lib/libcrypto/man/EVP_DigestSignInit.3
secure/lib/libcrypto/man/EVP_DigestVerifyInit.3
secure/lib/libcrypto/man/EVP_EncodeInit.3
secure/lib/libcrypto/man/EVP_EncryptInit.3
secure/lib/libcrypto/man/EVP_OpenInit.3
secure/lib/libcrypto/man/EVP_PKEY_CTX_ctrl.3
secure/lib/libcrypto/man/EVP_PKEY_CTX_new.3
secure/lib/libcrypto/man/EVP_PKEY_cmp.3
secure/lib/libcrypto/man/EVP_PKEY_decrypt.3
secure/lib/libcrypto/man/EVP_PKEY_derive.3
secure/lib/libcrypto/man/EVP_PKEY_encrypt.3
secure/lib/libcrypto/man/EVP_PKEY_get_default_digest.3
secure/lib/libcrypto/man/EVP_PKEY_keygen.3
secure/lib/libcrypto/man/EVP_PKEY_new.3
secure/lib/libcrypto/man/EVP_PKEY_print_private.3
secure/lib/libcrypto/man/EVP_PKEY_set1_RSA.3
secure/lib/libcrypto/man/EVP_PKEY_sign.3
secure/lib/libcrypto/man/EVP_PKEY_verify.3
secure/lib/libcrypto/man/EVP_PKEY_verify_recover.3
secure/lib/libcrypto/man/EVP_SealInit.3
secure/lib/libcrypto/man/EVP_SignInit.3
secure/lib/libcrypto/man/EVP_VerifyInit.3
secure/lib/libcrypto/man/OBJ_nid2obj.3
secure/lib/libcrypto/man/OPENSSL_Applink.3
secure/lib/libcrypto/man/OPENSSL_VERSION_NUMBER.3
secure/lib/libcrypto/man/OPENSSL_config.3
secure/lib/libcrypto/man/OPENSSL_ia32cap.3
secure/lib/libcrypto/man/OPENSSL_instrument_bus.3 [copied from secure/lib/libcrypto/man/OPENSSL_load_builtin_modules.3 with 70% similarity]
secure/lib/libcrypto/man/OPENSSL_load_builtin_modules.3
secure/lib/libcrypto/man/OpenSSL_add_all_algorithms.3
secure/lib/libcrypto/man/PEM_write_bio_CMS_stream.3
secure/lib/libcrypto/man/PEM_write_bio_PKCS7_stream.3
secure/lib/libcrypto/man/PKCS12_create.3
secure/lib/libcrypto/man/PKCS12_parse.3
secure/lib/libcrypto/man/PKCS7_decrypt.3
secure/lib/libcrypto/man/PKCS7_encrypt.3
secure/lib/libcrypto/man/PKCS7_sign.3
secure/lib/libcrypto/man/PKCS7_sign_add_signer.3
secure/lib/libcrypto/man/PKCS7_verify.3
secure/lib/libcrypto/man/RAND_add.3
secure/lib/libcrypto/man/RAND_bytes.3
secure/lib/libcrypto/man/RAND_cleanup.3
secure/lib/libcrypto/man/RAND_egd.3
secure/lib/libcrypto/man/RAND_load_file.3
secure/lib/libcrypto/man/RAND_set_rand_method.3
secure/lib/libcrypto/man/RSA_blinding_on.3
secure/lib/libcrypto/man/RSA_check_key.3
secure/lib/libcrypto/man/RSA_generate_key.3
secure/lib/libcrypto/man/RSA_get_ex_new_index.3
secure/lib/libcrypto/man/RSA_new.3
secure/lib/libcrypto/man/RSA_padding_add_PKCS1_type_1.3
secure/lib/libcrypto/man/RSA_print.3
secure/lib/libcrypto/man/RSA_private_encrypt.3
secure/lib/libcrypto/man/RSA_public_encrypt.3
secure/lib/libcrypto/man/RSA_set_method.3
secure/lib/libcrypto/man/RSA_sign.3
secure/lib/libcrypto/man/RSA_sign_ASN1_OCTET_STRING.3
secure/lib/libcrypto/man/RSA_size.3
secure/lib/libcrypto/man/SMIME_read_CMS.3
secure/lib/libcrypto/man/SMIME_read_PKCS7.3
secure/lib/libcrypto/man/SMIME_write_CMS.3
secure/lib/libcrypto/man/SMIME_write_PKCS7.3
secure/lib/libcrypto/man/SSLeay_version.3 [copied from secure/lib/libcrypto/man/OPENSSL_VERSION_NUMBER.3 with 72% similarity]
secure/lib/libcrypto/man/X509_NAME_ENTRY_get_object.3
secure/lib/libcrypto/man/X509_NAME_add_entry_by_txt.3
secure/lib/libcrypto/man/X509_NAME_get_index_by_NID.3
secure/lib/libcrypto/man/X509_NAME_print_ex.3
secure/lib/libcrypto/man/X509_STORE_CTX_get_error.3
secure/lib/libcrypto/man/X509_STORE_CTX_get_ex_new_index.3
secure/lib/libcrypto/man/X509_STORE_CTX_new.3
secure/lib/libcrypto/man/X509_STORE_CTX_set_verify_cb.3
secure/lib/libcrypto/man/X509_STORE_set_verify_cb_func.3
secure/lib/libcrypto/man/X509_VERIFY_PARAM_set_flags.3
secure/lib/libcrypto/man/X509_check_host.3 [new file with mode: 0644]
secure/lib/libcrypto/man/X509_new.3
secure/lib/libcrypto/man/X509_verify_cert.3
secure/lib/libcrypto/man/bio.3
secure/lib/libcrypto/man/blowfish.3
secure/lib/libcrypto/man/bn.3
secure/lib/libcrypto/man/bn_internal.3
secure/lib/libcrypto/man/buffer.3
secure/lib/libcrypto/man/crypto.3
secure/lib/libcrypto/man/d2i_ASN1_OBJECT.3
secure/lib/libcrypto/man/d2i_CMS_ContentInfo.3
secure/lib/libcrypto/man/d2i_DHparams.3
secure/lib/libcrypto/man/d2i_DSAPublicKey.3
secure/lib/libcrypto/man/d2i_ECPKParameters.3 [copied from secure/lib/libssl/man/SSL_connect.3 with 50% similarity]
secure/lib/libcrypto/man/d2i_ECPrivateKey.3
secure/lib/libcrypto/man/d2i_PKCS8PrivateKey.3
secure/lib/libcrypto/man/d2i_RSAPublicKey.3
secure/lib/libcrypto/man/d2i_X509.3
secure/lib/libcrypto/man/d2i_X509_ALGOR.3
secure/lib/libcrypto/man/d2i_X509_CRL.3
secure/lib/libcrypto/man/d2i_X509_NAME.3
secure/lib/libcrypto/man/d2i_X509_REQ.3
secure/lib/libcrypto/man/d2i_X509_SIG.3
secure/lib/libcrypto/man/des.3
secure/lib/libcrypto/man/des_modes.7
secure/lib/libcrypto/man/dh.3
secure/lib/libcrypto/man/dsa.3
secure/lib/libcrypto/man/ec.3 [new file with mode: 0644]
secure/lib/libcrypto/man/ecdsa.3
secure/lib/libcrypto/man/engine.3
secure/lib/libcrypto/man/err.3
secure/lib/libcrypto/man/evp.3
secure/lib/libcrypto/man/hmac.3
secure/lib/libcrypto/man/i2d_CMS_bio_stream.3
secure/lib/libcrypto/man/i2d_PKCS7_bio_stream.3
secure/lib/libcrypto/man/lh_stats.3
secure/lib/libcrypto/man/lhash.3
secure/lib/libcrypto/man/md5.3
secure/lib/libcrypto/man/mdc2.3
secure/lib/libcrypto/man/pem.3
secure/lib/libcrypto/man/rand.3
secure/lib/libcrypto/man/rc4.3
secure/lib/libcrypto/man/ripemd.3
secure/lib/libcrypto/man/rsa.3
secure/lib/libcrypto/man/sha.3
secure/lib/libcrypto/man/threads.3
secure/lib/libcrypto/man/ui.3
secure/lib/libcrypto/man/ui_compat.3
secure/lib/libcrypto/man/x509.3
secure/lib/libcrypto/opensslconf-x86_64.h
secure/lib/libssl/Makefile
secure/lib/libssl/Makefile.man
secure/lib/libssl/man/SSL_CIPHER_get_name.3
secure/lib/libssl/man/SSL_COMP_add_compression_method.3
secure/lib/libssl/man/SSL_CONF_CTX_new.3 [copied from secure/lib/libcrypto/man/DSA_SIG_new.3 with 81% similarity]
secure/lib/libssl/man/SSL_CONF_CTX_set1_prefix.3 [copied from secure/lib/libcrypto/man/BN_CTX_start.3 with 73% similarity]
secure/lib/libssl/man/SSL_CONF_CTX_set_flags.3 [copied from secure/lib/libcrypto/man/ASN1_OBJECT_new.3 with 68% similarity]
secure/lib/libssl/man/SSL_CONF_CTX_set_ssl_ctx.3 [copied from secure/lib/libcrypto/man/i2d_CMS_bio_stream.3 with 76% similarity]
secure/lib/libssl/man/SSL_CONF_cmd.3 [new file with mode: 0644]
secure/lib/libssl/man/SSL_CONF_cmd_argv.3 [copied from secure/lib/libcrypto/man/ERR_load_crypto_strings.3 with 79% similarity]
secure/lib/libssl/man/SSL_CTX_add1_chain_cert.3 [new file with mode: 0644]
secure/lib/libssl/man/SSL_CTX_add_extra_chain_cert.3
secure/lib/libssl/man/SSL_CTX_add_session.3
secure/lib/libssl/man/SSL_CTX_ctrl.3
secure/lib/libssl/man/SSL_CTX_flush_sessions.3
secure/lib/libssl/man/SSL_CTX_free.3
secure/lib/libssl/man/SSL_CTX_get0_param.3 [copied from secure/lib/libcrypto/man/PEM_write_bio_PKCS7_stream.3 with 72% similarity]
secure/lib/libssl/man/SSL_CTX_get_ex_new_index.3
secure/lib/libssl/man/SSL_CTX_get_verify_mode.3
secure/lib/libssl/man/SSL_CTX_load_verify_locations.3
secure/lib/libssl/man/SSL_CTX_new.3
secure/lib/libssl/man/SSL_CTX_sess_number.3
secure/lib/libssl/man/SSL_CTX_sess_set_cache_size.3
secure/lib/libssl/man/SSL_CTX_sess_set_get_cb.3
secure/lib/libssl/man/SSL_CTX_sessions.3
secure/lib/libssl/man/SSL_CTX_set1_curves.3 [copied from secure/lib/libcrypto/man/ASN1_STRING_length.3 with 51% similarity]
secure/lib/libssl/man/SSL_CTX_set1_verify_cert_store.3 [copied from secure/lib/libssl/man/SSL_CTX_set_max_cert_list.3 with 55% similarity]
secure/lib/libssl/man/SSL_CTX_set_alpn_select_cb.3 [new file with mode: 0644]
secure/lib/libssl/man/SSL_CTX_set_cert_cb.3 [copied from secure/lib/libssl/man/SSL_CTX_set_quiet_shutdown.3 with 60% similarity]
secure/lib/libssl/man/SSL_CTX_set_cert_store.3
secure/lib/libssl/man/SSL_CTX_set_cert_verify_callback.3
secure/lib/libssl/man/SSL_CTX_set_cipher_list.3
secure/lib/libssl/man/SSL_CTX_set_client_CA_list.3
secure/lib/libssl/man/SSL_CTX_set_client_cert_cb.3
secure/lib/libssl/man/SSL_CTX_set_custom_cli_ext.3 [new file with mode: 0644]
secure/lib/libssl/man/SSL_CTX_set_default_passwd_cb.3
secure/lib/libssl/man/SSL_CTX_set_generate_session_id.3
secure/lib/libssl/man/SSL_CTX_set_info_callback.3
secure/lib/libssl/man/SSL_CTX_set_max_cert_list.3
secure/lib/libssl/man/SSL_CTX_set_mode.3
secure/lib/libssl/man/SSL_CTX_set_msg_callback.3
secure/lib/libssl/man/SSL_CTX_set_options.3
secure/lib/libssl/man/SSL_CTX_set_psk_client_callback.3
secure/lib/libssl/man/SSL_CTX_set_quiet_shutdown.3
secure/lib/libssl/man/SSL_CTX_set_read_ahead.3
secure/lib/libssl/man/SSL_CTX_set_session_cache_mode.3
secure/lib/libssl/man/SSL_CTX_set_session_id_context.3
secure/lib/libssl/man/SSL_CTX_set_ssl_version.3
secure/lib/libssl/man/SSL_CTX_set_timeout.3
secure/lib/libssl/man/SSL_CTX_set_tlsext_status_cb.3
secure/lib/libssl/man/SSL_CTX_set_tlsext_ticket_key_cb.3
secure/lib/libssl/man/SSL_CTX_set_tmp_dh_callback.3
secure/lib/libssl/man/SSL_CTX_set_tmp_rsa_callback.3
secure/lib/libssl/man/SSL_CTX_set_verify.3
secure/lib/libssl/man/SSL_CTX_use_certificate.3
secure/lib/libssl/man/SSL_CTX_use_psk_identity_hint.3
secure/lib/libssl/man/SSL_CTX_use_serverinfo.3 [copied from secure/lib/libssl/man/SSL_library_init.3 with 69% similarity]
secure/lib/libssl/man/SSL_SESSION_free.3
secure/lib/libssl/man/SSL_SESSION_get_ex_new_index.3
secure/lib/libssl/man/SSL_SESSION_get_time.3
secure/lib/libssl/man/SSL_accept.3
secure/lib/libssl/man/SSL_alert_type_string.3
secure/lib/libssl/man/SSL_check_chain.3 [copied from secure/lib/libcrypto/man/OPENSSL_config.3 with 57% similarity]
secure/lib/libssl/man/SSL_clear.3
secure/lib/libssl/man/SSL_connect.3
secure/lib/libssl/man/SSL_do_handshake.3
secure/lib/libssl/man/SSL_free.3
secure/lib/libssl/man/SSL_get_SSL_CTX.3
secure/lib/libssl/man/SSL_get_ciphers.3
secure/lib/libssl/man/SSL_get_client_CA_list.3
secure/lib/libssl/man/SSL_get_current_cipher.3
secure/lib/libssl/man/SSL_get_default_timeout.3
secure/lib/libssl/man/SSL_get_error.3
secure/lib/libssl/man/SSL_get_ex_data_X509_STORE_CTX_idx.3
secure/lib/libssl/man/SSL_get_ex_new_index.3
secure/lib/libssl/man/SSL_get_fd.3
secure/lib/libssl/man/SSL_get_peer_cert_chain.3
secure/lib/libssl/man/SSL_get_peer_certificate.3
secure/lib/libssl/man/SSL_get_psk_identity.3
secure/lib/libssl/man/SSL_get_rbio.3
secure/lib/libssl/man/SSL_get_session.3
secure/lib/libssl/man/SSL_get_verify_result.3
secure/lib/libssl/man/SSL_get_version.3
secure/lib/libssl/man/SSL_library_init.3
secure/lib/libssl/man/SSL_load_client_CA_file.3
secure/lib/libssl/man/SSL_new.3
secure/lib/libssl/man/SSL_pending.3
secure/lib/libssl/man/SSL_read.3
secure/lib/libssl/man/SSL_rstate_string.3
secure/lib/libssl/man/SSL_session_reused.3
secure/lib/libssl/man/SSL_set_bio.3
secure/lib/libssl/man/SSL_set_connect_state.3
secure/lib/libssl/man/SSL_set_fd.3
secure/lib/libssl/man/SSL_set_session.3
secure/lib/libssl/man/SSL_set_shutdown.3
secure/lib/libssl/man/SSL_set_verify_result.3
secure/lib/libssl/man/SSL_shutdown.3
secure/lib/libssl/man/SSL_state_string.3
secure/lib/libssl/man/SSL_want.3
secure/lib/libssl/man/SSL_write.3
secure/lib/libssl/man/d2i_SSL_SESSION.3
secure/lib/libssl/man/ssl.3
secure/usr.bin/openssl/man/CA.pl.1
secure/usr.bin/openssl/man/asn1parse.1
secure/usr.bin/openssl/man/c_rehash.1
secure/usr.bin/openssl/man/ca.1
secure/usr.bin/openssl/man/ciphers.1
secure/usr.bin/openssl/man/cms.1
secure/usr.bin/openssl/man/config.5
secure/usr.bin/openssl/man/crl.1
secure/usr.bin/openssl/man/crl2pkcs7.1
secure/usr.bin/openssl/man/dgst.1
secure/usr.bin/openssl/man/dhparam.1
secure/usr.bin/openssl/man/dsa.1
secure/usr.bin/openssl/man/dsaparam.1
secure/usr.bin/openssl/man/ec.1
secure/usr.bin/openssl/man/ecparam.1
secure/usr.bin/openssl/man/enc.1
secure/usr.bin/openssl/man/errstr.1
secure/usr.bin/openssl/man/gendsa.1
secure/usr.bin/openssl/man/genpkey.1
secure/usr.bin/openssl/man/genrsa.1
secure/usr.bin/openssl/man/nseq.1
secure/usr.bin/openssl/man/ocsp.1
secure/usr.bin/openssl/man/openssl.1
secure/usr.bin/openssl/man/passwd.1
secure/usr.bin/openssl/man/pkcs12.1
secure/usr.bin/openssl/man/pkcs7.1
secure/usr.bin/openssl/man/pkcs8.1
secure/usr.bin/openssl/man/pkey.1
secure/usr.bin/openssl/man/pkeyparam.1
secure/usr.bin/openssl/man/pkeyutl.1
secure/usr.bin/openssl/man/rand.1
secure/usr.bin/openssl/man/req.1
secure/usr.bin/openssl/man/rsa.1
secure/usr.bin/openssl/man/rsautl.1
secure/usr.bin/openssl/man/s_client.1
secure/usr.bin/openssl/man/s_server.1
secure/usr.bin/openssl/man/s_time.1
secure/usr.bin/openssl/man/sess_id.1
secure/usr.bin/openssl/man/smime.1
secure/usr.bin/openssl/man/speed.1
secure/usr.bin/openssl/man/spkac.1
secure/usr.bin/openssl/man/ts.1
secure/usr.bin/openssl/man/tsget.1
secure/usr.bin/openssl/man/verify.1
secure/usr.bin/openssl/man/version.1
secure/usr.bin/openssl/man/x509.1
secure/usr.bin/openssl/man/x509v3_config.5

index 1e6a3f8..6eba835 100644 (file)
@@ -3,13 +3,11 @@
 LIB=           crypto
 SHLIB_MAJOR=   5
 SHLIBDIR?=     /lib
-WARNS?=                2
+WARNS?=                1
 
 SUBDIR=        engines
 
-.if ${MACHINE_ARCH} == "x86_64"
 LDFLAGS+=-Wl,-Bsymbolic
-.endif
 
 .if exists(Makefile.man)
 .include "Makefile.man"
@@ -20,9 +18,7 @@ LDFLAGS+=-Wl,-Bsymbolic
 # base sources
 SRCS=  cryptlib.c mem.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c \
        uid.c o_time.c o_dir.c o_fips.c o_init.c fips_ers.c
-.if ${MACHINE_ARCH} == "x86_64"
 SRCS+= x86_64cpuid.s
-.endif
 INCS=  ../e_os2.h
 INCS+= crypto.h opensslv.h opensslconf.h ebcdic.h symhacks.h ossl_typ.h
 SRCS+= buildinf.h
@@ -48,9 +44,8 @@ opensslconf.h: opensslconf-${MACHINE_ARCH}.h
 
 # aes
 SRCS+= aes_misc.c aes_ecb.c aes_cfb.c aes_ofb.c aes_ctr.c aes_ige.c aes_wrap.c
-.if ${MACHINE_ARCH} == "x86_64"
-SRCS+= aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s
-.endif
+SRCS+= aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s \
+       aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s
 INCS+= aes.h
 
 # asn1
@@ -71,9 +66,7 @@ INCS+=        asn1.h asn1_mac.h asn1t.h
 
 # bf
 SRCS+= bf_skey.c bf_ecb.c bf_cfb64.c bf_ofb64.c
-.if ${MACHINE_ARCH} == "x86_64"
 SRCS+= bf_enc.c
-.endif
 INCS+= blowfish.h
 
 # bio
@@ -91,9 +84,7 @@ SRCS+=        bn_add.c bn_div.c bn_exp.c bn_lib.c bn_ctx.c bn_mul.c bn_mod.c \
        bn_kron.c bn_sqrt.c bn_gcd.c bn_prime.c bn_err.c bn_sqr.c \
        bn_recp.c bn_mont.c bn_mpi.c bn_exp2.c bn_gf2m.c bn_nist.c \
        bn_depr.c bn_const.c bn_x931p.c
-.if ${MACHINE_ARCH} == "x86_64"
-SRCS+= x86_64-gcc.c x86_64-mont.s x86_64-mont5.s x86_64-gf2m.s modexp512-x86_64.s
-.endif
+SRCS+= x86_64-gcc.c x86_64-mont.s x86_64-mont5.s x86_64-gf2m.s rsaz_exp.c rsaz-x86_64.s rsaz-avx2.s
 INCS+= bn.h
 
 # buffer
@@ -102,9 +93,7 @@ INCS+=       buffer.h
 
 # camellia
 SRCS+= cmll_ecb.c cmll_ofb.c cmll_cfb.c cmll_ctr.c cmll_utl.c
-.if ${MACHINE_ARCH} == "x86_64"
 SRCS+= cmll-x86_64.s cmll_misc.c
-.endif
 INCS+= camellia.h
 
 # cast
@@ -118,7 +107,7 @@ INCS+=      cmac.h
 # cms
 SRCS+= cms_lib.c cms_asn1.c cms_att.c cms_io.c cms_smime.c cms_err.c \
        cms_sd.c cms_dd.c cms_cd.c cms_env.c cms_enc.c cms_ess.c \
-       cms_pwri.c
+       cms_pwri.c cms_kari.c
 INCS+= cms.h
 
 # comp
@@ -138,14 +127,12 @@ SRCS+=    set_key.c  ecb_enc.c  cbc_enc.c \
        ofb_enc.c  str2key.c  pcbc_enc.c qud_cksm.c rand_key.c \
        fcrypt.c xcbc_enc.c rpc_enc.c  cbc_cksm.c \
        ede_cbcm_enc.c des_old.c des_old2.c read2pwd.c
-.if ${MACHINE_ARCH} == "x86_64"
 SRCS+= des_enc.c fcrypt_b.c
-.endif
 INCS+= des.h des_old.h
 
 # dh
 SRCS+= dh_asn1.c dh_gen.c dh_key.c dh_lib.c dh_check.c dh_err.c dh_depr.c \
-       dh_ameth.c dh_pmeth.c dh_prn.c
+       dh_ameth.c dh_pmeth.c dh_prn.c dh_rfc5114.c dh_kdf.c
 INCS+= dh.h
 
 # dsa
@@ -163,10 +150,11 @@ SRCS+=    ec_lib.c ecp_smpl.c ecp_mont.c ecp_nist.c ec_cvt.c ec_mult.c \
        ec2_smpl.c ec2_mult.c ec_ameth.c ec_pmeth.c eck_prn.c \
        ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \
        ecp_oct.c ec2_oct.c ec_oct.c
+SRCS+= ecp_nistz256.c ecp_nistz256-x86_64.s
 INCS+= ec.h
 
 # ecdh
-SRCS+= ech_lib.c ech_ossl.c ech_key.c ech_err.c
+SRCS+= ech_lib.c ech_ossl.c ech_key.c ech_err.c ech_kdf.c
 INCS+= ecdh.h
 
 # ecdsa
@@ -179,7 +167,7 @@ SRCS+=      eng_err.c eng_lib.c eng_list.c eng_init.c eng_ctrl.c \
        tb_rsa.c tb_dsa.c tb_ecdsa.c tb_dh.c tb_ecdh.c tb_rand.c tb_store.c \
        tb_cipher.c tb_digest.c tb_pkmeth.c tb_asnmth.c \
        eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c \
-       eng_rsax.c eng_rdrand.c
+       eng_rdrand.c
 INCS+= engine.h
 
 # err
@@ -197,8 +185,8 @@ SRCS+=      encode.c digest.c evp_enc.c evp_key.c evp_acnf.c evp_cnf.c \
        bio_md.c bio_b64.c bio_enc.c evp_err.c e_null.c \
        c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \
        evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \
-       e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c evp_fips.c \
-       e_aes_cbc_hmac_sha1.c e_rc4_hmac_md5.c
+       e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c \
+       e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c
 INCS+= evp.h
 
 # hmac
@@ -230,9 +218,7 @@ INCS+=      md4.h
 
 # md5
 SRCS+= md5_dgst.c md5_one.c
-.if ${MACHINE_ARCH} == "x86_64"
 SRCS+= md5-x86_64.s
-.endif
 INCS+= md5.h
 
 # mdc2
@@ -241,10 +227,8 @@ INCS+=     mdc2.h
 
 # modes
 SRCS+= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
-       ccm128.c xts128.c
-.if ${MACHINE_ARCH} == "x86_64"
-SRCS+= ghash-x86_64.s
-.endif
+       ccm128.c xts128.c wrap128.c
+SRCS+= ghash-x86_64.s aesni-gcm-x86_64.s
 INCS+= modes.h
 
 # objects
@@ -287,9 +271,7 @@ INCS+=      rc2.h
 
 # rc4
 SRCS+= rc4_utl.c
-.if ${MACHINE_ARCH} == "x86_64"
 SRCS+= rc4-x86_64.s rc4-md5-x86_64.s
-.endif
 INCS+= rc4.h
 
 # rc5
@@ -314,9 +296,8 @@ INCS+=      seed.h
 
 # sha
 SRCS+= sha_dgst.c sha1dgst.c sha_one.c sha1_one.c sha256.c sha512.c
-.if ${MACHINE_ARCH} == "x86_64"
-SRCS+= sha1-x86_64.s sha256-x86_64.s sha512-x86_64.s
-.endif
+SRCS+= sha1-x86_64.s sha256-x86_64.s sha512-x86_64.s \
+       sha1-mb-x86_64.s sha256-mb-x86_64.s
 INCS+= sha.h
 
 # srp
@@ -347,9 +328,7 @@ INCS+=      ui.h ui_compat.h
 
 # whrlpool
 SRCS+= wp_dgst.c
-.if ${MACHINE_ARCH} == "x86_64"
 SRCS+= wp-x86_64.s
-.endif
 INCS+= whrlpool.h
 
 # x509
@@ -367,7 +346,7 @@ SRCS+=      v3_bcons.c v3_bitst.c v3_conf.c v3_extku.c v3_ia5.c v3_lib.c \
        v3_int.c v3_enum.c v3_sxnet.c v3_cpols.c v3_crld.c v3_purp.c v3_info.c \
        v3_ocsp.c v3_akeya.c v3_pmaps.c v3_pcons.c v3_ncons.c v3_pcia.c v3_pci.c \
        pcy_cache.c pcy_node.c pcy_data.c pcy_map.c pcy_tree.c pcy_lib.c \
-       v3_asid.c v3_addr.c
+       v3_asid.c v3_addr.c v3_scts.c
 INCS+= x509v3.h
 
 .include <bsd.lib.mk>
index a47275d..176e386 100644 (file)
@@ -1,17 +1,14 @@
-OSSLVERSION=   1.0.1t
+OSSLVERSION=   1.0.2h
 OSSLDATE=      2016-05-03
 LCRYPTO_SRC=   ${.CURDIR}/../../../crypto/openssl
 LCRYPTO_DOC=   ${LCRYPTO_SRC}/doc
 
-CFLAGS+=       -DDSO_DLFCN -DHAVE_DLFCN_H -DL_ENDIAN -DTERMIOS
 CFLAGS+=       -DOPENSSL_THREADS
+CFLAGS+=       -DDSO_DLFCN -DHAVE_DLFCN_H -DL_ENDIAN
 CFLAGS+=       -DOPENSSL_IA32_SSE2 -DOPENSSL_BN_ASM_MONT \
-               -DOPENSSL_BN_ASM_GF2m -DSHA1_ASM \
-               -DSHA256_ASM -DSHA512_ASM -DMD5_ASM -DAES_ASM -DVPAES_ASM \
-               -DWHIRLPOOL_ASM -DGHASH_ASM
-.if ${MACHINE_ARCH} == "x86_64"
-CFLAGS+=       -DOPENSSL_BN_ASM_MONT5 -DBSAES_ASM
-.endif
+               -DOPENSSL_BN_ASM_MONT5 -DOPENSSL_BN_ASM_GF2m -DSHA1_ASM \
+               -DSHA256_ASM -DSHA512_ASM -DMD6_ASM -DAES_ASM -DVPAES_ASM \
+               -DBSAES_ASM -DWHIRLPOOL_ASM -DGHASH_ASM -DECP_NISTZ256_ASM
 CFLAGS+=       -I${LCRYPTO_SRC} -I${LCRYPTO_SRC}/crypto \
                -I${LCRYPTO_SRC}/crypto/asn1 -I${LCRYPTO_SRC}/crypto/evp \
                -I${LCRYPTO_SRC}/crypto/modes \
index 32e6ed7..1c1dfed 100644 (file)
@@ -8,11 +8,19 @@ MLINKS+= ASN1_STRING_length.3 ASN1_STRING_set.3
 MLINKS+= ASN1_STRING_length.3 ASN1_STRING_length_set.3
 MLINKS+= ASN1_STRING_length.3 ASN1_STRING_type.3
 MLINKS+= ASN1_STRING_length.3 ASN1_STRING_data.3
+MLINKS+= ASN1_STRING_length.3 ASN1_STRING_to_UTF8.3
 MAN+= ASN1_STRING_new.3
 MLINKS+= ASN1_STRING_new.3 ASN1_STRING_type_new.3
 MLINKS+= ASN1_STRING_new.3 ASN1_STRING_free.3
 MAN+= ASN1_STRING_print_ex.3
 MLINKS+= ASN1_STRING_print_ex.3 ASN1_STRING_print_ex_fp.3
+MLINKS+= ASN1_STRING_print_ex.3 ASN1_STRING_print.3
+MAN+= ASN1_TIME_set.3
+MLINKS+= ASN1_TIME_set.3 ASN1_TIME_adj.3
+MLINKS+= ASN1_TIME_set.3 ASN1_TIME_check.3
+MLINKS+= ASN1_TIME_set.3 ASN1_TIME_set_string.3
+MLINKS+= ASN1_TIME_set.3 ASN1_TIME_print.3
+MLINKS+= ASN1_TIME_set.3 ASN1_TIME_diff.3
 MAN+= ASN1_generate_nconf.3
 MLINKS+= ASN1_generate_nconf.3 ASN1_generate_v3.3
 MAN+= BIO_ctrl.3
@@ -57,6 +65,7 @@ MLINKS+= BIO_f_ssl.3 BIO_ssl_copy_session_id.3
 MLINKS+= BIO_f_ssl.3 BIO_ssl_shutdown.3
 MAN+= BIO_find_type.3
 MLINKS+= BIO_find_type.3 BIO_next.3
+MLINKS+= BIO_find_type.3 BIO_method_type.3
 MAN+= BIO_new.3
 MLINKS+= BIO_new.3 BIO_set.3
 MLINKS+= BIO_new.3 BIO_free.3
@@ -72,6 +81,7 @@ MLINKS+= BIO_read.3 BIO_puts.3
 MAN+= BIO_s_accept.3
 MLINKS+= BIO_s_accept.3 BIO_set_accept_port.3
 MLINKS+= BIO_s_accept.3 BIO_get_accept_port.3
+MLINKS+= BIO_s_accept.3 BIO_new_accept.3
 MLINKS+= BIO_s_accept.3 BIO_set_nbio_accept.3
 MLINKS+= BIO_s_accept.3 BIO_set_accept_bios.3
 MLINKS+= BIO_s_accept.3 BIO_set_bind_mode.3
@@ -90,6 +100,7 @@ MLINKS+= BIO_s_bio.3 BIO_get_read_request.3
 MLINKS+= BIO_s_bio.3 BIO_ctrl_get_read_request.3
 MLINKS+= BIO_s_bio.3 BIO_ctrl_reset_read_request.3
 MAN+= BIO_s_connect.3
+MLINKS+= BIO_s_connect.3 BIO_new_connect.3
 MLINKS+= BIO_s_connect.3 BIO_set_conn_hostname.3
 MLINKS+= BIO_s_connect.3 BIO_set_conn_port.3
 MLINKS+= BIO_s_connect.3 BIO_set_conn_ip.3
@@ -143,6 +154,7 @@ MLINKS+= BN_BLINDING_new.3 BN_BLINDING_convert_ex.3
 MLINKS+= BN_BLINDING_new.3 BN_BLINDING_invert_ex.3
 MLINKS+= BN_BLINDING_new.3 BN_BLINDING_get_thread_id.3
 MLINKS+= BN_BLINDING_new.3 BN_BLINDING_set_thread_id.3
+MLINKS+= BN_BLINDING_new.3 BN_BLINDING_thread_id.3
 MLINKS+= BN_BLINDING_new.3 BN_BLINDING_get_flags.3
 MLINKS+= BN_BLINDING_new.3 BN_BLINDING_set_flags.3
 MLINKS+= BN_BLINDING_new.3 BN_BLINDING_create_param.3
@@ -190,6 +202,12 @@ MLINKS+= BN_cmp.3 BN_is_odd.3
 MAN+= BN_copy.3
 MLINKS+= BN_copy.3 BN_dup.3
 MAN+= BN_generate_prime.3
+MLINKS+= BN_generate_prime.3 BN_generate_prime_ex.3
+MLINKS+= BN_generate_prime.3 BN_is_prime_ex.3
+MLINKS+= BN_generate_prime.3 BN_is_prime_fasttest_ex.3
+MLINKS+= BN_generate_prime.3 BN_GENCB_call.3
+MLINKS+= BN_generate_prime.3 BN_GENCB_set_old.3
+MLINKS+= BN_generate_prime.3 BN_GENCB_set.3
 MLINKS+= BN_generate_prime.3 BN_is_prime.3
 MLINKS+= BN_generate_prime.3 BN_is_prime_fasttest.3
 MAN+= BN_mod_inverse.3
@@ -217,6 +235,8 @@ MLINKS+= BN_num_bytes.3 BN_num_bits.3
 MLINKS+= BN_num_bytes.3 BN_num_bits_word.3
 MAN+= BN_rand.3
 MLINKS+= BN_rand.3 BN_pseudo_rand.3
+MLINKS+= BN_rand.3 BN_rand_range.3
+MLINKS+= BN_rand.3 BN_pseudo_rand_range.3
 MAN+= BN_set_bit.3
 MLINKS+= BN_set_bit.3 BN_clear_bit.3
 MLINKS+= BN_set_bit.3 BN_is_bit_set.3
@@ -235,6 +255,7 @@ MAN+= CMS_add0_cert.3
 MLINKS+= CMS_add0_cert.3 CMS_add1_cert.3
 MLINKS+= CMS_add0_cert.3 CMS_get1_certs.3
 MLINKS+= CMS_add0_cert.3 CMS_add0_crl.3
+MLINKS+= CMS_add0_cert.3 CMS_add1_crl.3
 MLINKS+= CMS_add0_cert.3 CMS_get1_crls.3
 MAN+= CMS_add1_recipient_cert.3
 MLINKS+= CMS_add1_recipient_cert.3 CMS_add0_recipient_key.3
@@ -253,10 +274,12 @@ MLINKS+= CMS_get0_RecipientInfos.3 CMS_RecipientInfo_kekri_get0_id.3
 MLINKS+= CMS_get0_RecipientInfos.3 CMS_RecipientInfo_kekri_id_cmp.3
 MLINKS+= CMS_get0_RecipientInfos.3 CMS_RecipientInfo_set0_key.3
 MLINKS+= CMS_get0_RecipientInfos.3 CMS_RecipientInfo_decrypt.3
+MLINKS+= CMS_get0_RecipientInfos.3 CMS_RecipientInfo_encrypt.3
 MAN+= CMS_get0_SignerInfos.3
 MLINKS+= CMS_get0_SignerInfos.3 CMS_SignerInfo_get0_signer_id.3
+MLINKS+= CMS_get0_SignerInfos.3 CMS_SignerInfo_get0_signature.3
 MLINKS+= CMS_get0_SignerInfos.3 CMS_SignerInfo_cert_cmp.3
-MLINKS+= CMS_get0_SignerInfos.3 CMS_set1_signer_certs.3
+MLINKS+= CMS_get0_SignerInfos.3 CMS_set1_signer_cert.3
 MAN+= CMS_get0_type.3
 MLINKS+= CMS_get0_type.3 CMS_set1_eContentType.3
 MLINKS+= CMS_get0_type.3 CMS_get0_eContentType.3
@@ -269,6 +292,7 @@ MAN+= CMS_sign.3
 MAN+= CMS_sign_receipt.3
 MAN+= CMS_uncompress.3
 MAN+= CMS_verify.3
+MLINKS+= CMS_verify.3 CMS_get0_signers.3
 MAN+= CMS_verify_receipt.3
 MAN+= CONF_modules_free.3
 MLINKS+= CONF_modules_free.3 CONF_modules_finish.3
@@ -280,6 +304,7 @@ MLINKS+= CRYPTO_set_ex_data.3 CRYPTO_get_ex_data.3
 MAN+= DH_generate_key.3
 MLINKS+= DH_generate_key.3 DH_compute_key.3
 MAN+= DH_generate_parameters.3
+MLINKS+= DH_generate_parameters.3 DH_generate_parameters_ex.3
 MLINKS+= DH_generate_parameters.3 DH_check.3
 MAN+= DH_get_ex_new_index.3
 MLINKS+= DH_get_ex_new_index.3 DH_set_ex_data.3
@@ -299,6 +324,7 @@ MLINKS+= DSA_do_sign.3 DSA_do_verify.3
 MAN+= DSA_dup_DH.3
 MAN+= DSA_generate_key.3
 MAN+= DSA_generate_parameters.3
+MLINKS+= DSA_generate_parameters.3 DSA_generate_parameters_ex.3
 MAN+= DSA_get_ex_new_index.3
 MLINKS+= DSA_get_ex_new_index.3 DSA_set_ex_data.3
 MLINKS+= DSA_get_ex_new_index.3 DSA_get_ex_data.3
@@ -313,6 +339,107 @@ MAN+= DSA_sign.3
 MLINKS+= DSA_sign.3 DSA_sign_setup.3
 MLINKS+= DSA_sign.3 DSA_verify.3
 MAN+= DSA_size.3
+MAN+= EC_GFp_simple_method.3
+MLINKS+= EC_GFp_simple_method.3 EC_GFp_mont_method.3
+MLINKS+= EC_GFp_simple_method.3 EC_GFp_nist_method.3
+MLINKS+= EC_GFp_simple_method.3 EC_GFp_nistp224_method.3
+MLINKS+= EC_GFp_simple_method.3 EC_GFp_nistp256_method.3
+MLINKS+= EC_GFp_simple_method.3 EC_GFp_nistp521_method.3
+MLINKS+= EC_GFp_simple_method.3 EC_GF2m_simple_method.3
+MLINKS+= EC_GFp_simple_method.3 EC_METHOD_get_field_type.3
+MAN+= EC_GROUP_copy.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_dup.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_method_of.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_set_generator.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get0_generator.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_order.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_cofactor.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_set_curve_name.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_curve_name.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_set_asn1_flag.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_asn1_flag.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_set_point_conversion_form.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_point_conversion_form.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get0_seed.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_seed_len.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_set_seed.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_degree.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_check.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_check_discriminant.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_cmp.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_basis_type.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_trinomial_basis.3
+MLINKS+= EC_GROUP_copy.3 EC_GROUP_get_pentanomial_basis.3
+MAN+= EC_GROUP_new.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_free.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_clear_free.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_new_curve_GFp.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_new_curve_GF2m.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_new_by_curve_name.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_set_curve_GFp.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_get_curve_GFp.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_set_curve_GF2m.3
+MLINKS+= EC_GROUP_new.3 EC_GROUP_get_curve_GF2m.3
+MLINKS+= EC_GROUP_new.3 EC_get_builtin_curves.3
+MAN+= EC_KEY_new.3
+MLINKS+= EC_KEY_new.3 EC_KEY_get_flags.3
+MLINKS+= EC_KEY_new.3 EC_KEY_set_flags.3
+MLINKS+= EC_KEY_new.3 EC_KEY_clear_flags.3
+MLINKS+= EC_KEY_new.3 EC_KEY_new_by_curve_name.3
+MLINKS+= EC_KEY_new.3 EC_KEY_free.3
+MLINKS+= EC_KEY_new.3 EC_KEY_copy.3
+MLINKS+= EC_KEY_new.3 EC_KEY_dup.3
+MLINKS+= EC_KEY_new.3 EC_KEY_up_ref.3
+MLINKS+= EC_KEY_new.3 EC_KEY_get0_group.3
+MLINKS+= EC_KEY_new.3 EC_KEY_set_group.3
+MLINKS+= EC_KEY_new.3 EC_KEY_get0_private_key.3
+MLINKS+= EC_KEY_new.3 EC_KEY_set_private_key.3
+MLINKS+= EC_KEY_new.3 EC_KEY_get0_public_key.3
+MLINKS+= EC_KEY_new.3 EC_KEY_set_public_key.3
+MLINKS+= EC_KEY_new.3 EC_KEY_get_enc_flags.3
+MLINKS+= EC_KEY_new.3 EC_KEY_set_enc_flags.3
+MLINKS+= EC_KEY_new.3 EC_KEY_get_conv_form.3
+MLINKS+= EC_KEY_new.3 EC_KEY_set_conv_form.3
+MLINKS+= EC_KEY_new.3 EC_KEY_get_key_method_data.3
+MLINKS+= EC_KEY_new.3 EC_KEY_insert_key_method_data.3
+MLINKS+= EC_KEY_new.3 EC_KEY_set_asn1_flag.3
+MLINKS+= EC_KEY_new.3 EC_KEY_precompute_mult.3
+MLINKS+= EC_KEY_new.3 EC_KEY_generate_key.3
+MLINKS+= EC_KEY_new.3 EC_KEY_check_key.3
+MLINKS+= EC_KEY_new.3 EC_KEY_set_public_key_affine_coordinates.3
+MAN+= EC_POINT_add.3
+MLINKS+= EC_POINT_add.3 EC_POINT_dbl.3
+MLINKS+= EC_POINT_add.3 EC_POINT_invert.3
+MLINKS+= EC_POINT_add.3 EC_POINT_is_at_infinity.3
+MLINKS+= EC_POINT_add.3 EC_POINT_is_on_curve.3
+MLINKS+= EC_POINT_add.3 EC_POINT_cmp.3
+MLINKS+= EC_POINT_add.3 EC_POINT_make_affine.3
+MLINKS+= EC_POINT_add.3 EC_POINTs_make_affine.3
+MLINKS+= EC_POINT_add.3 EC_POINTs_mul.3
+MLINKS+= EC_POINT_add.3 EC_POINT_mul.3
+MLINKS+= EC_POINT_add.3 EC_GROUP_precompute_mult.3
+MLINKS+= EC_POINT_add.3 EC_GROUP_have_precompute_mult.3
+MAN+= EC_POINT_new.3
+MLINKS+= EC_POINT_new.3 EC_POINT_free.3
+MLINKS+= EC_POINT_new.3 EC_POINT_clear_free.3
+MLINKS+= EC_POINT_new.3 EC_POINT_copy.3
+MLINKS+= EC_POINT_new.3 EC_POINT_dup.3
+MLINKS+= EC_POINT_new.3 EC_POINT_method_of.3
+MLINKS+= EC_POINT_new.3 EC_POINT_set_to_infinity.3
+MLINKS+= EC_POINT_new.3 EC_POINT_set_Jprojective_coordinates.3
+MLINKS+= EC_POINT_new.3 EC_POINT_get_Jprojective_coordinates_GFp.3
+MLINKS+= EC_POINT_new.3 EC_POINT_set_affine_coordinates_GFp.3
+MLINKS+= EC_POINT_new.3 EC_POINT_get_affine_coordinates_GFp.3
+MLINKS+= EC_POINT_new.3 EC_POINT_set_compressed_coordinates_GFp.3
+MLINKS+= EC_POINT_new.3 EC_POINT_set_affine_coordinates_GF2m.3
+MLINKS+= EC_POINT_new.3 EC_POINT_get_affine_coordinates_GF2m.3
+MLINKS+= EC_POINT_new.3 EC_POINT_set_compressed_coordinates_GF2m.3
+MLINKS+= EC_POINT_new.3 EC_POINT_point2oct.3
+MLINKS+= EC_POINT_new.3 EC_POINT_oct2point.3
+MLINKS+= EC_POINT_new.3 EC_POINT_point2bn.3
+MLINKS+= EC_POINT_new.3 EC_POINT_bn2point.3
+MLINKS+= EC_POINT_new.3 EC_POINT_point2hex.3
+MLINKS+= EC_POINT_new.3 EC_POINT_hex2point.3
 MAN+= ERR_GET_LIB.3
 MLINKS+= ERR_GET_LIB.3 ERR_GET_FUNC.3
 MLINKS+= ERR_GET_LIB.3 ERR_GET_REASON.3
@@ -342,6 +469,7 @@ MLINKS+= ERR_print_errors.3 ERR_print_errors_fp.3
 MAN+= ERR_put_error.3
 MLINKS+= ERR_put_error.3 ERR_add_error_data.3
 MAN+= ERR_remove_state.3
+MLINKS+= ERR_remove_state.3 ERR_remove_thread_state.3
 MAN+= ERR_set_mark.3
 MLINKS+= ERR_set_mark.3 ERR_pop_to_mark.3
 MAN+= EVP_BytesToKey.3
@@ -355,6 +483,7 @@ MLINKS+= EVP_DigestInit.3 EVP_MD_CTX_cleanup.3
 MLINKS+= EVP_DigestInit.3 EVP_MD_CTX_destroy.3
 MLINKS+= EVP_DigestInit.3 EVP_MAX_MD_SIZE.3
 MLINKS+= EVP_DigestInit.3 EVP_MD_CTX_copy_ex.3
+MLINKS+= EVP_DigestInit.3 EVP_DigestFinal.3
 MLINKS+= EVP_DigestInit.3 EVP_MD_CTX_copy.3
 MLINKS+= EVP_DigestInit.3 EVP_MD_type.3
 MLINKS+= EVP_DigestInit.3 EVP_MD_pkey_type.3
@@ -436,12 +565,66 @@ MLINKS+= EVP_EncryptInit.3 EVP_CIPHER_CTX_mode.3
 MLINKS+= EVP_EncryptInit.3 EVP_CIPHER_param_to_asn1.3
 MLINKS+= EVP_EncryptInit.3 EVP_CIPHER_asn1_to_param.3
 MLINKS+= EVP_EncryptInit.3 EVP_CIPHER_CTX_set_padding.3
+MLINKS+= EVP_EncryptInit.3 EVP_enc_null.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ecb.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_cfb.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ofb.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ede_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ede.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ede_ofb.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ede_cfb.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ede3_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ede3.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ede3_ofb.3
+MLINKS+= EVP_EncryptInit.3 EVP_des_ede3_cfb.3
+MLINKS+= EVP_EncryptInit.3 EVP_desx_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc4.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc4_40.3
+MLINKS+= EVP_EncryptInit.3 EVP_idea_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_idea_ecb.3
+MLINKS+= EVP_EncryptInit.3 EVP_idea_cfb.3
+MLINKS+= EVP_EncryptInit.3 EVP_idea_ofb.3
+MLINKS+= EVP_EncryptInit.3 EVP_idea_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc2_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc2_ecb.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc2_cfb.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc2_ofb.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc2_40_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc2_64_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_bf_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_bf_ecb.3
+MLINKS+= EVP_EncryptInit.3 EVP_bf_cfb.3
+MLINKS+= EVP_EncryptInit.3 EVP_bf_ofb.3
+MLINKS+= EVP_EncryptInit.3 EVP_cast5_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_cast5_ecb.3
+MLINKS+= EVP_EncryptInit.3 EVP_cast5_cfb.3
+MLINKS+= EVP_EncryptInit.3 EVP_cast5_ofb.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc5_32_12_16_cbc.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc5_32_12_16_ecb.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc5_32_12_16_cfb.3
+MLINKS+= EVP_EncryptInit.3 EVP_rc5_32_12_16_ofb.3
+MLINKS+= EVP_EncryptInit.3 EVP_aes_128_gcm.3
+MLINKS+= EVP_EncryptInit.3 EVP_aes_192_gcm.3
+MLINKS+= EVP_EncryptInit.3 EVP_aes_256_gcm.3
+MLINKS+= EVP_EncryptInit.3 EVP_aes_128_ccm.3
+MLINKS+= EVP_EncryptInit.3 EVP_aes_192_ccm.3
+MLINKS+= EVP_EncryptInit.3 EVP_aes_256_ccm.3
 MAN+= EVP_OpenInit.3
 MLINKS+= EVP_OpenInit.3 EVP_OpenUpdate.3
 MLINKS+= EVP_OpenInit.3 EVP_OpenFinal.3
 MAN+= EVP_PKEY_CTX_ctrl.3
-MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_ctrl.3
-MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_ctrl_str.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_ctrl_str.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_get_default_digest_nid.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_signature_md.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_rsa_padding.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_rsa_pss_saltlen.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_rsa_rsa_keygen_bits.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_rsa_keygen_pubexp.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_dsa_paramgen_bits.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_dh_paramgen_prime_len.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_dh_paramgen_generator.3
+MLINKS+= EVP_PKEY_CTX_ctrl.3 EVP_PKEY_CTX_set_ec_paramgen_curve_nid.3
 MAN+= EVP_PKEY_CTX_new.3
 MLINKS+= EVP_PKEY_CTX_new.3 EVP_PKEY_CTX_new_id.3
 MLINKS+= EVP_PKEY_CTX_new.3 EVP_PKEY_CTX_dup.3
@@ -522,10 +705,16 @@ MLINKS+= OPENSSL_VERSION_NUMBER.3 SSLeay_version.3
 MAN+= OPENSSL_config.3
 MLINKS+= OPENSSL_config.3 OPENSSL_no_config.3
 MAN+= OPENSSL_ia32cap.3
+MLINKS+= OPENSSL_ia32cap.3 OPENSSL_ia32cap_loc.3
+MAN+= OPENSSL_instrument_bus.3
+MLINKS+= OPENSSL_instrument_bus.3 OPENSSL_instrument_bus2.3
 MAN+= OPENSSL_load_builtin_modules.3
+MLINKS+= OPENSSL_load_builtin_modules.3 ASN1_add_oid_module.3
+MLINKS+= OPENSSL_load_builtin_modules.3 ENGINE_add_conf_module.3
 MAN+= OpenSSL_add_all_algorithms.3
 MLINKS+= OpenSSL_add_all_algorithms.3 OpenSSL_add_all_ciphers.3
 MLINKS+= OpenSSL_add_all_algorithms.3 OpenSSL_add_all_digests.3
+MLINKS+= OpenSSL_add_all_algorithms.3 EVP_cleanup.3
 MAN+= PEM_write_bio_CMS_stream.3
 MAN+= PEM_write_bio_PKCS7_stream.3
 MAN+= PKCS12_create.3
@@ -535,6 +724,7 @@ MAN+= PKCS7_encrypt.3
 MAN+= PKCS7_sign.3
 MAN+= PKCS7_sign_add_signer.3
 MAN+= PKCS7_verify.3
+MLINKS+= PKCS7_verify.3 PKCS7_get0_signers.3
 MAN+= RAND_add.3
 MLINKS+= RAND_add.3 RAND_seed.3
 MLINKS+= RAND_add.3 RAND_status.3
@@ -544,6 +734,8 @@ MAN+= RAND_bytes.3
 MLINKS+= RAND_bytes.3 RAND_pseudo_bytes.3
 MAN+= RAND_cleanup.3
 MAN+= RAND_egd.3
+MLINKS+= RAND_egd.3 RAND_egd_bytes.3
+MLINKS+= RAND_egd.3 RAND_query_egd_bytes.3
 MAN+= RAND_load_file.3
 MLINKS+= RAND_load_file.3 RAND_write_file.3
 MLINKS+= RAND_load_file.3 RAND_file_name.3
@@ -554,6 +746,7 @@ MAN+= RSA_blinding_on.3
 MLINKS+= RSA_blinding_on.3 RSA_blinding_off.3
 MAN+= RSA_check_key.3
 MAN+= RSA_generate_key.3
+MLINKS+= RSA_generate_key.3 RSA_generate_key_ex.3
 MAN+= RSA_get_ex_new_index.3
 MLINKS+= RSA_get_ex_new_index.3 RSA_set_ex_data.3
 MLINKS+= RSA_get_ex_new_index.3 RSA_get_ex_data.3
@@ -598,6 +791,7 @@ MAN+= SMIME_read_CMS.3
 MAN+= SMIME_read_PKCS7.3
 MAN+= SMIME_write_CMS.3
 MAN+= SMIME_write_PKCS7.3
+MAN+= SSLeay_version.3
 MAN+= X509_NAME_ENTRY_get_object.3
 MLINKS+= X509_NAME_ENTRY_get_object.3 X509_NAME_ENTRY_get_data.3
 MLINKS+= X509_NAME_ENTRY_get_object.3 X509_NAME_ENTRY_set_object.3
@@ -653,6 +847,17 @@ MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_get_depth.3
 MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_set_time.3
 MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_add0_policy.3
 MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_set1_policies.3
+MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_set1_host.3
+MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_add1_host.3
+MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_set_hostflags.3
+MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_get0_peername.3
+MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_set1_email.3
+MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_set1_ip.3
+MLINKS+= X509_VERIFY_PARAM_set_flags.3 X509_VERIFY_PARAM_set1_ip_asc.3
+MAN+= X509_check_host.3
+MLINKS+= X509_check_host.3 X509_check_email.3
+MLINKS+= X509_check_host.3 X509_check_ip.3
+MLINKS+= X509_check_host.3 X509_check_ip_asc.3
 MAN+= X509_new.3
 MLINKS+= X509_new.3 X509_free.3
 MAN+= X509_verify_cert.3
@@ -720,8 +925,18 @@ MLINKS+= d2i_DSAPublicKey.3 d2i_DSAPrivateKey.3
 MLINKS+= d2i_DSAPublicKey.3 i2d_DSAPrivateKey.3
 MLINKS+= d2i_DSAPublicKey.3 d2i_DSA_PUBKEY.3
 MLINKS+= d2i_DSAPublicKey.3 i2d_DSA_PUBKEY.3
+MLINKS+= d2i_DSAPublicKey.3 d2i_DSAparams.3
+MLINKS+= d2i_DSAPublicKey.3 i2d_DSAparams.3
 MLINKS+= d2i_DSAPublicKey.3 d2i_DSA_SIG.3
 MLINKS+= d2i_DSAPublicKey.3 i2d_DSA_SIG.3
+MAN+= d2i_ECPKParameters.3
+MLINKS+= d2i_ECPKParameters.3 i2d_ECPKParameters.3
+MLINKS+= d2i_ECPKParameters.3 d2i_ECPKParameters_bio.3
+MLINKS+= d2i_ECPKParameters.3 i2d_ECPKParameters_bio.3
+MLINKS+= d2i_ECPKParameters.3 d2i_ECPKParameters_fp.3
+MLINKS+= d2i_ECPKParameters.3 i2d_ECPKParameters_fp.3
+MLINKS+= d2i_ECPKParameters.3 ECPKParameters_print.3
+MLINKS+= d2i_ECPKParameters.3 ECPKParameters_print_fp.3
 MAN+= d2i_ECPrivateKey.3
 MLINKS+= d2i_ECPrivateKey.3 i2d_ECPrivateKey.3
 MLINKS+= d2i_ECPrivateKey.3 d2i_ECPrivate_key.3
@@ -751,7 +966,7 @@ MLINKS+= d2i_X509_ALGOR.3 i2d_X509_ALGOR.3
 MAN+= d2i_X509_CRL.3
 MLINKS+= d2i_X509_CRL.3 i2d_X509_CRL.3
 MLINKS+= d2i_X509_CRL.3 d2i_X509_CRL_bio.3
-MLINKS+= d2i_X509_CRL.3 d2i_509_CRL_fp.3
+MLINKS+= d2i_X509_CRL.3 d2i_X509_CRL_fp.3
 MLINKS+= d2i_X509_CRL.3 i2d_X509_CRL_bio.3
 MLINKS+= d2i_X509_CRL.3 i2d_X509_CRL_fp.3
 MAN+= d2i_X509_NAME.3
@@ -800,15 +1015,31 @@ MLINKS+= des.3 DES_enc_write.3
 MAN+= des_modes.7
 MAN+= dh.3
 MAN+= dsa.3
+MAN+= ec.3
 MAN+= ecdsa.3
+MLINKS+= ecdsa.3 ECDSA_SIG_new.3
+MLINKS+= ecdsa.3 ECDSA_SIG_free.3
+MLINKS+= ecdsa.3 i2d_ECDSA_SIG.3
+MLINKS+= ecdsa.3 d2i_ECDSA_SIG.3
+MLINKS+= ecdsa.3 ECDSA_size.3
+MLINKS+= ecdsa.3 ECDSA_sign_setup.3
+MLINKS+= ecdsa.3 ECDSA_sign.3
+MLINKS+= ecdsa.3 ECDSA_sign_ex.3
+MLINKS+= ecdsa.3 ECDSA_verify.3
+MLINKS+= ecdsa.3 ECDSA_do_sign.3
+MLINKS+= ecdsa.3 ECDSA_do_sign_ex.3
+MLINKS+= ecdsa.3 ECDSA_do_verify.3
 MAN+= engine.3
 MAN+= err.3
 MAN+= evp.3
 MAN+= hmac.3
 MLINKS+= hmac.3 HMAC.3
+MLINKS+= hmac.3 HMAC_CTX_init.3
 MLINKS+= hmac.3 HMAC_Init.3
+MLINKS+= hmac.3 HMAC_Init_ex.3
 MLINKS+= hmac.3 HMAC_Update.3
 MLINKS+= hmac.3 HMAC_Final.3
+MLINKS+= hmac.3 HMAC_CTX_cleanup.3
 MLINKS+= hmac.3 HMAC_cleanup.3
 MAN+= i2d_CMS_bio_stream.3
 MAN+= i2d_PKCS7_bio_stream.3
@@ -928,6 +1159,22 @@ MLINKS+= sha.3 SHA1.3
 MLINKS+= sha.3 SHA1_Init.3
 MLINKS+= sha.3 SHA1_Update.3
 MLINKS+= sha.3 SHA1_Final.3
+MLINKS+= sha.3 SHA224.3
+MLINKS+= sha.3 SHA224_Init.3
+MLINKS+= sha.3 SHA224_Update.3
+MLINKS+= sha.3 SHA224_Final.3
+MLINKS+= sha.3 SHA256.3
+MLINKS+= sha.3 SHA256_Init.3
+MLINKS+= sha.3 SHA256_Update.3
+MLINKS+= sha.3 SHA256_Final.3
+MLINKS+= sha.3 SHA384.3
+MLINKS+= sha.3 SHA384_Init.3
+MLINKS+= sha.3 SHA384_Update.3
+MLINKS+= sha.3 SHA384_Final.3
+MLINKS+= sha.3 SHA512.3
+MLINKS+= sha.3 SHA512_Init.3
+MLINKS+= sha.3 SHA512_Update.3
+MLINKS+= sha.3 SHA512_Final.3
 MAN+= threads.3
 MLINKS+= threads.3 CRYPTO_THREADID_set_callback.3
 MLINKS+= threads.3 CRYPTO_THREADID_get_callback.3
index 139a84d..be5cfc1 100644 (file)
@@ -1,6 +1,5 @@
 # This file automatically generates the asm .s files after importing a new
-# version of OpenSSL.  You need to run it with MACHINE_ARCH=x86_64
-# to get everything generated properly.
+# version of OpenSSL.
 
 OPENSSL_SRC=   ../../../../crypto/openssl
 
@@ -10,6 +9,7 @@ OPENSSL_SRC=   ../../../../crypto/openssl
        ${OPENSSL_SRC}/crypto/bn/asm \
        ${OPENSSL_SRC}/crypto/camellia/asm \
        ${OPENSSL_SRC}/crypto/des/asm \
+       ${OPENSSL_SRC}/crypto/ec/asm \
        ${OPENSSL_SRC}/crypto/md5/asm \
        ${OPENSSL_SRC}/crypto/modes/asm \
        ${OPENSSL_SRC}/crypto/perlasm \
@@ -18,16 +18,20 @@ OPENSSL_SRC=        ../../../../crypto/openssl
        ${OPENSSL_SRC}/crypto/sha/asm \
        ${OPENSSL_SRC}/crypto/whrlpool/asm
 
-.if ${MACHINE_ARCH} == "x86_64"
 # cpuid
 SRCS=  x86_64cpuid.pl
 
 # bn
-SRCS+= x86_64-mont.pl x86_64-mont5.pl x86_64-gf2m.pl modexp512-x86_64.pl
+SRCS+= x86_64-mont.pl x86_64-mont5.pl x86_64-gf2m.pl \
+       rsaz-x86_64.pl rsaz-avx2.pl
+
+# ec
+SRCS+= ecp_nistz256-x86_64.pl
 
 # aes
 SRCS+= aes-x86_64.pl vpaes-x86_64.pl bsaes-x86_64.pl \
-       aesni-x86_64.pl aesni-sha1-x86_64.pl
+       aesni-x86_64.pl aesni-sha1-x86_64.pl aesni-sha256-x86_64.pl \
+       aesni-mb-x86_64.pl
 
 # rc4
 SRCS+= rc4-x86_64.pl rc4-md5-x86_64.pl
@@ -36,7 +40,8 @@ SRCS+=        rc4-x86_64.pl rc4-md5-x86_64.pl
 SRCS+= md5-x86_64.pl
 
 # sha
-SRCS+= sha1-x86_64.pl sha256-x86_64.s sha512-x86_64.pl
+SRCS+= sha1-x86_64.pl sha256-x86_64.s sha512-x86_64.pl \
+       sha1-mb-x86_64.pl sha256-mb-x86_64.pl
 
 # whrlpool
 SRCS+= wp-x86_64.pl
@@ -45,10 +50,9 @@ SRCS+=       wp-x86_64.pl
 SRCS+= cmll-x86_64.pl
 
 # modes
-SRCS+= ghash-x86_64.pl
+SRCS+= ghash-x86_64.pl aesni-gcm-x86_64.pl
 
 PERLFLAGS=
-.endif
 
 all:   ${SRCS:S/.pl$/.s/}
 
index 023f805..3cb86d6 100644 (file)
@@ -81,8 +81,8 @@ _x86_64_AES_encrypt:
        movl    0(%r14,%rdi,8),%edi
        movl    0(%r14,%rbp,8),%ebp
 
-       andl    $65280,%edi
-       andl    $65280,%ebp
+       andl    $0x0000ff00,%edi
+       andl    $0x0000ff00,%ebp
 
        xorl    %edi,%r10d
        xorl    %ebp,%r11d
@@ -94,8 +94,8 @@ _x86_64_AES_encrypt:
        movl    0(%r14,%rsi,8),%esi
        movl    0(%r14,%rdi,8),%edi
 
-       andl    $65280,%esi
-       andl    $65280,%edi
+       andl    $0x0000ff00,%esi
+       andl    $0x0000ff00,%edi
        shrl    $16,%ebx
        xorl    %esi,%r12d
        xorl    %edi,%r8d
@@ -108,9 +108,9 @@ _x86_64_AES_encrypt:
        movl    0(%r14,%rdi,8),%edi
        movl    0(%r14,%rbp,8),%ebp
 
-       andl    $16711680,%esi
-       andl    $16711680,%edi
-       andl    $16711680,%ebp
+       andl    $0x00ff0000,%esi
+       andl    $0x00ff0000,%edi
+       andl    $0x00ff0000,%ebp
 
        xorl    %esi,%r10d
        xorl    %edi,%r11d
@@ -123,9 +123,9 @@ _x86_64_AES_encrypt:
        movl    2(%r14,%rdi,8),%edi
        movl    2(%r14,%rbp,8),%ebp
 
-       andl    $16711680,%esi
-       andl    $4278190080,%edi
-       andl    $4278190080,%ebp
+       andl    $0x00ff0000,%esi
+       andl    $0xff000000,%edi
+       andl    $0xff000000,%ebp
 
        xorl    %esi,%r8d
        xorl    %edi,%r10d
@@ -138,8 +138,8 @@ _x86_64_AES_encrypt:
        movl    2(%r14,%rdi,8),%edi
        movl    16+0(%r15),%eax
 
-       andl    $4278190080,%esi
-       andl    $4278190080,%edi
+       andl    $0xff000000,%esi
+       andl    $0xff000000,%edi
 
        xorl    %esi,%r12d
        xorl    %edi,%r8d
@@ -150,7 +150,7 @@ _x86_64_AES_encrypt:
        xorl    %r11d,%ebx
        xorl    %r12d,%ecx
        xorl    %r8d,%edx
-.byte  0xf3,0xc3                       
+.byte  0xf3,0xc3
 .size  _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
 .type  _x86_64_AES_encrypt_compact,@function
 .align 16
@@ -175,90 +175,88 @@ _x86_64_AES_encrypt_compact:
        movzbl  %al,%r10d
        movzbl  %bl,%r11d
        movzbl  %cl,%r12d
-       movzbl  (%r14,%r10,1),%r10d
-       movzbl  (%r14,%r11,1),%r11d
-       movzbl  (%r14,%r12,1),%r12d
-
        movzbl  %dl,%r8d
        movzbl  %bh,%esi
        movzbl  %ch,%edi
+       shrl    $16,%ecx
+       movzbl  %dh,%ebp
+       movzbl  (%r14,%r10,1),%r10d
+       movzbl  (%r14,%r11,1),%r11d
+       movzbl  (%r14,%r12,1),%r12d
        movzbl  (%r14,%r8,1),%r8d
-       movzbl  (%r14,%rsi,1),%r9d
-       movzbl  (%r14,%rdi,1),%r13d
 
-       movzbl  %dh,%ebp
+       movzbl  (%r14,%rsi,1),%r9d
        movzbl  %ah,%esi
-       shrl    $16,%ecx
+       movzbl  (%r14,%rdi,1),%r13d
+       movzbl  %cl,%edi
        movzbl  (%r14,%rbp,1),%ebp
        movzbl  (%r14,%rsi,1),%esi
-       shrl    $16,%edx
 
-       movzbl  %cl,%edi
        shll    $8,%r9d
+       shrl    $16,%edx
        shll    $8,%r13d
-       movzbl  (%r14,%rdi,1),%edi
        xorl    %r9d,%r10d
-       xorl    %r13d,%r11d
-
-       movzbl  %dl,%r9d
        shrl    $16,%eax
+       movzbl  %dl,%r9d
        shrl    $16,%ebx
-       movzbl  %al,%r13d
+       xorl    %r13d,%r11d
        shll    $8,%ebp
-       shll    $8,%esi
-       movzbl  (%r14,%r9,1),%r9d
-       movzbl  (%r14,%r13,1),%r13d
+       movzbl  %al,%r13d
+       movzbl  (%r14,%rdi,1),%edi
        xorl    %ebp,%r12d
-       xorl    %esi,%r8d
 
+       shll    $8,%esi
        movzbl  %bl,%ebp
-       movzbl  %dh,%esi
        shll    $16,%edi
-       movzbl  (%r14,%rbp,1),%ebp
-       movzbl  (%r14,%rsi,1),%esi
+       xorl    %esi,%r8d
+       movzbl  (%r14,%r9,1),%r9d
+       movzbl  %dh,%esi
+       movzbl  (%r14,%r13,1),%r13d
        xorl    %edi,%r10d
 
-       movzbl  %ah,%edi
        shrl    $8,%ecx
+       movzbl  %ah,%edi
+       shll    $16,%r9d
        shrl    $8,%ebx
+       shll    $16,%r13d
+       xorl    %r9d,%r11d
+       movzbl  (%r14,%rbp,1),%ebp
+       movzbl  (%r14,%rsi,1),%esi
        movzbl  (%r14,%rdi,1),%edi
        movzbl  (%r14,%rcx,1),%edx
        movzbl  (%r14,%rbx,1),%ecx
-       shll    $16,%r9d
-       shll    $16,%r13d
+
        shll    $16,%ebp
-       xorl    %r9d,%r11d
        xorl    %r13d,%r12d
-       xorl    %ebp,%r8d
-
        shll    $24,%esi
+       xorl    %ebp,%r8d
        shll    $24,%edi
-       shll    $24,%edx
        xorl    %esi,%r10d
-       shll    $24,%ecx
+       shll    $24,%edx
        xorl    %edi,%r11d
+       shll    $24,%ecx
        movl    %r10d,%eax
        movl    %r11d,%ebx
        xorl    %r12d,%ecx
        xorl    %r8d,%edx
        cmpq    16(%rsp),%r15
        je      .Lenc_compact_done
-       movl    %eax,%esi
-       movl    %ebx,%edi
-       andl    $2155905152,%esi
-       andl    $2155905152,%edi
-       movl    %esi,%r10d
-       movl    %edi,%r11d
+       movl    $0x80808080,%r10d
+       movl    $0x80808080,%r11d
+       andl    %eax,%r10d
+       andl    %ebx,%r11d
+       movl    %r10d,%esi
+       movl    %r11d,%edi
        shrl    $7,%r10d
        leal    (%rax,%rax,1),%r8d
        shrl    $7,%r11d
        leal    (%rbx,%rbx,1),%r9d
        subl    %r10d,%esi
        subl    %r11d,%edi
-       andl    $4278124286,%r8d
-       andl    $4278124286,%r9d
-       andl    $454761243,%esi
-       andl    $454761243,%edi
+       andl    $0xfefefefe,%r8d
+       andl    $0xfefefefe,%r9d
+       andl    $0x1b1b1b1b,%esi
+       andl    $0x1b1b1b1b,%edi
        movl    %eax,%r10d
        movl    %ebx,%r11d
        xorl    %esi,%r8d
@@ -266,57 +264,57 @@ _x86_64_AES_encrypt_compact:
 
        xorl    %r8d,%eax
        xorl    %r9d,%ebx
-       movl    %ecx,%esi
-       movl    %edx,%edi
+       movl    $0x80808080,%r12d
        roll    $24,%eax
+       movl    $0x80808080,%ebp
        roll    $24,%ebx
-       andl    $2155905152,%esi
-       andl    $2155905152,%edi
+       andl    %ecx,%r12d
+       andl    %edx,%ebp
        xorl    %r8d,%eax
        xorl    %r9d,%ebx
-       movl    %esi,%r12d
-       movl    %edi,%ebp
+       movl    %r12d,%esi
        rorl    $16,%r10d
+       movl    %ebp,%edi
        rorl    $16,%r11d
-       shrl    $7,%r12d
        leal    (%rcx,%rcx,1),%r8d
+       shrl    $7,%r12d
        xorl    %r10d,%eax
-       xorl    %r11d,%ebx
        shrl    $7,%ebp
-       leal    (%rdx,%rdx,1),%r9d
+       xorl    %r11d,%ebx
        rorl    $8,%r10d
+       leal    (%rdx,%rdx,1),%r9d
        rorl    $8,%r11d
        subl    %r12d,%esi
        subl    %ebp,%edi
        xorl    %r10d,%eax
        xorl    %r11d,%ebx
 
-       andl    $4278124286,%r8d
-       andl    $4278124286,%r9d
-       andl    $454761243,%esi
-       andl    $454761243,%edi
+       andl    $0xfefefefe,%r8d
+       andl    $0xfefefefe,%r9d
+       andl    $0x1b1b1b1b,%esi
+       andl    $0x1b1b1b1b,%edi
        movl    %ecx,%r12d
        movl    %edx,%ebp
        xorl    %esi,%r8d
        xorl    %edi,%r9d
 
+       rorl    $16,%r12d
        xorl    %r8d,%ecx
+       rorl    $16,%ebp
        xorl    %r9d,%edx
        roll    $24,%ecx
+       movl    0(%r14),%esi
        roll    $24,%edx
        xorl    %r8d,%ecx
-       xorl    %r9d,%edx
-       movl    0(%r14),%esi
-       rorl    $16,%r12d
-       rorl    $16,%ebp
        movl    64(%r14),%edi
-       xorl    %r12d,%ecx
-       xorl    %ebp,%edx
+       xorl    %r9d,%edx
        movl    128(%r14),%r8d
+       xorl    %r12d,%ecx
        rorl    $8,%r12d
+       xorl    %ebp,%edx
        rorl    $8,%ebp
-       movl    192(%r14),%r9d
        xorl    %r12d,%ecx
+       movl    192(%r14),%r9d
        xorl    %ebp,%edx
        jmp     .Lenc_loop_compact
 .align 16
@@ -325,7 +323,7 @@ _x86_64_AES_encrypt_compact:
        xorl    4(%r15),%ebx
        xorl    8(%r15),%ecx
        xorl    12(%r15),%edx
-.byte  0xf3,0xc3                       
+.byte  0xf3,0xc3
 .size  _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
 .globl AES_encrypt
 .type  AES_encrypt,@function
@@ -347,7 +345,7 @@ AES_encrypt:
        andq    $-64,%rsp
        subq    %rsp,%rcx
        negq    %rcx
-       andq    $960,%rcx
+       andq    $0x3c0,%rcx
        subq    %rcx,%rsp
        subq    $32,%rsp
 
@@ -372,7 +370,7 @@ AES_encrypt:
        leaq    .LAES_Te+2048(%rip),%r14
        leaq    768(%rsp),%rbp
        subq    %r14,%rbp
-       andq    $768,%rbp
+       andq    $0x300,%rbp
        leaq    (%r14,%rbp,1),%r14
 
        call    _x86_64_AES_encrypt_compact
@@ -547,7 +545,7 @@ _x86_64_AES_decrypt:
        xorl    %r11d,%ebx
        xorl    %r12d,%ecx
        xorl    %r8d,%edx
-.byte  0xf3,0xc3                       
+.byte  0xf3,0xc3
 .size  _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
 .type  _x86_64_AES_decrypt_compact,@function
 .align 16
@@ -573,70 +571,69 @@ _x86_64_AES_decrypt_compact:
        movzbl  %al,%r10d
        movzbl  %bl,%r11d
        movzbl  %cl,%r12d
-       movzbl  (%r14,%r10,1),%r10d
-       movzbl  (%r14,%r11,1),%r11d
-       movzbl  (%r14,%r12,1),%r12d
-
        movzbl  %dl,%r8d
        movzbl  %dh,%esi
        movzbl  %ah,%edi
+       shrl    $16,%edx
+       movzbl  %bh,%ebp
+       movzbl  (%r14,%r10,1),%r10d
+       movzbl  (%r14,%r11,1),%r11d
+       movzbl  (%r14,%r12,1),%r12d
        movzbl  (%r14,%r8,1),%r8d
-       movzbl  (%r14,%rsi,1),%r9d
-       movzbl  (%r14,%rdi,1),%r13d
 
-       movzbl  %bh,%ebp
+       movzbl  (%r14,%rsi,1),%r9d
        movzbl  %ch,%esi
-       shrl    $16,%ecx
+       movzbl  (%r14,%rdi,1),%r13d
        movzbl  (%r14,%rbp,1),%ebp
        movzbl  (%r14,%rsi,1),%esi
-       shrl    $16,%edx
 
-       movzbl  %cl,%edi
-       shll    $8,%r9d
+       shrl    $16,%ecx
        shll    $8,%r13d
-       movzbl  (%r14,%rdi,1),%edi
-       xorl    %r9d,%r10d
-       xorl    %r13d,%r11d
-
-       movzbl  %dl,%r9d
+       shll    $8,%r9d
+       movzbl  %cl,%edi
        shrl    $16,%eax
+       xorl    %r9d,%r10d
        shrl    $16,%ebx
-       movzbl  %al,%r13d
+       movzbl  %dl,%r9d
+
        shll    $8,%ebp
+       xorl    %r13d,%r11d
        shll    $8,%esi
-       movzbl  (%r14,%r9,1),%r9d
-       movzbl  (%r14,%r13,1),%r13d
+       movzbl  %al,%r13d
+       movzbl  (%r14,%rdi,1),%edi
        xorl    %ebp,%r12d
-       xorl    %esi,%r8d
-
        movzbl  %bl,%ebp
-       movzbl  %bh,%esi
+
        shll    $16,%edi
+       xorl    %esi,%r8d
+       movzbl  (%r14,%r9,1),%r9d
+       movzbl  %bh,%esi
        movzbl  (%r14,%rbp,1),%ebp
-       movzbl  (%r14,%rsi,1),%esi
        xorl    %edi,%r10d
-
+       movzbl  (%r14,%r13,1),%r13d
        movzbl  %ch,%edi
+
+       shll    $16,%ebp
        shll    $16,%r9d
        shll    $16,%r13d
-       movzbl  (%r14,%rdi,1),%ebx
+       xorl    %ebp,%r8d
+       movzbl  %dh,%ebp
        xorl    %r9d,%r11d
+       shrl    $8,%eax
        xorl    %r13d,%r12d
 
-       movzbl  %dh,%edi
-       shrl    $8,%eax
-       shll    $16,%ebp
-       movzbl  (%r14,%rdi,1),%ecx
+       movzbl  (%r14,%rsi,1),%esi
+       movzbl  (%r14,%rdi,1),%ebx
+       movzbl  (%r14,%rbp,1),%ecx
        movzbl  (%r14,%rax,1),%edx
-       xorl    %ebp,%r8d
 
+       movl    %r10d,%eax
        shll    $24,%esi
        shll    $24,%ebx
        shll    $24,%ecx
-       xorl    %esi,%r10d
+       xorl    %esi,%eax
        shll    $24,%edx
        xorl    %r11d,%ebx
-       movl    %r10d,%eax
        xorl    %r12d,%ecx
        xorl    %r8d,%edx
        cmpq    16(%rsp),%r15
@@ -649,12 +646,12 @@ _x86_64_AES_decrypt_compact:
        orq     %rbx,%rax
        orq     %rdx,%rcx
        movq    256+16(%r14),%rbp
-       movq    %rax,%rbx
-       movq    %rcx,%rdx
-       andq    %rsi,%rbx
-       andq    %rsi,%rdx
-       movq    %rbx,%r9
-       movq    %rdx,%r12
+       movq    %rsi,%r9
+       movq    %rsi,%r12
+       andq    %rax,%r9
+       andq    %rcx,%r12
+       movq    %r9,%rbx
+       movq    %r12,%rdx
        shrq    $7,%r9
        leaq    (%rax,%rax,1),%r8
        shrq    $7,%r12
@@ -665,15 +662,15 @@ _x86_64_AES_decrypt_compact:
        andq    %rdi,%r11
        andq    %rbp,%rbx
        andq    %rbp,%rdx
-       xorq    %r8,%rbx
-       xorq    %r11,%rdx
-       movq    %rbx,%r8
-       movq    %rdx,%r11
-
-       andq    %rsi,%rbx
-       andq    %rsi,%rdx
-       movq    %rbx,%r10
-       movq    %rdx,%r13
+       xorq    %rbx,%r8
+       xorq    %rdx,%r11
+       movq    %rsi,%r10
+       movq    %rsi,%r13
+
+       andq    %r8,%r10
+       andq    %r11,%r13
+       movq    %r10,%rbx
+       movq    %r13,%rdx
        shrq    $7,%r10
        leaq    (%r8,%r8,1),%r9
        shrq    $7,%r13
@@ -684,15 +681,15 @@ _x86_64_AES_decrypt_compact:
        andq    %rdi,%r12
        andq    %rbp,%rbx
        andq    %rbp,%rdx
-       xorq    %r9,%rbx
-       xorq    %r12,%rdx
-       movq    %rbx,%r9
-       movq    %rdx,%r12
-
-       andq    %rsi,%rbx
-       andq    %rsi,%rdx
-       movq    %rbx,%r10
-       movq    %rdx,%r13
+       xorq    %rbx,%r9
+       xorq    %rdx,%r12
+       movq    %rsi,%r10
+       movq    %rsi,%r13
+
+       andq    %r9,%r10
+       andq    %r12,%r13
+       movq    %r10,%rbx
+       movq    %r13,%rdx
        shrq    $7,%r10
        xorq    %rax,%r8
        shrq    $7,%r13
@@ -717,51 +714,51 @@ _x86_64_AES_decrypt_compact:
        movq    %rax,%rbx
        movq    %rcx,%rdx
        xorq    %r10,%r9
-       xorq    %r13,%r12
        shrq    $32,%rbx
+       xorq    %r13,%r12
        shrq    $32,%rdx
        xorq    %r8,%r10
-       xorq    %r11,%r13
        roll    $8,%eax
+       xorq    %r11,%r13
        roll    $8,%ecx
        xorq    %r9,%r10
+       roll    $8,%ebx
        xorq    %r12,%r13
 
-       roll    $8,%ebx
        roll    $8,%edx
        xorl    %r10d,%eax
-       xorl    %r13d,%ecx
        shrq    $32,%r10
+       xorl    %r13d,%ecx
        shrq    $32,%r13
        xorl    %r10d,%ebx
        xorl    %r13d,%edx
 
        movq    %r8,%r10
-       movq    %r11,%r13
-       shrq    $32,%r10
-       shrq    $32,%r13
        roll    $24,%r8d
+       movq    %r11,%r13
        roll    $24,%r11d
-       roll    $24,%r10d
-       roll    $24,%r13d
+       shrq    $32,%r10
        xorl    %r8d,%eax
+       shrq    $32,%r13
        xorl    %r11d,%ecx
+       roll    $24,%r10d
        movq    %r9,%r8
+       roll    $24,%r13d
        movq    %r12,%r11
+       shrq    $32,%r8
        xorl    %r10d,%ebx
+       shrq    $32,%r11
        xorl    %r13d,%edx
 
        movq    0(%r14),%rsi
-       shrq    $32,%r8
-       shrq    $32,%r11
-       movq    64(%r14),%rdi
        roll    $16,%r9d
+       movq    64(%r14),%rdi
        roll    $16,%r12d
        movq    128(%r14),%rbp
        roll    $16,%r8d
-       roll    $16,%r11d
        movq    192(%r14),%r10
        xorl    %r9d,%eax
+       roll    $16,%r11d
        xorl    %r12d,%ecx
        movq    256(%r14),%r13
        xorl    %r8d,%ebx
@@ -773,7 +770,7 @@ _x86_64_AES_decrypt_compact:
        xorl    4(%r15),%ebx
        xorl    8(%r15),%ecx
        xorl    12(%r15),%edx
-.byte  0xf3,0xc3                       
+.byte  0xf3,0xc3
 .size  _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
 .globl AES_decrypt
 .type  AES_decrypt,@function
@@ -795,7 +792,7 @@ AES_decrypt:
        andq    $-64,%rsp
        subq    %rsp,%rcx
        negq    %rcx
-       andq    $960,%rcx
+       andq    $0x3c0,%rcx
        subq    %rcx,%rsp
        subq    $32,%rsp
 
@@ -820,7 +817,7 @@ AES_decrypt:
        leaq    .LAES_Td+2048(%rip),%r14
        leaq    768(%rsp),%rbp
        subq    %r14,%rbp
-       andq    $768,%rbp
+       andq    $0x300,%rbp
        leaq    (%r14,%rbp,1),%r14
        shrq    $3,%rbp
        addq    %rbp,%r14
@@ -859,10 +856,6 @@ private_AES_set_encrypt_key:
 
        call    _x86_64_AES_set_encrypt_key
 
-       movq    8(%rsp),%r15
-       movq    16(%rsp),%r14
-       movq    24(%rsp),%r13
-       movq    32(%rsp),%r12
        movq    40(%rsp),%rbp
        movq    48(%rsp),%rbx
        addq    $56,%rsp
@@ -1107,7 +1100,7 @@ _x86_64_AES_set_encrypt_key:
 .Lbadpointer:
        movq    $-1,%rax
 .Lexit:
-.byte  0xf3,0xc3                       
+.byte  0xf3,0xc3
 .size  _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
 .globl private_AES_set_decrypt_key
 .type  private_AES_set_decrypt_key,@function
@@ -1160,12 +1153,12 @@ private_AES_set_decrypt_key:
        leaq    16(%r15),%r15
        movq    0(%r15),%rax
        movq    8(%r15),%rcx
-       movq    %rax,%rbx
-       movq    %rcx,%rdx
-       andq    %rsi,%rbx
-       andq    %rsi,%rdx
-       movq    %rbx,%r9
-       movq    %rdx,%r12
+       movq    %rsi,%r9
+       movq    %rsi,%r12
+       andq    %rax,%r9
+       andq    %rcx,%r12
+       movq    %r9,%rbx
+       movq    %r12,%rdx
        shrq    $7,%r9
        leaq    (%rax,%rax,1),%r8
        shrq    $7,%r12
@@ -1176,15 +1169,15 @@ private_AES_set_decrypt_key:
        andq    %rdi,%r11
        andq    %rbp,%rbx
        andq    %rbp,%rdx
-       xorq    %r8,%rbx
-       xorq    %r11,%rdx
-       movq    %rbx,%r8
-       movq    %rdx,%r11
-
-       andq    %rsi,%rbx
-       andq    %rsi,%rdx
-       movq    %rbx,%r10
-       movq    %rdx,%r13
+       xorq    %rbx,%r8
+       xorq    %rdx,%r11
+       movq    %rsi,%r10
+       movq    %rsi,%r13
+
+       andq    %r8,%r10
+       andq    %r11,%r13
+       movq    %r10,%rbx
+       movq    %r13,%rdx
        shrq    $7,%r10
        leaq    (%r8,%r8,1),%r9
        shrq    $7,%r13
@@ -1195,15 +1188,15 @@ private_AES_set_decrypt_key:
        andq    %rdi,%r12
        andq    %rbp,%rbx
        andq    %rbp,%rdx
-       xorq    %r9,%rbx
-       xorq    %r12,%rdx
-       movq    %rbx,%r9
-       movq    %rdx,%r12
-
-       andq    %rsi,%rbx
-       andq    %rsi,%rdx
-       movq    %rbx,%r10
-       movq    %rdx,%r13
+       xorq    %rbx,%r9
+       xorq    %rdx,%r12
+       movq    %rsi,%r10
+       movq    %rsi,%r13
+
+       andq    %r9,%r10
+       andq    %r12,%r13
+       movq    %r10,%rbx
+       movq    %r13,%rdx
        shrq    $7,%r10
        xorq    %rax,%r8
        shrq    $7,%r13
@@ -1228,51 +1221,51 @@ private_AES_set_decrypt_key:
        movq    %rax,%rbx
        movq    %rcx,%rdx
        xorq    %r10,%r9
-       xorq    %r13,%r12
        shrq    $32,%rbx
+       xorq    %r13,%r12
        shrq    $32,%rdx
        xorq    %r8,%r10
-       xorq    %r11,%r13
        roll    $8,%eax
+       xorq    %r11,%r13
        roll    $8,%ecx
        xorq    %r9,%r10
+       roll    $8,%ebx
        xorq    %r12,%r13
 
-       roll    $8,%ebx
        roll    $8,%edx
        xorl    %r10d,%eax
-       xorl    %r13d,%ecx
        shrq    $32,%r10
+       xorl    %r13d,%ecx
        shrq    $32,%r13
        xorl    %r10d,%ebx
        xorl    %r13d,%edx
 
        movq    %r8,%r10
-       movq    %r11,%r13
-       shrq    $32,%r10
-       shrq    $32,%r13
        roll    $24,%r8d
+       movq    %r11,%r13
        roll    $24,%r11d
-       roll    $24,%r10d
-       roll    $24,%r13d
+       shrq    $32,%r10
        xorl    %r8d,%eax
+       shrq    $32,%r13
        xorl    %r11d,%ecx
+       roll    $24,%r10d
        movq    %r9,%r8
+       roll    $24,%r13d
        movq    %r12,%r11
+       shrq    $32,%r8
        xorl    %r10d,%ebx
+       shrq    $32,%r11
        xorl    %r13d,%edx
 
 
-       shrq    $32,%r8
-       shrq    $32,%r11
-
        roll    $16,%r9d
+
        roll    $16,%r12d
 
        roll    $16,%r8d
-       roll    $16,%r11d
 
        xorl    %r9d,%eax
+       roll    $16,%r11d
        xorl    %r12d,%ecx
 
        xorl    %r8d,%ebx
@@ -1340,9 +1333,9 @@ AES_cbc_encrypt:
        movq    %r14,%r10
        leaq    2304(%r14),%r11
        movq    %r15,%r12
-       andq    $4095,%r10
-       andq    $4095,%r11
-       andq    $4095,%r12
+       andq    $0xFFF,%r10
+       andq    $0xFFF,%r11
+       andq    $0xFFF,%r12
 
        cmpq    %r11,%r12
        jb      .Lcbc_te_break_out
@@ -1351,7 +1344,7 @@ AES_cbc_encrypt:
        jmp     .Lcbc_te_ok
 .Lcbc_te_break_out:
        subq    %r10,%r12
-       andq    $4095,%r12
+       andq    $0xFFF,%r12
        addq    $320,%r12
        subq    %r12,%r15
 .align 4
@@ -1377,7 +1370,7 @@ AES_cbc_encrypt:
 
        movq    %r15,%r10
        subq    %r14,%r10
-       andq    $4095,%r10
+       andq    $0xfff,%r10
        cmpq    $2304,%r10
        jb      .Lcbc_do_ecopy
        cmpq    $4096-248,%r10
@@ -1388,7 +1381,7 @@ AES_cbc_encrypt:
        leaq    80(%rsp),%rdi
        leaq    80(%rsp),%r15
        movl    $30,%ecx
-.long  0x90A548F3      
+.long  0x90A548F3
        movl    %eax,(%rdi)
 .Lcbc_skip_ecopy:
        movq    %r15,0(%rsp)
@@ -1550,7 +1543,7 @@ AES_cbc_encrypt:
        je      .Lcbc_exit
        movl    $30,%ecx
        xorq    %rax,%rax
-.long  0x90AB48F3      
+.long  0x90AB48F3
 
        jmp     .Lcbc_exit
 
@@ -1564,7 +1557,7 @@ AES_cbc_encrypt:
        leaq    -88-63(%rcx),%r10
        subq    %rbp,%r10
        negq    %r10
-       andq    $960,%r10
+       andq    $0x3c0,%r10
        subq    %r10,%rbp
 
        xchgq   %rsp,%rbp
@@ -1593,7 +1586,7 @@ AES_cbc_encrypt:
        leaq    2048(%r14),%r14
        leaq    768-8(%rsp),%rax
        subq    %r14,%rax
-       andq    $768,%rax
+       andq    $0x300,%rax
        leaq    (%r14,%rax,1),%r14
 
        cmpq    $0,%rbx
@@ -1605,7 +1598,7 @@ AES_cbc_encrypt:
        movl    4(%rbp),%ebx
        movl    8(%rbp),%ecx
        movl    12(%rbp),%edx
-       jz      .Lcbc_slow_enc_tail     
+       jz      .Lcbc_slow_enc_tail
 
 .align 4
 .Lcbc_slow_enc_loop:
@@ -1650,16 +1643,16 @@ AES_cbc_encrypt:
        movq    %r10,%rcx
        movq    %r8,%rsi
        movq    %r9,%rdi
-.long  0x9066A4F3              
+.long  0x9066A4F3
        movq    $16,%rcx
        subq    %r10,%rcx
        xorq    %rax,%rax
-.long  0x9066AAF3              
+.long  0x9066AAF3
        movq    %r9,%r8
        movq    $16,%r10
        movq    %r11,%rax
        movq    %r12,%rcx
-       jmp     .Lcbc_slow_enc_loop     
+       jmp     .Lcbc_slow_enc_loop
 
 .align 16
 .LSLOW_DECRYPT:
@@ -1735,7 +1728,7 @@ AES_cbc_encrypt:
        movq    %r9,%rdi
        leaq    64(%rsp),%rsi
        leaq    16(%r10),%rcx
-.long  0x9066A4F3      
+.long  0x9066A4F3
        jmp     .Lcbc_exit
 
 .align 16
diff --git a/secure/lib/libcrypto/asm/aesni-gcm-x86_64.s b/secure/lib/libcrypto/asm/aesni-gcm-x86_64.s
new file mode 100644 (file)
index 0000000..7eaaaa0
--- /dev/null
@@ -0,0 +1,16 @@
+.text  
+
+.globl aesni_gcm_encrypt
+.type  aesni_gcm_encrypt,@function
+aesni_gcm_encrypt:
+       xorl    %eax,%eax
+       .byte   0xf3,0xc3
+.size  aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl aesni_gcm_decrypt
+.type  aesni_gcm_decrypt,@function
+aesni_gcm_decrypt:
+       xorl    %eax,%eax
+       .byte   0xf3,0xc3
+.size  aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.section .note.GNU-stack,"",%progbits
diff --git a/secure/lib/libcrypto/asm/aesni-mb-x86_64.s b/secure/lib/libcrypto/asm/aesni-mb-x86_64.s
new file mode 100644 (file)
index 0000000..7c8d2e6
--- /dev/null
@@ -0,0 +1,507 @@
+.text  
+
+
+
+.globl aesni_multi_cbc_encrypt
+.type  aesni_multi_cbc_encrypt,@function
+.align 32
+aesni_multi_cbc_encrypt:
+       movq    %rsp,%rax
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+
+
+
+
+
+
+       subq    $48,%rsp
+       andq    $-64,%rsp
+       movq    %rax,16(%rsp)
+
+.Lenc4x_body:
+       movdqu  (%rsi),%xmm12
+       leaq    120(%rsi),%rsi
+       leaq    80(%rdi),%rdi
+
+.Lenc4x_loop_grande:
+       movl    %edx,24(%rsp)
+       xorl    %edx,%edx
+       movl    -64(%rdi),%ecx
+       movq    -80(%rdi),%r8
+       cmpl    %edx,%ecx
+       movq    -72(%rdi),%r12
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       movdqu  -56(%rdi),%xmm2
+       movl    %ecx,32(%rsp)
+       cmovleq %rsp,%r8
+       movl    -24(%rdi),%ecx
+       movq    -40(%rdi),%r9
+       cmpl    %edx,%ecx
+       movq    -32(%rdi),%r13
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       movdqu  -16(%rdi),%xmm3
+       movl    %ecx,36(%rsp)
+       cmovleq %rsp,%r9
+       movl    16(%rdi),%ecx
+       movq    0(%rdi),%r10
+       cmpl    %edx,%ecx
+       movq    8(%rdi),%r14
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       movdqu  24(%rdi),%xmm4
+       movl    %ecx,40(%rsp)
+       cmovleq %rsp,%r10
+       movl    56(%rdi),%ecx
+       movq    40(%rdi),%r11
+       cmpl    %edx,%ecx
+       movq    48(%rdi),%r15
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       movdqu  64(%rdi),%xmm5
+       movl    %ecx,44(%rsp)
+       cmovleq %rsp,%r11
+       testl   %edx,%edx
+       jz      .Lenc4x_done
+
+       movups  16-120(%rsi),%xmm1
+       pxor    %xmm12,%xmm2
+       movups  32-120(%rsi),%xmm0
+       pxor    %xmm12,%xmm3
+       movl    240-120(%rsi),%eax
+       pxor    %xmm12,%xmm4
+       movdqu  (%r8),%xmm6
+       pxor    %xmm12,%xmm5
+       movdqu  (%r9),%xmm7
+       pxor    %xmm6,%xmm2
+       movdqu  (%r10),%xmm8
+       pxor    %xmm7,%xmm3
+       movdqu  (%r11),%xmm9
+       pxor    %xmm8,%xmm4
+       pxor    %xmm9,%xmm5
+       movdqa  32(%rsp),%xmm10
+       xorq    %rbx,%rbx
+       jmp     .Loop_enc4x
+
+.align 32
+.Loop_enc4x:
+       addq    $16,%rbx
+       leaq    16(%rsp),%rbp
+       movl    $1,%ecx
+       subq    %rbx,%rbp
+
+.byte  102,15,56,220,209
+       prefetcht0      31(%r8,%rbx,1)
+       prefetcht0      31(%r9,%rbx,1)
+.byte  102,15,56,220,217
+       prefetcht0      31(%r10,%rbx,1)
+       prefetcht0      31(%r10,%rbx,1)
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  48-120(%rsi),%xmm1
+       cmpl    32(%rsp),%ecx
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+       cmovgeq %rbp,%r8
+       cmovgq  %rbp,%r12
+.byte  102,15,56,220,232
+       movups  -56(%rsi),%xmm0
+       cmpl    36(%rsp),%ecx
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+       cmovgeq %rbp,%r9
+       cmovgq  %rbp,%r13
+.byte  102,15,56,220,233
+       movups  -40(%rsi),%xmm1
+       cmpl    40(%rsp),%ecx
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+       cmovgeq %rbp,%r10
+       cmovgq  %rbp,%r14
+.byte  102,15,56,220,232
+       movups  -24(%rsi),%xmm0
+       cmpl    44(%rsp),%ecx
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+       cmovgeq %rbp,%r11
+       cmovgq  %rbp,%r15
+.byte  102,15,56,220,233
+       movups  -8(%rsi),%xmm1
+       movdqa  %xmm10,%xmm11
+.byte  102,15,56,220,208
+       prefetcht0      15(%r12,%rbx,1)
+       prefetcht0      15(%r13,%rbx,1)
+.byte  102,15,56,220,216
+       prefetcht0      15(%r14,%rbx,1)
+       prefetcht0      15(%r15,%rbx,1)
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  128-120(%rsi),%xmm0
+       pxor    %xmm12,%xmm12
+
+.byte  102,15,56,220,209
+       pcmpgtd %xmm12,%xmm11
+       movdqu  -120(%rsi),%xmm12
+.byte  102,15,56,220,217
+       paddd   %xmm11,%xmm10
+       movdqa  %xmm10,32(%rsp)
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  144-120(%rsi),%xmm1
+
+       cmpl    $11,%eax
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  160-120(%rsi),%xmm0
+
+       jb      .Lenc4x_tail
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  176-120(%rsi),%xmm1
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  192-120(%rsi),%xmm0
+
+       je      .Lenc4x_tail
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movups  208-120(%rsi),%xmm1
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movups  224-120(%rsi),%xmm0
+       jmp     .Lenc4x_tail
+
+.align 32
+.Lenc4x_tail:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movdqu  (%r8,%rbx,1),%xmm6
+       movdqu  16-120(%rsi),%xmm1
+
+.byte  102,15,56,221,208
+       movdqu  (%r9,%rbx,1),%xmm7
+       pxor    %xmm12,%xmm6
+.byte  102,15,56,221,216
+       movdqu  (%r10,%rbx,1),%xmm8
+       pxor    %xmm12,%xmm7
+.byte  102,15,56,221,224
+       movdqu  (%r11,%rbx,1),%xmm9
+       pxor    %xmm12,%xmm8
+.byte  102,15,56,221,232
+       movdqu  32-120(%rsi),%xmm0
+       pxor    %xmm12,%xmm9
+
+       movups  %xmm2,-16(%r12,%rbx,1)
+       pxor    %xmm6,%xmm2
+       movups  %xmm3,-16(%r13,%rbx,1)
+       pxor    %xmm7,%xmm3
+       movups  %xmm4,-16(%r14,%rbx,1)
+       pxor    %xmm8,%xmm4
+       movups  %xmm5,-16(%r15,%rbx,1)
+       pxor    %xmm9,%xmm5
+
+       decl    %edx
+       jnz     .Loop_enc4x
+
+       movq    16(%rsp),%rax
+       movl    24(%rsp),%edx
+
+
+
+
+
+
+
+
+
+
+       leaq    160(%rdi),%rdi
+       decl    %edx
+       jnz     .Lenc4x_loop_grande
+
+.Lenc4x_done:
+       movq    -48(%rax),%r15
+       movq    -40(%rax),%r14
+       movq    -32(%rax),%r13
+       movq    -24(%rax),%r12
+       movq    -16(%rax),%rbp
+       movq    -8(%rax),%rbx
+       leaq    (%rax),%rsp
+.Lenc4x_epilogue:
+       .byte   0xf3,0xc3
+.size  aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
+
+.globl aesni_multi_cbc_decrypt
+.type  aesni_multi_cbc_decrypt,@function
+.align 32
+aesni_multi_cbc_decrypt:
+       movq    %rsp,%rax
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+
+
+
+
+
+
+       subq    $48,%rsp
+       andq    $-64,%rsp
+       movq    %rax,16(%rsp)
+
+.Ldec4x_body:
+       movdqu  (%rsi),%xmm12
+       leaq    120(%rsi),%rsi
+       leaq    80(%rdi),%rdi
+
+.Ldec4x_loop_grande:
+       movl    %edx,24(%rsp)
+       xorl    %edx,%edx
+       movl    -64(%rdi),%ecx
+       movq    -80(%rdi),%r8
+       cmpl    %edx,%ecx
+       movq    -72(%rdi),%r12
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       movdqu  -56(%rdi),%xmm6
+       movl    %ecx,32(%rsp)
+       cmovleq %rsp,%r8
+       movl    -24(%rdi),%ecx
+       movq    -40(%rdi),%r9
+       cmpl    %edx,%ecx
+       movq    -32(%rdi),%r13
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       movdqu  -16(%rdi),%xmm7
+       movl    %ecx,36(%rsp)
+       cmovleq %rsp,%r9
+       movl    16(%rdi),%ecx
+       movq    0(%rdi),%r10
+       cmpl    %edx,%ecx
+       movq    8(%rdi),%r14
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       movdqu  24(%rdi),%xmm8
+       movl    %ecx,40(%rsp)
+       cmovleq %rsp,%r10
+       movl    56(%rdi),%ecx
+       movq    40(%rdi),%r11
+       cmpl    %edx,%ecx
+       movq    48(%rdi),%r15
+       cmovgl  %ecx,%edx
+       testl   %ecx,%ecx
+       movdqu  64(%rdi),%xmm9
+       movl    %ecx,44(%rsp)
+       cmovleq %rsp,%r11
+       testl   %edx,%edx
+       jz      .Ldec4x_done
+
+       movups  16-120(%rsi),%xmm1
+       movups  32-120(%rsi),%xmm0
+       movl    240-120(%rsi),%eax
+       movdqu  (%r8),%xmm2
+       movdqu  (%r9),%xmm3
+       pxor    %xmm12,%xmm2
+       movdqu  (%r10),%xmm4
+       pxor    %xmm12,%xmm3
+       movdqu  (%r11),%xmm5
+       pxor    %xmm12,%xmm4
+       pxor    %xmm12,%xmm5
+       movdqa  32(%rsp),%xmm10
+       xorq    %rbx,%rbx
+       jmp     .Loop_dec4x
+
+.align 32
+.Loop_dec4x:
+       addq    $16,%rbx
+       leaq    16(%rsp),%rbp
+       movl    $1,%ecx
+       subq    %rbx,%rbp
+
+.byte  102,15,56,222,209
+       prefetcht0      31(%r8,%rbx,1)
+       prefetcht0      31(%r9,%rbx,1)
+.byte  102,15,56,222,217
+       prefetcht0      31(%r10,%rbx,1)
+       prefetcht0      31(%r11,%rbx,1)
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  48-120(%rsi),%xmm1
+       cmpl    32(%rsp),%ecx
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+       cmovgeq %rbp,%r8
+       cmovgq  %rbp,%r12
+.byte  102,15,56,222,232
+       movups  -56(%rsi),%xmm0
+       cmpl    36(%rsp),%ecx
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+       cmovgeq %rbp,%r9
+       cmovgq  %rbp,%r13
+.byte  102,15,56,222,233
+       movups  -40(%rsi),%xmm1
+       cmpl    40(%rsp),%ecx
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+       cmovgeq %rbp,%r10
+       cmovgq  %rbp,%r14
+.byte  102,15,56,222,232
+       movups  -24(%rsi),%xmm0
+       cmpl    44(%rsp),%ecx
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+       cmovgeq %rbp,%r11
+       cmovgq  %rbp,%r15
+.byte  102,15,56,222,233
+       movups  -8(%rsi),%xmm1
+       movdqa  %xmm10,%xmm11
+.byte  102,15,56,222,208
+       prefetcht0      15(%r12,%rbx,1)
+       prefetcht0      15(%r13,%rbx,1)
+.byte  102,15,56,222,216
+       prefetcht0      15(%r14,%rbx,1)
+       prefetcht0      15(%r15,%rbx,1)
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  128-120(%rsi),%xmm0
+       pxor    %xmm12,%xmm12
+
+.byte  102,15,56,222,209
+       pcmpgtd %xmm12,%xmm11
+       movdqu  -120(%rsi),%xmm12
+.byte  102,15,56,222,217
+       paddd   %xmm11,%xmm10
+       movdqa  %xmm10,32(%rsp)
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  144-120(%rsi),%xmm1
+
+       cmpl    $11,%eax
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  160-120(%rsi),%xmm0
+
+       jb      .Ldec4x_tail
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  176-120(%rsi),%xmm1
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  192-120(%rsi),%xmm0
+
+       je      .Ldec4x_tail
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+.byte  102,15,56,222,233
+       movups  208-120(%rsi),%xmm1
+
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       movups  224-120(%rsi),%xmm0
+       jmp     .Ldec4x_tail
+
+.align 32
+.Ldec4x_tail:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,222,225
+       pxor    %xmm0,%xmm6
+       pxor    %xmm0,%xmm7
+.byte  102,15,56,222,233
+       movdqu  16-120(%rsi),%xmm1
+       pxor    %xmm0,%xmm8
+       pxor    %xmm0,%xmm9
+       movdqu  32-120(%rsi),%xmm0
+
+.byte  102,15,56,223,214
+.byte  102,15,56,223,223
+       movdqu  -16(%r8,%rbx,1),%xmm6
+       movdqu  -16(%r9,%rbx,1),%xmm7
+.byte  102,65,15,56,223,224
+.byte  102,65,15,56,223,233
+       movdqu  -16(%r10,%rbx,1),%xmm8
+       movdqu  -16(%r11,%rbx,1),%xmm9
+
+       movups  %xmm2,-16(%r12,%rbx,1)
+       movdqu  (%r8,%rbx,1),%xmm2
+       movups  %xmm3,-16(%r13,%rbx,1)
+       movdqu  (%r9,%rbx,1),%xmm3
+       pxor    %xmm12,%xmm2
+       movups  %xmm4,-16(%r14,%rbx,1)
+       movdqu  (%r10,%rbx,1),%xmm4
+       pxor    %xmm12,%xmm3
+       movups  %xmm5,-16(%r15,%rbx,1)
+       movdqu  (%r11,%rbx,1),%xmm5
+       pxor    %xmm12,%xmm4
+       pxor    %xmm12,%xmm5
+
+       decl    %edx
+       jnz     .Loop_dec4x
+
+       movq    16(%rsp),%rax
+       movl    24(%rsp),%edx
+
+       leaq    160(%rdi),%rdi
+       decl    %edx
+       jnz     .Ldec4x_loop_grande
+
+.Ldec4x_done:
+       movq    -48(%rax),%r15
+       movq    -40(%rax),%r14
+       movq    -32(%rax),%r13
+       movq    -24(%rax),%r12
+       movq    -16(%rax),%rbp
+       movq    -8(%rax),%rbx
+       leaq    (%rax),%rsp
+.Ldec4x_epilogue:
+       .byte   0xf3,0xc3
+.size  aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+.section .note.GNU-stack,"",%progbits
index a65ae57..53e360a 100644 (file)
@@ -3,16 +3,18 @@
 
 .globl aesni_cbc_sha1_enc
 .type  aesni_cbc_sha1_enc,@function
-.align 16
+.align 32
 aesni_cbc_sha1_enc:
 
        movl    OPENSSL_ia32cap_P+0(%rip),%r10d
-       movl    OPENSSL_ia32cap_P+4(%rip),%r11d
+       movq    OPENSSL_ia32cap_P+4(%rip),%r11
+       btq     $61,%r11
+       jc      aesni_cbc_sha1_enc_shaext
        jmp     aesni_cbc_sha1_enc_ssse3
        .byte   0xf3,0xc3
 .size  aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
 .type  aesni_cbc_sha1_enc_ssse3,@function
-.align 16
+.align 32
 aesni_cbc_sha1_enc_ssse3:
        movq    8(%rsp),%r10
 
@@ -29,12 +31,12 @@ aesni_cbc_sha1_enc_ssse3:
        movq    %rdi,%r12
        movq    %rsi,%r13
        movq    %rdx,%r14
-       movq    %rcx,%r15
-       movdqu  (%r8),%xmm11
+       leaq    112(%rcx),%r15
+       movdqu  (%r8),%xmm2
        movq    %r8,88(%rsp)
        shlq    $6,%r14
        subq    %r12,%r13
-       movl    240(%r15),%r8d
+       movl    240-112(%r15),%r8d
        addq    %r10,%r14
 
        leaq    K_XX_XX(%rip),%r11
@@ -44,1188 +46,1168 @@ aesni_cbc_sha1_enc_ssse3:
        movl    12(%r9),%edx
        movl    %ebx,%esi
        movl    16(%r9),%ebp
+       movl    %ecx,%edi
+       xorl    %edx,%edi
+       andl    %edi,%esi
 
-       movdqa  64(%r11),%xmm6
-       movdqa  0(%r11),%xmm9
-       movdqu  0(%r10),%xmm0
-       movdqu  16(%r10),%xmm1
-       movdqu  32(%r10),%xmm2
-       movdqu  48(%r10),%xmm3
-.byte  102,15,56,0,198
+       movdqa  64(%r11),%xmm3
+       movdqa  0(%r11),%xmm13
+       movdqu  0(%r10),%xmm4
+       movdqu  16(%r10),%xmm5
+       movdqu  32(%r10),%xmm6
+       movdqu  48(%r10),%xmm7
+.byte  102,15,56,0,227
+.byte  102,15,56,0,235
+.byte  102,15,56,0,243
        addq    $64,%r10
-.byte  102,15,56,0,206
-.byte  102,15,56,0,214
-.byte  102,15,56,0,222
-       paddd   %xmm9,%xmm0
-       paddd   %xmm9,%xmm1
-       paddd   %xmm9,%xmm2
-       movdqa  %xmm0,0(%rsp)
-       psubd   %xmm9,%xmm0
-       movdqa  %xmm1,16(%rsp)
-       psubd   %xmm9,%xmm1
-       movdqa  %xmm2,32(%rsp)
-       psubd   %xmm9,%xmm2
-       movups  (%r15),%xmm13
-       movups  16(%r15),%xmm14
+       paddd   %xmm13,%xmm4
+.byte  102,15,56,0,251
+       paddd   %xmm13,%xmm5
+       paddd   %xmm13,%xmm6
+       movdqa  %xmm4,0(%rsp)
+       psubd   %xmm13,%xmm4
+       movdqa  %xmm5,16(%rsp)
+       psubd   %xmm13,%xmm5
+       movdqa  %xmm6,32(%rsp)
+       psubd   %xmm13,%xmm6
+       movups  -112(%r15),%xmm15
+       movups  16-112(%r15),%xmm0
        jmp     .Loop_ssse3
-.align 16
+.align 32
 .Loop_ssse3:
-       movdqa  %xmm1,%xmm4
-       addl    0(%rsp),%ebp
-       movups  0(%r12),%xmm12
-       xorps   %xmm13,%xmm12
-       xorps   %xmm12,%xmm11
-.byte  102,69,15,56,220,222
-       movups  32(%r15),%xmm15
-       xorl    %edx,%ecx
-       movdqa  %xmm3,%xmm8
-.byte  102,15,58,15,224,8
+       rorl    $2,%ebx
+       movups  0(%r12),%xmm14
+       xorps   %xmm15,%xmm14
+       xorps   %xmm14,%xmm2
+       movups  -80(%r15),%xmm1
+.byte  102,15,56,220,208
+       pshufd  $238,%xmm4,%xmm8
+       xorl    %edx,%esi
+       movdqa  %xmm7,%xmm12
+       paddd   %xmm7,%xmm13
        movl    %eax,%edi
+       addl    0(%rsp),%ebp
+       punpcklqdq      %xmm5,%xmm8
+       xorl    %ecx,%ebx
        roll    $5,%eax
-       paddd   %xmm3,%xmm9
-       andl    %ecx,%esi
-       xorl    %edx,%ecx
-       psrldq  $4,%xmm8
-       xorl    %edx,%esi
-       addl    %eax,%ebp
-       pxor    %xmm0,%xmm4
-       rorl    $2,%ebx
        addl    %esi,%ebp
-       pxor    %xmm2,%xmm8
-       addl    4(%rsp),%edx
-       xorl    %ecx,%ebx
-       movl    %ebp,%esi
-       roll    $5,%ebp
-       pxor    %xmm8,%xmm4
+       psrldq  $4,%xmm12
        andl    %ebx,%edi
        xorl    %ecx,%ebx
-       movdqa  %xmm9,48(%rsp)
-       xorl    %ecx,%edi
-.byte  102,69,15,56,220,223
-       movups  48(%r15),%xmm14
-       addl    %ebp,%edx
-       movdqa  %xmm4,%xmm10
-       movdqa  %xmm4,%xmm8
+       pxor    %xmm4,%xmm8
+       addl    %eax,%ebp
        rorl    $7,%eax
-       addl    %edi,%edx
-       addl    8(%rsp),%ecx
+       pxor    %xmm6,%xmm12
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    4(%rsp),%edx
+       pxor    %xmm12,%xmm8
        xorl    %ebx,%eax
-       pslldq  $12,%xmm10
-       paddd   %xmm4,%xmm4
-       movl    %edx,%edi
-       roll    $5,%edx
+       roll    $5,%ebp
+       movdqa  %xmm13,48(%rsp)
+       addl    %edi,%edx
+       movups  -64(%r15),%xmm0
+.byte  102,15,56,220,209
        andl    %eax,%esi
+       movdqa  %xmm8,%xmm3
        xorl    %ebx,%eax
-       psrld   $31,%xmm8
-       xorl    %ebx,%esi
-       addl    %edx,%ecx
-       movdqa  %xmm10,%xmm9
+       addl    %ebp,%edx
        rorl    $7,%ebp
-       addl    %esi,%ecx
-       psrld   $30,%xmm10
-       por     %xmm8,%xmm4
-       addl    12(%rsp),%ebx
+       movdqa  %xmm8,%xmm12
+       xorl    %ebx,%esi
+       pslldq  $12,%xmm3
+       paddd   %xmm8,%xmm8
+       movl    %edx,%edi
+       addl    8(%rsp),%ecx
+       psrld   $31,%xmm12
        xorl    %eax,%ebp
-       movl    %ecx,%esi
-       roll    $5,%ecx
-.byte  102,69,15,56,220,222
-       movups  64(%r15),%xmm15
-       pslld   $2,%xmm9
-       pxor    %xmm10,%xmm4
+       roll    $5,%edx
+       addl    %esi,%ecx
+       movdqa  %xmm3,%xmm13
        andl    %ebp,%edi
        xorl    %eax,%ebp
-       movdqa  0(%r11),%xmm10
-       xorl    %eax,%edi
-       addl    %ecx,%ebx
-       pxor    %xmm9,%xmm4
+       psrld   $30,%xmm3
+       addl    %edx,%ecx
        rorl    $7,%edx
-       addl    %edi,%ebx
-       movdqa  %xmm2,%xmm5
-       addl    16(%rsp),%eax
+       por     %xmm12,%xmm8
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    12(%rsp),%ebx
+       movups  -48(%r15),%xmm1
+.byte  102,15,56,220,208
+       pslld   $2,%xmm13
+       pxor    %xmm3,%xmm8
        xorl    %ebp,%edx
-       movdqa  %xmm4,%xmm9
-.byte  102,15,58,15,233,8
-       movl    %ebx,%edi
-       roll    $5,%ebx
-       paddd   %xmm4,%xmm10
+       movdqa  0(%r11),%xmm3
+       roll    $5,%ecx
+       addl    %edi,%ebx
        andl    %edx,%esi
+       pxor    %xmm13,%xmm8
        xorl    %ebp,%edx
-       psrldq  $4,%xmm9
-       xorl    %ebp,%esi
-       addl    %ebx,%eax
-       pxor    %xmm1,%xmm5
+       addl    %ecx,%ebx
        rorl    $7,%ecx
-       addl    %esi,%eax
-       pxor    %xmm3,%xmm9
-       addl    20(%rsp),%ebp
-.byte  102,69,15,56,220,223
-       movups  80(%r15),%xmm14
+       pshufd  $238,%xmm5,%xmm9
+       xorl    %ebp,%esi
+       movdqa  %xmm8,%xmm13
+       paddd   %xmm8,%xmm3
+       movl    %ebx,%edi
+       addl    16(%rsp),%eax
+       punpcklqdq      %xmm6,%xmm9
        xorl    %edx,%ecx
-       movl    %eax,%esi
-       roll    $5,%eax
-       pxor    %xmm9,%xmm5
+       roll    $5,%ebx
+       addl    %esi,%eax
+       psrldq  $4,%xmm13
        andl    %ecx,%edi
        xorl    %edx,%ecx
-       movdqa  %xmm10,0(%rsp)
-       xorl    %edx,%edi
-       addl    %eax,%ebp
-       movdqa  %xmm5,%xmm8
-       movdqa  %xmm5,%xmm9
+       pxor    %xmm5,%xmm9
+       addl    %ebx,%eax
        rorl    $7,%ebx
-       addl    %edi,%ebp
-       addl    24(%rsp),%edx
+       movups  -32(%r15),%xmm0
+.byte  102,15,56,220,209
+       pxor    %xmm7,%xmm13
+       xorl    %edx,%edi
+       movl    %eax,%esi
+       addl    20(%rsp),%ebp
+       pxor    %xmm13,%xmm9
        xorl    %ecx,%ebx
-       pslldq  $12,%xmm8
-       paddd   %xmm5,%xmm5
-       movl    %ebp,%edi
-       roll    $5,%ebp
+       roll    $5,%eax
+       movdqa  %xmm3,0(%rsp)
+       addl    %edi,%ebp
        andl    %ebx,%esi
+       movdqa  %xmm9,%xmm12
        xorl    %ecx,%ebx
-       psrld   $31,%xmm9
-       xorl    %ecx,%esi
-.byte  102,69,15,56,220,222
-       movups  96(%r15),%xmm15
-       addl    %ebp,%edx
-       movdqa  %xmm8,%xmm10
+       addl    %eax,%ebp
        rorl    $7,%eax
-       addl    %esi,%edx
-       psrld   $30,%xmm8
-       por     %xmm9,%xmm5
-       addl    28(%rsp),%ecx
+       movdqa  %xmm9,%xmm13
+       xorl    %ecx,%esi
+       pslldq  $12,%xmm12
+       paddd   %xmm9,%xmm9
+       movl    %ebp,%edi
+       addl    24(%rsp),%edx
+       psrld   $31,%xmm13
        xorl    %ebx,%eax
-       movl    %edx,%esi
-       roll    $5,%edx
-       pslld   $2,%xmm10
-       pxor    %xmm8,%xmm5
+       roll    $5,%ebp
+       addl    %esi,%edx
+       movups  -16(%r15),%xmm1
+.byte  102,15,56,220,208
+       movdqa  %xmm12,%xmm3
        andl    %eax,%edi
        xorl    %ebx,%eax
-       movdqa  16(%r11),%xmm8
-       xorl    %ebx,%edi
-       addl    %edx,%ecx
-       pxor    %xmm10,%xmm5
+       psrld   $30,%xmm12
+       addl    %ebp,%edx
        rorl    $7,%ebp
-       addl    %edi,%ecx
-       movdqa  %xmm3,%xmm6
-       addl    32(%rsp),%ebx
+       por     %xmm13,%xmm9
+       xorl    %ebx,%edi
+       movl    %edx,%esi
+       addl    28(%rsp),%ecx
+       pslld   $2,%xmm3
+       pxor    %xmm12,%xmm9
        xorl    %eax,%ebp
-       movdqa  %xmm5,%xmm10
-.byte  102,15,58,15,242,8
-       movl    %ecx,%edi
-       roll    $5,%ecx
-.byte  102,69,15,56,220,223
-       movups  112(%r15),%xmm14
-       paddd   %xmm5,%xmm8
+       movdqa  16(%r11),%xmm12
+       roll    $5,%edx
+       addl    %edi,%ecx
        andl    %ebp,%esi
+       pxor    %xmm3,%xmm9
        xorl    %eax,%ebp
-       psrldq  $4,%xmm10
-       xorl    %eax,%esi
-       addl    %ecx,%ebx
-       pxor    %xmm2,%xmm6
+       addl    %edx,%ecx
        rorl    $7,%edx
-       addl    %esi,%ebx
-       pxor    %xmm4,%xmm10
-       addl    36(%rsp),%eax
+       pshufd  $238,%xmm6,%xmm10
+       xorl    %eax,%esi
+       movdqa  %xmm9,%xmm3
+       paddd   %xmm9,%xmm12
+       movl    %ecx,%edi
+       addl    32(%rsp),%ebx
+       movups  0(%r15),%xmm0
+.byte  102,15,56,220,209
+       punpcklqdq      %xmm7,%xmm10
        xorl    %ebp,%edx
-       movl    %ebx,%esi
-       roll    $5,%ebx
-       pxor    %xmm10,%xmm6
+       roll    $5,%ecx
+       addl    %esi,%ebx
+       psrldq  $4,%xmm3
        andl    %edx,%edi
        xorl    %ebp,%edx
-       movdqa  %xmm8,16(%rsp)
-       xorl    %ebp,%edi
-       addl    %ebx,%eax
-       movdqa  %xmm6,%xmm9
-       movdqa  %xmm6,%xmm10
+       pxor    %xmm6,%xmm10
+       addl    %ecx,%ebx
        rorl    $7,%ecx
-       addl    %edi,%eax
-       addl    40(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  128(%r15),%xmm15
+       pxor    %xmm8,%xmm3
+       xorl    %ebp,%edi
+       movl    %ebx,%esi
+       addl    36(%rsp),%eax
+       pxor    %xmm3,%xmm10
        xorl    %edx,%ecx
-       pslldq  $12,%xmm9
-       paddd   %xmm6,%xmm6
-       movl    %eax,%edi
-       roll    $5,%eax
+       roll    $5,%ebx
+       movdqa  %xmm12,16(%rsp)
+       addl    %edi,%eax
        andl    %ecx,%esi
+       movdqa  %xmm10,%xmm13
        xorl    %edx,%ecx
-       psrld   $31,%xmm10
-       xorl    %edx,%esi
-       addl    %eax,%ebp
-       movdqa  %xmm9,%xmm8
+       addl    %ebx,%eax
        rorl    $7,%ebx
-       addl    %esi,%ebp
-       psrld   $30,%xmm9
-       por     %xmm10,%xmm6
-       addl    44(%rsp),%edx
+       movups  16(%r15),%xmm1
+.byte  102,15,56,220,208
+       movdqa  %xmm10,%xmm3
+       xorl    %edx,%esi
+       pslldq  $12,%xmm13
+       paddd   %xmm10,%xmm10
+       movl    %eax,%edi
+       addl    40(%rsp),%ebp
+       psrld   $31,%xmm3
        xorl    %ecx,%ebx
-       movl    %ebp,%esi
-       roll    $5,%ebp
-       pslld   $2,%xmm8
-       pxor    %xmm9,%xmm6
+       roll    $5,%eax
+       addl    %esi,%ebp
+       movdqa  %xmm13,%xmm12
        andl    %ebx,%edi
        xorl    %ecx,%ebx
-       movdqa  16(%r11),%xmm9
-       xorl    %ecx,%edi
-.byte  102,69,15,56,220,223
-       movups  144(%r15),%xmm14
-       addl    %ebp,%edx
-       pxor    %xmm8,%xmm6
+       psrld   $30,%xmm13
+       addl    %eax,%ebp
        rorl    $7,%eax
-       addl    %edi,%edx
-       movdqa  %xmm4,%xmm7
-       addl    48(%rsp),%ecx
+       por     %xmm3,%xmm10
+       xorl    %ecx,%edi
+       movl    %ebp,%esi
+       addl    44(%rsp),%edx
+       pslld   $2,%xmm12
+       pxor    %xmm13,%xmm10
        xorl    %ebx,%eax
-       movdqa  %xmm6,%xmm8
-.byte  102,15,58,15,251,8
-       movl    %edx,%edi
-       roll    $5,%edx
-       paddd   %xmm6,%xmm9
+       movdqa  16(%r11),%xmm13
+       roll    $5,%ebp
+       addl    %edi,%edx
+       movups  32(%r15),%xmm0
+.byte  102,15,56,220,209
        andl    %eax,%esi
+       pxor    %xmm12,%xmm10
        xorl    %ebx,%eax
-       psrldq  $4,%xmm8
-       xorl    %ebx,%esi
-       addl    %edx,%ecx
-       pxor    %xmm3,%xmm7
+       addl    %ebp,%edx
        rorl    $7,%ebp
-       addl    %esi,%ecx
-       pxor    %xmm5,%xmm8
-       addl    52(%rsp),%ebx
+       pshufd  $238,%xmm7,%xmm11
+       xorl    %ebx,%esi
+       movdqa  %xmm10,%xmm12
+       paddd   %xmm10,%xmm13
+       movl    %edx,%edi
+       addl    48(%rsp),%ecx
+       punpcklqdq      %xmm8,%xmm11
        xorl    %eax,%ebp
-       movl    %ecx,%esi
-       roll    $5,%ecx
-.byte  102,69,15,56,220,222
-       movups  160(%r15),%xmm15
-       pxor    %xmm8,%xmm7
+       roll    $5,%edx
+       addl    %esi,%ecx
+       psrldq  $4,%xmm12
        andl    %ebp,%edi
        xorl    %eax,%ebp
-       movdqa  %xmm9,32(%rsp)
-       xorl    %eax,%edi
-       addl    %ecx,%ebx
-       movdqa  %xmm7,%xmm10
-       movdqa  %xmm7,%xmm8
+       pxor    %xmm7,%xmm11
+       addl    %edx,%ecx
        rorl    $7,%edx
-       addl    %edi,%ebx
-       addl    56(%rsp),%eax
+       pxor    %xmm9,%xmm12
+       xorl    %eax,%edi
+       movl    %ecx,%esi
+       addl    52(%rsp),%ebx
+       movups  48(%r15),%xmm1
+.byte  102,15,56,220,208
+       pxor    %xmm12,%xmm11
        xorl    %ebp,%edx
-       pslldq  $12,%xmm10
-       paddd   %xmm7,%xmm7
-       movl    %ebx,%edi
-       roll    $5,%ebx
+       roll    $5,%ecx
+       movdqa  %xmm13,32(%rsp)
+       addl    %edi,%ebx
        andl    %edx,%esi
+       movdqa  %xmm11,%xmm3
        xorl    %ebp,%edx
-       psrld   $31,%xmm8
-       xorl    %ebp,%esi
-       addl    %ebx,%eax
-       movdqa  %xmm10,%xmm9
+       addl    %ecx,%ebx
        rorl    $7,%ecx
+       movdqa  %xmm11,%xmm12
+       xorl    %ebp,%esi
+       pslldq  $12,%xmm3
+       paddd   %xmm11,%xmm11
+       movl    %ebx,%edi
+       addl    56(%rsp),%eax
+       psrld   $31,%xmm12
+       xorl    %edx,%ecx
+       roll    $5,%ebx
        addl    %esi,%eax
-       psrld   $30,%xmm10
-       por     %xmm8,%xmm7
-       addl    60(%rsp),%ebp
+       movdqa  %xmm3,%xmm13
+       andl    %ecx,%edi
+       xorl    %edx,%ecx
+       psrld   $30,%xmm3
+       addl    %ebx,%eax
+       rorl    $7,%ebx
        cmpl    $11,%r8d
        jb      .Laesenclast1
-       movups  176(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  192(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  64(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%r15),%xmm1
+.byte  102,15,56,220,208
        je      .Laesenclast1
-       movups  208(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  224(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  96(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%r15),%xmm1
+.byte  102,15,56,220,208
 .Laesenclast1:
-.byte  102,69,15,56,221,223
-       movups  16(%r15),%xmm14
-       xorl    %edx,%ecx
+.byte  102,15,56,221,209
+       movups  16-112(%r15),%xmm0
+       por     %xmm12,%xmm11
+       xorl    %edx,%edi
        movl    %eax,%esi
+       addl    60(%rsp),%ebp
+       pslld   $2,%xmm13
+       pxor    %xmm3,%xmm11
+       xorl    %ecx,%ebx
+       movdqa  16(%r11),%xmm3
        roll    $5,%eax
-       pslld   $2,%xmm9
-       pxor    %xmm10,%xmm7
-       andl    %ecx,%edi
-       xorl    %edx,%ecx
-       movdqa  16(%r11),%xmm10
-       xorl    %edx,%edi
-       addl    %eax,%ebp
-       pxor    %xmm9,%xmm7
-       rorl    $7,%ebx
        addl    %edi,%ebp
-       movdqa  %xmm7,%xmm9
-       addl    0(%rsp),%edx
-       pxor    %xmm4,%xmm0
-.byte  102,68,15,58,15,206,8
-       xorl    %ecx,%ebx
-       movl    %ebp,%edi
-       roll    $5,%ebp
-       pxor    %xmm1,%xmm0
        andl    %ebx,%esi
+       pxor    %xmm13,%xmm11
+       pshufd  $238,%xmm10,%xmm13
        xorl    %ecx,%ebx
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm7,%xmm10
-       xorl    %ecx,%esi
-       movups  16(%r12),%xmm12
-       xorps   %xmm13,%xmm12
-       movups  %xmm11,0(%r13,%r12,1)
-       xorps   %xmm12,%xmm11
-.byte  102,69,15,56,220,222
-       movups  32(%r15),%xmm15
-       addl    %ebp,%edx
-       pxor    %xmm9,%xmm0
+       addl    %eax,%ebp
        rorl    $7,%eax
-       addl    %esi,%edx
-       addl    4(%rsp),%ecx
+       pxor    %xmm8,%xmm4
+       xorl    %ecx,%esi
+       movl    %ebp,%edi
+       addl    0(%rsp),%edx
+       punpcklqdq      %xmm11,%xmm13
        xorl    %ebx,%eax
-       movdqa  %xmm0,%xmm9
-       movdqa  %xmm10,48(%rsp)
-       movl    %edx,%esi
-       roll    $5,%edx
+       roll    $5,%ebp
+       pxor    %xmm5,%xmm4
+       addl    %esi,%edx
+       movups  16(%r12),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm2,0(%r12,%r13,1)
+       xorps   %xmm14,%xmm2
+       movups  -80(%r15),%xmm1
+.byte  102,15,56,220,208
        andl    %eax,%edi
+       movdqa  %xmm3,%xmm12
        xorl    %ebx,%eax
-       pslld   $2,%xmm0
-       xorl    %ebx,%edi
-       addl    %edx,%ecx
-       psrld   $30,%xmm9
+       paddd   %xmm11,%xmm3
+       addl    %ebp,%edx
+       pxor    %xmm13,%xmm4
        rorl    $7,%ebp
-       addl    %edi,%ecx
-       addl    8(%rsp),%ebx
+       xorl    %ebx,%edi
+       movl    %edx,%esi
+       addl    4(%rsp),%ecx
+       movdqa  %xmm4,%xmm13
        xorl    %eax,%ebp
-       movl    %ecx,%edi
-       roll    $5,%ecx
-.byte  102,69,15,56,220,223
-       movups  48(%r15),%xmm14
-       por     %xmm9,%xmm0
+       roll    $5,%edx
+       movdqa  %xmm3,48(%rsp)
+       addl    %edi,%ecx
        andl    %ebp,%esi
        xorl    %eax,%ebp
-       movdqa  %xmm0,%xmm10
-       xorl    %eax,%esi
-       addl    %ecx,%ebx
+       pslld   $2,%xmm4
+       addl    %edx,%ecx
        rorl    $7,%edx
-       addl    %esi,%ebx
-       addl    12(%rsp),%eax
+       psrld   $30,%xmm13
+       xorl    %eax,%esi
+       movl    %ecx,%edi
+       addl    8(%rsp),%ebx
+       movups  -64(%r15),%xmm0
+.byte  102,15,56,220,209
+       por     %xmm13,%xmm4
        xorl    %ebp,%edx
-       movl    %ebx,%esi
-       roll    $5,%ebx
+       roll    $5,%ecx
+       pshufd  $238,%xmm11,%xmm3
+       addl    %esi,%ebx
        andl    %edx,%edi
        xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       addl    12(%rsp),%eax
        xorl    %ebp,%edi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
+       movl    %ebx,%esi
+       roll    $5,%ebx
        addl    %edi,%eax
-       addl    16(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  64(%r15),%xmm15
-       pxor    %xmm5,%xmm1
-.byte  102,68,15,58,15,215,8
        xorl    %edx,%esi
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       pxor    %xmm9,%xmm5
+       addl    16(%rsp),%ebp
+       movups  -48(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %ecx,%esi
+       punpcklqdq      %xmm4,%xmm3
        movl    %eax,%edi
        roll    $5,%eax
-       pxor    %xmm2,%xmm1
-       xorl    %ecx,%esi
-       addl    %eax,%ebp
-       movdqa  %xmm8,%xmm9
-       paddd   %xmm0,%xmm8
-       rorl    $7,%ebx
+       pxor    %xmm6,%xmm5
        addl    %esi,%ebp
-       pxor    %xmm10,%xmm1
-       addl    20(%rsp),%edx
        xorl    %ecx,%edi
+       movdqa  %xmm12,%xmm13
+       rorl    $7,%ebx
+       paddd   %xmm4,%xmm12
+       addl    %eax,%ebp
+       pxor    %xmm3,%xmm5
+       addl    20(%rsp),%edx
+       xorl    %ebx,%edi
        movl    %ebp,%esi
        roll    $5,%ebp
-       movdqa  %xmm1,%xmm10
-       movdqa  %xmm8,0(%rsp)
-       xorl    %ebx,%edi
-       addl    %ebp,%edx
-       rorl    $7,%eax
+       movdqa  %xmm5,%xmm3
        addl    %edi,%edx
-       pslld   $2,%xmm1
-       addl    24(%rsp),%ecx
        xorl    %ebx,%esi
-       psrld   $30,%xmm10
+       movdqa  %xmm12,0(%rsp)
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       addl    24(%rsp),%ecx
+       pslld   $2,%xmm5
+       xorl    %eax,%esi
        movl    %edx,%edi
+       psrld   $30,%xmm3
        roll    $5,%edx
-       xorl    %eax,%esi
-.byte  102,69,15,56,220,223
-       movups  80(%r15),%xmm14
-       addl    %edx,%ecx
-       rorl    $7,%ebp
        addl    %esi,%ecx
-       por     %xmm10,%xmm1
-       addl    28(%rsp),%ebx
+       movups  -32(%r15),%xmm0
+.byte  102,15,56,220,209
        xorl    %eax,%edi
-       movdqa  %xmm1,%xmm8
+       rorl    $7,%ebp
+       por     %xmm3,%xmm5
+       addl    %edx,%ecx
+       addl    28(%rsp),%ebx
+       pshufd  $238,%xmm4,%xmm12
+       xorl    %ebp,%edi
        movl    %ecx,%esi
        roll    $5,%ecx
-       xorl    %ebp,%edi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %edi,%ebx
-       addl    32(%rsp),%eax
-       pxor    %xmm6,%xmm2
-.byte  102,68,15,58,15,192,8
        xorl    %ebp,%esi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       pxor    %xmm10,%xmm6
+       addl    32(%rsp),%eax
+       xorl    %edx,%esi
+       punpcklqdq      %xmm5,%xmm12
        movl    %ebx,%edi
        roll    $5,%ebx
-       pxor    %xmm3,%xmm2
-       xorl    %edx,%esi
-       addl    %ebx,%eax
-       movdqa  32(%r11),%xmm10
-       paddd   %xmm1,%xmm9
-       rorl    $7,%ecx
+       pxor    %xmm7,%xmm6
        addl    %esi,%eax
-       pxor    %xmm8,%xmm2
-       addl    36(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  96(%r15),%xmm15
        xorl    %edx,%edi
+       movdqa  32(%r11),%xmm3
+       rorl    $7,%ecx
+       paddd   %xmm5,%xmm13
+       addl    %ebx,%eax
+       pxor    %xmm12,%xmm6
+       addl    36(%rsp),%ebp
+       movups  -16(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %ecx,%edi
        movl    %eax,%esi
        roll    $5,%eax
-       movdqa  %xmm2,%xmm8
-       movdqa  %xmm9,16(%rsp)
-       xorl    %ecx,%edi
-       addl    %eax,%ebp
-       rorl    $7,%ebx
+       movdqa  %xmm6,%xmm12
        addl    %edi,%ebp
-       pslld   $2,%xmm2
-       addl    40(%rsp),%edx
        xorl    %ecx,%esi
-       psrld   $30,%xmm8
+       movdqa  %xmm13,16(%rsp)
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    40(%rsp),%edx
+       pslld   $2,%xmm6
+       xorl    %ebx,%esi
        movl    %ebp,%edi
+       psrld   $30,%xmm12
        roll    $5,%ebp
-       xorl    %ebx,%esi
-       addl    %ebp,%edx
-       rorl    $7,%eax
        addl    %esi,%edx
-       por     %xmm8,%xmm2
-       addl    44(%rsp),%ecx
        xorl    %ebx,%edi
-       movdqa  %xmm2,%xmm9
+       rorl    $7,%eax
+       por     %xmm12,%xmm6
+       addl    %ebp,%edx
+       addl    44(%rsp),%ecx
+       pshufd  $238,%xmm5,%xmm13
+       xorl    %eax,%edi
        movl    %edx,%esi
        roll    $5,%edx
-       xorl    %eax,%edi
-.byte  102,69,15,56,220,223
-       movups  112(%r15),%xmm14
-       addl    %edx,%ecx
-       rorl    $7,%ebp
        addl    %edi,%ecx
-       addl    48(%rsp),%ebx
-       pxor    %xmm7,%xmm3
-.byte  102,68,15,58,15,201,8
+       movups  0(%r15),%xmm0
+.byte  102,15,56,220,209
        xorl    %eax,%esi
+       rorl    $7,%ebp
+       addl    %edx,%ecx
+       pxor    %xmm11,%xmm7
+       addl    48(%rsp),%ebx
+       xorl    %ebp,%esi
+       punpcklqdq      %xmm6,%xmm13
        movl    %ecx,%edi
        roll    $5,%ecx
-       pxor    %xmm4,%xmm3
-       xorl    %ebp,%esi
-       addl    %ecx,%ebx
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm2,%xmm10
-       rorl    $7,%edx
+       pxor    %xmm8,%xmm7
        addl    %esi,%ebx
-       pxor    %xmm9,%xmm3
-       addl    52(%rsp),%eax
        xorl    %ebp,%edi
+       movdqa  %xmm3,%xmm12
+       rorl    $7,%edx
+       paddd   %xmm6,%xmm3
+       addl    %ecx,%ebx
+       pxor    %xmm13,%xmm7
+       addl    52(%rsp),%eax
+       xorl    %edx,%edi
        movl    %ebx,%esi
        roll    $5,%ebx
-       movdqa  %xmm3,%xmm9
-       movdqa  %xmm10,32(%rsp)
-       xorl    %edx,%edi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
+       movdqa  %xmm7,%xmm13
        addl    %edi,%eax
-       pslld   $2,%xmm3
-       addl    56(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  128(%r15),%xmm15
        xorl    %edx,%esi
-       psrld   $30,%xmm9
+       movdqa  %xmm3,32(%rsp)
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    56(%rsp),%ebp
+       movups  16(%r15),%xmm1
+.byte  102,15,56,220,208
+       pslld   $2,%xmm7
+       xorl    %ecx,%esi
        movl    %eax,%edi
+       psrld   $30,%xmm13
        roll    $5,%eax
-       xorl    %ecx,%esi
-       addl    %eax,%ebp
-       rorl    $7,%ebx
        addl    %esi,%ebp
-       por     %xmm9,%xmm3
-       addl    60(%rsp),%edx
        xorl    %ecx,%edi
-       movdqa  %xmm3,%xmm10
+       rorl    $7,%ebx
+       por     %xmm13,%xmm7
+       addl    %eax,%ebp
+       addl    60(%rsp),%edx
+       pshufd  $238,%xmm6,%xmm3
+       xorl    %ebx,%edi
        movl    %ebp,%esi
        roll    $5,%ebp
-       xorl    %ebx,%edi
-       addl    %ebp,%edx
-       rorl    $7,%eax
        addl    %edi,%edx
-       addl    0(%rsp),%ecx
-       pxor    %xmm0,%xmm4
-.byte  102,68,15,58,15,210,8
        xorl    %ebx,%esi
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       pxor    %xmm4,%xmm8
+       addl    0(%rsp),%ecx
+       xorl    %eax,%esi
+       punpcklqdq      %xmm7,%xmm3
        movl    %edx,%edi
        roll    $5,%edx
-       pxor    %xmm5,%xmm4
-       xorl    %eax,%esi
-.byte  102,69,15,56,220,223
-       movups  144(%r15),%xmm14
-       addl    %edx,%ecx
-       movdqa  %xmm8,%xmm9
-       paddd   %xmm3,%xmm8
-       rorl    $7,%ebp
+       pxor    %xmm9,%xmm8
        addl    %esi,%ecx
-       pxor    %xmm10,%xmm4
-       addl    4(%rsp),%ebx
+       movups  32(%r15),%xmm0
+.byte  102,15,56,220,209
        xorl    %eax,%edi
+       movdqa  %xmm12,%xmm13
+       rorl    $7,%ebp
+       paddd   %xmm7,%xmm12
+       addl    %edx,%ecx
+       pxor    %xmm3,%xmm8
+       addl    4(%rsp),%ebx
+       xorl    %ebp,%edi
        movl    %ecx,%esi
        roll    $5,%ecx
-       movdqa  %xmm4,%xmm10
-       movdqa  %xmm8,48(%rsp)
-       xorl    %ebp,%edi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
+       movdqa  %xmm8,%xmm3
        addl    %edi,%ebx
-       pslld   $2,%xmm4
-       addl    8(%rsp),%eax
        xorl    %ebp,%esi
-       psrld   $30,%xmm10
+       movdqa  %xmm12,48(%rsp)
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    8(%rsp),%eax
+       pslld   $2,%xmm8
+       xorl    %edx,%esi
        movl    %ebx,%edi
+       psrld   $30,%xmm3
        roll    $5,%ebx
-       xorl    %edx,%esi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
        addl    %esi,%eax
-       por     %xmm10,%xmm4
-       addl    12(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  160(%r15),%xmm15
        xorl    %edx,%edi
-       movdqa  %xmm4,%xmm8
+       rorl    $7,%ecx
+       por     %xmm3,%xmm8
+       addl    %ebx,%eax
+       addl    12(%rsp),%ebp
+       movups  48(%r15),%xmm1
+.byte  102,15,56,220,208
+       pshufd  $238,%xmm7,%xmm12
+       xorl    %ecx,%edi
        movl    %eax,%esi
        roll    $5,%eax
-       xorl    %ecx,%edi
-       addl    %eax,%ebp
-       rorl    $7,%ebx
        addl    %edi,%ebp
-       addl    16(%rsp),%edx
-       pxor    %xmm1,%xmm5
-.byte  102,68,15,58,15,195,8
        xorl    %ecx,%esi
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       pxor    %xmm5,%xmm9
+       addl    16(%rsp),%edx
+       xorl    %ebx,%esi
+       punpcklqdq      %xmm8,%xmm12
        movl    %ebp,%edi
        roll    $5,%ebp
-       pxor    %xmm6,%xmm5
-       xorl    %ebx,%esi
-       addl    %ebp,%edx
-       movdqa  %xmm9,%xmm10
-       paddd   %xmm4,%xmm9
-       rorl    $7,%eax
+       pxor    %xmm10,%xmm9
        addl    %esi,%edx
-       pxor    %xmm8,%xmm5
+       xorl    %ebx,%edi
+       movdqa  %xmm13,%xmm3
+       rorl    $7,%eax
+       paddd   %xmm8,%xmm13
+       addl    %ebp,%edx
+       pxor    %xmm12,%xmm9
        addl    20(%rsp),%ecx
-       xorl    %ebx,%edi
+       xorl    %eax,%edi
        movl    %edx,%esi
        roll    $5,%edx
-       movdqa  %xmm5,%xmm8
-       movdqa  %xmm9,0(%rsp)
-       xorl    %eax,%edi
+       movdqa  %xmm9,%xmm12
+       addl    %edi,%ecx
        cmpl    $11,%r8d
        jb      .Laesenclast2
-       movups  176(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  192(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  64(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%r15),%xmm1
+.byte  102,15,56,220,208
        je      .Laesenclast2
-       movups  208(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  224(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  96(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%r15),%xmm1
+.byte  102,15,56,220,208
 .Laesenclast2:
-.byte  102,69,15,56,221,223
-       movups  16(%r15),%xmm14
-       addl    %edx,%ecx
+.byte  102,15,56,221,209
+       movups  16-112(%r15),%xmm0
+       xorl    %eax,%esi
+       movdqa  %xmm13,0(%rsp)
        rorl    $7,%ebp
-       addl    %edi,%ecx
-       pslld   $2,%xmm5
+       addl    %edx,%ecx
        addl    24(%rsp),%ebx
-       xorl    %eax,%esi
-       psrld   $30,%xmm8
+       pslld   $2,%xmm9
+       xorl    %ebp,%esi
        movl    %ecx,%edi
+       psrld   $30,%xmm12
        roll    $5,%ecx
-       xorl    %ebp,%esi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %esi,%ebx
-       por     %xmm8,%xmm5
-       addl    28(%rsp),%eax
        xorl    %ebp,%edi
-       movdqa  %xmm5,%xmm9
+       rorl    $7,%edx
+       por     %xmm12,%xmm9
+       addl    %ecx,%ebx
+       addl    28(%rsp),%eax
+       pshufd  $238,%xmm8,%xmm13
+       rorl    $7,%ecx
        movl    %ebx,%esi
-       roll    $5,%ebx
        xorl    %edx,%edi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
+       roll    $5,%ebx
        addl    %edi,%eax
-       movl    %ecx,%edi
-       movups  32(%r12),%xmm12
-       xorps   %xmm13,%xmm12
-       movups  %xmm11,16(%r13,%r12,1)
-       xorps   %xmm12,%xmm11
-.byte  102,69,15,56,220,222
-       movups  32(%r15),%xmm15
-       pxor    %xmm2,%xmm6
-.byte  102,68,15,58,15,204,8
+       xorl    %ecx,%esi
        xorl    %edx,%ecx
+       addl    %ebx,%eax
+       pxor    %xmm6,%xmm10
        addl    32(%rsp),%ebp
-       andl    %edx,%edi
-       pxor    %xmm7,%xmm6
+       movups  32(%r12),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm2,16(%r13,%r12,1)
+       xorps   %xmm14,%xmm2
+       movups  -80(%r15),%xmm1
+.byte  102,15,56,220,208
        andl    %ecx,%esi
+       xorl    %edx,%ecx
        rorl    $7,%ebx
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm5,%xmm10
-       addl    %edi,%ebp
+       punpcklqdq      %xmm9,%xmm13
        movl    %eax,%edi
-       pxor    %xmm9,%xmm6
+       xorl    %ecx,%esi
+       pxor    %xmm11,%xmm10
        roll    $5,%eax
        addl    %esi,%ebp
-       xorl    %edx,%ecx
-       addl    %eax,%ebp
-       movdqa  %xmm6,%xmm9
-       movdqa  %xmm10,16(%rsp)
-       movl    %ebx,%esi
+       movdqa  %xmm3,%xmm12
+       xorl    %ebx,%edi
+       paddd   %xmm9,%xmm3
        xorl    %ecx,%ebx
+       pxor    %xmm13,%xmm10
+       addl    %eax,%ebp
        addl    36(%rsp),%edx
-       andl    %ecx,%esi
-       pslld   $2,%xmm6
        andl    %ebx,%edi
+       xorl    %ecx,%ebx
        rorl    $7,%eax
-       psrld   $30,%xmm9
-       addl    %esi,%edx
+       movdqa  %xmm10,%xmm13
        movl    %ebp,%esi
+       xorl    %ebx,%edi
+       movdqa  %xmm3,16(%rsp)
        roll    $5,%ebp
-.byte  102,69,15,56,220,223
-       movups  48(%r15),%xmm14
        addl    %edi,%edx
-       xorl    %ecx,%ebx
-       addl    %ebp,%edx
-       por     %xmm9,%xmm6
-       movl    %eax,%edi
+       movups  -64(%r15),%xmm0
+.byte  102,15,56,220,209
+       xorl    %eax,%esi
+       pslld   $2,%xmm10
        xorl    %ebx,%eax
-       movdqa  %xmm6,%xmm10
+       addl    %ebp,%edx
+       psrld   $30,%xmm13
        addl    40(%rsp),%ecx
-       andl    %ebx,%edi
        andl    %eax,%esi
+       xorl    %ebx,%eax
+       por     %xmm13,%xmm10
        rorl    $7,%ebp
-       addl    %edi,%ecx
        movl    %edx,%edi
+       xorl    %eax,%esi
        roll    $5,%edx
+       pshufd  $238,%xmm9,%xmm3
        addl    %esi,%ecx
-       xorl    %ebx,%eax
-       addl    %edx,%ecx
-       movl    %ebp,%esi
+       xorl    %ebp,%edi
        xorl    %eax,%ebp
+       addl    %edx,%ecx
        addl    44(%rsp),%ebx
-       andl    %eax,%esi
        andl    %ebp,%edi
-.byte  102,69,15,56,220,222
-       movups  64(%r15),%xmm15
+       xorl    %eax,%ebp
        rorl    $7,%edx
-       addl    %esi,%ebx
+       movups  -48(%r15),%xmm1
+.byte  102,15,56,220,208
        movl    %ecx,%esi
+       xorl    %ebp,%edi
        roll    $5,%ecx
        addl    %edi,%ebx
-       xorl    %eax,%ebp
-       addl    %ecx,%ebx
-       movl    %edx,%edi
-       pxor    %xmm3,%xmm7
-.byte  102,68,15,58,15,213,8
+       xorl    %edx,%esi
        xorl    %ebp,%edx
+       addl    %ecx,%ebx
+       pxor    %xmm7,%xmm11
        addl    48(%rsp),%eax
-       andl    %ebp,%edi
-       pxor    %xmm0,%xmm7
        andl    %edx,%esi
+       xorl    %ebp,%edx
        rorl    $7,%ecx
-       movdqa  48(%r11),%xmm9
-       paddd   %xmm6,%xmm8
-       addl    %edi,%eax
+       punpcklqdq      %xmm10,%xmm3
        movl    %ebx,%edi
-       pxor    %xmm10,%xmm7
+       xorl    %edx,%esi
+       pxor    %xmm4,%xmm11
        roll    $5,%ebx
        addl    %esi,%eax
-       xorl    %ebp,%edx
-       addl    %ebx,%eax
-       movdqa  %xmm7,%xmm10
-       movdqa  %xmm8,32(%rsp)
-       movl    %ecx,%esi
-.byte  102,69,15,56,220,223
-       movups  80(%r15),%xmm14
+       movdqa  48(%r11),%xmm13
+       xorl    %ecx,%edi
+       paddd   %xmm10,%xmm12
        xorl    %edx,%ecx
+       pxor    %xmm3,%xmm11
+       addl    %ebx,%eax
        addl    52(%rsp),%ebp
-       andl    %edx,%esi
-       pslld   $2,%xmm7
+       movups  -32(%r15),%xmm0
+.byte  102,15,56,220,209
        andl    %ecx,%edi
+       xorl    %edx,%ecx
        rorl    $7,%ebx
-       psrld   $30,%xmm10
-       addl    %esi,%ebp
+       movdqa  %xmm11,%xmm3
        movl    %eax,%esi
+       xorl    %ecx,%edi
+       movdqa  %xmm12,32(%rsp)
        roll    $5,%eax
        addl    %edi,%ebp
-       xorl    %edx,%ecx
-       addl    %eax,%ebp
-       por     %xmm10,%xmm7
-       movl    %ebx,%edi
+       xorl    %ebx,%esi
+       pslld   $2,%xmm11
        xorl    %ecx,%ebx
-       movdqa  %xmm7,%xmm8
+       addl    %eax,%ebp
+       psrld   $30,%xmm3
        addl    56(%rsp),%edx
-       andl    %ecx,%edi
        andl    %ebx,%esi
+       xorl    %ecx,%ebx
+       por     %xmm3,%xmm11
        rorl    $7,%eax
-       addl    %edi,%edx
        movl    %ebp,%edi
+       xorl    %ebx,%esi
        roll    $5,%ebp
-.byte  102,69,15,56,220,222
-       movups  96(%r15),%xmm15
+       pshufd  $238,%xmm10,%xmm12
        addl    %esi,%edx
-       xorl    %ecx,%ebx
-       addl    %ebp,%edx
-       movl    %eax,%esi
+       movups  -16(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %eax,%edi
        xorl    %ebx,%eax
+       addl    %ebp,%edx
        addl    60(%rsp),%ecx
-       andl    %ebx,%esi
        andl    %eax,%edi
+       xorl    %ebx,%eax
        rorl    $7,%ebp
-       addl    %esi,%ecx
        movl    %edx,%esi
+       xorl    %eax,%edi
        roll    $5,%edx
        addl    %edi,%ecx
-       xorl    %ebx,%eax
-       addl    %edx,%ecx
-       movl    %ebp,%edi
-       pxor    %xmm4,%xmm0
-.byte  102,68,15,58,15,198,8
+       xorl    %ebp,%esi
        xorl    %eax,%ebp
+       addl    %edx,%ecx
+       pxor    %xmm8,%xmm4
        addl    0(%rsp),%ebx
-       andl    %eax,%edi
-       pxor    %xmm1,%xmm0
        andl    %ebp,%esi
-.byte  102,69,15,56,220,223
-       movups  112(%r15),%xmm14
+       xorl    %eax,%ebp
        rorl    $7,%edx
-       movdqa  %xmm9,%xmm10
-       paddd   %xmm7,%xmm9
-       addl    %edi,%ebx
+       movups  0(%r15),%xmm0
+.byte  102,15,56,220,209
+       punpcklqdq      %xmm11,%xmm12
        movl    %ecx,%edi
-       pxor    %xmm8,%xmm0
+       xorl    %ebp,%esi
+       pxor    %xmm5,%xmm4
        roll    $5,%ecx
        addl    %esi,%ebx
-       xorl    %eax,%ebp
-       addl    %ecx,%ebx
-       movdqa  %xmm0,%xmm8
-       movdqa  %xmm9,48(%rsp)
-       movl    %edx,%esi
+       movdqa  %xmm13,%xmm3
+       xorl    %edx,%edi
+       paddd   %xmm11,%xmm13
        xorl    %ebp,%edx
+       pxor    %xmm12,%xmm4
+       addl    %ecx,%ebx
        addl    4(%rsp),%eax
-       andl    %ebp,%esi
-       pslld   $2,%xmm0
        andl    %edx,%edi
+       xorl    %ebp,%edx
        rorl    $7,%ecx
-       psrld   $30,%xmm8
-       addl    %esi,%eax
+       movdqa  %xmm4,%xmm12
        movl    %ebx,%esi
+       xorl    %edx,%edi
+       movdqa  %xmm13,48(%rsp)
        roll    $5,%ebx
        addl    %edi,%eax
-       xorl    %ebp,%edx
-       addl    %ebx,%eax
-       por     %xmm8,%xmm0
-       movl    %ecx,%edi
-.byte  102,69,15,56,220,222
-       movups  128(%r15),%xmm15
+       xorl    %ecx,%esi
+       pslld   $2,%xmm4
        xorl    %edx,%ecx
-       movdqa  %xmm0,%xmm9
+       addl    %ebx,%eax
+       psrld   $30,%xmm12
        addl    8(%rsp),%ebp
-       andl    %edx,%edi
+       movups  16(%r15),%xmm1
+.byte  102,15,56,220,208
        andl    %ecx,%esi
+       xorl    %edx,%ecx
+       por     %xmm12,%xmm4
        rorl    $7,%ebx
-       addl    %edi,%ebp
        movl    %eax,%edi
+       xorl    %ecx,%esi
        roll    $5,%eax
+       pshufd  $238,%xmm11,%xmm13
        addl    %esi,%ebp
-       xorl    %edx,%ecx
-       addl    %eax,%ebp
-       movl    %ebx,%esi
+       xorl    %ebx,%edi
        xorl    %ecx,%ebx
+       addl    %eax,%ebp
        addl    12(%rsp),%edx
-       andl    %ecx,%esi
        andl    %ebx,%edi
+       xorl    %ecx,%ebx
        rorl    $7,%eax
-       addl    %esi,%edx
        movl    %ebp,%esi
+       xorl    %ebx,%edi
        roll    $5,%ebp
-.byte  102,69,15,56,220,223
-       movups  144(%r15),%xmm14
        addl    %edi,%edx
-       xorl    %ecx,%ebx
-       addl    %ebp,%edx
-       movl    %eax,%edi
-       pxor    %xmm5,%xmm1
-.byte  102,68,15,58,15,207,8
+       movups  32(%r15),%xmm0
+.byte  102,15,56,220,209
+       xorl    %eax,%esi
        xorl    %ebx,%eax
+       addl    %ebp,%edx
+       pxor    %xmm9,%xmm5
        addl    16(%rsp),%ecx
-       andl    %ebx,%edi
-       pxor    %xmm2,%xmm1
        andl    %eax,%esi
+       xorl    %ebx,%eax
        rorl    $7,%ebp
-       movdqa  %xmm10,%xmm8
-       paddd   %xmm0,%xmm10
-       addl    %edi,%ecx
+       punpcklqdq      %xmm4,%xmm13
        movl    %edx,%edi
-       pxor    %xmm9,%xmm1
+       xorl    %eax,%esi
+       pxor    %xmm6,%xmm5
        roll    $5,%edx
        addl    %esi,%ecx
-       xorl    %ebx,%eax
-       addl    %edx,%ecx
-       movdqa  %xmm1,%xmm9
-       movdqa  %xmm10,0(%rsp)
-       movl    %ebp,%esi
+       movdqa  %xmm3,%xmm12
+       xorl    %ebp,%edi
+       paddd   %xmm4,%xmm3
        xorl    %eax,%ebp
+       pxor    %xmm13,%xmm5
+       addl    %edx,%ecx
        addl    20(%rsp),%ebx
-       andl    %eax,%esi
-       pslld   $2,%xmm1
        andl    %ebp,%edi
-.byte  102,69,15,56,220,222
-       movups  160(%r15),%xmm15
+       xorl    %eax,%ebp
        rorl    $7,%edx
-       psrld   $30,%xmm9
-       addl    %esi,%ebx
+       movups  48(%r15),%xmm1
+.byte  102,15,56,220,208
+       movdqa  %xmm5,%xmm13
        movl    %ecx,%esi
+       xorl    %ebp,%edi
+       movdqa  %xmm3,0(%rsp)
        roll    $5,%ecx
        addl    %edi,%ebx
-       xorl    %eax,%ebp
-       addl    %ecx,%ebx
-       por     %xmm9,%xmm1
-       movl    %edx,%edi
+       xorl    %edx,%esi
+       pslld   $2,%xmm5
        xorl    %ebp,%edx
-       movdqa  %xmm1,%xmm10
+       addl    %ecx,%ebx
+       psrld   $30,%xmm13
        addl    24(%rsp),%eax
-       andl    %ebp,%edi
        andl    %edx,%esi
+       xorl    %ebp,%edx
+       por     %xmm13,%xmm5
        rorl    $7,%ecx
-       addl    %edi,%eax
        movl    %ebx,%edi
+       xorl    %edx,%esi
        roll    $5,%ebx
+       pshufd  $238,%xmm4,%xmm3
        addl    %esi,%eax
-       xorl    %ebp,%edx
+       xorl    %ecx,%edi
+       xorl    %edx,%ecx
        addl    %ebx,%eax
-       movl    %ecx,%esi
+       addl    28(%rsp),%ebp
        cmpl    $11,%r8d
        jb      .Laesenclast3
-       movups  176(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  192(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  64(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%r15),%xmm1
+.byte  102,15,56,220,208
        je      .Laesenclast3
-       movups  208(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  224(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  96(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%r15),%xmm1
+.byte  102,15,56,220,208
 .Laesenclast3:
-.byte  102,69,15,56,221,223
-       movups  16(%r15),%xmm14
-       xorl    %edx,%ecx
-       addl    28(%rsp),%ebp
-       andl    %edx,%esi
+.byte  102,15,56,221,209
+       movups  16-112(%r15),%xmm0
        andl    %ecx,%edi
+       xorl    %edx,%ecx
        rorl    $7,%ebx
-       addl    %esi,%ebp
        movl    %eax,%esi
+       xorl    %ecx,%edi
        roll    $5,%eax
        addl    %edi,%ebp
-       xorl    %edx,%ecx
-       addl    %eax,%ebp
-       movl    %ebx,%edi
-       pxor    %xmm6,%xmm2
-.byte  102,68,15,58,15,208,8
+       xorl    %ebx,%esi
        xorl    %ecx,%ebx
+       addl    %eax,%ebp
+       pxor    %xmm10,%xmm6
        addl    32(%rsp),%edx
-       andl    %ecx,%edi
-       pxor    %xmm3,%xmm2
        andl    %ebx,%esi
+       xorl    %ecx,%ebx
        rorl    $7,%eax
-       movdqa  %xmm8,%xmm9
-       paddd   %xmm1,%xmm8
-       addl    %edi,%edx
+       punpcklqdq      %xmm5,%xmm3
        movl    %ebp,%edi
-       pxor    %xmm10,%xmm2
+       xorl    %ebx,%esi
+       pxor    %xmm7,%xmm6
        roll    $5,%ebp
-       movups  48(%r12),%xmm12
-       xorps   %xmm13,%xmm12
-       movups  %xmm11,32(%r13,%r12,1)
-       xorps   %xmm12,%xmm11
-.byte  102,69,15,56,220,222
-       movups  32(%r15),%xmm15
        addl    %esi,%edx
-       xorl    %ecx,%ebx
-       addl    %ebp,%edx
-       movdqa  %xmm2,%xmm10
-       movdqa  %xmm8,16(%rsp)
-       movl    %eax,%esi
+       movups  48(%r12),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm2,32(%r13,%r12,1)
+       xorps   %xmm14,%xmm2
+       movups  -80(%r15),%xmm1
+.byte  102,15,56,220,208
+       movdqa  %xmm12,%xmm13
+       xorl    %eax,%edi
+       paddd   %xmm5,%xmm12
        xorl    %ebx,%eax
+       pxor    %xmm3,%xmm6
+       addl    %ebp,%edx
        addl    36(%rsp),%ecx
-       andl    %ebx,%esi
-       pslld   $2,%xmm2
        andl    %eax,%edi
+       xorl    %ebx,%eax
        rorl    $7,%ebp
-       psrld   $30,%xmm10
-       addl    %esi,%ecx
+       movdqa  %xmm6,%xmm3
        movl    %edx,%esi
+       xorl    %eax,%edi
+       movdqa  %xmm12,16(%rsp)
        roll    $5,%edx
        addl    %edi,%ecx
-       xorl    %ebx,%eax
-       addl    %edx,%ecx
-       por     %xmm10,%xmm2
-       movl    %ebp,%edi
+       xorl    %ebp,%esi
+       pslld   $2,%xmm6
        xorl    %eax,%ebp
-       movdqa  %xmm2,%xmm8
+       addl    %edx,%ecx
+       psrld   $30,%xmm3
        addl    40(%rsp),%ebx
-       andl    %eax,%edi
        andl    %ebp,%esi
-.byte  102,69,15,56,220,223
-       movups  48(%r15),%xmm14
+       xorl    %eax,%ebp
+       por     %xmm3,%xmm6
        rorl    $7,%edx
-       addl    %edi,%ebx
+       movups  -64(%r15),%xmm0
+.byte  102,15,56,220,209
        movl    %ecx,%edi
+       xorl    %ebp,%esi
        roll    $5,%ecx
+       pshufd  $238,%xmm5,%xmm12
        addl    %esi,%ebx
-       xorl    %eax,%ebp
-       addl    %ecx,%ebx
-       movl    %edx,%esi
+       xorl    %edx,%edi
        xorl    %ebp,%edx
+       addl    %ecx,%ebx
        addl    44(%rsp),%eax
-       andl    %ebp,%esi
        andl    %edx,%edi
+       xorl    %ebp,%edx
        rorl    $7,%ecx
-       addl    %esi,%eax
        movl    %ebx,%esi
+       xorl    %edx,%edi
        roll    $5,%ebx
        addl    %edi,%eax
-       xorl    %ebp,%edx
+       xorl    %edx,%esi
        addl    %ebx,%eax
+       pxor    %xmm11,%xmm7
        addl    48(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  64(%r15),%xmm15
-       pxor    %xmm7,%xmm3
-.byte  102,68,15,58,15,193,8
-       xorl    %edx,%esi
+       movups  -48(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %ecx,%esi
+       punpcklqdq      %xmm6,%xmm12
        movl    %eax,%edi
        roll    $5,%eax
-       pxor    %xmm4,%xmm3
-       xorl    %ecx,%esi
-       addl    %eax,%ebp
-       movdqa  %xmm9,%xmm10
-       paddd   %xmm2,%xmm9
-       rorl    $7,%ebx
+       pxor    %xmm8,%xmm7
        addl    %esi,%ebp
-       pxor    %xmm8,%xmm3
-       addl    52(%rsp),%edx
        xorl    %ecx,%edi
+       movdqa  %xmm13,%xmm3
+       rorl    $7,%ebx
+       paddd   %xmm6,%xmm13
+       addl    %eax,%ebp
+       pxor    %xmm12,%xmm7
+       addl    52(%rsp),%edx
+       xorl    %ebx,%edi
        movl    %ebp,%esi
        roll    $5,%ebp
-       movdqa  %xmm3,%xmm8
-       movdqa  %xmm9,32(%rsp)
-       xorl    %ebx,%edi
-       addl    %ebp,%edx
-       rorl    $7,%eax
+       movdqa  %xmm7,%xmm12
        addl    %edi,%edx
-       pslld   $2,%xmm3
-       addl    56(%rsp),%ecx
        xorl    %ebx,%esi
-       psrld   $30,%xmm8
+       movdqa  %xmm13,32(%rsp)
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       addl    56(%rsp),%ecx
+       pslld   $2,%xmm7
+       xorl    %eax,%esi
        movl    %edx,%edi
+       psrld   $30,%xmm12
        roll    $5,%edx
-       xorl    %eax,%esi
-.byte  102,69,15,56,220,223
-       movups  80(%r15),%xmm14
-       addl    %edx,%ecx
-       rorl    $7,%ebp
        addl    %esi,%ecx
-       por     %xmm8,%xmm3
-       addl    60(%rsp),%ebx
+       movups  -32(%r15),%xmm0
+.byte  102,15,56,220,209
        xorl    %eax,%edi
+       rorl    $7,%ebp
+       por     %xmm12,%xmm7
+       addl    %edx,%ecx
+       addl    60(%rsp),%ebx
+       xorl    %ebp,%edi
        movl    %ecx,%esi
        roll    $5,%ecx
-       xorl    %ebp,%edi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %edi,%ebx
+       xorl    %ebp,%esi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
        addl    0(%rsp),%eax
-       paddd   %xmm3,%xmm10
-       xorl    %ebp,%esi
+       xorl    %edx,%esi
        movl    %ebx,%edi
        roll    $5,%ebx
-       xorl    %edx,%esi
-       movdqa  %xmm10,48(%rsp)
-       addl    %ebx,%eax
-       rorl    $7,%ecx
+       paddd   %xmm7,%xmm3
        addl    %esi,%eax
-       addl    4(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  96(%r15),%xmm15
        xorl    %edx,%edi
+       movdqa  %xmm3,48(%rsp)
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    4(%rsp),%ebp
+       movups  -16(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %ecx,%edi
        movl    %eax,%esi
        roll    $5,%eax
-       xorl    %ecx,%edi
-       addl    %eax,%ebp
-       rorl    $7,%ebx
        addl    %edi,%ebp
-       addl    8(%rsp),%edx
        xorl    %ecx,%esi
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    8(%rsp),%edx
+       xorl    %ebx,%esi
        movl    %ebp,%edi
        roll    $5,%ebp
-       xorl    %ebx,%esi
-       addl    %ebp,%edx
-       rorl    $7,%eax
        addl    %esi,%edx
-       addl    12(%rsp),%ecx
        xorl    %ebx,%edi
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       addl    12(%rsp),%ecx
+       xorl    %eax,%edi
        movl    %edx,%esi
        roll    $5,%edx
-       xorl    %eax,%edi
-.byte  102,69,15,56,220,223
-       movups  112(%r15),%xmm14
-       addl    %edx,%ecx
-       rorl    $7,%ebp
        addl    %edi,%ecx
+       movups  0(%r15),%xmm0
+.byte  102,15,56,220,209
+       xorl    %eax,%esi
+       rorl    $7,%ebp
+       addl    %edx,%ecx
        cmpq    %r14,%r10
        je      .Ldone_ssse3
-       movdqa  64(%r11),%xmm6
-       movdqa  0(%r11),%xmm9
-       movdqu  0(%r10),%xmm0
-       movdqu  16(%r10),%xmm1
-       movdqu  32(%r10),%xmm2
-       movdqu  48(%r10),%xmm3
-.byte  102,15,56,0,198
+       movdqa  64(%r11),%xmm3
+       movdqa  0(%r11),%xmm13
+       movdqu  0(%r10),%xmm4
+       movdqu  16(%r10),%xmm5
+       movdqu  32(%r10),%xmm6
+       movdqu  48(%r10),%xmm7
+.byte  102,15,56,0,227
        addq    $64,%r10
        addl    16(%rsp),%ebx
-       xorl    %eax,%esi
-.byte  102,15,56,0,206
+       xorl    %ebp,%esi
        movl    %ecx,%edi
+.byte  102,15,56,0,235
        roll    $5,%ecx
-       paddd   %xmm9,%xmm0
-       xorl    %ebp,%esi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %esi,%ebx
-       movdqa  %xmm0,0(%rsp)
-       addl    20(%rsp),%eax
        xorl    %ebp,%edi
-       psubd   %xmm9,%xmm0
+       rorl    $7,%edx
+       paddd   %xmm13,%xmm4
+       addl    %ecx,%ebx
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
        movl    %ebx,%esi
+       movdqa  %xmm4,0(%rsp)
        roll    $5,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
        addl    %edi,%eax
-       addl    24(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  128(%r15),%xmm15
        xorl    %edx,%esi
+       rorl    $7,%ecx
+       psubd   %xmm13,%xmm4
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       movups  16(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %ecx,%esi
        movl    %eax,%edi
        roll    $5,%eax
-       xorl    %ecx,%esi
-       addl    %eax,%ebp
-       rorl    $7,%ebx
        addl    %esi,%ebp
-       addl    28(%rsp),%edx
        xorl    %ecx,%edi
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
        movl    %ebp,%esi
        roll    $5,%ebp
-       xorl    %ebx,%edi
-       addl    %ebp,%edx
-       rorl    $7,%eax
        addl    %edi,%edx
-       addl    32(%rsp),%ecx
        xorl    %ebx,%esi
-.byte  102,15,56,0,214
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
        movl    %edx,%edi
+.byte  102,15,56,0,243
        roll    $5,%edx
-       paddd   %xmm9,%xmm1
-       xorl    %eax,%esi
-.byte  102,69,15,56,220,223
-       movups  144(%r15),%xmm14
-       addl    %edx,%ecx
-       rorl    $7,%ebp
        addl    %esi,%ecx
-       movdqa  %xmm1,16(%rsp)
-       addl    36(%rsp),%ebx
+       movups  32(%r15),%xmm0
+.byte  102,15,56,220,209
        xorl    %eax,%edi
-       psubd   %xmm9,%xmm1
+       rorl    $7,%ebp
+       paddd   %xmm13,%xmm5
+       addl    %edx,%ecx
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
        movl    %ecx,%esi
+       movdqa  %xmm5,16(%rsp)
        roll    $5,%ecx
-       xorl    %ebp,%edi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %edi,%ebx
-       addl    40(%rsp),%eax
        xorl    %ebp,%esi
+       rorl    $7,%edx
+       psubd   %xmm13,%xmm5
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
        movl    %ebx,%edi
        roll    $5,%ebx
-       xorl    %edx,%esi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
        addl    %esi,%eax
-       addl    44(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  160(%r15),%xmm15
        xorl    %edx,%edi
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       movups  48(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %ecx,%edi
        movl    %eax,%esi
        roll    $5,%eax
-       xorl    %ecx,%edi
-       addl    %eax,%ebp
-       rorl    $7,%ebx
        addl    %edi,%ebp
-       addl    48(%rsp),%edx
        xorl    %ecx,%esi
-.byte  102,15,56,0,222
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
        movl    %ebp,%edi
+.byte  102,15,56,0,251
        roll    $5,%ebp
-       paddd   %xmm9,%xmm2
-       xorl    %ebx,%esi
-       addl    %ebp,%edx
-       rorl    $7,%eax
        addl    %esi,%edx
-       movdqa  %xmm2,32(%rsp)
-       addl    52(%rsp),%ecx
        xorl    %ebx,%edi
-       psubd   %xmm9,%xmm2
+       rorl    $7,%eax
+       paddd   %xmm13,%xmm6
+       addl    %ebp,%edx
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
        movl    %edx,%esi
+       movdqa  %xmm6,32(%rsp)
        roll    $5,%edx
-       xorl    %eax,%edi
+       addl    %edi,%ecx
        cmpl    $11,%r8d
        jb      .Laesenclast4
-       movups  176(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  192(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  64(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%r15),%xmm1
+.byte  102,15,56,220,208
        je      .Laesenclast4
-       movups  208(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  224(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  96(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%r15),%xmm1
+.byte  102,15,56,220,208
 .Laesenclast4:
-.byte  102,69,15,56,221,223
-       movups  16(%r15),%xmm14
-       addl    %edx,%ecx
+.byte  102,15,56,221,209
+       movups  16-112(%r15),%xmm0
+       xorl    %eax,%esi
        rorl    $7,%ebp
-       addl    %edi,%ecx
+       psubd   %xmm13,%xmm6
+       addl    %edx,%ecx
        addl    56(%rsp),%ebx
-       xorl    %eax,%esi
+       xorl    %ebp,%esi
        movl    %ecx,%edi
        roll    $5,%ecx
-       xorl    %ebp,%esi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %esi,%ebx
-       addl    60(%rsp),%eax
        xorl    %ebp,%edi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
        movl    %ebx,%esi
        roll    $5,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
        addl    %edi,%eax
-       movups  %xmm11,48(%r13,%r12,1)
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       movups  %xmm2,48(%r13,%r12,1)
        leaq    64(%r12),%r12
 
        addl    0(%r9),%eax
@@ -1237,129 +1219,130 @@ aesni_cbc_sha1_enc_ssse3:
        movl    %esi,4(%r9)
        movl    %esi,%ebx
        movl    %ecx,8(%r9)
+       movl    %ecx,%edi
        movl    %edx,12(%r9)
+       xorl    %edx,%edi
        movl    %ebp,16(%r9)
+       andl    %edi,%esi
        jmp     .Loop_ssse3
 
-.align 16
 .Ldone_ssse3:
        addl    16(%rsp),%ebx
-       xorl    %eax,%esi
+       xorl    %ebp,%esi
        movl    %ecx,%edi
        roll    $5,%ecx
-       xorl    %ebp,%esi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %esi,%ebx
-       addl    20(%rsp),%eax
        xorl    %ebp,%edi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    20(%rsp),%eax
+       xorl    %edx,%edi
        movl    %ebx,%esi
        roll    $5,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
        addl    %edi,%eax
-       addl    24(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  128(%r15),%xmm15
        xorl    %edx,%esi
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    24(%rsp),%ebp
+       movups  16(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %ecx,%esi
        movl    %eax,%edi
        roll    $5,%eax
-       xorl    %ecx,%esi
-       addl    %eax,%ebp
-       rorl    $7,%ebx
        addl    %esi,%ebp
-       addl    28(%rsp),%edx
        xorl    %ecx,%edi
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    28(%rsp),%edx
+       xorl    %ebx,%edi
        movl    %ebp,%esi
        roll    $5,%ebp
-       xorl    %ebx,%edi
-       addl    %ebp,%edx
-       rorl    $7,%eax
        addl    %edi,%edx
-       addl    32(%rsp),%ecx
        xorl    %ebx,%esi
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       addl    32(%rsp),%ecx
+       xorl    %eax,%esi
        movl    %edx,%edi
        roll    $5,%edx
-       xorl    %eax,%esi
-.byte  102,69,15,56,220,223
-       movups  144(%r15),%xmm14
-       addl    %edx,%ecx
-       rorl    $7,%ebp
        addl    %esi,%ecx
-       addl    36(%rsp),%ebx
+       movups  32(%r15),%xmm0
+.byte  102,15,56,220,209
        xorl    %eax,%edi
+       rorl    $7,%ebp
+       addl    %edx,%ecx
+       addl    36(%rsp),%ebx
+       xorl    %ebp,%edi
        movl    %ecx,%esi
        roll    $5,%ecx
-       xorl    %ebp,%edi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %edi,%ebx
-       addl    40(%rsp),%eax
        xorl    %ebp,%esi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    40(%rsp),%eax
+       xorl    %edx,%esi
        movl    %ebx,%edi
        roll    $5,%ebx
-       xorl    %edx,%esi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
        addl    %esi,%eax
-       addl    44(%rsp),%ebp
-.byte  102,69,15,56,220,222
-       movups  160(%r15),%xmm15
        xorl    %edx,%edi
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       addl    44(%rsp),%ebp
+       movups  48(%r15),%xmm1
+.byte  102,15,56,220,208
+       xorl    %ecx,%edi
        movl    %eax,%esi
        roll    $5,%eax
-       xorl    %ecx,%edi
-       addl    %eax,%ebp
-       rorl    $7,%ebx
        addl    %edi,%ebp
-       addl    48(%rsp),%edx
        xorl    %ecx,%esi
+       rorl    $7,%ebx
+       addl    %eax,%ebp
+       addl    48(%rsp),%edx
+       xorl    %ebx,%esi
        movl    %ebp,%edi
        roll    $5,%ebp
-       xorl    %ebx,%esi
-       addl    %ebp,%edx
-       rorl    $7,%eax
        addl    %esi,%edx
-       addl    52(%rsp),%ecx
        xorl    %ebx,%edi
+       rorl    $7,%eax
+       addl    %ebp,%edx
+       addl    52(%rsp),%ecx
+       xorl    %eax,%edi
        movl    %edx,%esi
        roll    $5,%edx
-       xorl    %eax,%edi
+       addl    %edi,%ecx
        cmpl    $11,%r8d
        jb      .Laesenclast5
-       movups  176(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  192(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  64(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%r15),%xmm1
+.byte  102,15,56,220,208
        je      .Laesenclast5
-       movups  208(%r15),%xmm14
-.byte  102,69,15,56,220,223
-       movups  224(%r15),%xmm15
-.byte  102,69,15,56,220,222
+       movups  96(%r15),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%r15),%xmm1
+.byte  102,15,56,220,208
 .Laesenclast5:
-.byte  102,69,15,56,221,223
-       movups  16(%r15),%xmm14
-       addl    %edx,%ecx
+.byte  102,15,56,221,209
+       movups  16-112(%r15),%xmm0
+       xorl    %eax,%esi
        rorl    $7,%ebp
-       addl    %edi,%ecx
+       addl    %edx,%ecx
        addl    56(%rsp),%ebx
-       xorl    %eax,%esi
+       xorl    %ebp,%esi
        movl    %ecx,%edi
        roll    $5,%ecx
-       xorl    %ebp,%esi
-       addl    %ecx,%ebx
-       rorl    $7,%edx
        addl    %esi,%ebx
-       addl    60(%rsp),%eax
        xorl    %ebp,%edi
+       rorl    $7,%edx
+       addl    %ecx,%ebx
+       addl    60(%rsp),%eax
+       xorl    %edx,%edi
        movl    %ebx,%esi
        roll    $5,%ebx
-       xorl    %edx,%edi
-       addl    %ebx,%eax
-       rorl    $7,%ecx
        addl    %edi,%eax
-       movups  %xmm11,48(%r13,%r12,1)
+       rorl    $7,%ecx
+       addl    %ebx,%eax
+       movups  %xmm2,48(%r13,%r12,1)
        movq    88(%rsp),%r8
 
        addl    0(%r9),%eax
@@ -1372,7 +1355,7 @@ aesni_cbc_sha1_enc_ssse3:
        movl    %ecx,8(%r9)
        movl    %edx,12(%r9)
        movl    %ebp,16(%r9)
-       movups  %xmm11,(%r8)
+       movups  %xmm2,(%r8)
        leaq    104(%rsp),%rsi
        movq    0(%rsi),%r15
        movq    8(%rsi),%r14
@@ -1386,12 +1369,314 @@ aesni_cbc_sha1_enc_ssse3:
 .size  aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
 .align 64
 K_XX_XX:
-.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999     
-.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     
-.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     
-.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     
-.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte  0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
 
 .byte  65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 64
+.type  aesni_cbc_sha1_enc_shaext,@function
+.align 32
+aesni_cbc_sha1_enc_shaext:
+       movq    8(%rsp),%r10
+       movdqu  (%r9),%xmm8
+       movd    16(%r9),%xmm9
+       movdqa  K_XX_XX+80(%rip),%xmm7
+
+       movl    240(%rcx),%r11d
+       subq    %rdi,%rsi
+       movups  (%rcx),%xmm15
+       movups  16(%rcx),%xmm0
+       leaq    112(%rcx),%rcx
+
+       pshufd  $27,%xmm8,%xmm8
+       pshufd  $27,%xmm9,%xmm9
+       jmp     .Loop_shaext
+
+.align 16
+.Loop_shaext:
+       movups  0(%rdi),%xmm14
+       xorps   %xmm15,%xmm14
+       xorps   %xmm14,%xmm2
+       movups  -80(%rcx),%xmm1
+.byte  102,15,56,220,208
+       movdqu  (%r10),%xmm3
+       movdqa  %xmm9,%xmm12
+.byte  102,15,56,0,223
+       movdqu  16(%r10),%xmm4
+       movdqa  %xmm8,%xmm11
+       movups  -64(%rcx),%xmm0
+.byte  102,15,56,220,209
+.byte  102,15,56,0,231
+
+       paddd   %xmm3,%xmm9
+       movdqu  32(%r10),%xmm5
+       leaq    64(%r10),%r10
+       pxor    %xmm12,%xmm3
+       movups  -48(%rcx),%xmm1
+.byte  102,15,56,220,208
+       pxor    %xmm12,%xmm3
+       movdqa  %xmm8,%xmm10
+.byte  102,15,56,0,239
+.byte  69,15,58,204,193,0
+.byte  68,15,56,200,212
+       movups  -32(%rcx),%xmm0
+.byte  102,15,56,220,209
+.byte  15,56,201,220
+       movdqu  -16(%r10),%xmm6
+       movdqa  %xmm8,%xmm9
+.byte  102,15,56,0,247
+       movups  -16(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  69,15,58,204,194,0
+.byte  68,15,56,200,205
+       pxor    %xmm5,%xmm3
+.byte  15,56,201,229
+       movups  0(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,0
+.byte  68,15,56,200,214
+       movups  16(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,222
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+       movups  32(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,0
+.byte  68,15,56,200,203
+       movups  48(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,227
+       pxor    %xmm3,%xmm5
+.byte  15,56,201,243
+       cmpl    $11,%r11d
+       jb      .Laesenclast6
+       movups  64(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%rcx),%xmm1
+.byte  102,15,56,220,208
+       je      .Laesenclast6
+       movups  96(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%rcx),%xmm1
+.byte  102,15,56,220,208
+.Laesenclast6:
+.byte  102,15,56,221,209
+       movups  16-112(%rcx),%xmm0
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,0
+.byte  68,15,56,200,212
+       movups  16(%rdi),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm2,0(%rsi,%rdi,1)
+       xorps   %xmm14,%xmm2
+       movups  -80(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,236
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,220
+       movups  -64(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,1
+.byte  68,15,56,200,205
+       movups  -48(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,245
+       pxor    %xmm5,%xmm3
+.byte  15,56,201,229
+       movups  -32(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,1
+.byte  68,15,56,200,214
+       movups  -16(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,222
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+       movups  0(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,1
+.byte  68,15,56,200,203
+       movups  16(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,227
+       pxor    %xmm3,%xmm5
+.byte  15,56,201,243
+       movups  32(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,1
+.byte  68,15,56,200,212
+       movups  48(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,236
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,220
+       cmpl    $11,%r11d
+       jb      .Laesenclast7
+       movups  64(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%rcx),%xmm1
+.byte  102,15,56,220,208
+       je      .Laesenclast7
+       movups  96(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%rcx),%xmm1
+.byte  102,15,56,220,208
+.Laesenclast7:
+.byte  102,15,56,221,209
+       movups  16-112(%rcx),%xmm0
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,1
+.byte  68,15,56,200,205
+       movups  32(%rdi),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm2,16(%rsi,%rdi,1)
+       xorps   %xmm14,%xmm2
+       movups  -80(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,245
+       pxor    %xmm5,%xmm3
+.byte  15,56,201,229
+       movups  -64(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,2
+.byte  68,15,56,200,214
+       movups  -48(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,222
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+       movups  -32(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,2
+.byte  68,15,56,200,203
+       movups  -16(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,227
+       pxor    %xmm3,%xmm5
+.byte  15,56,201,243
+       movups  0(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,2
+.byte  68,15,56,200,212
+       movups  16(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,236
+       pxor    %xmm4,%xmm6
+.byte  15,56,201,220
+       movups  32(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,2
+.byte  68,15,56,200,205
+       movups  48(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,245
+       pxor    %xmm5,%xmm3
+.byte  15,56,201,229
+       cmpl    $11,%r11d
+       jb      .Laesenclast8
+       movups  64(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%rcx),%xmm1
+.byte  102,15,56,220,208
+       je      .Laesenclast8
+       movups  96(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%rcx),%xmm1
+.byte  102,15,56,220,208
+.Laesenclast8:
+.byte  102,15,56,221,209
+       movups  16-112(%rcx),%xmm0
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,2
+.byte  68,15,56,200,214
+       movups  48(%rdi),%xmm14
+       xorps   %xmm15,%xmm14
+       movups  %xmm2,32(%rsi,%rdi,1)
+       xorps   %xmm14,%xmm2
+       movups  -80(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,222
+       pxor    %xmm6,%xmm4
+.byte  15,56,201,238
+       movups  -64(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,3
+.byte  68,15,56,200,203
+       movups  -48(%rcx),%xmm1
+.byte  102,15,56,220,208
+.byte  15,56,202,227
+       pxor    %xmm3,%xmm5
+.byte  15,56,201,243
+       movups  -32(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,3
+.byte  68,15,56,200,212
+.byte  15,56,202,236
+       pxor    %xmm4,%xmm6
+       movups  -16(%rcx),%xmm1
+.byte  102,15,56,220,208
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,3
+.byte  68,15,56,200,205
+.byte  15,56,202,245
+       movups  0(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movdqa  %xmm12,%xmm5
+       movdqa  %xmm8,%xmm10
+.byte  69,15,58,204,193,3
+.byte  68,15,56,200,214
+       movups  16(%rcx),%xmm1
+.byte  102,15,56,220,208
+       movdqa  %xmm8,%xmm9
+.byte  69,15,58,204,194,3
+.byte  68,15,56,200,205
+       movups  32(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  48(%rcx),%xmm1
+.byte  102,15,56,220,208
+       cmpl    $11,%r11d
+       jb      .Laesenclast9
+       movups  64(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  80(%rcx),%xmm1
+.byte  102,15,56,220,208
+       je      .Laesenclast9
+       movups  96(%rcx),%xmm0
+.byte  102,15,56,220,209
+       movups  112(%rcx),%xmm1
+.byte  102,15,56,220,208
+.Laesenclast9:
+.byte  102,15,56,221,209
+       movups  16-112(%rcx),%xmm0
+       decq    %rdx
+
+       paddd   %xmm11,%xmm8
+       movups  %xmm2,48(%rsi,%rdi,1)
+       leaq    64(%rdi),%rdi
+       jnz     .Loop_shaext
+
+       pshufd  $27,%xmm8,%xmm8
+       pshufd  $27,%xmm9,%xmm9
+       movups  %xmm2,(%r8)
+       movdqu  %xmm8,(%r9)
+       movd    %xmm9,16(%r9)
+       .byte   0xf3,0xc3
+.size  aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
 .section .note.GNU-stack,"",%progbits
diff --git a/secure/lib/libcrypto/asm/aesni-sha256-x86_64.s b/secure/lib/libcrypto/asm/aesni-sha256-x86_64.s
new file mode 100644 (file)
index 0000000..26f0e10
--- /dev/null
@@ -0,0 +1,58 @@
+.text  
+
+
+.globl aesni_cbc_sha256_enc
+.type  aesni_cbc_sha256_enc,@function
+.align 16
+aesni_cbc_sha256_enc:
+       xorl    %eax,%eax
+       cmpq    $0,%rdi
+       je      .Lprobe
+       ud2
+.Lprobe:
+       .byte   0xf3,0xc3
+.size  aesni_cbc_sha256_enc,.-aesni_cbc_sha256_enc
+
+.align 64
+.type  K256,@object
+K256:
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
+.long  0,0,0,0,   0,0,0,0
+.byte  65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+.section .note.GNU-stack,"",%progbits
index 50b2183..5c801bc 100644 (file)
@@ -1,4 +1,5 @@
 .text  
+
 .globl aesni_encrypt
 .type  aesni_encrypt,@function
 .align 16
@@ -14,9 +15,12 @@ aesni_encrypt:
        decl    %eax
        movups  (%rdx),%xmm1
        leaq    16(%rdx),%rdx
-       jnz     .Loop_enc1_1    
+       jnz     .Loop_enc1_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 .size  aesni_encrypt,.-aesni_encrypt
 
@@ -35,34 +39,96 @@ aesni_decrypt:
        decl    %eax
        movups  (%rdx),%xmm1
        leaq    16(%rdx),%rdx
-       jnz     .Loop_dec1_2    
+       jnz     .Loop_dec1_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 .size  aesni_decrypt, .-aesni_decrypt
+.type  _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+       movups  (%rcx),%xmm0
+       shll    $4,%eax
+       movups  16(%rcx),%xmm1
+       xorps   %xmm0,%xmm2
+       xorps   %xmm0,%xmm3
+       movups  32(%rcx),%xmm0
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
+       addq    $16,%rax
+
+.Lenc_loop2:
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Lenc_loop2
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,221,208
+.byte  102,15,56,221,216
+       .byte   0xf3,0xc3
+.size  _aesni_encrypt2,.-_aesni_encrypt2
+.type  _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+       movups  (%rcx),%xmm0
+       shll    $4,%eax
+       movups  16(%rcx),%xmm1
+       xorps   %xmm0,%xmm2
+       xorps   %xmm0,%xmm3
+       movups  32(%rcx),%xmm0
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
+       addq    $16,%rax
+
+.Ldec_loop2:
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+.byte  102,15,56,222,208
+.byte  102,15,56,222,216
+       movups  -16(%rcx,%rax,1),%xmm0
+       jnz     .Ldec_loop2
+
+.byte  102,15,56,222,209
+.byte  102,15,56,222,217
+.byte  102,15,56,223,208
+.byte  102,15,56,223,216
+       .byte   0xf3,0xc3
+.size  _aesni_decrypt2,.-_aesni_decrypt2
 .type  _aesni_encrypt3,@function
 .align 16
 _aesni_encrypt3:
        movups  (%rcx),%xmm0
-       shrl    $1,%eax
+       shll    $4,%eax
        movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
        xorps   %xmm0,%xmm3
        xorps   %xmm0,%xmm4
-       movups  (%rcx),%xmm0
+       movups  32(%rcx),%xmm0
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
+       addq    $16,%rax
 
 .Lenc_loop3:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %eax
 .byte  102,15,56,220,225
-       movups  16(%rcx),%xmm1
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,220,224
-       movups  (%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Lenc_loop3
 
 .byte  102,15,56,220,209
@@ -77,25 +143,26 @@ _aesni_encrypt3:
 .align 16
 _aesni_decrypt3:
        movups  (%rcx),%xmm0
-       shrl    $1,%eax
+       shll    $4,%eax
        movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
        xorps   %xmm0,%xmm3
        xorps   %xmm0,%xmm4
-       movups  (%rcx),%xmm0
+       movups  32(%rcx),%xmm0
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
+       addq    $16,%rax
 
 .Ldec_loop3:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %eax
 .byte  102,15,56,222,225
-       movups  16(%rcx),%xmm1
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,222,224
-       movups  (%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Ldec_loop3
 
 .byte  102,15,56,222,209
@@ -110,28 +177,30 @@ _aesni_decrypt3:
 .align 16
 _aesni_encrypt4:
        movups  (%rcx),%xmm0
-       shrl    $1,%eax
+       shll    $4,%eax
        movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
        xorps   %xmm0,%xmm3
        xorps   %xmm0,%xmm4
        xorps   %xmm0,%xmm5
-       movups  (%rcx),%xmm0
+       movups  32(%rcx),%xmm0
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
+.byte  0x0f,0x1f,0x00
+       addq    $16,%rax
 
 .Lenc_loop4:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %eax
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
-       movups  16(%rcx),%xmm1
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
-       movups  (%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Lenc_loop4
 
 .byte  102,15,56,220,209
@@ -148,28 +217,30 @@ _aesni_encrypt4:
 .align 16
 _aesni_decrypt4:
        movups  (%rcx),%xmm0
-       shrl    $1,%eax
+       shll    $4,%eax
        movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
        xorps   %xmm0,%xmm3
        xorps   %xmm0,%xmm4
        xorps   %xmm0,%xmm5
-       movups  (%rcx),%xmm0
+       movups  32(%rcx),%xmm0
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
+.byte  0x0f,0x1f,0x00
+       addq    $16,%rax
 
 .Ldec_loop4:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %eax
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
-       movups  16(%rcx),%xmm1
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
-       movups  (%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Ldec_loop4
 
 .byte  102,15,56,222,209
@@ -186,43 +257,40 @@ _aesni_decrypt4:
 .align 16
 _aesni_encrypt6:
        movups  (%rcx),%xmm0
-       shrl    $1,%eax
+       shll    $4,%eax
        movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
-.byte  102,15,56,220,209
        pxor    %xmm0,%xmm4
+.byte  102,15,56,220,209
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
 .byte  102,15,56,220,217
        pxor    %xmm0,%xmm5
-.byte  102,15,56,220,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,220,233
+.byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
-       decl    %eax
-.byte  102,15,56,220,241
-       movups  (%rcx),%xmm0
-.byte  102,15,56,220,249
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
        jmp     .Lenc_loop6_enter
 .align 16
 .Lenc_loop6:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %eax
 .byte  102,15,56,220,225
+.Lenc_loop6_enter:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-.Lenc_loop6_enter:
-       movups  16(%rcx),%xmm1
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  (%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Lenc_loop6
 
 .byte  102,15,56,220,209
@@ -243,43 +311,40 @@ _aesni_encrypt6:
 .align 16
 _aesni_decrypt6:
        movups  (%rcx),%xmm0
-       shrl    $1,%eax
+       shll    $4,%eax
        movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
        pxor    %xmm0,%xmm3
-.byte  102,15,56,222,209
        pxor    %xmm0,%xmm4
+.byte  102,15,56,222,209
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
 .byte  102,15,56,222,217
        pxor    %xmm0,%xmm5
-.byte  102,15,56,222,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,222,233
+.byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
-       decl    %eax
-.byte  102,15,56,222,241
-       movups  (%rcx),%xmm0
-.byte  102,15,56,222,249
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
        jmp     .Ldec_loop6_enter
 .align 16
 .Ldec_loop6:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %eax
 .byte  102,15,56,222,225
+.Ldec_loop6_enter:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-.Ldec_loop6_enter:
-       movups  16(%rcx),%xmm1
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
-       movups  (%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Ldec_loop6
 
 .byte  102,15,56,222,209
@@ -300,52 +365,46 @@ _aesni_decrypt6:
 .align 16
 _aesni_encrypt8:
        movups  (%rcx),%xmm0
-       shrl    $1,%eax
+       shll    $4,%eax
        movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
        xorps   %xmm0,%xmm3
-.byte  102,15,56,220,209
        pxor    %xmm0,%xmm4
-.byte  102,15,56,220,217
        pxor    %xmm0,%xmm5
-.byte  102,15,56,220,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,220,233
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
+.byte  102,15,56,220,209
        pxor    %xmm0,%xmm7
-       decl    %eax
-.byte  102,15,56,220,241
        pxor    %xmm0,%xmm8
-.byte  102,15,56,220,249
+.byte  102,15,56,220,217
        pxor    %xmm0,%xmm9
-       movups  (%rcx),%xmm0
-.byte  102,68,15,56,220,193
-.byte  102,68,15,56,220,201
-       movups  16(%rcx),%xmm1
-       jmp     .Lenc_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     .Lenc_loop8_inner
 .align 16
 .Lenc_loop8:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %eax
+.Lenc_loop8_inner:
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
 .byte  102,68,15,56,220,193
 .byte  102,68,15,56,220,201
-       movups  16(%rcx),%xmm1
 .Lenc_loop8_enter:
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
 .byte  102,68,15,56,220,192
 .byte  102,68,15,56,220,200
-       movups  (%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Lenc_loop8
 
 .byte  102,15,56,220,209
@@ -370,52 +429,46 @@ _aesni_encrypt8:
 .align 16
 _aesni_decrypt8:
        movups  (%rcx),%xmm0
-       shrl    $1,%eax
+       shll    $4,%eax
        movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
        xorps   %xmm0,%xmm3
-.byte  102,15,56,222,209
        pxor    %xmm0,%xmm4
-.byte  102,15,56,222,217
        pxor    %xmm0,%xmm5
-.byte  102,15,56,222,225
        pxor    %xmm0,%xmm6
-.byte  102,15,56,222,233
+       leaq    32(%rcx,%rax,1),%rcx
+       negq    %rax
+.byte  102,15,56,222,209
        pxor    %xmm0,%xmm7
-       decl    %eax
-.byte  102,15,56,222,241
        pxor    %xmm0,%xmm8
-.byte  102,15,56,222,249
+.byte  102,15,56,222,217
        pxor    %xmm0,%xmm9
-       movups  (%rcx),%xmm0
-.byte  102,68,15,56,222,193
-.byte  102,68,15,56,222,201
-       movups  16(%rcx),%xmm1
-       jmp     .Ldec_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     .Ldec_loop8_inner
 .align 16
 .Ldec_loop8:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %eax
+.Ldec_loop8_inner:
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
 .byte  102,68,15,56,222,193
 .byte  102,68,15,56,222,201
-       movups  16(%rcx),%xmm1
 .Ldec_loop8_enter:
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
 .byte  102,68,15,56,222,192
 .byte  102,68,15,56,222,200
-       movups  (%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Ldec_loop8
 
 .byte  102,15,56,222,209
@@ -450,7 +503,7 @@ aesni_ecb_encrypt:
        testl   %r8d,%r8d
        jz      .Lecb_decrypt
 
-       cmpq    $128,%rdx
+       cmpq    $0x80,%rdx
        jb      .Lecb_enc_tail
 
        movdqu  (%rdi),%xmm2
@@ -462,7 +515,7 @@ aesni_ecb_encrypt:
        movdqu  96(%rdi),%xmm8
        movdqu  112(%rdi),%xmm9
        leaq    128(%rdi),%rdi
-       subq    $128,%rdx
+       subq    $0x80,%rdx
        jmp     .Lecb_enc_loop8_enter
 .align 16
 .Lecb_enc_loop8:
@@ -490,7 +543,7 @@ aesni_ecb_encrypt:
 
        call    _aesni_encrypt8
 
-       subq    $128,%rdx
+       subq    $0x80,%rdx
        jnc     .Lecb_enc_loop8
 
        movups  %xmm2,(%rsi)
@@ -504,26 +557,27 @@ aesni_ecb_encrypt:
        movups  %xmm8,96(%rsi)
        movups  %xmm9,112(%rsi)
        leaq    128(%rsi),%rsi
-       addq    $128,%rdx
+       addq    $0x80,%rdx
        jz      .Lecb_ret
 
 .Lecb_enc_tail:
        movups  (%rdi),%xmm2
-       cmpq    $32,%rdx
+       cmpq    $0x20,%rdx
        jb      .Lecb_enc_one
        movups  16(%rdi),%xmm3
        je      .Lecb_enc_two
        movups  32(%rdi),%xmm4
-       cmpq    $64,%rdx
+       cmpq    $0x40,%rdx
        jb      .Lecb_enc_three
        movups  48(%rdi),%xmm5
        je      .Lecb_enc_four
        movups  64(%rdi),%xmm6
-       cmpq    $96,%rdx
+       cmpq    $0x60,%rdx
        jb      .Lecb_enc_five
        movups  80(%rdi),%xmm7
        je      .Lecb_enc_six
        movdqu  96(%rdi),%xmm8
+       xorps   %xmm9,%xmm9
        call    _aesni_encrypt8
        movups  %xmm2,(%rsi)
        movups  %xmm3,16(%rsi)
@@ -544,14 +598,13 @@ aesni_ecb_encrypt:
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_3    
+       jnz     .Loop_enc1_3
 .byte  102,15,56,221,209
        movups  %xmm2,(%rsi)
        jmp     .Lecb_ret
 .align 16
 .Lecb_enc_two:
-       xorps   %xmm4,%xmm4
-       call    _aesni_encrypt3
+       call    _aesni_encrypt2
        movups  %xmm2,(%rsi)
        movups  %xmm3,16(%rsi)
        jmp     .Lecb_ret
@@ -593,7 +646,7 @@ aesni_ecb_encrypt:
 
 .align 16
 .Lecb_decrypt:
-       cmpq    $128,%rdx
+       cmpq    $0x80,%rdx
        jb      .Lecb_dec_tail
 
        movdqu  (%rdi),%xmm2
@@ -605,7 +658,7 @@ aesni_ecb_encrypt:
        movdqu  96(%rdi),%xmm8
        movdqu  112(%rdi),%xmm9
        leaq    128(%rdi),%rdi
-       subq    $128,%rdx
+       subq    $0x80,%rdx
        jmp     .Lecb_dec_loop8_enter
 .align 16
 .Lecb_dec_loop8:
@@ -634,49 +687,66 @@ aesni_ecb_encrypt:
        call    _aesni_decrypt8
 
        movups  (%r11),%xmm0
-       subq    $128,%rdx
+       subq    $0x80,%rdx
        jnc     .Lecb_dec_loop8
 
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movq    %r11,%rcx
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movl    %r10d,%eax
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
        movups  %xmm9,112(%rsi)
+       pxor    %xmm9,%xmm9
        leaq    128(%rsi),%rsi
-       addq    $128,%rdx
+       addq    $0x80,%rdx
        jz      .Lecb_ret
 
 .Lecb_dec_tail:
        movups  (%rdi),%xmm2
-       cmpq    $32,%rdx
+       cmpq    $0x20,%rdx
        jb      .Lecb_dec_one
        movups  16(%rdi),%xmm3
        je      .Lecb_dec_two
        movups  32(%rdi),%xmm4
-       cmpq    $64,%rdx
+       cmpq    $0x40,%rdx
        jb      .Lecb_dec_three
        movups  48(%rdi),%xmm5
        je      .Lecb_dec_four
        movups  64(%rdi),%xmm6
-       cmpq    $96,%rdx
+       cmpq    $0x60,%rdx
        jb      .Lecb_dec_five
        movups  80(%rdi),%xmm7
        je      .Lecb_dec_six
        movups  96(%rdi),%xmm8
        movups  (%rcx),%xmm0
+       xorps   %xmm9,%xmm9
        call    _aesni_decrypt8
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_one:
@@ -689,53 +759,76 @@ aesni_ecb_encrypt:
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     .Loop_dec1_4    
+       jnz     .Loop_dec1_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_two:
-       xorps   %xmm4,%xmm4
-       call    _aesni_decrypt3
+       call    _aesni_decrypt2
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_three:
        call    _aesni_decrypt3
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_four:
        call    _aesni_decrypt4
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_five:
        xorps   %xmm7,%xmm7
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_six:
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
 
 .Lecb_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        .byte   0xf3,0xc3
 .size  aesni_ecb_encrypt,.-aesni_ecb_encrypt
 .globl aesni_ccm64_encrypt_blocks
@@ -743,56 +836,62 @@ aesni_ecb_encrypt:
 .align 16
 aesni_ccm64_encrypt_blocks:
        movl    240(%rcx),%eax
-       movdqu  (%r8),%xmm9
-       movdqa  .Lincrement64(%rip),%xmm6
+       movdqu  (%r8),%xmm6
+       movdqa  .Lincrement64(%rip),%xmm9
        movdqa  .Lbswap_mask(%rip),%xmm7
 
-       shrl    $1,%eax
+       shll    $4,%eax
+       movl    $16,%r10d
        leaq    0(%rcx),%r11
        movdqu  (%r9),%xmm3
-       movdqa  %xmm9,%xmm2
-       movl    %eax,%r10d
-.byte  102,68,15,56,0,207
+       movdqa  %xmm6,%xmm2
+       leaq    32(%rcx,%rax,1),%rcx
+.byte  102,15,56,0,247
+       subq    %rax,%r10
        jmp     .Lccm64_enc_outer
 .align 16
 .Lccm64_enc_outer:
        movups  (%r11),%xmm0
-       movl    %r10d,%eax
+       movq    %r10,%rax
        movups  (%rdi),%xmm8
 
        xorps   %xmm0,%xmm2
        movups  16(%r11),%xmm1
        xorps   %xmm8,%xmm0
-       leaq    32(%r11),%rcx
        xorps   %xmm0,%xmm3
-       movups  (%rcx),%xmm0
+       movups  32(%r11),%xmm0
 
 .Lccm64_enc2_loop:
 .byte  102,15,56,220,209
-       decl    %eax
 .byte  102,15,56,220,217
-       movups  16(%rcx),%xmm1
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,220,208
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,220,216
-       movups  0(%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Lccm64_enc2_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       paddq   %xmm6,%xmm9
+       paddq   %xmm9,%xmm6
+       decq    %rdx
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
 
-       decq    %rdx
        leaq    16(%rdi),%rdi
        xorps   %xmm2,%xmm8
-       movdqa  %xmm9,%xmm2
+       movdqa  %xmm6,%xmm2
        movups  %xmm8,(%rsi)
-       leaq    16(%rsi),%rsi
 .byte  102,15,56,0,215
+       leaq    16(%rsi),%rsi
        jnz     .Lccm64_enc_outer
 
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 .size  aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
 .globl aesni_ccm64_decrypt_blocks
@@ -800,15 +899,15 @@ aesni_ccm64_encrypt_blocks:
 .align 16
 aesni_ccm64_decrypt_blocks:
        movl    240(%rcx),%eax
-       movups  (%r8),%xmm9
+       movups  (%r8),%xmm6
        movdqu  (%r9),%xmm3
-       movdqa  .Lincrement64(%rip),%xmm6
+       movdqa  .Lincrement64(%rip),%xmm9
        movdqa  .Lbswap_mask(%rip),%xmm7
 
-       movaps  %xmm9,%xmm2
+       movaps  %xmm6,%xmm2
        movl    %eax,%r10d
        movq    %rcx,%r11
-.byte  102,68,15,56,0,207
+.byte  102,15,56,0,247
        movups  (%rcx),%xmm0
        movups  16(%rcx),%xmm1
        leaq    32(%rcx),%rcx
@@ -818,17 +917,21 @@ aesni_ccm64_decrypt_blocks:
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_5    
+       jnz     .Loop_enc1_5
 .byte  102,15,56,221,209
+       shll    $4,%r10d
+       movl    $16,%eax
        movups  (%rdi),%xmm8
-       paddq   %xmm6,%xmm9
+       paddq   %xmm9,%xmm6
        leaq    16(%rdi),%rdi
+       subq    %r10,%rax
+       leaq    32(%r11,%r10,1),%rcx
+       movq    %rax,%r10
        jmp     .Lccm64_dec_outer
 .align 16
 .Lccm64_dec_outer:
        xorps   %xmm2,%xmm8
-       movdqa  %xmm9,%xmm2
-       movl    %r10d,%eax
+       movdqa  %xmm6,%xmm2
        movups  %xmm8,(%rsi)
        leaq    16(%rsi),%rsi
 .byte  102,15,56,0,215
@@ -837,36 +940,36 @@ aesni_ccm64_decrypt_blocks:
        jz      .Lccm64_dec_break
 
        movups  (%r11),%xmm0
-       shrl    $1,%eax
+       movq    %r10,%rax
        movups  16(%r11),%xmm1
        xorps   %xmm0,%xmm8
-       leaq    32(%r11),%rcx
        xorps   %xmm0,%xmm2
        xorps   %xmm8,%xmm3
-       movups  (%rcx),%xmm0
-
+       movups  32(%r11),%xmm0
+       jmp     .Lccm64_dec2_loop
+.align 16
 .Lccm64_dec2_loop:
 .byte  102,15,56,220,209
-       decl    %eax
 .byte  102,15,56,220,217
-       movups  16(%rcx),%xmm1
+       movups  (%rcx,%rax,1),%xmm1
+       addq    $32,%rax
 .byte  102,15,56,220,208
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,220,216
-       movups  0(%rcx),%xmm0
+       movups  -16(%rcx,%rax,1),%xmm0
        jnz     .Lccm64_dec2_loop
        movups  (%rdi),%xmm8
-       paddq   %xmm6,%xmm9
+       paddq   %xmm9,%xmm6
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       leaq    16(%rdi),%rdi
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
+       leaq    16(%rdi),%rdi
        jmp     .Lccm64_dec_outer
 
 .align 16
 .Lccm64_dec_break:
 
+       movl    240(%r11),%eax
        movups  (%r11),%xmm0
        movups  16(%r11),%xmm1
        xorps   %xmm0,%xmm8
@@ -877,9 +980,15 @@ aesni_ccm64_decrypt_blocks:
        decl    %eax
        movups  (%r11),%xmm1
        leaq    16(%r11),%r11
-       jnz     .Loop_enc1_6    
+       jnz     .Loop_enc1_6
 .byte  102,15,56,221,217
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 .size  aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
 .globl aesni_ctr32_encrypt_blocks
@@ -887,490 +996,859 @@ aesni_ccm64_decrypt_blocks:
 .align 16
 aesni_ctr32_encrypt_blocks:
        cmpq    $1,%rdx
-       je      .Lctr32_one_shortcut
+       jne     .Lctr32_bulk
+
+
+
+       movups  (%r8),%xmm2
+       movups  (%rdi),%xmm3
+       movl    240(%rcx),%edx
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+.Loop_enc1_7:
+.byte  102,15,56,220,209
+       decl    %edx
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_enc1_7
+.byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       xorps   %xmm2,%xmm2
+       jmp     .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+       leaq    (%rsp),%rax
+       pushq   %rbp
+       subq    $128,%rsp
+       andq    $-16,%rsp
+       leaq    -8(%rax),%rbp
+
 
-       movdqu  (%r8),%xmm14
-       movdqa  .Lbswap_mask(%rip),%xmm15
-       xorl    %eax,%eax
-.byte  102,69,15,58,22,242,3
-.byte  102,68,15,58,34,240,3
 
+
+       movdqu  (%r8),%xmm2
+       movdqu  (%rcx),%xmm0
+       movl    12(%r8),%r8d
+       pxor    %xmm0,%xmm2
+       movl    12(%rcx),%r11d
+       movdqa  %xmm2,0(%rsp)
+       bswapl  %r8d
+       movdqa  %xmm2,%xmm3
+       movdqa  %xmm2,%xmm4
+       movdqa  %xmm2,%xmm5
+       movdqa  %xmm2,64(%rsp)
+       movdqa  %xmm2,80(%rsp)
+       movdqa  %xmm2,96(%rsp)
+       movq    %rdx,%r10
+       movdqa  %xmm2,112(%rsp)
+
+       leaq    1(%r8),%rax
+       leaq    2(%r8),%rdx
+       bswapl  %eax
+       bswapl  %edx
+       xorl    %r11d,%eax
+       xorl    %r11d,%edx
+.byte  102,15,58,34,216,3
+       leaq    3(%r8),%rax
+       movdqa  %xmm3,16(%rsp)
+.byte  102,15,58,34,226,3
+       bswapl  %eax
+       movq    %r10,%rdx
+       leaq    4(%r8),%r10
+       movdqa  %xmm4,32(%rsp)
+       xorl    %r11d,%eax
+       bswapl  %r10d
+.byte  102,15,58,34,232,3
+       xorl    %r11d,%r10d
+       movdqa  %xmm5,48(%rsp)
+       leaq    5(%r8),%r9
+       movl    %r10d,64+12(%rsp)
+       bswapl  %r9d
+       leaq    6(%r8),%r10
        movl    240(%rcx),%eax
+       xorl    %r11d,%r9d
        bswapl  %r10d
-       pxor    %xmm12,%xmm12
-       pxor    %xmm13,%xmm13
-.byte  102,69,15,58,34,226,0
-       leaq    3(%r10),%r11
-.byte  102,69,15,58,34,235,0
-       incl    %r10d
-.byte  102,69,15,58,34,226,1
-       incq    %r11
-.byte  102,69,15,58,34,235,1
-       incl    %r10d
-.byte  102,69,15,58,34,226,2
-       incq    %r11
-.byte  102,69,15,58,34,235,2
-       movdqa  %xmm12,-40(%rsp)
-.byte  102,69,15,56,0,231
-       movdqa  %xmm13,-24(%rsp)
-.byte  102,69,15,56,0,239
-
-       pshufd  $192,%xmm12,%xmm2
-       pshufd  $128,%xmm12,%xmm3
-       pshufd  $64,%xmm12,%xmm4
-       cmpq    $6,%rdx
+       movl    %r9d,80+12(%rsp)
+       xorl    %r11d,%r10d
+       leaq    7(%r8),%r9
+       movl    %r10d,96+12(%rsp)
+       bswapl  %r9d
+       movl    OPENSSL_ia32cap_P+4(%rip),%r10d
+       xorl    %r11d,%r9d
+       andl    $71303168,%r10d
+       movl    %r9d,112+12(%rsp)
+
+       movups  16(%rcx),%xmm1
+
+       movdqa  64(%rsp),%xmm6
+       movdqa  80(%rsp),%xmm7
+
+       cmpq    $8,%rdx
        jb      .Lctr32_tail
-       shrl    $1,%eax
-       movq    %rcx,%r11
-       movl    %eax,%r10d
+
        subq    $6,%rdx
+       cmpl    $4194304,%r10d
+       je      .Lctr32_6x
+
+       leaq    128(%rcx),%rcx
+       subq    $2,%rdx
+       jmp     .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+       shll    $4,%eax
+       movl    $48,%r10d
+       bswapl  %r11d
+       leaq    32(%rcx,%rax,1),%rcx
+       subq    %rax,%r10
        jmp     .Lctr32_loop6
 
 .align 16
 .Lctr32_loop6:
-       pshufd  $192,%xmm13,%xmm5
-       por     %xmm14,%xmm2
-       movups  (%r11),%xmm0
-       pshufd  $128,%xmm13,%xmm6
-       por     %xmm14,%xmm3
-       movups  16(%r11),%xmm1
-       pshufd  $64,%xmm13,%xmm7
-       por     %xmm14,%xmm4
-       por     %xmm14,%xmm5
-       xorps   %xmm0,%xmm2
-       por     %xmm14,%xmm6
-       por     %xmm14,%xmm7
+       addl    $6,%r8d
+       movups  -48(%rcx,%r10,1),%xmm0
+.byte  102,15,56,220,209
+       movl    %r8d,%eax
+       xorl    %r11d,%eax
+.byte  102,15,56,220,217
+.byte  0x0f,0x38,0xf1,0x44,0x24,12
+       leal    1(%r8),%eax
+.byte  102,15,56,220,225
+       xorl    %r11d,%eax
+.byte  0x0f,0x38,0xf1,0x44,0x24,28
+.byte  102,15,56,220,233
+       leal    2(%r8),%eax
+       xorl    %r11d,%eax
+.byte  102,15,56,220,241
+.byte  0x0f,0x38,0xf1,0x44,0x24,44
+       leal    3(%r8),%eax
+.byte  102,15,56,220,249
+       movups  -32(%rcx,%r10,1),%xmm1
+       xorl    %r11d,%eax
+
+.byte  102,15,56,220,208
+.byte  0x0f,0x38,0xf1,0x44,0x24,60
+       leal    4(%r8),%eax
+.byte  102,15,56,220,216
+       xorl    %r11d,%eax
+.byte  0x0f,0x38,0xf1,0x44,0x24,76
+.byte  102,15,56,220,224
+       leal    5(%r8),%eax
+       xorl    %r11d,%eax
+.byte  102,15,56,220,232
+.byte  0x0f,0x38,0xf1,0x44,0x24,92
+       movq    %r10,%rax
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+       movups  -16(%rcx,%r10,1),%xmm0
 
+       call    .Lenc_loop6
 
+       movdqu  (%rdi),%xmm8
+       movdqu  16(%rdi),%xmm9
+       movdqu  32(%rdi),%xmm10
+       movdqu  48(%rdi),%xmm11
+       movdqu  64(%rdi),%xmm12
+       movdqu  80(%rdi),%xmm13
+       leaq    96(%rdi),%rdi
+       movups  -64(%rcx,%r10,1),%xmm1
+       pxor    %xmm2,%xmm8
+       movaps  0(%rsp),%xmm2
+       pxor    %xmm3,%xmm9
+       movaps  16(%rsp),%xmm3
+       pxor    %xmm4,%xmm10
+       movaps  32(%rsp),%xmm4
+       pxor    %xmm5,%xmm11
+       movaps  48(%rsp),%xmm5
+       pxor    %xmm6,%xmm12
+       movaps  64(%rsp),%xmm6
+       pxor    %xmm7,%xmm13
+       movaps  80(%rsp),%xmm7
+       movdqu  %xmm8,(%rsi)
+       movdqu  %xmm9,16(%rsi)
+       movdqu  %xmm10,32(%rsi)
+       movdqu  %xmm11,48(%rsi)
+       movdqu  %xmm12,64(%rsi)
+       movdqu  %xmm13,80(%rsi)
+       leaq    96(%rsi),%rsi
 
+       subq    $6,%rdx
+       jnc     .Lctr32_loop6
 
-       pxor    %xmm0,%xmm3
+       addq    $6,%rdx
+       jz      .Lctr32_done
+
+       leal    -48(%r10),%eax
+       leaq    -80(%rcx,%r10,1),%rcx
+       negl    %eax
+       shrl    $4,%eax
+       jmp     .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+       addl    $8,%r8d
+       movdqa  96(%rsp),%xmm8
 .byte  102,15,56,220,209
-       leaq    32(%r11),%rcx
-       pxor    %xmm0,%xmm4
+       movl    %r8d,%r9d
+       movdqa  112(%rsp),%xmm9
 .byte  102,15,56,220,217
-       movdqa  .Lincrement32(%rip),%xmm13
-       pxor    %xmm0,%xmm5
+       bswapl  %r9d
+       movups  32-128(%rcx),%xmm0
 .byte  102,15,56,220,225
-       movdqa  -40(%rsp),%xmm12
-       pxor    %xmm0,%xmm6
+       xorl    %r11d,%r9d
+       nop
 .byte  102,15,56,220,233
-       pxor    %xmm0,%xmm7
-       movups  (%rcx),%xmm0
-       decl    %eax
+       movl    %r9d,0+12(%rsp)
+       leaq    1(%r8),%r9
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-       jmp     .Lctr32_enc_loop6_enter
-.align 16
-.Lctr32_enc_loop6:
+.byte  102,68,15,56,220,193
+.byte  102,68,15,56,220,201
+       movups  48-128(%rcx),%xmm1
+       bswapl  %r9d
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+       xorl    %r11d,%r9d
+.byte  0x66,0x90
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movl    %r9d,16+12(%rsp)
+       leaq    2(%r8),%r9
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+.byte  102,68,15,56,220,192
+.byte  102,68,15,56,220,200
+       movups  64-128(%rcx),%xmm0
+       bswapl  %r9d
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %eax
+       xorl    %r11d,%r9d
+.byte  0x66,0x90
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
+       movl    %r9d,32+12(%rsp)
+       leaq    3(%r8),%r9
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-.Lctr32_enc_loop6_enter:
-       movups  16(%rcx),%xmm1
+.byte  102,68,15,56,220,193
+.byte  102,68,15,56,220,201
+       movups  80-128(%rcx),%xmm1
+       bswapl  %r9d
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leaq    32(%rcx),%rcx
+       xorl    %r11d,%r9d
+.byte  0x66,0x90
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
+       movl    %r9d,48+12(%rsp)
+       leaq    4(%r8),%r9
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  (%rcx),%xmm0
-       jnz     .Lctr32_enc_loop6
-
+.byte  102,68,15,56,220,192
+.byte  102,68,15,56,220,200
+       movups  96-128(%rcx),%xmm0
+       bswapl  %r9d
 .byte  102,15,56,220,209
-       paddd   %xmm13,%xmm12
 .byte  102,15,56,220,217
-       paddd   -24(%rsp),%xmm13
+       xorl    %r11d,%r9d
+.byte  0x66,0x90
 .byte  102,15,56,220,225
-       movdqa  %xmm12,-40(%rsp)
 .byte  102,15,56,220,233
-       movdqa  %xmm13,-24(%rsp)
+       movl    %r9d,64+12(%rsp)
+       leaq    5(%r8),%r9
 .byte  102,15,56,220,241
-.byte  102,69,15,56,0,231
 .byte  102,15,56,220,249
-.byte  102,69,15,56,0,239
-
-.byte  102,15,56,221,208
-       movups  (%rdi),%xmm8
-.byte  102,15,56,221,216
-       movups  16(%rdi),%xmm9
-.byte  102,15,56,221,224
-       movups  32(%rdi),%xmm10
-.byte  102,15,56,221,232
-       movups  48(%rdi),%xmm11
-.byte  102,15,56,221,240
-       movups  64(%rdi),%xmm1
-.byte  102,15,56,221,248
-       movups  80(%rdi),%xmm0
-       leaq    96(%rdi),%rdi
-
-       xorps   %xmm2,%xmm8
-       pshufd  $192,%xmm12,%xmm2
-       xorps   %xmm3,%xmm9
-       pshufd  $128,%xmm12,%xmm3
-       movups  %xmm8,(%rsi)
-       xorps   %xmm4,%xmm10
-       pshufd  $64,%xmm12,%xmm4
-       movups  %xmm9,16(%rsi)
-       xorps   %xmm5,%xmm11
-       movups  %xmm10,32(%rsi)
-       xorps   %xmm6,%xmm1
-       movups  %xmm11,48(%rsi)
-       xorps   %xmm7,%xmm0
-       movups  %xmm1,64(%rsi)
-       movups  %xmm0,80(%rsi)
-       leaq    96(%rsi),%rsi
-       movl    %r10d,%eax
-       subq    $6,%rdx
-       jnc     .Lctr32_loop6
+.byte  102,68,15,56,220,193
+.byte  102,68,15,56,220,201
+       movups  112-128(%rcx),%xmm1
+       bswapl  %r9d
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+       xorl    %r11d,%r9d
+.byte  0x66,0x90
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       movl    %r9d,80+12(%rsp)
+       leaq    6(%r8),%r9
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+.byte  102,68,15,56,220,192
+.byte  102,68,15,56,220,200
+       movups  128-128(%rcx),%xmm0
+       bswapl  %r9d
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+       xorl    %r11d,%r9d
+.byte  0x66,0x90
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       movl    %r9d,96+12(%rsp)
+       leaq    7(%r8),%r9
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+.byte  102,68,15,56,220,193
+.byte  102,68,15,56,220,201
+       movups  144-128(%rcx),%xmm1
+       bswapl  %r9d
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+       xorl    %r11d,%r9d
+       movdqu  0(%rdi),%xmm10
+.byte  102,15,56,220,232
+       movl    %r9d,112+12(%rsp)
+       cmpl    $11,%eax
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+.byte  102,68,15,56,220,192
+.byte  102,68,15,56,220,200
+       movups  160-128(%rcx),%xmm0
 
-       addq    $6,%rdx
+       jb      .Lctr32_enc_done
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+.byte  102,68,15,56,220,193
+.byte  102,68,15,56,220,201
+       movups  176-128(%rcx),%xmm1
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+.byte  102,68,15,56,220,192
+.byte  102,68,15,56,220,200
+       movups  192-128(%rcx),%xmm0
+       je      .Lctr32_enc_done
+
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+.byte  102,68,15,56,220,193
+.byte  102,68,15,56,220,201
+       movups  208-128(%rcx),%xmm1
+
+.byte  102,15,56,220,208
+.byte  102,15,56,220,216
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+.byte  102,68,15,56,220,192
+.byte  102,68,15,56,220,200
+       movups  224-128(%rcx),%xmm0
+       jmp     .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+       movdqu  16(%rdi),%xmm11
+       pxor    %xmm0,%xmm10
+       movdqu  32(%rdi),%xmm12
+       pxor    %xmm0,%xmm11
+       movdqu  48(%rdi),%xmm13
+       pxor    %xmm0,%xmm12
+       movdqu  64(%rdi),%xmm14
+       pxor    %xmm0,%xmm13
+       movdqu  80(%rdi),%xmm15
+       pxor    %xmm0,%xmm14
+       pxor    %xmm0,%xmm15
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+.byte  102,68,15,56,220,193
+.byte  102,68,15,56,220,201
+       movdqu  96(%rdi),%xmm1
+       leaq    128(%rdi),%rdi
+
+.byte  102,65,15,56,221,210
+       pxor    %xmm0,%xmm1
+       movdqu  112-128(%rdi),%xmm10
+.byte  102,65,15,56,221,219
+       pxor    %xmm0,%xmm10
+       movdqa  0(%rsp),%xmm11
+.byte  102,65,15,56,221,228
+.byte  102,65,15,56,221,237
+       movdqa  16(%rsp),%xmm12
+       movdqa  32(%rsp),%xmm13
+.byte  102,65,15,56,221,246
+.byte  102,65,15,56,221,255
+       movdqa  48(%rsp),%xmm14
+       movdqa  64(%rsp),%xmm15
+.byte  102,68,15,56,221,193
+       movdqa  80(%rsp),%xmm0
+       movups  16-128(%rcx),%xmm1
+.byte  102,69,15,56,221,202
+
+       movups  %xmm2,(%rsi)
+       movdqa  %xmm11,%xmm2
+       movups  %xmm3,16(%rsi)
+       movdqa  %xmm12,%xmm3
+       movups  %xmm4,32(%rsi)
+       movdqa  %xmm13,%xmm4
+       movups  %xmm5,48(%rsi)
+       movdqa  %xmm14,%xmm5
+       movups  %xmm6,64(%rsi)
+       movdqa  %xmm15,%xmm6
+       movups  %xmm7,80(%rsi)
+       movdqa  %xmm0,%xmm7
+       movups  %xmm8,96(%rsi)
+       movups  %xmm9,112(%rsi)
+       leaq    128(%rsi),%rsi
+
+       subq    $8,%rdx
+       jnc     .Lctr32_loop8
+
+       addq    $8,%rdx
        jz      .Lctr32_done
-       movq    %r11,%rcx
-       leal    1(%rax,%rax,1),%eax
+       leaq    -128(%rcx),%rcx
 
 .Lctr32_tail:
-       por     %xmm14,%xmm2
-       movups  (%rdi),%xmm8
-       cmpq    $2,%rdx
-       jb      .Lctr32_one
 
-       por     %xmm14,%xmm3
-       movups  16(%rdi),%xmm9
-       je      .Lctr32_two
 
-       pshufd  $192,%xmm13,%xmm5
-       por     %xmm14,%xmm4
-       movups  32(%rdi),%xmm10
+       leaq    16(%rcx),%rcx
        cmpq    $4,%rdx
-       jb      .Lctr32_three
+       jb      .Lctr32_loop3
+       je      .Lctr32_loop4
 
-       pshufd  $128,%xmm13,%xmm6
-       por     %xmm14,%xmm5
-       movups  48(%rdi),%xmm11
-       je      .Lctr32_four
 
-       por     %xmm14,%xmm6
-       xorps   %xmm7,%xmm7
+       shll    $4,%eax
+       movdqa  96(%rsp),%xmm8
+       pxor    %xmm9,%xmm9
 
-       call    _aesni_encrypt6
+       movups  16(%rcx),%xmm0
+.byte  102,15,56,220,209
+.byte  102,15,56,220,217
+       leaq    32-16(%rcx,%rax,1),%rcx
+       negq    %rax
+.byte  102,15,56,220,225
+       addq    $16,%rax
+       movups  (%rdi),%xmm10
+.byte  102,15,56,220,233
+.byte  102,15,56,220,241
+       movups  16(%rdi),%xmm11
+       movups  32(%rdi),%xmm12
+.byte  102,15,56,220,249
+.byte  102,68,15,56,220,193
 
-       movups  64(%rdi),%xmm1
-       xorps   %xmm2,%xmm8
-       xorps   %xmm3,%xmm9
-       movups  %xmm8,(%rsi)
-       xorps   %xmm4,%xmm10
-       movups  %xmm9,16(%rsi)
-       xorps   %xmm5,%xmm11
-       movups  %xmm10,32(%rsi)
-       xorps   %xmm6,%xmm1
-       movups  %xmm11,48(%rsi)
-       movups  %xmm1,64(%rsi)
+       call    .Lenc_loop8_enter
+
+       movdqu  48(%rdi),%xmm13
+       pxor    %xmm10,%xmm2
+       movdqu  64(%rdi),%xmm10
+       pxor    %xmm11,%xmm3
+       movdqu  %xmm2,(%rsi)
+       pxor    %xmm12,%xmm4
+       movdqu  %xmm3,16(%rsi)
+       pxor    %xmm13,%xmm5
+       movdqu  %xmm4,32(%rsi)
+       pxor    %xmm10,%xmm6
+       movdqu  %xmm5,48(%rsi)
+       movdqu  %xmm6,64(%rsi)
+       cmpq    $6,%rdx
+       jb      .Lctr32_done
+
+       movups  80(%rdi),%xmm11
+       xorps   %xmm11,%xmm7
+       movups  %xmm7,80(%rsi)
+       je      .Lctr32_done
+
+       movups  96(%rdi),%xmm12
+       xorps   %xmm12,%xmm8
+       movups  %xmm8,96(%rsi)
        jmp     .Lctr32_done
 
-.align 16
-.Lctr32_one_shortcut:
-       movups  (%r8),%xmm2
-       movups  (%rdi),%xmm8
-       movl    240(%rcx),%eax
-.Lctr32_one:
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
-       xorps   %xmm0,%xmm2
-.Loop_enc1_7:
+.align 32
+.Lctr32_loop4:
 .byte  102,15,56,220,209
+       leaq    16(%rcx),%rcx
        decl    %eax
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
        movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_7    
+       jnz     .Lctr32_loop4
 .byte  102,15,56,221,209
-       xorps   %xmm2,%xmm8
-       movups  %xmm8,(%rsi)
-       jmp     .Lctr32_done
+.byte  102,15,56,221,217
+       movups  (%rdi),%xmm10
+       movups  16(%rdi),%xmm11
+.byte  102,15,56,221,225
+.byte  102,15,56,221,233
+       movups  32(%rdi),%xmm12
+       movups  48(%rdi),%xmm13
 
-.align 16
-.Lctr32_two:
-       xorps   %xmm4,%xmm4
-       call    _aesni_encrypt3
-       xorps   %xmm2,%xmm8
-       xorps   %xmm3,%xmm9
-       movups  %xmm8,(%rsi)
-       movups  %xmm9,16(%rsi)
+       xorps   %xmm10,%xmm2
+       movups  %xmm2,(%rsi)
+       xorps   %xmm11,%xmm3
+       movups  %xmm3,16(%rsi)
+       pxor    %xmm12,%xmm4
+       movdqu  %xmm4,32(%rsi)
+       pxor    %xmm13,%xmm5
+       movdqu  %xmm5,48(%rsi)
        jmp     .Lctr32_done
 
-.align 16
-.Lctr32_three:
-       call    _aesni_encrypt3
-       xorps   %xmm2,%xmm8
-       xorps   %xmm3,%xmm9
-       movups  %xmm8,(%rsi)
-       xorps   %xmm4,%xmm10
-       movups  %xmm9,16(%rsi)
-       movups  %xmm10,32(%rsi)
-       jmp     .Lctr32_done
+.align 32
+.Lctr32_loop3:
+.byte  102,15,56,220,209
+       leaq    16(%rcx),%rcx
+       decl    %eax
+.byte  102,15,56,220,217
+.byte  102,15,56,220,225
+       movups  (%rcx),%xmm1
+       jnz     .Lctr32_loop3
+.byte  102,15,56,221,209
+.byte  102,15,56,221,217
+.byte  102,15,56,221,225
 
-.align 16
-.Lctr32_four:
-       call    _aesni_encrypt4
-       xorps   %xmm2,%xmm8
-       xorps   %xmm3,%xmm9
-       movups  %xmm8,(%rsi)
-       xorps   %xmm4,%xmm10
-       movups  %xmm9,16(%rsi)
-       xorps   %xmm5,%xmm11
-       movups  %xmm10,32(%rsi)
-       movups  %xmm11,48(%rsi)
+       movups  (%rdi),%xmm10
+       xorps   %xmm10,%xmm2
+       movups  %xmm2,(%rsi)
+       cmpq    $2,%rdx
+       jb      .Lctr32_done
+
+       movups  16(%rdi),%xmm11
+       xorps   %xmm11,%xmm3
+       movups  %xmm3,16(%rsi)
+       je      .Lctr32_done
+
+       movups  32(%rdi),%xmm12
+       xorps   %xmm12,%xmm4
+       movups  %xmm4,32(%rsi)
 
 .Lctr32_done:
+       xorps   %xmm0,%xmm0
+       xorl    %r11d,%r11d
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       movaps  %xmm0,112(%rsp)
+       pxor    %xmm15,%xmm15
+       leaq    (%rbp),%rsp
+       popq    %rbp
+.Lctr32_epilogue:
        .byte   0xf3,0xc3
 .size  aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
 .globl aesni_xts_encrypt
 .type  aesni_xts_encrypt,@function
 .align 16
 aesni_xts_encrypt:
-       leaq    -104(%rsp),%rsp
-       movups  (%r9),%xmm15
+       leaq    (%rsp),%rax
+       pushq   %rbp
+       subq    $112,%rsp
+       andq    $-16,%rsp
+       leaq    -8(%rax),%rbp
+       movups  (%r9),%xmm2
        movl    240(%r8),%eax
        movl    240(%rcx),%r10d
        movups  (%r8),%xmm0
        movups  16(%r8),%xmm1
        leaq    32(%r8),%r8
-       xorps   %xmm0,%xmm15
+       xorps   %xmm0,%xmm2
 .Loop_enc1_8:
-.byte  102,68,15,56,220,249
+.byte  102,15,56,220,209
        decl    %eax
        movups  (%r8),%xmm1
        leaq    16(%r8),%r8
-       jnz     .Loop_enc1_8    
-.byte  102,68,15,56,221,249
+       jnz     .Loop_enc1_8
+.byte  102,15,56,221,209
+       movups  (%rcx),%xmm0
        movq    %rcx,%r11
        movl    %r10d,%eax
+       shll    $4,%r10d
        movq    %rdx,%r9
        andq    $-16,%rdx
 
+       movups  16(%rcx,%r10,1),%xmm1
+
        movdqa  .Lxts_magic(%rip),%xmm8
-       pxor    %xmm14,%xmm14
-       pcmpgtd %xmm15,%xmm14
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
+       movdqa  %xmm2,%xmm15
+       pshufd  $0x5f,%xmm2,%xmm9
+       pxor    %xmm0,%xmm1
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
        movdqa  %xmm15,%xmm10
+       psrad   $31,%xmm14
        paddq   %xmm15,%xmm15
-       pand    %xmm8,%xmm9
-       pcmpgtd %xmm15,%xmm14
-       pxor    %xmm9,%xmm15
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
+       pand    %xmm8,%xmm14
+       pxor    %xmm0,%xmm10
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
        movdqa  %xmm15,%xmm11
+       psrad   $31,%xmm14
        paddq   %xmm15,%xmm15
-       pand    %xmm8,%xmm9
-       pcmpgtd %xmm15,%xmm14
-       pxor    %xmm9,%xmm15
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
+       pand    %xmm8,%xmm14
+       pxor    %xmm0,%xmm11
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
        movdqa  %xmm15,%xmm12
+       psrad   $31,%xmm14
        paddq   %xmm15,%xmm15
-       pand    %xmm8,%xmm9
-       pcmpgtd %xmm15,%xmm14
-       pxor    %xmm9,%xmm15
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
+       pand    %xmm8,%xmm14
+       pxor    %xmm0,%xmm12
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
        movdqa  %xmm15,%xmm13
+       psrad   $31,%xmm14
+       paddq   %xmm15,%xmm15
+       pand    %xmm8,%xmm14
+       pxor    %xmm0,%xmm13
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm15,%xmm14
+       psrad   $31,%xmm9
        paddq   %xmm15,%xmm15
        pand    %xmm8,%xmm9
-       pcmpgtd %xmm15,%xmm14
+       pxor    %xmm0,%xmm14
        pxor    %xmm9,%xmm15
+       movaps  %xmm1,96(%rsp)
+
        subq    $96,%rdx
        jc      .Lxts_enc_short
 
-       shrl    $1,%eax
-       subl    $1,%eax
-       movl    %eax,%r10d
+       movl    $16+96,%eax
+       leaq    32(%r11,%r10,1),%rcx
+       subq    %r10,%rax
+       movups  16(%r11),%xmm1
+       movq    %rax,%r10
+       leaq    .Lxts_magic(%rip),%r8
        jmp     .Lxts_enc_grandloop
 
-.align 16
+.align 32
 .Lxts_enc_grandloop:
-       pshufd  $19,%xmm14,%xmm9
-       movdqa  %xmm15,%xmm14
-       paddq   %xmm15,%xmm15
        movdqu  0(%rdi),%xmm2
-       pand    %xmm8,%xmm9
+       movdqa  %xmm0,%xmm8
        movdqu  16(%rdi),%xmm3
-       pxor    %xmm9,%xmm15
-
-       movdqu  32(%rdi),%xmm4
        pxor    %xmm10,%xmm2
-       movdqu  48(%rdi),%xmm5
+       movdqu  32(%rdi),%xmm4
        pxor    %xmm11,%xmm3
-       movdqu  64(%rdi),%xmm6
+.byte  102,15,56,220,209
+       movdqu  48(%rdi),%xmm5
        pxor    %xmm12,%xmm4
-       movdqu  80(%rdi),%xmm7
-       leaq    96(%rdi),%rdi
+.byte  102,15,56,220,217
+       movdqu  64(%rdi),%xmm6
        pxor    %xmm13,%xmm5
-       movups  (%r11),%xmm0
+.byte  102,15,56,220,225
+       movdqu  80(%rdi),%xmm7
+       pxor    %xmm15,%xmm8
+       movdqa  96(%rsp),%xmm9
        pxor    %xmm14,%xmm6
-       pxor    %xmm15,%xmm7
-
-
+.byte  102,15,56,220,233
+       movups  32(%r11),%xmm0
+       leaq    96(%rdi),%rdi
+       pxor    %xmm8,%xmm7
 
-       movups  16(%r11),%xmm1
-       pxor    %xmm0,%xmm2
-       pxor    %xmm0,%xmm3
+       pxor    %xmm9,%xmm10
+.byte  102,15,56,220,241
+       pxor    %xmm9,%xmm11
        movdqa  %xmm10,0(%rsp)
-.byte  102,15,56,220,209
-       leaq    32(%r11),%rcx
-       pxor    %xmm0,%xmm4
+.byte  102,15,56,220,249
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm12
+
+.byte  102,15,56,220,208
+       pxor    %xmm9,%xmm13
        movdqa  %xmm11,16(%rsp)
-.byte  102,15,56,220,217
-       pxor    %xmm0,%xmm5
+.byte  102,15,56,220,216
+       pxor    %xmm9,%xmm14
        movdqa  %xmm12,32(%rsp)
-.byte  102,15,56,220,225
-       pxor    %xmm0,%xmm6
-       movdqa  %xmm13,48(%rsp)
-.byte  102,15,56,220,233
-       pxor    %xmm0,%xmm7
-       movups  (%rcx),%xmm0
-       decl    %eax
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+       pxor    %xmm9,%xmm8
        movdqa  %xmm14,64(%rsp)
-.byte  102,15,56,220,241
-       movdqa  %xmm15,80(%rsp)
-.byte  102,15,56,220,249
-       pxor    %xmm14,%xmm14
-       pcmpgtd %xmm15,%xmm14
-       jmp     .Lxts_enc_loop6_enter
-
-.align 16
+.byte  102,15,56,220,240
+.byte  102,15,56,220,248
+       movups  64(%r11),%xmm0
+       movdqa  %xmm8,80(%rsp)
+       pshufd  $0x5f,%xmm15,%xmm9
+       jmp     .Lxts_enc_loop6
+.align 32
 .Lxts_enc_loop6:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-       decl    %eax
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-.Lxts_enc_loop6_enter:
-       movups  16(%rcx),%xmm1
+       movups  -64(%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,220,224
 .byte  102,15,56,220,232
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
-       movups  (%rcx),%xmm0
+       movups  -80(%rcx,%rax,1),%xmm0
        jnz     .Lxts_enc_loop6
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       paddq   %xmm15,%xmm15
+       movdqa  (%r8),%xmm8
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
 .byte  102,15,56,220,209
-       pand    %xmm8,%xmm9
+       paddq   %xmm15,%xmm15
+       psrad   $31,%xmm14
 .byte  102,15,56,220,217
-       pcmpgtd %xmm15,%xmm14
+       pand    %xmm8,%xmm14
+       movups  (%r11),%xmm10
 .byte  102,15,56,220,225
-       pxor    %xmm9,%xmm15
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
+       pxor    %xmm14,%xmm15
+       movaps  %xmm10,%xmm11
 .byte  102,15,56,220,249
-       movups  16(%rcx),%xmm1
+       movups  -64(%rcx),%xmm1
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       movdqa  %xmm15,%xmm10
-       paddq   %xmm15,%xmm15
+       movdqa  %xmm9,%xmm14
 .byte  102,15,56,220,208
-       pand    %xmm8,%xmm9
+       paddd   %xmm9,%xmm9
+       pxor    %xmm15,%xmm10
 .byte  102,15,56,220,216
-       pcmpgtd %xmm15,%xmm14
+       psrad   $31,%xmm14
+       paddq   %xmm15,%xmm15
 .byte  102,15,56,220,224
-       pxor    %xmm9,%xmm15
 .byte  102,15,56,220,232
+       pand    %xmm8,%xmm14
+       movaps  %xmm11,%xmm12
 .byte  102,15,56,220,240
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm9,%xmm14
 .byte  102,15,56,220,248
-       movups  32(%rcx),%xmm0
+       movups  -48(%rcx),%xmm0
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       movdqa  %xmm15,%xmm11
-       paddq   %xmm15,%xmm15
+       paddd   %xmm9,%xmm9
 .byte  102,15,56,220,209
-       pand    %xmm8,%xmm9
+       pxor    %xmm15,%xmm11
+       psrad   $31,%xmm14
 .byte  102,15,56,220,217
-       pcmpgtd %xmm15,%xmm14
+       paddq   %xmm15,%xmm15
+       pand    %xmm8,%xmm14
 .byte  102,15,56,220,225
-       pxor    %xmm9,%xmm15
 .byte  102,15,56,220,233
+       movdqa  %xmm13,48(%rsp)
+       pxor    %xmm14,%xmm15
 .byte  102,15,56,220,241
+       movaps  %xmm12,%xmm13
+       movdqa  %xmm9,%xmm14
 .byte  102,15,56,220,249
+       movups  -32(%rcx),%xmm1
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       movdqa  %xmm15,%xmm12
+       paddd   %xmm9,%xmm9
+.byte  102,15,56,220,208
+       pxor    %xmm15,%xmm12
+       psrad   $31,%xmm14
+.byte  102,15,56,220,216
        paddq   %xmm15,%xmm15
-.byte  102,15,56,221,208
-       pand    %xmm8,%xmm9
-.byte  102,15,56,221,216
-       pcmpgtd %xmm15,%xmm14
-.byte  102,15,56,221,224
-       pxor    %xmm9,%xmm15
-.byte  102,15,56,221,232
-.byte  102,15,56,221,240
-.byte  102,15,56,221,248
+       pand    %xmm8,%xmm14
+.byte  102,15,56,220,224
+.byte  102,15,56,220,232
+.byte  102,15,56,220,240
+       pxor    %xmm14,%xmm15
+       movaps  %xmm13,%xmm14
+.byte  102,15,56,220,248
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       movdqa  %xmm15,%xmm13
+       movdqa  %xmm9,%xmm0
+       paddd   %xmm9,%xmm9
+.byte  102,15,56,220,209
+       pxor    %xmm15,%xmm13
+       psrad   $31,%xmm0
+.byte  102,15,56,220,217
+       paddq   %xmm15,%xmm15
+       pand    %xmm8,%xmm0
+.byte  102,15,56,220,225
+.byte  102,15,56,220,233
+       pxor    %xmm0,%xmm15
+       movups  (%r11),%xmm0
+.byte  102,15,56,220,241
+.byte  102,15,56,220,249
+       movups  16(%r11),%xmm1
+
+       pxor    %xmm15,%xmm14
+.byte  102,15,56,221,84,36,0
+       psrad   $31,%xmm9
        paddq   %xmm15,%xmm15
-       xorps   0(%rsp),%xmm2
+.byte  102,15,56,221,92,36,16
+.byte  102,15,56,221,100,36,32
        pand    %xmm8,%xmm9
-       xorps   16(%rsp),%xmm3
-       pcmpgtd %xmm15,%xmm14
+       movq    %r10,%rax
+.byte  102,15,56,221,108,36,48
+.byte  102,15,56,221,116,36,64
+.byte  102,15,56,221,124,36,80
        pxor    %xmm9,%xmm15
 
-       xorps   32(%rsp),%xmm4
-       movups  %xmm2,0(%rsi)
-       xorps   48(%rsp),%xmm5
-       movups  %xmm3,16(%rsi)
-       xorps   64(%rsp),%xmm6
-       movups  %xmm4,32(%rsi)
-       xorps   80(%rsp),%xmm7
-       movups  %xmm5,48(%rsi)
-       movl    %r10d,%eax
-       movups  %xmm6,64(%rsi)
-       movups  %xmm7,80(%rsi)
        leaq    96(%rsi),%rsi
+       movups  %xmm2,-96(%rsi)
+       movups  %xmm3,-80(%rsi)
+       movups  %xmm4,-64(%rsi)
+       movups  %xmm5,-48(%rsi)
+       movups  %xmm6,-32(%rsi)
+       movups  %xmm7,-16(%rsi)
        subq    $96,%rdx
        jnc     .Lxts_enc_grandloop
 
-       leal    3(%rax,%rax,1),%eax
+       movl    $16+96,%eax
+       subl    %r10d,%eax
        movq    %r11,%rcx
-       movl    %eax,%r10d
+       shrl    $4,%eax
 
 .Lxts_enc_short:
+
+       movl    %eax,%r10d
+       pxor    %xmm0,%xmm10
        addq    $96,%rdx
        jz      .Lxts_enc_done
 
-       cmpq    $32,%rdx
+       pxor    %xmm0,%xmm11
+       cmpq    $0x20,%rdx
        jb      .Lxts_enc_one
+       pxor    %xmm0,%xmm12
        je      .Lxts_enc_two
 
-       cmpq    $64,%rdx
+       pxor    %xmm0,%xmm13
+       cmpq    $0x40,%rdx
        jb      .Lxts_enc_three
+       pxor    %xmm0,%xmm14
        je      .Lxts_enc_four
 
-       pshufd  $19,%xmm14,%xmm9
-       movdqa  %xmm15,%xmm14
-       paddq   %xmm15,%xmm15
        movdqu  (%rdi),%xmm2
-       pand    %xmm8,%xmm9
        movdqu  16(%rdi),%xmm3
-       pxor    %xmm9,%xmm15
-
        movdqu  32(%rdi),%xmm4
        pxor    %xmm10,%xmm2
        movdqu  48(%rdi),%xmm5
@@ -1380,6 +1858,7 @@ aesni_xts_encrypt:
        pxor    %xmm12,%xmm4
        pxor    %xmm13,%xmm5
        pxor    %xmm14,%xmm6
+       pxor    %xmm7,%xmm7
 
        call    _aesni_encrypt6
 
@@ -1411,7 +1890,7 @@ aesni_xts_encrypt:
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_9    
+       jnz     .Loop_enc1_9
 .byte  102,15,56,221,209
        xorps   %xmm10,%xmm2
        movdqa  %xmm11,%xmm10
@@ -1427,7 +1906,7 @@ aesni_xts_encrypt:
        xorps   %xmm10,%xmm2
        xorps   %xmm11,%xmm3
 
-       call    _aesni_encrypt3
+       call    _aesni_encrypt2
 
        xorps   %xmm10,%xmm2
        movdqa  %xmm12,%xmm10
@@ -1473,15 +1952,15 @@ aesni_xts_encrypt:
 
        call    _aesni_encrypt4
 
-       xorps   %xmm10,%xmm2
-       movdqa  %xmm15,%xmm10
-       xorps   %xmm11,%xmm3
-       xorps   %xmm12,%xmm4
-       movups  %xmm2,(%rsi)
-       xorps   %xmm13,%xmm5
-       movups  %xmm3,16(%rsi)
-       movups  %xmm4,32(%rsi)
-       movups  %xmm5,48(%rsi)
+       pxor    %xmm10,%xmm2
+       movdqa  %xmm14,%xmm10
+       pxor    %xmm11,%xmm3
+       pxor    %xmm12,%xmm4
+       movdqu  %xmm2,(%rsi)
+       pxor    %xmm13,%xmm5
+       movdqu  %xmm3,16(%rsi)
+       movdqu  %xmm4,32(%rsi)
+       movdqu  %xmm5,48(%rsi)
        leaq    64(%rsi),%rsi
        jmp     .Lxts_enc_done
 
@@ -1516,13 +1995,37 @@ aesni_xts_encrypt:
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_10   
+       jnz     .Loop_enc1_10
 .byte  102,15,56,221,209
        xorps   %xmm10,%xmm2
        movups  %xmm2,-16(%rsi)
 
 .Lxts_enc_ret:
-       leaq    104(%rsp),%rsp
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       leaq    (%rbp),%rsp
+       popq    %rbp
 .Lxts_enc_epilogue:
        .byte   0xf3,0xc3
 .size  aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -1530,249 +2033,293 @@ aesni_xts_encrypt:
 .type  aesni_xts_decrypt,@function
 .align 16
 aesni_xts_decrypt:
-       leaq    -104(%rsp),%rsp
-       movups  (%r9),%xmm15
+       leaq    (%rsp),%rax
+       pushq   %rbp
+       subq    $112,%rsp
+       andq    $-16,%rsp
+       leaq    -8(%rax),%rbp
+       movups  (%r9),%xmm2
        movl    240(%r8),%eax
        movl    240(%rcx),%r10d
        movups  (%r8),%xmm0
        movups  16(%r8),%xmm1
        leaq    32(%r8),%r8
-       xorps   %xmm0,%xmm15
+       xorps   %xmm0,%xmm2
 .Loop_enc1_11:
-.byte  102,68,15,56,220,249
+.byte  102,15,56,220,209
        decl    %eax
        movups  (%r8),%xmm1
        leaq    16(%r8),%r8
-       jnz     .Loop_enc1_11   
-.byte  102,68,15,56,221,249
+       jnz     .Loop_enc1_11
+.byte  102,15,56,221,209
        xorl    %eax,%eax
        testq   $15,%rdx
        setnz   %al
        shlq    $4,%rax
        subq    %rax,%rdx
 
+       movups  (%rcx),%xmm0
        movq    %rcx,%r11
        movl    %r10d,%eax
+       shll    $4,%r10d
        movq    %rdx,%r9
        andq    $-16,%rdx
 
+       movups  16(%rcx,%r10,1),%xmm1
+
        movdqa  .Lxts_magic(%rip),%xmm8
-       pxor    %xmm14,%xmm14
-       pcmpgtd %xmm15,%xmm14
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
+       movdqa  %xmm2,%xmm15
+       pshufd  $0x5f,%xmm2,%xmm9
+       pxor    %xmm0,%xmm1
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
        movdqa  %xmm15,%xmm10
+       psrad   $31,%xmm14
        paddq   %xmm15,%xmm15
-       pand    %xmm8,%xmm9
-       pcmpgtd %xmm15,%xmm14
-       pxor    %xmm9,%xmm15
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
+       pand    %xmm8,%xmm14
+       pxor    %xmm0,%xmm10
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
        movdqa  %xmm15,%xmm11
+       psrad   $31,%xmm14
        paddq   %xmm15,%xmm15
-       pand    %xmm8,%xmm9
-       pcmpgtd %xmm15,%xmm14
-       pxor    %xmm9,%xmm15
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
+       pand    %xmm8,%xmm14
+       pxor    %xmm0,%xmm11
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
        movdqa  %xmm15,%xmm12
+       psrad   $31,%xmm14
        paddq   %xmm15,%xmm15
-       pand    %xmm8,%xmm9
-       pcmpgtd %xmm15,%xmm14
-       pxor    %xmm9,%xmm15
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
+       pand    %xmm8,%xmm14
+       pxor    %xmm0,%xmm12
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
        movdqa  %xmm15,%xmm13
+       psrad   $31,%xmm14
+       paddq   %xmm15,%xmm15
+       pand    %xmm8,%xmm14
+       pxor    %xmm0,%xmm13
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm15,%xmm14
+       psrad   $31,%xmm9
        paddq   %xmm15,%xmm15
        pand    %xmm8,%xmm9
-       pcmpgtd %xmm15,%xmm14
+       pxor    %xmm0,%xmm14
        pxor    %xmm9,%xmm15
+       movaps  %xmm1,96(%rsp)
+
        subq    $96,%rdx
        jc      .Lxts_dec_short
 
-       shrl    $1,%eax
-       subl    $1,%eax
-       movl    %eax,%r10d
+       movl    $16+96,%eax
+       leaq    32(%r11,%r10,1),%rcx
+       subq    %r10,%rax
+       movups  16(%r11),%xmm1
+       movq    %rax,%r10
+       leaq    .Lxts_magic(%rip),%r8
        jmp     .Lxts_dec_grandloop
 
-.align 16
+.align 32
 .Lxts_dec_grandloop:
-       pshufd  $19,%xmm14,%xmm9
-       movdqa  %xmm15,%xmm14
-       paddq   %xmm15,%xmm15
        movdqu  0(%rdi),%xmm2
-       pand    %xmm8,%xmm9
+       movdqa  %xmm0,%xmm8
        movdqu  16(%rdi),%xmm3
-       pxor    %xmm9,%xmm15
-
-       movdqu  32(%rdi),%xmm4
        pxor    %xmm10,%xmm2
-       movdqu  48(%rdi),%xmm5
+       movdqu  32(%rdi),%xmm4
        pxor    %xmm11,%xmm3
-       movdqu  64(%rdi),%xmm6
-       pxor    %xmm12,%xmm4
-       movdqu  80(%rdi),%xmm7
-       leaq    96(%rdi),%rdi
-       pxor    %xmm13,%xmm5
-       movups  (%r11),%xmm0
-       pxor    %xmm14,%xmm6
-       pxor    %xmm15,%xmm7
-
-
-
-       movups  16(%r11),%xmm1
-       pxor    %xmm0,%xmm2
-       pxor    %xmm0,%xmm3
-       movdqa  %xmm10,0(%rsp)
 .byte  102,15,56,222,209
-       leaq    32(%r11),%rcx
-       pxor    %xmm0,%xmm4
-       movdqa  %xmm11,16(%rsp)
+       movdqu  48(%rdi),%xmm5
+       pxor    %xmm12,%xmm4
 .byte  102,15,56,222,217
-       pxor    %xmm0,%xmm5
-       movdqa  %xmm12,32(%rsp)
+       movdqu  64(%rdi),%xmm6
+       pxor    %xmm13,%xmm5
 .byte  102,15,56,222,225
-       pxor    %xmm0,%xmm6
-       movdqa  %xmm13,48(%rsp)
+       movdqu  80(%rdi),%xmm7
+       pxor    %xmm15,%xmm8
+       movdqa  96(%rsp),%xmm9
+       pxor    %xmm14,%xmm6
 .byte  102,15,56,222,233
-       pxor    %xmm0,%xmm7
-       movups  (%rcx),%xmm0
-       decl    %eax
-       movdqa  %xmm14,64(%rsp)
+       movups  32(%r11),%xmm0
+       leaq    96(%rdi),%rdi
+       pxor    %xmm8,%xmm7
+
+       pxor    %xmm9,%xmm10
 .byte  102,15,56,222,241
-       movdqa  %xmm15,80(%rsp)
+       pxor    %xmm9,%xmm11
+       movdqa  %xmm10,0(%rsp)
 .byte  102,15,56,222,249
-       pxor    %xmm14,%xmm14
-       pcmpgtd %xmm15,%xmm14
-       jmp     .Lxts_dec_loop6_enter
+       movups  48(%r11),%xmm1
+       pxor    %xmm9,%xmm12
 
-.align 16
+.byte  102,15,56,222,208
+       pxor    %xmm9,%xmm13
+       movdqa  %xmm11,16(%rsp)
+.byte  102,15,56,222,216
+       pxor    %xmm9,%xmm14
+       movdqa  %xmm12,32(%rsp)
+.byte  102,15,56,222,224
+.byte  102,15,56,222,232
+       pxor    %xmm9,%xmm8
+       movdqa  %xmm14,64(%rsp)
+.byte  102,15,56,222,240
+.byte  102,15,56,222,248
+       movups  64(%r11),%xmm0
+       movdqa  %xmm8,80(%rsp)
+       pshufd  $0x5f,%xmm15,%xmm9
+       jmp     .Lxts_dec_loop6
+.align 32
 .Lxts_dec_loop6:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
-       decl    %eax
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-.Lxts_dec_loop6_enter:
-       movups  16(%rcx),%xmm1
+       movups  -64(%rcx,%rax,1),%xmm1
+       addq    $32,%rax
+
 .byte  102,15,56,222,208
 .byte  102,15,56,222,216
-       leaq    32(%rcx),%rcx
 .byte  102,15,56,222,224
 .byte  102,15,56,222,232
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
-       movups  (%rcx),%xmm0
+       movups  -80(%rcx,%rax,1),%xmm0
        jnz     .Lxts_dec_loop6
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       paddq   %xmm15,%xmm15
+       movdqa  (%r8),%xmm8
+       movdqa  %xmm9,%xmm14
+       paddd   %xmm9,%xmm9
 .byte  102,15,56,222,209
-       pand    %xmm8,%xmm9
+       paddq   %xmm15,%xmm15
+       psrad   $31,%xmm14
 .byte  102,15,56,222,217
-       pcmpgtd %xmm15,%xmm14
+       pand    %xmm8,%xmm14
+       movups  (%r11),%xmm10
 .byte  102,15,56,222,225
-       pxor    %xmm9,%xmm15
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
+       pxor    %xmm14,%xmm15
+       movaps  %xmm10,%xmm11
 .byte  102,15,56,222,249
-       movups  16(%rcx),%xmm1
+       movups  -64(%rcx),%xmm1
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       movdqa  %xmm15,%xmm10
-       paddq   %xmm15,%xmm15
+       movdqa  %xmm9,%xmm14
 .byte  102,15,56,222,208
-       pand    %xmm8,%xmm9
+       paddd   %xmm9,%xmm9
+       pxor    %xmm15,%xmm10
 .byte  102,15,56,222,216
-       pcmpgtd %xmm15,%xmm14
+       psrad   $31,%xmm14
+       paddq   %xmm15,%xmm15
 .byte  102,15,56,222,224
-       pxor    %xmm9,%xmm15
 .byte  102,15,56,222,232
+       pand    %xmm8,%xmm14
+       movaps  %xmm11,%xmm12
 .byte  102,15,56,222,240
+       pxor    %xmm14,%xmm15
+       movdqa  %xmm9,%xmm14
 .byte  102,15,56,222,248
-       movups  32(%rcx),%xmm0
+       movups  -48(%rcx),%xmm0
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       movdqa  %xmm15,%xmm11
-       paddq   %xmm15,%xmm15
+       paddd   %xmm9,%xmm9
 .byte  102,15,56,222,209
-       pand    %xmm8,%xmm9
+       pxor    %xmm15,%xmm11
+       psrad   $31,%xmm14
 .byte  102,15,56,222,217
-       pcmpgtd %xmm15,%xmm14
+       paddq   %xmm15,%xmm15
+       pand    %xmm8,%xmm14
 .byte  102,15,56,222,225
-       pxor    %xmm9,%xmm15
 .byte  102,15,56,222,233
+       movdqa  %xmm13,48(%rsp)
+       pxor    %xmm14,%xmm15
 .byte  102,15,56,222,241
+       movaps  %xmm12,%xmm13
+       movdqa  %xmm9,%xmm14
 .byte  102,15,56,222,249
+       movups  -32(%rcx),%xmm1
 
-       pshufd  $19,%xmm14,%xmm9
-       pxor    %xmm14,%xmm14
-       movdqa  %xmm15,%xmm12
+       paddd   %xmm9,%xmm9
+.byte  102,15,56,222,208
+       pxor    %xmm15,%xmm12
+       psrad   $31,%xmm14
+.byte  102,15,56,222,216
        paddq   %xmm15,%xmm15
-.byte  102,15,56,223,208
-       pand    %xmm8,%xmm9
-.byte  102,15,56,223,216
-       pcmpgtd %xmm15,%xmm14
-.byte  102,15,56,223,224
-       pxor    %xmm9,%xmm15