diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 4a17341a30..9edc42fb06 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -34,6 +34,7 @@ if(ARCH STREQUAL "x86_64") ghash-ssse3-x86_64.${ASM_EXT} ghash-x86_64.${ASM_EXT} md5-x86_64.${ASM_EXT} + md5-avx512.${ASM_EXT} p256-x86_64-asm.${ASM_EXT} p256_beeu-x86_64-asm.${ASM_EXT} rdrand-x86_64.${ASM_EXT} @@ -144,6 +145,7 @@ if(PERL_EXECUTABLE) perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl) perlasm(md5-armv8.${ASM_EXT} md5/asm/md5-armv8.pl) perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl) + perlasm(md5-avx512.${ASM_EXT} md5/asm/md5-avx512.pl) perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl) perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl) perlasm(p256-armv8-asm.${ASM_EXT} ec/asm/p256-armv8-asm.pl) diff --git a/crypto/fipsmodule/md5/asm/md5-avx512.pl b/crypto/fipsmodule/md5/asm/md5-avx512.pl new file mode 100644 index 0000000000..1e056050c2 --- /dev/null +++ b/crypto/fipsmodule/md5/asm/md5-avx512.pl @@ -0,0 +1,304 @@ +#! /usr/bin/env perl +# Copyright (C) 2025 Intel Corporation + +if ($#ARGV < 1) { die "Not enough arguments provided. + Two arguments are necessary: the flavour and the output file path."; } + +$flavour = shift; +$output = shift; + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$avx512md5 = 1; +for (@ARGV) { $avx512md5 = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +#====================================================================== + +if ($avx512md5) { + + my $XMM_STORAGE = 16 * 5; + + my $state = "%rdi"; + my $data = "%rsi"; + my $num = "%rdx"; + + my $a = "%xmm1"; + my $b = "%xmm2"; + my $c = "%xmm3"; + my $d = "%xmm4"; + + my $pa = "%xmm5"; + my $pb = "%xmm6"; + my $pc = "%xmm7"; + my $pd = "%xmm8"; + + sub md5_step { + my ($src, $a, $b, $c, $d, $off, $rot, $t, $imm8) = @_; + + # TODO(pittma): At the cost of another register, we can add t and k + # together, and then combine results which may get us better ILP. + $code .= <<___; + vmovd .L_T+4*$t(%rip), %xmm10 + vpaddd $off*4($src), %xmm10, %xmm10 # T[i] + k[i] + vpaddd $a, %xmm10, %xmm10 # T[i] + k[i] + a + vmovdqa $b, %xmm9 # preserve b + vpternlogd $imm8, $d, $c, %xmm9 # f(b, c, d) + vpaddd %xmm9, %xmm10, %xmm9 # (T[i] + k[i]) + (f(b, c, d) + a) + vprold \$$rot, %xmm9, %xmm9 # tmp <<< s + vpaddd $b, %xmm9, $a # b + (tmp <<< s) +___ + } + + sub round1_op { + my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_; + + md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0xca"); + } + + sub round2_op { + my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_; + + md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0xe4"); + } + + sub round3_op { + my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_; + + md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0x96"); + } + + sub round4_op { + my ($src, $a, $b, $c, $d, $off, $rot, $t) = @_; + + md5_step($src, $a, $b, $c, $d, $off, $rot, $t, "\$0x39"); + } + + sub one_round { + my ($src) = @_; + + $code .= <<___; + vmovdqa $a, $pa + vmovdqa $b, $pb + vmovdqa $c, $pc + vmovdqa $d, $pd +___ + + # Round 1 + # [ABCD 0 7 1] [DABC 1 12 2] [CDAB 2 17 3] [BCDA 3 22 4] + round1_op($src, $a, $b, $c, $d, 0, 7, 0); + round1_op($src, $d, $a, $b, $c, 1, 12, 1); + round1_op($src, $c, $d, $a, $b, 2, 17, 2); + round1_op($src, $b, $c, $d, $a, 3, 22, 3); + + # [ABCD 4 7 5] [DABC 5 12 6] [CDAB 6 17 7] [BCDA 7 22 8] + round1_op($src, $a, $b, $c, $d, 4, 7, 4); + round1_op($src, $d, $a, $b, $c, 5, 12, 5); + round1_op($src, $c, $d, $a, $b, 6, 17, 6); + round1_op($src, $b, $c, $d, $a, 7, 22, 7); + + # [ABCD 8 7 9] [DABC 9 12 10] [CDAB 10 17 11] [BCDA 11 22 12] + round1_op($src, $a, $b, $c, $d, 8, 7, 8); + round1_op($src, $d, $a, $b, $c, 9, 12, 9); + round1_op($src, $c, $d, $a, $b, 10, 17, 10); + round1_op($src, $b, $c, $d, $a, 11, 22, 11); + + # [ABCD 12 7 13] [DABC 13 12 14] [CDAB 14 17 15] [BCDA 15 22 16] + round1_op($src, $a, $b, $c, $d, 12, 7, 12); + round1_op($src, $d, $a, $b, $c, 13, 12, 13); + round1_op($src, $c, $d, $a, $b, 14, 17, 14); + round1_op($src, $b, $c, $d, $a, 15, 22, 15); + + # Round 2 + # [ABCD 1 5 17] [DABC 6 9 18] [CDAB 11 14 19] [BCDA 0 20 20] + round2_op($src, $a, $b, $c, $d, 1, 5, 16); + round2_op($src, $d, $a, $b, $c, 6, 9, 17); + round2_op($src, $c, $d, $a, $b, 11, 14, 18); + round2_op($src, $b, $c, $d, $a, 0, 20, 19); + + # [ABCD 5 5 21] [DABC 10 9 22] [CDAB 15 14 23] [BCDA 4 20 24] + round2_op($src, $a, $b, $c, $d, 5, 5, 20); + round2_op($src, $d, $a, $b, $c, 10, 9, 21); + round2_op($src, $c, $d, $a, $b, 15, 14, 22); + round2_op($src, $b, $c, $d, $a, 4, 20, 23); + + # [ABCD 9 5 25] [DABC 14 9 26] [CDAB 3 14 27] [BCDA 8 20 28] + round2_op($src, $a, $b, $c, $d, 9, 5, 24); + round2_op($src, $d, $a, $b, $c, 14, 9, 25); + round2_op($src, $c, $d, $a, $b, 3, 14, 26); + round2_op($src, $b, $c, $d, $a, 8, 20, 27); + + # [ABCD 13 5 29] [DABC 2 9 30] [CDAB 7 14 31] [BCDA 12 20 32] + round2_op($src, $a, $b, $c, $d, 13, 5, 28); + round2_op($src, $d, $a, $b, $c, 2, 9, 29); + round2_op($src, $c, $d, $a, $b, 7, 14, 30); + round2_op($src, $b, $c, $d, $a, 12, 20, 31); + + # Round 3 + # [ABCD 5 4 33] [DABC 8 11 34] [CDAB 11 16 35] [BCDA 14 23 36] + round3_op($src, $a, $b, $c, $d, 5, 4, 32); + round3_op($src, $d, $a, $b, $c, 8, 11, 33); + round3_op($src, $c, $d, $a, $b, 11, 16, 34); + round3_op($src, $b, $c, $d, $a, 14, 23, 35); + + # [ABCD 1 4 37] [DABC 4 11 38] [CDAB 7 16 39] [BCDA 10 23 40] + round3_op($src, $a, $b, $c, $d, 1, 4, 36); + round3_op($src, $d, $a, $b, $c, 4, 11, 37); + round3_op($src, $c, $d, $a, $b, 7, 16, 38); + round3_op($src, $b, $c, $d, $a, 10, 23, 39); + + # [ABCD 13 4 41] [DABC 0 11 42] [CDAB 3 16 43] [BCDA 6 23 44] + round3_op($src, $a, $b, $c, $d, 13, 4, 40); + round3_op($src, $d, $a, $b, $c, 0, 11, 41); + round3_op($src, $c, $d, $a, $b, 3, 16, 42); + round3_op($src, $b, $c, $d, $a, 6, 23, 43); + + # [ABCD 9 4 45] [DABC 12 11 46] [CDAB 15 16 47] [BCDA 2 23 48] + round3_op($src, $a, $b, $c, $d, 9, 4, 44); + round3_op($src, $d, $a, $b, $c, 12, 11, 45); + round3_op($src, $c, $d, $a, $b, 15, 16, 46); + round3_op($src, $b, $c, $d, $a, 2, 23, 47); + + # Round 4 + # [ABCD 0 6 49] [DABC 7 10 50] [CDAB 14 15 51] [BCDA 5 21 52] + round4_op($src, $a, $b, $c, $d, 0, 6, 48); + round4_op($src, $d, $a, $b, $c, 7, 10, 49); + round4_op($src, $c, $d, $a, $b, 14, 15, 50); + round4_op($src, $b, $c, $d, $a, 5, 21, 51); + + # [ABCD 12 6 53] [DABC 3 10 54] [CDAB 10 15 55] [BCDA 1 21 56] + round4_op($src, $a, $b, $c, $d, 12, 6, 52); + round4_op($src, $d, $a, $b, $c, 3, 10, 53); + round4_op($src, $c, $d, $a, $b, 10, 15, 54); + round4_op($src, $b, $c, $d, $a, 1, 21, 55); + + # [ABCD 8 6 57] [DABC 15 10 58] [CDAB 6 15 59] [BCDA 13 21 60] + round4_op($src, $a, $b, $c, $d, 8, 6, 56); + round4_op($src, $d, $a, $b, $c, 15, 10, 57); + round4_op($src, $c, $d, $a, $b, 6, 15, 58); + round4_op($src, $b, $c, $d, $a, 13, 21, 59); + + # [ABCD 4 6 61] [DABC 11 10 62] [CDAB 2 15 63] [BCDA 9 21 64] + round4_op($src, $a, $b, $c, $d, 4, 6, 60); + round4_op($src, $d, $a, $b, $c, 11, 10, 61); + round4_op($src, $c, $d, $a, $b, 2, 15, 62); + round4_op($src, $b, $c, $d, $a, 9, 21, 63); + + $code .= <<___; + vpaddd $pa, $a, $a + vpaddd $pb, $b, $b + vpaddd $pc, $c, $c + vpaddd $pd, $d, $d +___ + } + + # int md5_x86_64_avx512(const uint8_t *data, + # size_t len, + # uint8_t out[MD5_DIGEST_LENGTH]); + $code .= <<___; +#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX + .text + + .globl md5_x86_64_avx512 + .type md5_x86_64_avx512,\@function,3 + .align 32 + md5_x86_64_avx512: + .cfi_startproc + endbranch +___ + if ($win64) { + $code .= <<___; + push %rbp + mov %rsp,%rbp + sub $XMM_STORAGE, %rsp + and \$0xfffffffffffffff0,%rsp + vmovdqa %xmm6, 16*0(%rsp) + vmovdqa %xmm7, 16*1(%rsp) + vmovdqa %xmm8, 16*2(%rsp) + vmovdqa %xmm9, 16*3(%rsp) + vmovdqa %xmm10, 16*4(%rsp) +___ + } + $code .= <<___; + vmovd 4*0($state), $a + vmovd 4*1($state), $b + vmovd 4*2($state), $c + vmovd 4*3($state), $d + + .align 32 + .L_main_loop: +___ + + one_round($data); + + $code .= <<___; + add \$64, $data + dec $num + jnz .L_main_loop + + .L_done: +___ + if ($win64) { + $code .= <<___; + vmovdqa 16*0(%rsp), %xmm6 + vmovdqa 16*1(%rsp), %xmm7 + vmovdqa 16*2(%rsp), %xmm8 + vmovdqa 16*3(%rsp), %xmm9 + vmovdqa 16*4(%rsp), %xmm10 + mov %rbp,%rsp + pop %rbp +___ + } + + $code .= <<___; + vmovd $a, 4*0($state) + vmovd $b, 4*1($state) + vmovd $c, 4*2($state) + vmovd $d, 4*3($state) + ret + .cfi_endproc + .size md5_x86_64_avx512, .-md5_x86_64_avx512 + + .section .rodata + .align 32 + .L_T: + .long 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee + .long 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 + .long 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be + .long 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 + .long 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa + .long 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 + .long 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed + .long 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a + .long 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c + .long 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 + .long 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 + .long 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 + .long 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 + .long 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 + .long 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 + .long 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 +#endif +___ + +} else { + $code = <<___; + .text + .globl md5_x86_64_avx512 + md5_x86_64_avx512: + .byte 0x0f,0x0b # ud2 + ret + .size md5_x86_64_avx512, .-md5_x86_64_avx512 +___ +} + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/fipsmodule/md5/internal.h b/crypto/fipsmodule/md5/internal.h index e50b5582b8..5a85d37476 100644 --- a/crypto/fipsmodule/md5/internal.h +++ b/crypto/fipsmodule/md5/internal.h @@ -45,13 +45,23 @@ OPENSSL_EXPORT int MD5_get_state(MD5_CTX *ctx, uint8_t out_h[MD5_CHAINING_LENGTH], uint64_t *out_n); -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86_64) || defined(OPENSSL_X86) || defined(OPENSSL_AARCH64)) +#if !defined(OPENSSL_NO_ASM) +// If building for x86_64 and we have a new enough assembler, we need both +// definitions for the case where we've built for AVX-512, but it is not +// available at runtime. +#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) +#define MD5_ASM_AVX512 +extern void md5_x86_64_avx512(uint32_t *state, const uint8_t *data, + size_t num); +#endif + +#if defined(OPENSSL_X86_64) || defined(OPENSSL_X86) || \ + defined(OPENSSL_AARCH64) #define MD5_ASM extern void md5_block_asm_data_order(uint32_t *state, const uint8_t *data, size_t num); #endif - +#endif // !defined(OPENSSL_NO_ASM) #if defined(__cplusplus) } // extern "C" diff --git a/crypto/fipsmodule/md5/md5.c b/crypto/fipsmodule/md5/md5.c index bf53ce18f7..723e7b70f0 100644 --- a/crypto/fipsmodule/md5/md5.c +++ b/crypto/fipsmodule/md5/md5.c @@ -104,7 +104,15 @@ int MD5_Init_from_state(MD5_CTX *md5, const uint8_t h[MD5_CHAINING_LENGTH], return 1; } -#if defined(MD5_ASM) +// If MD5_ASM_AVX512 is set, then so is MD5_ASM; the inverse is not true. +// +// Here we handle for cases that we have built for AVX-512 and cases where we +// have not. If we've built for AVX-512 but it is not available at runtime, we +// fall back to the definition for md5_block_asm_data_order, as it is defined +// in both cases. +#if defined(MD5_ASM_AVX512) +#define md5_block_data_order md5_x86_64_avx512 +#elif defined(MD5_ASM) #define md5_block_data_order md5_block_asm_data_order #else static void md5_block_data_order(uint32_t *state, const uint8_t *data, @@ -112,17 +120,46 @@ static void md5_block_data_order(uint32_t *state, const uint8_t *data, #endif void MD5_Transform(MD5_CTX *c, const uint8_t data[MD5_CBLOCK]) { - md5_block_data_order(c->h, data, 1); + void (*block_func)(uint32_t *state, const uint8_t *data, + size_t num); + block_func = md5_block_data_order; + +#if defined(MD5_ASM_AVX512) + if (!CRYPTO_is_AVX512_capable()) { + block_func = md5_block_asm_data_order; + } +#endif + block_func(c->h, data, 1); } int MD5_Update(MD5_CTX *c, const void *data, size_t len) { - crypto_md32_update(&md5_block_data_order, c->h, c->data, MD5_CBLOCK, &c->num, + void (*block_func)(uint32_t *state, const uint8_t *data, + size_t num); + block_func = md5_block_data_order; + +#if defined(MD5_ASM_AVX512) + if (!CRYPTO_is_AVX512_capable()) { + block_func = md5_block_asm_data_order; + } +#endif + + crypto_md32_update(block_func, c->h, c->data, MD5_CBLOCK, &c->num, &c->Nh, &c->Nl, data, len); return 1; } int MD5_Final(uint8_t out[MD5_DIGEST_LENGTH], MD5_CTX *c) { - crypto_md32_final(&md5_block_data_order, c->h, c->data, MD5_CBLOCK, &c->num, + void (*block_func)(uint32_t *state, const uint8_t *data, + size_t num); + block_func = md5_block_data_order; + +#if defined(MD5_ASM_AVX512) + if (!CRYPTO_is_AVX512_capable()) { + block_func = md5_block_asm_data_order; + } +#endif + + crypto_md32_final(block_func, c->h, c->data, MD5_CBLOCK, &c->num, c->Nh, c->Nl, /*is_big_endian=*/0); CRYPTO_store_u32_le(out, c->h[0]); diff --git a/crypto/fipsmodule/md5/md5_test.cc b/crypto/fipsmodule/md5/md5_test.cc index 7df5bb2595..b408827810 100644 --- a/crypto/fipsmodule/md5/md5_test.cc +++ b/crypto/fipsmodule/md5/md5_test.cc @@ -17,6 +17,7 @@ #include #include "internal.h" +#include "../cpucap/internal.h" #include "../../test/abi_test.h" @@ -30,5 +31,14 @@ TEST(MD5Test, ABI) { CHECK_ABI(md5_block_asm_data_order, ctx.h, kBuf, 2); CHECK_ABI(md5_block_asm_data_order, ctx.h, kBuf, 4); CHECK_ABI(md5_block_asm_data_order, ctx.h, kBuf, 8); + +#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) + if (CRYPTO_is_AVX512_capable()) { + CHECK_ABI(md5_x86_64_avx512, ctx.h, kBuf, 1); + CHECK_ABI(md5_x86_64_avx512, ctx.h, kBuf, 2); + CHECK_ABI(md5_x86_64_avx512, ctx.h, kBuf, 4); + CHECK_ABI(md5_x86_64_avx512, ctx.h, kBuf, 8); + } +#endif } #endif // MD5_ASM && SUPPORTS_ABI_TEST diff --git a/generated-src/linux-x86_64/crypto/fipsmodule/md5-avx512.S b/generated-src/linux-x86_64/crypto/fipsmodule/md5-avx512.S new file mode 100644 index 0000000000..5d4139bad3 --- /dev/null +++ b/generated-src/linux-x86_64/crypto/fipsmodule/md5-avx512.S @@ -0,0 +1,577 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX +.text + +.globl md5_x86_64_avx512 +.hidden md5_x86_64_avx512 +.type md5_x86_64_avx512,@function +.align 32 +md5_x86_64_avx512: +.cfi_startproc +.byte 243,15,30,250 + vmovd 0(%rdi),%xmm1 + vmovd 4(%rdi),%xmm2 + vmovd 8(%rdi),%xmm3 + vmovd 12(%rdi),%xmm4 + +.align 32 +.L_main_loop: + vmovdqa %xmm1,%xmm5 + vmovdqa %xmm2,%xmm6 + vmovdqa %xmm3,%xmm7 + vmovdqa %xmm4,%xmm8 + vmovd .L_T+0(%rip),%xmm10 + vpaddd 0(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xca,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $7,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+4(%rip),%xmm10 + vpaddd 4(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xca,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $12,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+8(%rip),%xmm10 + vpaddd 8(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xca,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $17,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+12(%rip),%xmm10 + vpaddd 12(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xca,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $22,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+16(%rip),%xmm10 + vpaddd 16(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xca,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $7,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+20(%rip),%xmm10 + vpaddd 20(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xca,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $12,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+24(%rip),%xmm10 + vpaddd 24(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xca,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $17,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+28(%rip),%xmm10 + vpaddd 28(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xca,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $22,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+32(%rip),%xmm10 + vpaddd 32(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xca,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $7,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+36(%rip),%xmm10 + vpaddd 36(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xca,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $12,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+40(%rip),%xmm10 + vpaddd 40(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xca,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $17,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+44(%rip),%xmm10 + vpaddd 44(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xca,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $22,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+48(%rip),%xmm10 + vpaddd 48(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xca,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $7,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+52(%rip),%xmm10 + vpaddd 52(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xca,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $12,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+56(%rip),%xmm10 + vpaddd 56(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xca,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $17,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+60(%rip),%xmm10 + vpaddd 60(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xca,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $22,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+64(%rip),%xmm10 + vpaddd 4(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xe4,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $5,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+68(%rip),%xmm10 + vpaddd 24(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xe4,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $9,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+72(%rip),%xmm10 + vpaddd 44(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xe4,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $14,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+76(%rip),%xmm10 + vpaddd 0(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xe4,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $20,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+80(%rip),%xmm10 + vpaddd 20(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xe4,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $5,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+84(%rip),%xmm10 + vpaddd 40(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xe4,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $9,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+88(%rip),%xmm10 + vpaddd 60(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xe4,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $14,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+92(%rip),%xmm10 + vpaddd 16(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xe4,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $20,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+96(%rip),%xmm10 + vpaddd 36(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xe4,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $5,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+100(%rip),%xmm10 + vpaddd 56(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xe4,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $9,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+104(%rip),%xmm10 + vpaddd 12(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xe4,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $14,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+108(%rip),%xmm10 + vpaddd 32(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xe4,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $20,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+112(%rip),%xmm10 + vpaddd 52(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xe4,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $5,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+116(%rip),%xmm10 + vpaddd 8(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xe4,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $9,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+120(%rip),%xmm10 + vpaddd 28(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xe4,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $14,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+124(%rip),%xmm10 + vpaddd 48(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xe4,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $20,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+128(%rip),%xmm10 + vpaddd 20(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x96,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $4,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+132(%rip),%xmm10 + vpaddd 32(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x96,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $11,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+136(%rip),%xmm10 + vpaddd 44(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x96,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $16,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+140(%rip),%xmm10 + vpaddd 56(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x96,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $23,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+144(%rip),%xmm10 + vpaddd 4(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x96,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $4,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+148(%rip),%xmm10 + vpaddd 16(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x96,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $11,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+152(%rip),%xmm10 + vpaddd 28(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x96,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $16,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+156(%rip),%xmm10 + vpaddd 40(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x96,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $23,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+160(%rip),%xmm10 + vpaddd 52(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x96,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $4,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+164(%rip),%xmm10 + vpaddd 0(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x96,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $11,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+168(%rip),%xmm10 + vpaddd 12(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x96,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $16,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+172(%rip),%xmm10 + vpaddd 24(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x96,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $23,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+176(%rip),%xmm10 + vpaddd 36(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x96,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $4,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+180(%rip),%xmm10 + vpaddd 48(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x96,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $11,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+184(%rip),%xmm10 + vpaddd 60(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x96,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $16,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+188(%rip),%xmm10 + vpaddd 8(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x96,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $23,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+192(%rip),%xmm10 + vpaddd 0(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x39,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $6,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+196(%rip),%xmm10 + vpaddd 28(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x39,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $10,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+200(%rip),%xmm10 + vpaddd 56(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x39,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $15,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+204(%rip),%xmm10 + vpaddd 20(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x39,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $21,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+208(%rip),%xmm10 + vpaddd 48(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x39,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $6,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+212(%rip),%xmm10 + vpaddd 12(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x39,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $10,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+216(%rip),%xmm10 + vpaddd 40(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x39,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $15,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+220(%rip),%xmm10 + vpaddd 4(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x39,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $21,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+224(%rip),%xmm10 + vpaddd 32(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x39,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $6,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+228(%rip),%xmm10 + vpaddd 60(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x39,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $10,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+232(%rip),%xmm10 + vpaddd 24(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x39,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $15,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+236(%rip),%xmm10 + vpaddd 52(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x39,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $21,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd .L_T+240(%rip),%xmm10 + vpaddd 16(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x39,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $6,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd .L_T+244(%rip),%xmm10 + vpaddd 44(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x39,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $10,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd .L_T+248(%rip),%xmm10 + vpaddd 8(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x39,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $15,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd .L_T+252(%rip),%xmm10 + vpaddd 36(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x39,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $21,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm2,%xmm2 + vpaddd %xmm7,%xmm3,%xmm3 + vpaddd %xmm8,%xmm4,%xmm4 + addq $64,%rsi + decq %rdx + jnz .L_main_loop + +.L_done: + vmovd %xmm1,0(%rdi) + vmovd %xmm2,4(%rdi) + vmovd %xmm3,8(%rdi) + vmovd %xmm4,12(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size md5_x86_64_avx512, .-md5_x86_64_avx512 + +.section .rodata +.align 32 +.L_T: +.long 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee +.long 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 +.long 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be +.long 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 +.long 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa +.long 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 +.long 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed +.long 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a +.long 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c +.long 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 +.long 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 +.long 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 +.long 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 +.long 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 +.long 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 +.long 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 +#endif +#endif diff --git a/generated-src/mac-x86_64/crypto/fipsmodule/md5-avx512.S b/generated-src/mac-x86_64/crypto/fipsmodule/md5-avx512.S new file mode 100644 index 0000000000..ecb308199c --- /dev/null +++ b/generated-src/mac-x86_64/crypto/fipsmodule/md5-avx512.S @@ -0,0 +1,577 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX +.text + +.globl _md5_x86_64_avx512 +.private_extern _md5_x86_64_avx512 + +.p2align 5 +_md5_x86_64_avx512: + +.byte 243,15,30,250 + vmovd 0(%rdi),%xmm1 + vmovd 4(%rdi),%xmm2 + vmovd 8(%rdi),%xmm3 + vmovd 12(%rdi),%xmm4 + +.p2align 5 +L$_main_loop: + vmovdqa %xmm1,%xmm5 + vmovdqa %xmm2,%xmm6 + vmovdqa %xmm3,%xmm7 + vmovdqa %xmm4,%xmm8 + vmovd L$_T+0(%rip),%xmm10 + vpaddd 0(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xca,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $7,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+4(%rip),%xmm10 + vpaddd 4(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xca,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $12,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+8(%rip),%xmm10 + vpaddd 8(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xca,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $17,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+12(%rip),%xmm10 + vpaddd 12(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xca,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $22,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+16(%rip),%xmm10 + vpaddd 16(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xca,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $7,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+20(%rip),%xmm10 + vpaddd 20(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xca,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $12,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+24(%rip),%xmm10 + vpaddd 24(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xca,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $17,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+28(%rip),%xmm10 + vpaddd 28(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xca,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $22,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+32(%rip),%xmm10 + vpaddd 32(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xca,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $7,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+36(%rip),%xmm10 + vpaddd 36(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xca,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $12,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+40(%rip),%xmm10 + vpaddd 40(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xca,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $17,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+44(%rip),%xmm10 + vpaddd 44(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xca,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $22,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+48(%rip),%xmm10 + vpaddd 48(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xca,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $7,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+52(%rip),%xmm10 + vpaddd 52(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xca,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $12,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+56(%rip),%xmm10 + vpaddd 56(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xca,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $17,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+60(%rip),%xmm10 + vpaddd 60(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xca,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $22,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+64(%rip),%xmm10 + vpaddd 4(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xe4,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $5,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+68(%rip),%xmm10 + vpaddd 24(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xe4,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $9,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+72(%rip),%xmm10 + vpaddd 44(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xe4,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $14,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+76(%rip),%xmm10 + vpaddd 0(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xe4,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $20,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+80(%rip),%xmm10 + vpaddd 20(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xe4,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $5,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+84(%rip),%xmm10 + vpaddd 40(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xe4,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $9,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+88(%rip),%xmm10 + vpaddd 60(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xe4,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $14,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+92(%rip),%xmm10 + vpaddd 16(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xe4,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $20,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+96(%rip),%xmm10 + vpaddd 36(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xe4,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $5,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+100(%rip),%xmm10 + vpaddd 56(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xe4,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $9,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+104(%rip),%xmm10 + vpaddd 12(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xe4,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $14,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+108(%rip),%xmm10 + vpaddd 32(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xe4,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $20,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+112(%rip),%xmm10 + vpaddd 52(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0xe4,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $5,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+116(%rip),%xmm10 + vpaddd 8(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0xe4,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $9,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+120(%rip),%xmm10 + vpaddd 28(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0xe4,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $14,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+124(%rip),%xmm10 + vpaddd 48(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0xe4,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $20,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+128(%rip),%xmm10 + vpaddd 20(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x96,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $4,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+132(%rip),%xmm10 + vpaddd 32(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x96,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $11,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+136(%rip),%xmm10 + vpaddd 44(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x96,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $16,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+140(%rip),%xmm10 + vpaddd 56(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x96,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $23,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+144(%rip),%xmm10 + vpaddd 4(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x96,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $4,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+148(%rip),%xmm10 + vpaddd 16(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x96,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $11,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+152(%rip),%xmm10 + vpaddd 28(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x96,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $16,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+156(%rip),%xmm10 + vpaddd 40(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x96,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $23,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+160(%rip),%xmm10 + vpaddd 52(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x96,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $4,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+164(%rip),%xmm10 + vpaddd 0(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x96,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $11,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+168(%rip),%xmm10 + vpaddd 12(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x96,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $16,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+172(%rip),%xmm10 + vpaddd 24(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x96,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $23,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+176(%rip),%xmm10 + vpaddd 36(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x96,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $4,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+180(%rip),%xmm10 + vpaddd 48(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x96,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $11,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+184(%rip),%xmm10 + vpaddd 60(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x96,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $16,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+188(%rip),%xmm10 + vpaddd 8(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x96,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $23,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+192(%rip),%xmm10 + vpaddd 0(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x39,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $6,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+196(%rip),%xmm10 + vpaddd 28(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x39,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $10,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+200(%rip),%xmm10 + vpaddd 56(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x39,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $15,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+204(%rip),%xmm10 + vpaddd 20(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x39,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $21,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+208(%rip),%xmm10 + vpaddd 48(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x39,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $6,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+212(%rip),%xmm10 + vpaddd 12(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x39,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $10,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+216(%rip),%xmm10 + vpaddd 40(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x39,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $15,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+220(%rip),%xmm10 + vpaddd 4(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x39,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $21,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+224(%rip),%xmm10 + vpaddd 32(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x39,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $6,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+228(%rip),%xmm10 + vpaddd 60(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x39,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $10,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+232(%rip),%xmm10 + vpaddd 24(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x39,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $15,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+236(%rip),%xmm10 + vpaddd 52(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x39,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $21,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vmovd L$_T+240(%rip),%xmm10 + vpaddd 16(%rsi),%xmm10,%xmm10 + vpaddd %xmm1,%xmm10,%xmm10 + vmovdqa %xmm2,%xmm9 + vpternlogd $0x39,%xmm4,%xmm3,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $6,%xmm9,%xmm9 + vpaddd %xmm2,%xmm9,%xmm1 + vmovd L$_T+244(%rip),%xmm10 + vpaddd 44(%rsi),%xmm10,%xmm10 + vpaddd %xmm4,%xmm10,%xmm10 + vmovdqa %xmm1,%xmm9 + vpternlogd $0x39,%xmm3,%xmm2,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $10,%xmm9,%xmm9 + vpaddd %xmm1,%xmm9,%xmm4 + vmovd L$_T+248(%rip),%xmm10 + vpaddd 8(%rsi),%xmm10,%xmm10 + vpaddd %xmm3,%xmm10,%xmm10 + vmovdqa %xmm4,%xmm9 + vpternlogd $0x39,%xmm2,%xmm1,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $15,%xmm9,%xmm9 + vpaddd %xmm4,%xmm9,%xmm3 + vmovd L$_T+252(%rip),%xmm10 + vpaddd 36(%rsi),%xmm10,%xmm10 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqa %xmm3,%xmm9 + vpternlogd $0x39,%xmm1,%xmm4,%xmm9 + vpaddd %xmm9,%xmm10,%xmm9 + vprold $21,%xmm9,%xmm9 + vpaddd %xmm3,%xmm9,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm2,%xmm2 + vpaddd %xmm7,%xmm3,%xmm3 + vpaddd %xmm8,%xmm4,%xmm4 + addq $64,%rsi + decq %rdx + jnz L$_main_loop + +L$_done: + vmovd %xmm1,0(%rdi) + vmovd %xmm2,4(%rdi) + vmovd %xmm3,8(%rdi) + vmovd %xmm4,12(%rdi) + .byte 0xf3,0xc3 + + + +.section __DATA,__const +.p2align 5 +L$_T: +.long 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee +.long 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 +.long 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be +.long 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 +.long 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa +.long 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 +.long 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed +.long 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a +.long 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c +.long 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 +.long 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 +.long 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 +.long 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 +.long 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 +.long 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 +.long 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 +#endif +#endif diff --git a/generated-src/win-x86_64/crypto/fipsmodule/md5-avx512.asm b/generated-src/win-x86_64/crypto/fipsmodule/md5-avx512.asm new file mode 100644 index 0000000000..073d3753f4 --- /dev/null +++ b/generated-src/win-x86_64/crypto/fipsmodule/md5-avx512.asm @@ -0,0 +1,612 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "openssl/boringssl_prefix_symbols_nasm.inc" +%ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX +section .text code align=64 + + +global md5_x86_64_avx512 + +ALIGN 32 +md5_x86_64_avx512: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_md5_x86_64_avx512: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +DB 243,15,30,250 + push rbp + mov rbp,rsp + sub rsp,80 + and rsp,0xfffffffffffffff0 + vmovdqa XMMWORD[rsp],xmm6 + vmovdqa XMMWORD[16+rsp],xmm7 + vmovdqa XMMWORD[32+rsp],xmm8 + vmovdqa XMMWORD[48+rsp],xmm9 + vmovdqa XMMWORD[64+rsp],xmm10 + vmovd xmm1,DWORD[rdi] + vmovd xmm2,DWORD[4+rdi] + vmovd xmm3,DWORD[8+rdi] + vmovd xmm4,DWORD[12+rdi] + +ALIGN 32 +$L$_main_loop: + vmovdqa xmm5,xmm1 + vmovdqa xmm6,xmm2 + vmovdqa xmm7,xmm3 + vmovdqa xmm8,xmm4 + vmovd xmm10,DWORD[(($L$_T+0))] + vpaddd xmm10,xmm10,XMMWORD[rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,7 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+4))] + vpaddd xmm10,xmm10,XMMWORD[4+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,12 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+8))] + vpaddd xmm10,xmm10,XMMWORD[8+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,17 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+12))] + vpaddd xmm10,xmm10,XMMWORD[12+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,22 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+16))] + vpaddd xmm10,xmm10,XMMWORD[16+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,7 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+20))] + vpaddd xmm10,xmm10,XMMWORD[20+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,12 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+24))] + vpaddd xmm10,xmm10,XMMWORD[24+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,17 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+28))] + vpaddd xmm10,xmm10,XMMWORD[28+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,22 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+32))] + vpaddd xmm10,xmm10,XMMWORD[32+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,7 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+36))] + vpaddd xmm10,xmm10,XMMWORD[36+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,12 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+40))] + vpaddd xmm10,xmm10,XMMWORD[40+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,17 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+44))] + vpaddd xmm10,xmm10,XMMWORD[44+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,22 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+48))] + vpaddd xmm10,xmm10,XMMWORD[48+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,7 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+52))] + vpaddd xmm10,xmm10,XMMWORD[52+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,12 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+56))] + vpaddd xmm10,xmm10,XMMWORD[56+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,17 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+60))] + vpaddd xmm10,xmm10,XMMWORD[60+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0xca + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,22 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+64))] + vpaddd xmm10,xmm10,XMMWORD[4+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,5 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+68))] + vpaddd xmm10,xmm10,XMMWORD[24+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,9 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+72))] + vpaddd xmm10,xmm10,XMMWORD[44+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,14 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+76))] + vpaddd xmm10,xmm10,XMMWORD[rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,20 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+80))] + vpaddd xmm10,xmm10,XMMWORD[20+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,5 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+84))] + vpaddd xmm10,xmm10,XMMWORD[40+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,9 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+88))] + vpaddd xmm10,xmm10,XMMWORD[60+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,14 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+92))] + vpaddd xmm10,xmm10,XMMWORD[16+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,20 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+96))] + vpaddd xmm10,xmm10,XMMWORD[36+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,5 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+100))] + vpaddd xmm10,xmm10,XMMWORD[56+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,9 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+104))] + vpaddd xmm10,xmm10,XMMWORD[12+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,14 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+108))] + vpaddd xmm10,xmm10,XMMWORD[32+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,20 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+112))] + vpaddd xmm10,xmm10,XMMWORD[52+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,5 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+116))] + vpaddd xmm10,xmm10,XMMWORD[8+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,9 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+120))] + vpaddd xmm10,xmm10,XMMWORD[28+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,14 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+124))] + vpaddd xmm10,xmm10,XMMWORD[48+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0xe4 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,20 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+128))] + vpaddd xmm10,xmm10,XMMWORD[20+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,4 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+132))] + vpaddd xmm10,xmm10,XMMWORD[32+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,11 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+136))] + vpaddd xmm10,xmm10,XMMWORD[44+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,16 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+140))] + vpaddd xmm10,xmm10,XMMWORD[56+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,23 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+144))] + vpaddd xmm10,xmm10,XMMWORD[4+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,4 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+148))] + vpaddd xmm10,xmm10,XMMWORD[16+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,11 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+152))] + vpaddd xmm10,xmm10,XMMWORD[28+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,16 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+156))] + vpaddd xmm10,xmm10,XMMWORD[40+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,23 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+160))] + vpaddd xmm10,xmm10,XMMWORD[52+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,4 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+164))] + vpaddd xmm10,xmm10,XMMWORD[rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,11 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+168))] + vpaddd xmm10,xmm10,XMMWORD[12+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,16 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+172))] + vpaddd xmm10,xmm10,XMMWORD[24+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,23 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+176))] + vpaddd xmm10,xmm10,XMMWORD[36+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,4 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+180))] + vpaddd xmm10,xmm10,XMMWORD[48+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,11 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+184))] + vpaddd xmm10,xmm10,XMMWORD[60+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,16 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+188))] + vpaddd xmm10,xmm10,XMMWORD[8+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0x96 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,23 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+192))] + vpaddd xmm10,xmm10,XMMWORD[rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,6 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+196))] + vpaddd xmm10,xmm10,XMMWORD[28+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,10 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+200))] + vpaddd xmm10,xmm10,XMMWORD[56+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,15 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+204))] + vpaddd xmm10,xmm10,XMMWORD[20+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,21 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+208))] + vpaddd xmm10,xmm10,XMMWORD[48+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,6 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+212))] + vpaddd xmm10,xmm10,XMMWORD[12+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,10 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+216))] + vpaddd xmm10,xmm10,XMMWORD[40+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,15 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+220))] + vpaddd xmm10,xmm10,XMMWORD[4+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,21 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+224))] + vpaddd xmm10,xmm10,XMMWORD[32+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,6 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+228))] + vpaddd xmm10,xmm10,XMMWORD[60+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,10 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+232))] + vpaddd xmm10,xmm10,XMMWORD[24+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,15 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+236))] + vpaddd xmm10,xmm10,XMMWORD[52+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,21 + vpaddd xmm2,xmm9,xmm3 + vmovd xmm10,DWORD[(($L$_T+240))] + vpaddd xmm10,xmm10,XMMWORD[16+rsi] + vpaddd xmm10,xmm10,xmm1 + vmovdqa xmm9,xmm2 + vpternlogd xmm9,xmm3,xmm4,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,6 + vpaddd xmm1,xmm9,xmm2 + vmovd xmm10,DWORD[(($L$_T+244))] + vpaddd xmm10,xmm10,XMMWORD[44+rsi] + vpaddd xmm10,xmm10,xmm4 + vmovdqa xmm9,xmm1 + vpternlogd xmm9,xmm2,xmm3,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,10 + vpaddd xmm4,xmm9,xmm1 + vmovd xmm10,DWORD[(($L$_T+248))] + vpaddd xmm10,xmm10,XMMWORD[8+rsi] + vpaddd xmm10,xmm10,xmm3 + vmovdqa xmm9,xmm4 + vpternlogd xmm9,xmm1,xmm2,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,15 + vpaddd xmm3,xmm9,xmm4 + vmovd xmm10,DWORD[(($L$_T+252))] + vpaddd xmm10,xmm10,XMMWORD[36+rsi] + vpaddd xmm10,xmm10,xmm2 + vmovdqa xmm9,xmm3 + vpternlogd xmm9,xmm4,xmm1,0x39 + vpaddd xmm9,xmm10,xmm9 + vprold xmm9,xmm9,21 + vpaddd xmm2,xmm9,xmm3 + vpaddd xmm1,xmm1,xmm5 + vpaddd xmm2,xmm2,xmm6 + vpaddd xmm3,xmm3,xmm7 + vpaddd xmm4,xmm4,xmm8 + add rsi,64 + dec rdx + jnz NEAR $L$_main_loop + +$L$_done: + vmovdqa xmm6,XMMWORD[rsp] + vmovdqa xmm7,XMMWORD[16+rsp] + vmovdqa xmm8,XMMWORD[32+rsp] + vmovdqa xmm9,XMMWORD[48+rsp] + vmovdqa xmm10,XMMWORD[64+rsp] + mov rsp,rbp + pop rbp + vmovd DWORD[rdi],xmm1 + vmovd DWORD[4+rdi],xmm2 + vmovd DWORD[8+rdi],xmm3 + vmovd DWORD[12+rdi],xmm4 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_md5_x86_64_avx512: + +section .rdata rdata align=8 +ALIGN 32 +$L$_T: + DD 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee + DD 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501 + DD 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be + DD 0x6b901122,0xfd987193,0xa679438e,0x49b40821 + DD 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa + DD 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8 + DD 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed + DD 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a + DD 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c + DD 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70 + DD 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05 + DD 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665 + DD 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039 + DD 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1 + DD 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1 + DD 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 +%endif +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif