diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index c6f263fcbc..713dbeb610 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -262,6 +262,7 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR ${S2N_BIGNUM_DIR}/curve25519/curve25519_x25519_alt.S ${S2N_BIGNUM_DIR}/curve25519/curve25519_x25519base.S ${S2N_BIGNUM_DIR}/curve25519/curve25519_x25519base_alt.S + ${S2N_BIGNUM_DIR}/sha3/sha3_keccak_f1600.S ) elseif(ARCH STREQUAL "aarch64") # byte-level interface for aarch64 s2n-bignum x25519 are in diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h index 99afc11697..3e7db5bb26 100644 --- a/crypto/fipsmodule/sha/internal.h +++ b/crypto/fipsmodule/sha/internal.h @@ -342,7 +342,8 @@ void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *data, size_t num); #endif -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_AARCH64) || defined(OPENSSL_X86_64)) #define KECCAK1600_ASM #if defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE) #define KECCAK1600_S2N_BIGNUM_ASM diff --git a/crypto/fipsmodule/sha/keccak1600.c b/crypto/fipsmodule/sha/keccak1600.c index 3e90e6eed3..b2fdd1c3b5 100644 --- a/crypto/fipsmodule/sha/keccak1600.c +++ b/crypto/fipsmodule/sha/keccak1600.c @@ -12,45 +12,75 @@ #include "../../internal.h" #include "../cpucap/internal.h" -#if defined(__x86_64__) || defined(__aarch64__) || \ +#if !(defined(__x86_64__) || defined(__aarch64__) || \ defined(__mips64) || defined(__ia64) || defined(__loongarch_lp64) || \ - (defined(__VMS) && !defined(__vax)) + (defined(__VMS) && !defined(__vax))) // These are available even in ILP32 flavours, but even then they are // capable of performing 64-bit operations as efficiently as in *P64. // Since it's not given that we can use sizeof(void *), just shunt it. -# define BIT_INTERLEAVE (0) -#else # define BIT_INTERLEAVE (sizeof(void *) < 8) #endif +#if defined(BIT_INTERLEAVE) + +static const uint64_t iotas[] = { + 0x0000000000000001ULL, + 0x0000008900000000ULL, + 0x8000008b00000000ULL, + 0x8000808000000000ULL, + 0x0000008b00000001ULL, + 0x0000800000000001ULL, + 0x8000808800000001ULL, + 0x8000008200000001ULL, + 0x0000000b00000000ULL, + 0x0000000a00000000ULL, + 0x0000808200000001ULL, + 0x0000800300000000ULL, + 0x0000808b00000001ULL, + 0x8000000b00000001ULL, + 0x8000008a00000001ULL, + 0x8000008100000001ULL, + 0x8000008100000000ULL, + 0x8000000800000000ULL, + 0x0000008300000000ULL, + 0x8000800300000000ULL, + 0x8000808800000001ULL, + 0x8000008800000000ULL, + 0x0000800000000001ULL, + 0x8000808200000000ULL +}; +#else + static const uint64_t iotas[] = { - BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL, - BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL, - BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL, - BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL, - BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL, - BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL, - BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL, - BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL, - BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL, - BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL, - BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL, - BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL, - BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL, - BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL, - BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL, - BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL, - BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL, - BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL, - BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL, - BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL, - BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL, - BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL, - BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL, - BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL }; +#endif + #if !defined(KECCAK1600_ASM) static const uint8_t rhotates[KECCAK1600_ROWS][KECCAK1600_ROWS] = { @@ -73,27 +103,29 @@ static const uint8_t rhotates[KECCAK1600_ROWS][KECCAK1600_ROWS] = { #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31))) static uint64_t ROL64(uint64_t val, int offset) { +#ifdef BIT_INTERLEAVE + uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val; + + if ((offset & 1) != 0) { + uint32_t tmp = hi; + + offset >>= 1; + hi = ROL32(lo, offset); + lo = ROL32(tmp, offset + 1); + } else { + offset >>= 1; + lo = ROL32(lo, offset); + hi = ROL32(hi, offset); + } + + return ((uint64_t)hi << 32) | lo; +#else if (offset == 0) { return val; - } else if (!BIT_INTERLEAVE) { - return (val << offset) | (val >> (64-offset)); } else { - uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val; - - if ((offset & 1) != 0) { - uint32_t tmp = hi; - - offset >>= 1; - hi = ROL32(lo, offset); - lo = ROL32(tmp, offset + 1); - } else { - offset >>= 1; - lo = ROL32(lo, offset); - hi = ROL32(hi, offset); - } - - return ((uint64_t)hi << 32) | lo; + return (val << offset) | (val >> (64-offset)); } +#endif } // KECCAK_2X: @@ -253,72 +285,71 @@ static void KeccakF1600_c(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) { #endif // !KECCAK1600_ASM static uint64_t BitInterleave(uint64_t Ai) { - if (BIT_INTERLEAVE) { - uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; - uint32_t t0, t1; - - t0 = lo & 0x55555555; - t0 |= t0 >> 1; t0 &= 0x33333333; - t0 |= t0 >> 2; t0 &= 0x0f0f0f0f; - t0 |= t0 >> 4; t0 &= 0x00ff00ff; - t0 |= t0 >> 8; t0 &= 0x0000ffff; - - t1 = hi & 0x55555555; - t1 |= t1 >> 1; t1 &= 0x33333333; - t1 |= t1 >> 2; t1 &= 0x0f0f0f0f; - t1 |= t1 >> 4; t1 &= 0x00ff00ff; - t1 |= t1 >> 8; t1 <<= 16; - - lo &= 0xaaaaaaaa; - lo |= lo << 1; lo &= 0xcccccccc; - lo |= lo << 2; lo &= 0xf0f0f0f0; - lo |= lo << 4; lo &= 0xff00ff00; - lo |= lo << 8; lo >>= 16; - - hi &= 0xaaaaaaaa; - hi |= hi << 1; hi &= 0xcccccccc; - hi |= hi << 2; hi &= 0xf0f0f0f0; - hi |= hi << 4; hi &= 0xff00ff00; - hi |= hi << 8; hi &= 0xffff0000; - - Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); - } +#ifdef BIT_INTERLEAVE + uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; + uint32_t t0, t1; + + t0 = lo & 0x55555555; + t0 |= t0 >> 1; t0 &= 0x33333333; + t0 |= t0 >> 2; t0 &= 0x0f0f0f0f; + t0 |= t0 >> 4; t0 &= 0x00ff00ff; + t0 |= t0 >> 8; t0 &= 0x0000ffff; + + t1 = hi & 0x55555555; + t1 |= t1 >> 1; t1 &= 0x33333333; + t1 |= t1 >> 2; t1 &= 0x0f0f0f0f; + t1 |= t1 >> 4; t1 &= 0x00ff00ff; + t1 |= t1 >> 8; t1 <<= 16; + + lo &= 0xaaaaaaaa; + lo |= lo << 1; lo &= 0xcccccccc; + lo |= lo << 2; lo &= 0xf0f0f0f0; + lo |= lo << 4; lo &= 0xff00ff00; + lo |= lo << 8; lo >>= 16; + + hi &= 0xaaaaaaaa; + hi |= hi << 1; hi &= 0xcccccccc; + hi |= hi << 2; hi &= 0xf0f0f0f0; + hi |= hi << 4; hi &= 0xff00ff00; + hi |= hi << 8; hi &= 0xffff0000; + + Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); +#endif return Ai; } static uint64_t BitDeinterleave(uint64_t Ai) { - if (BIT_INTERLEAVE) { - uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; - uint32_t t0, t1; - - t0 = lo & 0x0000ffff; - t0 |= t0 << 8; t0 &= 0x00ff00ff; - t0 |= t0 << 4; t0 &= 0x0f0f0f0f; - t0 |= t0 << 2; t0 &= 0x33333333; - t0 |= t0 << 1; t0 &= 0x55555555; - - t1 = hi << 16; - t1 |= t1 >> 8; t1 &= 0xff00ff00; - t1 |= t1 >> 4; t1 &= 0xf0f0f0f0; - t1 |= t1 >> 2; t1 &= 0xcccccccc; - t1 |= t1 >> 1; t1 &= 0xaaaaaaaa; - - lo >>= 16; - lo |= lo << 8; lo &= 0x00ff00ff; - lo |= lo << 4; lo &= 0x0f0f0f0f; - lo |= lo << 2; lo &= 0x33333333; - lo |= lo << 1; lo &= 0x55555555; - - hi &= 0xffff0000; - hi |= hi >> 8; hi &= 0xff00ff00; - hi |= hi >> 4; hi &= 0xf0f0f0f0; - hi |= hi >> 2; hi &= 0xcccccccc; - hi |= hi >> 1; hi &= 0xaaaaaaaa; - - Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); - } - +#ifdef BIT_INTERLEAVE + uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; + uint32_t t0, t1; + + t0 = lo & 0x0000ffff; + t0 |= t0 << 8; t0 &= 0x00ff00ff; + t0 |= t0 << 4; t0 &= 0x0f0f0f0f; + t0 |= t0 << 2; t0 &= 0x33333333; + t0 |= t0 << 1; t0 &= 0x55555555; + + t1 = hi << 16; + t1 |= t1 >> 8; t1 &= 0xff00ff00; + t1 |= t1 >> 4; t1 &= 0xf0f0f0f0; + t1 |= t1 >> 2; t1 &= 0xcccccccc; + t1 |= t1 >> 1; t1 &= 0xaaaaaaaa; + + lo >>= 16; + lo |= lo << 8; lo &= 0x00ff00ff; + lo |= lo << 4; lo &= 0x0f0f0f0f; + lo |= lo << 2; lo &= 0x33333333; + lo |= lo << 1; lo &= 0x55555555; + + hi &= 0xffff0000; + hi |= hi >> 8; hi &= 0xff00ff00; + hi |= hi >> 4; hi &= 0xf0f0f0f0; + hi |= hi >> 2; hi &= 0xcccccccc; + hi |= hi >> 1; hi &= 0xaaaaaaaa; + + Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); +#endif return Ai; } @@ -419,18 +450,20 @@ void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *o #if defined(KECCAK1600_ASM) // Double-check that bit-interleaving is not used on AArch64 -#if BIT_INTERLEAVE != 0 +#if defined(BIT_INTERLEAVE) && defined(OPENSSL_AARCH64) #error Bit-interleaving of Keccak1600 states should be disabled for AArch64 #endif // Scalar implementation from OpenSSL provided by keccak1600-armv8.pl extern void KeccakF1600_hw(uint64_t state[25]); +#if defined(OPENSSL_AARCH64) static void keccak_log_dispatch(size_t id) { #if BORINGSSL_DISPATCH_TEST BORINGSSL_function_hit[id] = 1; #endif } +#endif void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) { // Dispatch logic for Keccak-x1 on AArch64: @@ -454,7 +487,7 @@ void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) { // Neoverse V1 and V2 do support SHA3 instructions, but they are only // implemented on 1/4 of Neon units, and are thus slower than a scalar // implementation. - +#if defined(OPENSSL_AARCH64) #if defined(KECCAK1600_S2N_BIGNUM_ASM) if (CRYPTO_is_Neoverse_N1() || CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) { keccak_log_dispatch(10); // kFlag_sha3_keccak_f1600 @@ -473,6 +506,11 @@ void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) { keccak_log_dispatch(9); // kFlag_KeccakF1600_hw KeccakF1600_hw((uint64_t *) A); + +#elif defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) && \ + defined(KECCAK1600_S2N_BIGNUM_ASM) + sha3_keccak_f1600((uint64_t *)A, iotas); +#endif } #else // KECCAK1600_ASM @@ -524,8 +562,7 @@ static void Keccak1600_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]) { // which is a straightforward implementation using the SHA3 extension. // - Otherwise, fall back to four times the 1-fold Keccak implementation // (which has its own dispatch logic). - -#if defined(KECCAK1600_S2N_BIGNUM_ASM) +#if defined(KECCAK1600_S2N_BIGNUM_ASM) && defined(OPENSSL_AARCH64) if (CRYPTO_is_Neoverse_N1()) { keccak_log_dispatch(13); // kFlag_sha3_keccak4_f1600_alt sha3_keccak4_f1600_alt((uint64_t *)A, iotas); diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile index 075ec11a61..d105903634 100644 --- a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile @@ -286,6 +286,7 @@ OBJ = curve25519/bignum_add_p25519.o \ secp256k1/secp256k1_jdouble_alt.o \ secp256k1/secp256k1_jmixadd.o \ secp256k1/secp256k1_jmixadd_alt.o \ + sha3/sha3_keccak_f1600.o \ sm2/bignum_add_sm2.o \ sm2/bignum_cmul_sm2.o \ sm2/bignum_cmul_sm2_alt.o \ diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sha3/sha3_keccak_f1600.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sha3/sha3_keccak_f1600.S new file mode 100644 index 0000000000..55db081e5b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sha3/sha3_keccak_f1600.S @@ -0,0 +1,461 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + + +// ---------------------------------------------------------------------------- +// Keccak-f1600 permutation for SHA3 +// Input a[25], rc[24]; output a[25] +// +// Keccak-f1600 permutation operation is at the core of SHA3 and SHAKE +// and is fully specified here: +// +// https://keccak.team/files/Keccak-reference-3.0.pdf +// +// extern void sha3_keccak_f1600(uint64_t a[25], const uint64_t rc[24]); +// +// Standard x86-64 ABI: RDI = a, RSI = rc +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sha3_keccak_f1600) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sha3_keccak_f1600) + .text + +S2N_BN_SYMBOL(sha3_keccak_f1600): + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub $0xd0,%rsp +// Perform bitwise NOT operation on some entries of |a| + notq 0x8(%rdi) + notq 0x10(%rdi) + notq 0x40(%rdi) + notq 0x60(%rdi) + notq 0x88(%rdi) + notq 0xa0(%rdi) + lea (%rsp),%r15 +// Load some entries of |a| + mov 0xa0(%rdi),%rax + mov 0xa8(%rdi),%rbx + mov 0xb0(%rdi),%rcx + mov 0xb8(%rdi),%rdx + mov 0xc0(%rdi),%rbp +// Initialize loop counter + mov $0x0,%r8 + +loop_keccak: + mov %r8,0xc8(%rsp) + mov (%rdi),%r8 + mov 0x30(%rdi),%r9 + mov 0x60(%rdi),%r10 + mov 0x90(%rdi),%r11 + xor 0x10(%rdi),%rcx + xor 0x18(%rdi),%rdx + xor %r8,%rax + xor 0x8(%rdi),%rbx + xor 0x38(%rdi),%rcx + xor 0x28(%rdi),%rax + mov %rbp,%r12 + xor 0x20(%rdi),%rbp + xor %r10,%rcx + xor 0x50(%rdi),%rax + xor 0x40(%rdi),%rdx + xor %r9,%rbx + xor 0x48(%rdi),%rbp + xor 0x88(%rdi),%rcx + xor 0x78(%rdi),%rax + xor 0x68(%rdi),%rdx + xor 0x58(%rdi),%rbx + xor 0x70(%rdi),%rbp + mov %rcx,%r13 + rol $1,%rcx + xor %rax,%rcx + xor %r11,%rdx + rol $1,%rax + xor %rdx,%rax + xor 0x80(%rdi),%rbx + rol $1,%rdx + xor %rbx,%rdx + xor 0x98(%rdi),%rbp + rol $1,%rbx + xor %rbp,%rbx + rol $1,%rbp + xor %r13,%rbp + xor %rcx,%r9 + xor %rdx,%r10 + rol $0x2c,%r9 + xor %rbp,%r11 + xor %rax,%r12 + rol $0x2b,%r10 + xor %rbx,%r8 + mov %r9,%r13 + rol $0x15,%r11 + or %r10,%r9 + xor %r8,%r9 + rol $0xe,%r12 + xor (%rsi),%r9 + mov %r12,%r14 + and %r11,%r12 + mov %r9,(%r15) + xor %r10,%r12 + not %r10 + mov %r12,0x10(%r15) + or %r11,%r10 + mov 0xb0(%rdi),%r12 + xor %r13,%r10 + mov %r10,0x8(%r15) + and %r8,%r13 + mov 0x48(%rdi),%r9 + xor %r14,%r13 + mov 0x50(%rdi),%r10 + mov %r13,0x20(%r15) + or %r8,%r14 + mov 0x18(%rdi),%r8 + xor %r11,%r14 + mov 0x80(%rdi),%r11 + mov %r14,0x18(%r15) + xor %rbp,%r8 + xor %rdx,%r12 + rol $0x1c,%r8 + xor %rcx,%r11 + xor %rax,%r9 + rol $0x3d,%r12 + rol $0x2d,%r11 + xor %rbx,%r10 + rol $0x14,%r9 + mov %r8,%r13 + or %r12,%r8 + rol $0x3,%r10 + xor %r11,%r8 + mov %r8,0x40(%r15) + mov %r9,%r14 + and %r13,%r9 + mov 0x8(%rdi),%r8 + xor %r12,%r9 + not %r12 + mov %r9,0x48(%r15) + or %r11,%r12 + mov 0x38(%rdi),%r9 + xor %r10,%r12 + mov %r12,0x38(%r15) + and %r10,%r11 + mov 0xa0(%rdi),%r12 + xor %r14,%r11 + mov %r11,0x30(%r15) + or %r10,%r14 + mov 0x68(%rdi),%r10 + xor %r13,%r14 + mov 0x98(%rdi),%r11 + mov %r14,0x28(%r15) + xor %rbp,%r10 + xor %rax,%r11 + rol $0x19,%r10 + xor %rdx,%r9 + rol $0x8,%r11 + xor %rbx,%r12 + rol $0x6,%r9 + xor %rcx,%r8 + rol $0x12,%r12 + mov %r10,%r13 + and %r11,%r10 + rol $1,%r8 + not %r11 + xor %r9,%r10 + mov %r10,0x58(%r15) + mov %r12,%r14 + and %r11,%r12 + mov 0x58(%rdi),%r10 + xor %r13,%r12 + mov %r12,0x60(%r15) + or %r9,%r13 + mov 0xb8(%rdi),%r12 + xor %r8,%r13 + mov %r13,0x50(%r15) + and %r8,%r9 + xor %r14,%r9 + mov %r9,0x70(%r15) + or %r8,%r14 + mov 0x28(%rdi),%r9 + xor %r11,%r14 + mov 0x88(%rdi),%r11 + mov %r14,0x68(%r15) + mov 0x20(%rdi),%r8 + xor %rcx,%r10 + xor %rdx,%r11 + rol $0xa,%r10 + xor %rbx,%r9 + rol $0xf,%r11 + xor %rbp,%r12 + rol $0x24,%r9 + xor %rax,%r8 + rol $0x38,%r12 + mov %r10,%r13 + or %r11,%r10 + rol $0x1b,%r8 + not %r11 + xor %r9,%r10 + mov %r10,0x80(%r15) + mov %r12,%r14 + or %r11,%r12 + xor %r13,%r12 + mov %r12,0x88(%r15) + and %r9,%r13 + xor %r8,%r13 + mov %r13,0x78(%r15) + or %r8,%r9 + xor %r14,%r9 + mov %r9,0x98(%r15) + and %r14,%r8 + xor %r11,%r8 + mov %r8,0x90(%r15) + xor 0x10(%rdi),%rdx + xor 0x40(%rdi),%rbp + rol $0x3e,%rdx + xor 0xa8(%rdi),%rcx + rol $0x37,%rbp + xor 0x70(%rdi),%rax + rol $0x2,%rcx + xor 0x78(%rdi),%rbx + xchg %r15,%rdi + rol $0x27,%rax + rol $0x29,%rbx + mov %rdx,%r13 + and %rbp,%rdx + not %rbp + xor %rcx,%rdx + mov %rdx,0xc0(%rdi) + mov %rax,%r14 + and %rbp,%rax + xor %r13,%rax + mov %rax,0xa0(%rdi) + or %rcx,%r13 + xor %rbx,%r13 + mov %r13,0xb8(%rdi) + and %rbx,%rcx + xor %r14,%rcx + mov %rcx,0xb0(%rdi) + or %r14,%rbx + xor %rbp,%rbx + mov %rbx,0xa8(%rdi) + mov %rdx,%rbp + mov %r13,%rdx + lea 0x8(%rsi),%rsi + mov (%rdi),%r8 + mov 0x30(%rdi),%r9 + mov 0x60(%rdi),%r10 + mov 0x90(%rdi),%r11 + xor 0x10(%rdi),%rcx + xor 0x18(%rdi),%rdx + xor %r8,%rax + xor 0x8(%rdi),%rbx + xor 0x38(%rdi),%rcx + xor 0x28(%rdi),%rax + mov %rbp,%r12 + xor 0x20(%rdi),%rbp + xor %r10,%rcx + xor 0x50(%rdi),%rax + xor 0x40(%rdi),%rdx + xor %r9,%rbx + xor 0x48(%rdi),%rbp + xor 0x88(%rdi),%rcx + xor 0x78(%rdi),%rax + xor 0x68(%rdi),%rdx + xor 0x58(%rdi),%rbx + xor 0x70(%rdi),%rbp + mov %rcx,%r13 + rol $1,%rcx + xor %rax,%rcx + xor %r11,%rdx + rol $1,%rax + xor %rdx,%rax + xor 0x80(%rdi),%rbx + rol $1,%rdx + xor %rbx,%rdx + xor 0x98(%rdi),%rbp + rol $1,%rbx + xor %rbp,%rbx + rol $1,%rbp + xor %r13,%rbp + xor %rcx,%r9 + xor %rdx,%r10 + rol $0x2c,%r9 + xor %rbp,%r11 + xor %rax,%r12 + rol $0x2b,%r10 + xor %rbx,%r8 + mov %r9,%r13 + rol $0x15,%r11 + or %r10,%r9 + xor %r8,%r9 + rol $0xe,%r12 + xor (%rsi),%r9 + mov %r12,%r14 + and %r11,%r12 + mov %r9,(%r15) + xor %r10,%r12 + not %r10 + mov %r12,0x10(%r15) + or %r11,%r10 + mov 0xb0(%rdi),%r12 + xor %r13,%r10 + mov %r10,0x8(%r15) + and %r8,%r13 + mov 0x48(%rdi),%r9 + xor %r14,%r13 + mov 0x50(%rdi),%r10 + mov %r13,0x20(%r15) + or %r8,%r14 + mov 0x18(%rdi),%r8 + xor %r11,%r14 + mov 0x80(%rdi),%r11 + mov %r14,0x18(%r15) + xor %rbp,%r8 + xor %rdx,%r12 + rol $0x1c,%r8 + xor %rcx,%r11 + xor %rax,%r9 + rol $0x3d,%r12 + rol $0x2d,%r11 + xor %rbx,%r10 + rol $0x14,%r9 + mov %r8,%r13 + or %r12,%r8 + rol $0x3,%r10 + xor %r11,%r8 + mov %r8,0x40(%r15) + mov %r9,%r14 + and %r13,%r9 + mov 0x8(%rdi),%r8 + xor %r12,%r9 + not %r12 + mov %r9,0x48(%r15) + or %r11,%r12 + mov 0x38(%rdi),%r9 + xor %r10,%r12 + mov %r12,0x38(%r15) + and %r10,%r11 + mov 0xa0(%rdi),%r12 + xor %r14,%r11 + mov %r11,0x30(%r15) + or %r10,%r14 + mov 0x68(%rdi),%r10 + xor %r13,%r14 + mov 0x98(%rdi),%r11 + mov %r14,0x28(%r15) + xor %rbp,%r10 + xor %rax,%r11 + rol $0x19,%r10 + xor %rdx,%r9 + rol $0x8,%r11 + xor %rbx,%r12 + rol $0x6,%r9 + xor %rcx,%r8 + rol $0x12,%r12 + mov %r10,%r13 + and %r11,%r10 + rol $1,%r8 + not %r11 + xor %r9,%r10 + mov %r10,0x58(%r15) + mov %r12,%r14 + and %r11,%r12 + mov 0x58(%rdi),%r10 + xor %r13,%r12 + mov %r12,0x60(%r15) + or %r9,%r13 + mov 0xb8(%rdi),%r12 + xor %r8,%r13 + mov %r13,0x50(%r15) + and %r8,%r9 + xor %r14,%r9 + mov %r9,0x70(%r15) + or %r8,%r14 + mov 0x28(%rdi),%r9 + xor %r11,%r14 + mov 0x88(%rdi),%r11 + mov %r14,0x68(%r15) + mov 0x20(%rdi),%r8 + xor %rcx,%r10 + xor %rdx,%r11 + rol $0xa,%r10 + xor %rbx,%r9 + rol $0xf,%r11 + xor %rbp,%r12 + rol $0x24,%r9 + xor %rax,%r8 + rol $0x38,%r12 + mov %r10,%r13 + or %r11,%r10 + rol $0x1b,%r8 + not %r11 + xor %r9,%r10 + mov %r10,0x80(%r15) + mov %r12,%r14 + or %r11,%r12 + xor %r13,%r12 + mov %r12,0x88(%r15) + and %r9,%r13 + xor %r8,%r13 + mov %r13,0x78(%r15) + or %r8,%r9 + xor %r14,%r9 + mov %r9,0x98(%r15) + and %r14,%r8 + xor %r11,%r8 + mov %r8,0x90(%r15) + xor 0x10(%rdi),%rdx + xor 0x40(%rdi),%rbp + rol $0x3e,%rdx + xor 0xa8(%rdi),%rcx + rol $0x37,%rbp + xor 0x70(%rdi),%rax + rol $0x2,%rcx + xor 0x78(%rdi),%rbx + xchg %r15,%rdi + rol $0x27,%rax + rol $0x29,%rbx + mov %rdx,%r13 + and %rbp,%rdx + not %rbp + xor %rcx,%rdx + mov %rdx,0xc0(%rdi) + mov %rax,%r14 + and %rbp,%rax + xor %r13,%rax + mov %rax,0xa0(%rdi) + or %rcx,%r13 + xor %rbx,%r13 + mov %r13,0xb8(%rdi) + and %rbx,%rcx + xor %r14,%rcx + mov %rcx,0xb0(%rdi) + or %r14,%rbx + xor %rbp,%rbx + mov %rbx,0xa8(%rdi) + mov %rdx,%rbp + mov %r13,%rdx + lea 0x8(%rsi),%rsi + mov 0xc8(%rsp),%r8 + add $0x2,%r8 + cmp $0x18,%r8 + jne loop_keccak +// Perform bitwise NOT operation on the previously negated entries of |a| + lea -0xc0(%rsi),%rsi + notq 0x8(%rdi) + notq 0x10(%rdi) + notq 0x40(%rdi) + notq 0x60(%rdi) + notq 0x88(%rdi) + notq 0xa0(%rdi) + add $0xd0,%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret \ No newline at end of file