Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crypto/fipsmodule/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
${S2N_BIGNUM_DIR}/curve25519/curve25519_x25519_alt.S
${S2N_BIGNUM_DIR}/curve25519/curve25519_x25519base.S
${S2N_BIGNUM_DIR}/curve25519/curve25519_x25519base_alt.S
${S2N_BIGNUM_DIR}/sha3/sha3_keccak_f1600.S
)
elseif(ARCH STREQUAL "aarch64")
# byte-level interface for aarch64 s2n-bignum x25519 are in
Expand Down
3 changes: 2 additions & 1 deletion crypto/fipsmodule/sha/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,8 @@ void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *data,
size_t num);
#endif

#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_AARCH64) || defined(OPENSSL_X86_64))
#define KECCAK1600_ASM
#if defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)
#define KECCAK1600_S2N_BIGNUM_ASM
Expand Down
257 changes: 147 additions & 110 deletions crypto/fipsmodule/sha/keccak1600.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,45 +12,75 @@
#include "../../internal.h"
#include "../cpucap/internal.h"

#if defined(__x86_64__) || defined(__aarch64__) || \
#if !(defined(__x86_64__) || defined(__aarch64__) || \
defined(__mips64) || defined(__ia64) || defined(__loongarch_lp64) || \
(defined(__VMS) && !defined(__vax))
(defined(__VMS) && !defined(__vax)))

// These are available even in ILP32 flavours, but even then they are
// capable of performing 64-bit operations as efficiently as in *P64.
// Since it's not given that we can use sizeof(void *), just shunt it.
# define BIT_INTERLEAVE (0)
#else
# define BIT_INTERLEAVE (sizeof(void *) < 8)
#endif

#if defined(BIT_INTERLEAVE)

static const uint64_t iotas[] = {
0x0000000000000001ULL,
0x0000008900000000ULL,
0x8000008b00000000ULL,
0x8000808000000000ULL,
0x0000008b00000001ULL,
0x0000800000000001ULL,
0x8000808800000001ULL,
0x8000008200000001ULL,
0x0000000b00000000ULL,
0x0000000a00000000ULL,
0x0000808200000001ULL,
0x0000800300000000ULL,
0x0000808b00000001ULL,
0x8000000b00000001ULL,
0x8000008a00000001ULL,
0x8000008100000001ULL,
0x8000008100000000ULL,
0x8000000800000000ULL,
0x0000008300000000ULL,
0x8000800300000000ULL,
0x8000808800000001ULL,
0x8000008800000000ULL,
0x0000800000000001ULL,
0x8000808200000000ULL
};
#else

static const uint64_t iotas[] = {
BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
0x0000000000000001ULL,
0x0000000000008082ULL,
0x800000000000808aULL,
0x8000000080008000ULL,
0x000000000000808bULL,
0x0000000080000001ULL,
0x8000000080008081ULL,
0x8000000000008009ULL,
0x000000000000008aULL,
0x0000000000000088ULL,
0x0000000080008009ULL,
0x000000008000000aULL,
0x000000008000808bULL,
0x800000000000008bULL,
0x8000000000008089ULL,
0x8000000000008003ULL,
0x8000000000008002ULL,
0x8000000000000080ULL,
0x000000000000800aULL,
0x800000008000000aULL,
0x8000000080008081ULL,
0x8000000000008080ULL,
0x0000000080000001ULL,
0x8000000080008008ULL
};

#endif

#if !defined(KECCAK1600_ASM)

static const uint8_t rhotates[KECCAK1600_ROWS][KECCAK1600_ROWS] = {
Expand All @@ -73,27 +103,29 @@ static const uint8_t rhotates[KECCAK1600_ROWS][KECCAK1600_ROWS] = {
#define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))

static uint64_t ROL64(uint64_t val, int offset) {
#ifdef BIT_INTERLEAVE
uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;

if ((offset & 1) != 0) {
uint32_t tmp = hi;

offset >>= 1;
hi = ROL32(lo, offset);
lo = ROL32(tmp, offset + 1);
} else {
offset >>= 1;
lo = ROL32(lo, offset);
hi = ROL32(hi, offset);
}

return ((uint64_t)hi << 32) | lo;
#else
if (offset == 0) {
return val;
} else if (!BIT_INTERLEAVE) {
return (val << offset) | (val >> (64-offset));
} else {
uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;

if ((offset & 1) != 0) {
uint32_t tmp = hi;

offset >>= 1;
hi = ROL32(lo, offset);
lo = ROL32(tmp, offset + 1);
} else {
offset >>= 1;
lo = ROL32(lo, offset);
hi = ROL32(hi, offset);
}

return ((uint64_t)hi << 32) | lo;
return (val << offset) | (val >> (64-offset));
}
#endif
}

// KECCAK_2X:
Expand Down Expand Up @@ -253,72 +285,71 @@ static void KeccakF1600_c(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
#endif // !KECCAK1600_ASM

static uint64_t BitInterleave(uint64_t Ai) {
if (BIT_INTERLEAVE) {
uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
uint32_t t0, t1;

t0 = lo & 0x55555555;
t0 |= t0 >> 1; t0 &= 0x33333333;
t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
t0 |= t0 >> 4; t0 &= 0x00ff00ff;
t0 |= t0 >> 8; t0 &= 0x0000ffff;

t1 = hi & 0x55555555;
t1 |= t1 >> 1; t1 &= 0x33333333;
t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
t1 |= t1 >> 4; t1 &= 0x00ff00ff;
t1 |= t1 >> 8; t1 <<= 16;

lo &= 0xaaaaaaaa;
lo |= lo << 1; lo &= 0xcccccccc;
lo |= lo << 2; lo &= 0xf0f0f0f0;
lo |= lo << 4; lo &= 0xff00ff00;
lo |= lo << 8; lo >>= 16;

hi &= 0xaaaaaaaa;
hi |= hi << 1; hi &= 0xcccccccc;
hi |= hi << 2; hi &= 0xf0f0f0f0;
hi |= hi << 4; hi &= 0xff00ff00;
hi |= hi << 8; hi &= 0xffff0000;

Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
}
#ifdef BIT_INTERLEAVE
uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
uint32_t t0, t1;

t0 = lo & 0x55555555;
t0 |= t0 >> 1; t0 &= 0x33333333;
t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
t0 |= t0 >> 4; t0 &= 0x00ff00ff;
t0 |= t0 >> 8; t0 &= 0x0000ffff;

t1 = hi & 0x55555555;
t1 |= t1 >> 1; t1 &= 0x33333333;
t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
t1 |= t1 >> 4; t1 &= 0x00ff00ff;
t1 |= t1 >> 8; t1 <<= 16;

lo &= 0xaaaaaaaa;
lo |= lo << 1; lo &= 0xcccccccc;
lo |= lo << 2; lo &= 0xf0f0f0f0;
lo |= lo << 4; lo &= 0xff00ff00;
lo |= lo << 8; lo >>= 16;

hi &= 0xaaaaaaaa;
hi |= hi << 1; hi &= 0xcccccccc;
hi |= hi << 2; hi &= 0xf0f0f0f0;
hi |= hi << 4; hi &= 0xff00ff00;
hi |= hi << 8; hi &= 0xffff0000;

Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
#endif

return Ai;
}

static uint64_t BitDeinterleave(uint64_t Ai) {
if (BIT_INTERLEAVE) {
uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
uint32_t t0, t1;

t0 = lo & 0x0000ffff;
t0 |= t0 << 8; t0 &= 0x00ff00ff;
t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
t0 |= t0 << 2; t0 &= 0x33333333;
t0 |= t0 << 1; t0 &= 0x55555555;

t1 = hi << 16;
t1 |= t1 >> 8; t1 &= 0xff00ff00;
t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
t1 |= t1 >> 2; t1 &= 0xcccccccc;
t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;

lo >>= 16;
lo |= lo << 8; lo &= 0x00ff00ff;
lo |= lo << 4; lo &= 0x0f0f0f0f;
lo |= lo << 2; lo &= 0x33333333;
lo |= lo << 1; lo &= 0x55555555;

hi &= 0xffff0000;
hi |= hi >> 8; hi &= 0xff00ff00;
hi |= hi >> 4; hi &= 0xf0f0f0f0;
hi |= hi >> 2; hi &= 0xcccccccc;
hi |= hi >> 1; hi &= 0xaaaaaaaa;

Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
}

#ifdef BIT_INTERLEAVE
uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
uint32_t t0, t1;

t0 = lo & 0x0000ffff;
t0 |= t0 << 8; t0 &= 0x00ff00ff;
t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
t0 |= t0 << 2; t0 &= 0x33333333;
t0 |= t0 << 1; t0 &= 0x55555555;

t1 = hi << 16;
t1 |= t1 >> 8; t1 &= 0xff00ff00;
t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
t1 |= t1 >> 2; t1 &= 0xcccccccc;
t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;

lo >>= 16;
lo |= lo << 8; lo &= 0x00ff00ff;
lo |= lo << 4; lo &= 0x0f0f0f0f;
lo |= lo << 2; lo &= 0x33333333;
lo |= lo << 1; lo &= 0x55555555;

hi &= 0xffff0000;
hi |= hi >> 8; hi &= 0xff00ff00;
hi |= hi >> 4; hi &= 0xf0f0f0f0;
hi |= hi >> 2; hi &= 0xcccccccc;
hi |= hi >> 1; hi &= 0xaaaaaaaa;

Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
#endif
return Ai;
}

Expand Down Expand Up @@ -419,18 +450,20 @@ void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *o
#if defined(KECCAK1600_ASM)

// Double-check that bit-interleaving is not used on AArch64
#if BIT_INTERLEAVE != 0
#if defined(BIT_INTERLEAVE) && defined(OPENSSL_AARCH64)
#error Bit-interleaving of Keccak1600 states should be disabled for AArch64
#endif

// Scalar implementation from OpenSSL provided by keccak1600-armv8.pl
extern void KeccakF1600_hw(uint64_t state[25]);

#if defined(OPENSSL_AARCH64)
static void keccak_log_dispatch(size_t id) {
#if BORINGSSL_DISPATCH_TEST
BORINGSSL_function_hit[id] = 1;
#endif
}
#endif

void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
// Dispatch logic for Keccak-x1 on AArch64:
Expand All @@ -454,7 +487,7 @@ void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
// Neoverse V1 and V2 do support SHA3 instructions, but they are only
// implemented on 1/4 of Neon units, and are thus slower than a scalar
// implementation.

#if defined(OPENSSL_AARCH64)
#if defined(KECCAK1600_S2N_BIGNUM_ASM)
if (CRYPTO_is_Neoverse_N1() || CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) {
keccak_log_dispatch(10); // kFlag_sha3_keccak_f1600
Expand All @@ -473,6 +506,11 @@ void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {

keccak_log_dispatch(9); // kFlag_KeccakF1600_hw
KeccakF1600_hw((uint64_t *) A);

#elif defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) && \
defined(KECCAK1600_S2N_BIGNUM_ASM)
sha3_keccak_f1600((uint64_t *)A, iotas);
#endif
}

#else // KECCAK1600_ASM
Expand Down Expand Up @@ -524,8 +562,7 @@ static void Keccak1600_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]) {
// which is a straightforward implementation using the SHA3 extension.
// - Otherwise, fall back to four times the 1-fold Keccak implementation
// (which has its own dispatch logic).

#if defined(KECCAK1600_S2N_BIGNUM_ASM)
#if defined(KECCAK1600_S2N_BIGNUM_ASM) && defined(OPENSSL_AARCH64)
if (CRYPTO_is_Neoverse_N1()) {
keccak_log_dispatch(13); // kFlag_sha3_keccak4_f1600_alt
sha3_keccak4_f1600_alt((uint64_t *)A, iotas);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ OBJ = curve25519/bignum_add_p25519.o \
secp256k1/secp256k1_jdouble_alt.o \
secp256k1/secp256k1_jmixadd.o \
secp256k1/secp256k1_jmixadd_alt.o \
sha3/sha3_keccak_f1600.o \
sm2/bignum_add_sm2.o \
sm2/bignum_cmul_sm2.o \
sm2/bignum_cmul_sm2_alt.o \
Expand Down
Loading
Loading