Skip to content

Commit 34f7f6c

Browse files
nhukcherbertx
authored andcommitted
crypto: x86/polyval - Add PCLMULQDQ accelerated implementation of POLYVAL
Add hardware accelerated version of POLYVAL for x86-64 CPUs with PCLMULQDQ support. This implementation is accelerated using PCLMULQDQ instructions to perform the finite field computations. For added efficiency, 8 blocks of the message are processed simultaneously by precomputing the first 8 powers of the key. Schoolbook multiplication is used instead of Karatsuba multiplication because it was found to be slightly faster on x86-64 machines. Montgomery reduction must be used instead of Barrett reduction due to the difference in modulus between POLYVAL's field and other finite fields. More information on POLYVAL can be found in the HCTR2 paper: "Length-preserving encryption with HCTR2": https://eprint.iacr.org/2021/1441.pdf Signed-off-by: Nathan Huckleberry <[email protected]> Reviewed-by: Ard Biesheuvel <[email protected]> Reviewed-by: Eric Biggers <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent c0eb759 commit 34f7f6c

File tree

6 files changed

+581
-0
lines changed

6 files changed

+581
-0
lines changed

arch/x86/crypto/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
6969
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
7070
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
7171

72+
obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
73+
polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
74+
7275
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
7376
crc32c-intel-y := crc32c-intel_glue.o
7477
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Copyright 2021 Google LLC
4+
*/
5+
/*
6+
* This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
7+
* instructions. It works on 8 blocks at a time, by precomputing the first 8
8+
* keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
9+
* allows us to split finite field multiplication into two steps.
10+
*
11+
* In the first step, we consider h^i, m_i as normal polynomials of degree less
12+
* than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
13+
* is simply polynomial multiplication.
14+
*
15+
* In the second step, we compute the reduction of p(x) modulo the finite field
16+
* modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
17+
*
18+
* This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
19+
* multiplication is finite field multiplication. The advantage is that the
20+
* two-step process only requires 1 finite field reduction for every 8
21+
* polynomial multiplications. Further parallelism is gained by interleaving the
22+
* multiplications and polynomial reductions.
23+
*/
24+
25+
#include <linux/linkage.h>
26+
#include <asm/frame.h>
27+
28+
#define STRIDE_BLOCKS 8
29+
30+
#define GSTAR %xmm7
31+
#define PL %xmm8
32+
#define PH %xmm9
33+
#define TMP_XMM %xmm11
34+
#define LO %xmm12
35+
#define HI %xmm13
36+
#define MI %xmm14
37+
#define SUM %xmm15
38+
39+
#define KEY_POWERS %rdi
40+
#define MSG %rsi
41+
#define BLOCKS_LEFT %rdx
42+
#define ACCUMULATOR %rcx
43+
#define TMP %rax
44+
45+
.section .rodata.cst16.gstar, "aM", @progbits, 16
46+
.align 16
47+
48+
.Lgstar:
49+
.quad 0xc200000000000000, 0xc200000000000000
50+
51+
.text
52+
53+
/*
54+
* Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
55+
* count pointed to by MSG and KEY_POWERS.
56+
*/
57+
.macro schoolbook1 count
58+
.set i, 0
59+
.rept (\count)
60+
schoolbook1_iteration i 0
61+
.set i, (i +1)
62+
.endr
63+
.endm
64+
65+
/*
66+
* Computes the product of two 128-bit polynomials at the memory locations
67+
* specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
68+
* the 256-bit product into LO, MI, HI.
69+
*
70+
* Given:
71+
* X = [X_1 : X_0]
72+
* Y = [Y_1 : Y_0]
73+
*
74+
* We compute:
75+
* LO += X_0 * Y_0
76+
* MI += X_0 * Y_1 + X_1 * Y_0
77+
* HI += X_1 * Y_1
78+
*
79+
* Later, the 256-bit result can be extracted as:
80+
* [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
81+
* This step is done when computing the polynomial reduction for efficiency
82+
* reasons.
83+
*
84+
* If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
85+
* extra multiplication of SUM and h^8.
86+
*/
87+
.macro schoolbook1_iteration i xor_sum
88+
movups (16*\i)(MSG), %xmm0
89+
.if (\i == 0 && \xor_sum == 1)
90+
pxor SUM, %xmm0
91+
.endif
92+
vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
93+
vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
94+
vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
95+
vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
96+
vpxor %xmm2, MI, MI
97+
vpxor %xmm1, LO, LO
98+
vpxor %xmm4, HI, HI
99+
vpxor %xmm3, MI, MI
100+
.endm
101+
102+
/*
103+
* Performs the same computation as schoolbook1_iteration, except we expect the
104+
* arguments to already be loaded into xmm0 and xmm1 and we set the result
105+
* registers LO, MI, and HI directly rather than XOR'ing into them.
106+
*/
107+
.macro schoolbook1_noload
108+
vpclmulqdq $0x01, %xmm0, %xmm1, MI
109+
vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
110+
vpclmulqdq $0x00, %xmm0, %xmm1, LO
111+
vpclmulqdq $0x11, %xmm0, %xmm1, HI
112+
vpxor %xmm2, MI, MI
113+
.endm
114+
115+
/*
116+
* Computes the 256-bit polynomial represented by LO, HI, MI. Stores
117+
* the result in PL, PH.
118+
* [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
119+
*/
120+
.macro schoolbook2
121+
vpslldq $8, MI, PL
122+
vpsrldq $8, MI, PH
123+
pxor LO, PL
124+
pxor HI, PH
125+
.endm
126+
127+
/*
128+
* Computes the 128-bit reduction of PH : PL. Stores the result in dest.
129+
*
130+
* This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
131+
* x^128 + x^127 + x^126 + x^121 + 1.
132+
*
133+
* We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
134+
* product of two 128-bit polynomials in Montgomery form. We need to reduce it
135+
* mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
136+
* of x^128, this product has two extra factors of x^128. To get it back into
137+
* Montgomery form, we need to remove one of these factors by dividing by x^128.
138+
*
139+
* To accomplish both of these goals, we add multiples of g(x) that cancel out
140+
* the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
141+
* bits are zero, the polynomial division by x^128 can be done by right shifting.
142+
*
143+
* Since the only nonzero term in the low 64 bits of g(x) is the constant term,
144+
* the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
145+
* only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
146+
* x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
147+
* the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
148+
* = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
149+
*
150+
* Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
151+
* 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
152+
* + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
153+
* x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
154+
* P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
155+
*
156+
* So our final computation is:
157+
* T = T_1 : T_0 = g*(x) * P_0
158+
* V = V_1 : V_0 = g*(x) * (P_1 + T_0)
159+
* p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
160+
*
161+
* The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
162+
* + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
163+
* T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
164+
*/
165+
.macro montgomery_reduction dest
166+
vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
167+
pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
168+
pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
169+
pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
170+
pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
171+
vpxor TMP_XMM, PH, \dest
172+
.endm
173+
174+
/*
175+
* Compute schoolbook multiplication for 8 blocks
176+
* m_0h^8 + ... + m_7h^1
177+
*
178+
* If reduce is set, also computes the montgomery reduction of the
179+
* previous full_stride call and XORs with the first message block.
180+
* (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
181+
* I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
182+
*/
183+
.macro full_stride reduce
184+
pxor LO, LO
185+
pxor HI, HI
186+
pxor MI, MI
187+
188+
schoolbook1_iteration 7 0
189+
.if \reduce
190+
vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
191+
.endif
192+
193+
schoolbook1_iteration 6 0
194+
.if \reduce
195+
pshufd $0b01001110, TMP_XMM, TMP_XMM
196+
.endif
197+
198+
schoolbook1_iteration 5 0
199+
.if \reduce
200+
pxor PL, TMP_XMM
201+
.endif
202+
203+
schoolbook1_iteration 4 0
204+
.if \reduce
205+
pxor TMP_XMM, PH
206+
.endif
207+
208+
schoolbook1_iteration 3 0
209+
.if \reduce
210+
pclmulqdq $0x11, GSTAR, TMP_XMM
211+
.endif
212+
213+
schoolbook1_iteration 2 0
214+
.if \reduce
215+
vpxor TMP_XMM, PH, SUM
216+
.endif
217+
218+
schoolbook1_iteration 1 0
219+
220+
schoolbook1_iteration 0 1
221+
222+
addq $(8*16), MSG
223+
schoolbook2
224+
.endm
225+
226+
/*
227+
* Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
228+
*/
229+
.macro partial_stride
230+
mov BLOCKS_LEFT, TMP
231+
shlq $4, TMP
232+
addq $(16*STRIDE_BLOCKS), KEY_POWERS
233+
subq TMP, KEY_POWERS
234+
235+
movups (MSG), %xmm0
236+
pxor SUM, %xmm0
237+
movaps (KEY_POWERS), %xmm1
238+
schoolbook1_noload
239+
dec BLOCKS_LEFT
240+
addq $16, MSG
241+
addq $16, KEY_POWERS
242+
243+
test $4, BLOCKS_LEFT
244+
jz .Lpartial4BlocksDone
245+
schoolbook1 4
246+
addq $(4*16), MSG
247+
addq $(4*16), KEY_POWERS
248+
.Lpartial4BlocksDone:
249+
test $2, BLOCKS_LEFT
250+
jz .Lpartial2BlocksDone
251+
schoolbook1 2
252+
addq $(2*16), MSG
253+
addq $(2*16), KEY_POWERS
254+
.Lpartial2BlocksDone:
255+
test $1, BLOCKS_LEFT
256+
jz .LpartialDone
257+
schoolbook1 1
258+
.LpartialDone:
259+
schoolbook2
260+
montgomery_reduction SUM
261+
.endm
262+
263+
/*
264+
* Perform montgomery multiplication in GF(2^128) and store result in op1.
265+
*
266+
* Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
267+
* If op1, op2 are in montgomery form, this computes the montgomery
268+
* form of op1*op2.
269+
*
270+
* void clmul_polyval_mul(u8 *op1, const u8 *op2);
271+
*/
272+
SYM_FUNC_START(clmul_polyval_mul)
273+
FRAME_BEGIN
274+
vmovdqa .Lgstar(%rip), GSTAR
275+
movups (%rdi), %xmm0
276+
movups (%rsi), %xmm1
277+
schoolbook1_noload
278+
schoolbook2
279+
montgomery_reduction SUM
280+
movups SUM, (%rdi)
281+
FRAME_END
282+
RET
283+
SYM_FUNC_END(clmul_polyval_mul)
284+
285+
/*
286+
* Perform polynomial evaluation as specified by POLYVAL. This computes:
287+
* h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
288+
* where n=nblocks, h is the hash key, and m_i are the message blocks.
289+
*
290+
* rdi - pointer to precomputed key powers h^8 ... h^1
291+
* rsi - pointer to message blocks
292+
* rdx - number of blocks to hash
293+
* rcx - pointer to the accumulator
294+
*
295+
* void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
296+
* const u8 *in, size_t nblocks, u8 *accumulator);
297+
*/
298+
SYM_FUNC_START(clmul_polyval_update)
299+
FRAME_BEGIN
300+
vmovdqa .Lgstar(%rip), GSTAR
301+
movups (ACCUMULATOR), SUM
302+
subq $STRIDE_BLOCKS, BLOCKS_LEFT
303+
js .LstrideLoopExit
304+
full_stride 0
305+
subq $STRIDE_BLOCKS, BLOCKS_LEFT
306+
js .LstrideLoopExitReduce
307+
.LstrideLoop:
308+
full_stride 1
309+
subq $STRIDE_BLOCKS, BLOCKS_LEFT
310+
jns .LstrideLoop
311+
.LstrideLoopExitReduce:
312+
montgomery_reduction SUM
313+
.LstrideLoopExit:
314+
add $STRIDE_BLOCKS, BLOCKS_LEFT
315+
jz .LskipPartial
316+
partial_stride
317+
.LskipPartial:
318+
movups SUM, (ACCUMULATOR)
319+
FRAME_END
320+
RET
321+
SYM_FUNC_END(clmul_polyval_update)

0 commit comments

Comments
 (0)