Skip to content

Commit b9ce382

Browse files
committed
issue: 1557652 Improve memory copy for Blue Flame usage
Use write to BF operation based on CPU command set. Signed-off-by: Igor Ivanov <[email protected]>
1 parent b173291 commit b9ce382

File tree

5 files changed

+150
-38
lines changed

5 files changed

+150
-38
lines changed

src/utils/asm-arm64.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,22 +37,12 @@
3737
#include <stdint.h>
3838
#include <unistd.h>
3939

40-
#define COPY_64B_NT(dst, src) \
41-
*dst++ = *src++; \
42-
*dst++ = *src++; \
43-
*dst++ = *src++; \
44-
*dst++ = *src++; \
45-
*dst++ = *src++; \
46-
*dst++ = *src++; \
47-
*dst++ = *src++; \
48-
*dst++ = *src++
4940

5041
#define mb() asm volatile("dsb sy" ::: "memory")
5142
#define rmb() asm volatile("dsb ld" ::: "memory")
5243
#define wmb() asm volatile("dsb st" ::: "memory")
5344
#define wc_wmb() wmb()
5445

55-
5646
/**
5747
* Read RDTSC register
5848
*/
@@ -84,6 +74,4 @@ static inline void prefetch_range(void *addr, size_t len)
8474
prefetch(cp);
8575
}
8676

87-
88-
8977
#endif

src/utils/asm-ppc64.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,6 @@
3737
#include <stdint.h>
3838
#include <unistd.h>
3939

40-
#define COPY_64B_NT(dst, src) \
41-
*dst++ = *src++; \
42-
*dst++ = *src++; \
43-
*dst++ = *src++; \
44-
*dst++ = *src++; \
45-
*dst++ = *src++; \
46-
*dst++ = *src++; \
47-
*dst++ = *src++; \
48-
*dst++ = *src++
4940

5041
#define mb() asm volatile("sync" ::: "memory")
5142
#define rmb() asm volatile("lwsync" ::: "memory")

src/utils/asm-x86.h

Lines changed: 131 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,21 +43,6 @@
4343
#define wmb() asm volatile("" ::: "memory")
4444
#define wc_wmb() asm volatile("sfence" ::: "memory")
4545

46-
#define COPY_64B_NT(dst, src) \
47-
__asm__ __volatile__ ( \
48-
" movdqa (%1),%%xmm0\n" \
49-
" movdqa 16(%1),%%xmm1\n" \
50-
" movdqa 32(%1),%%xmm2\n" \
51-
" movdqa 48(%1),%%xmm3\n" \
52-
" movntdq %%xmm0, (%0)\n" \
53-
" movntdq %%xmm1, 16(%0)\n" \
54-
" movntdq %%xmm2, 32(%0)\n" \
55-
" movntdq %%xmm3, 48(%0)\n" \
56-
: : "r" (dst), "r" (src) : "memory"); \
57-
dst += 8; \
58-
src += 8
59-
60-
6146
/**
6247
* Add to the atomic variable.
6348
* @param i integer value to add.
@@ -117,4 +102,135 @@ static inline void prefetch_range(void *addr, size_t len)
117102
prefetch(cp);
118103
}
119104

105+
enum {
106+
CPU_FLAG_CMOV = (1 << 0),
107+
CPU_FLAG_MMX = (1 << 1),
108+
CPU_FLAG_MMX2 = (1 << 2),
109+
CPU_FLAG_SSE = (1 << 3),
110+
CPU_FLAG_SSE2 = (1 << 4),
111+
CPU_FLAG_SSE3 = (1 << 5),
112+
CPU_FLAG_SSSE3 = (1 << 6),
113+
CPU_FLAG_SSE41 = (1 << 7),
114+
CPU_FLAG_SSE42 = (1 << 8),
115+
CPU_FLAG_AVX = (1 << 9),
116+
CPU_FLAG_AVX2 = (1 << 10)
117+
};
118+
119+
#define X86_CPUID_GET_MODEL 0x00000001u
120+
#define X86_CPUID_GET_BASE_VALUE 0x00000000u
121+
#define X86_CPUID_GET_EXTD_VALUE 0x00000007u
122+
#define X86_CPUID_GET_MAX_VALUE 0x80000000u
123+
124+
VMA_ATTRIBUTE_OPTIMIZE_NONE
125+
static inline void __x86_cpuid(uint32_t level,
126+
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
127+
{
128+
asm volatile ("cpuid\n\t"
129+
: "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
130+
: "0" (level));
131+
}
132+
133+
/* This allows the CPU detection to work with assemblers not supporting
134+
* the xgetbv mnemonic.
135+
*/
136+
#define __x86_xgetbv(_index, _eax, _edx) \
137+
asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(_eax), "=d"(_edx) : "c" (_index))
138+
139+
/**
140+
* Read CPU instruction set
141+
*/
142+
VMA_ATTRIBUTE_OPTIMIZE_NONE
143+
static inline int cpuid_flags()
144+
{
145+
static int cpu_flag = -1;
146+
147+
if (cpu_flag < 0) {
148+
uint32_t result = 0;
149+
uint32_t base_value;
150+
uint32_t _eax, _ebx, _ecx, _edx;
151+
152+
__x86_cpuid(X86_CPUID_GET_BASE_VALUE, &_eax, &_ebx, &_ecx, &_edx);
153+
base_value = _eax;
154+
155+
if (base_value >= 1) {
156+
__x86_cpuid(X86_CPUID_GET_MODEL, &_eax, &_ebx, &_ecx, &_edx);
157+
if (_edx & (1 << 15)) {
158+
result |= CPU_FLAG_CMOV;
159+
}
160+
if (_edx & (1 << 23)) {
161+
result |= CPU_FLAG_MMX;
162+
}
163+
if (_edx & (1 << 25)) {
164+
result |= CPU_FLAG_MMX2;
165+
}
166+
if (_edx & (1 << 25)) {
167+
result |= CPU_FLAG_SSE;
168+
}
169+
if (_edx & (1 << 26)) {
170+
result |= CPU_FLAG_SSE2;
171+
}
172+
if (_ecx & 1) {
173+
result |= CPU_FLAG_SSE3;
174+
}
175+
if (_ecx & (1 << 9)) {
176+
result |= CPU_FLAG_SSSE3;
177+
}
178+
if (_ecx & (1 << 19)) {
179+
result |= CPU_FLAG_SSE41;
180+
}
181+
if (_ecx & (1 << 20)) {
182+
result |= CPU_FLAG_SSE42;
183+
}
184+
if ((_ecx & 0x18000000) == 0x18000000) {
185+
__x86_xgetbv(0, _eax, _edx);
186+
if ((_eax & 0x6) == 0x6) {
187+
result |= CPU_FLAG_AVX;
188+
}
189+
}
190+
}
191+
if (base_value >= 7) {
192+
__x86_cpuid(X86_CPUID_GET_EXTD_VALUE, &_eax, &_ebx, &_ecx, &_edx);
193+
if ((result & CPU_FLAG_AVX) && (_ebx & (1 << 5))) {
194+
result |= CPU_FLAG_AVX2;
195+
}
196+
}
197+
cpu_flag = result;
198+
}
199+
200+
return cpu_flag;
201+
}
202+
203+
#define __vma_memory_copy64(_dst, _src) \
204+
{ \
205+
static int is_wc_simd = cpuid_flags() & \
206+
(CPU_FLAG_SSE3 | CPU_FLAG_SSSE3 | \
207+
CPU_FLAG_SSE41 | CPU_FLAG_SSE42 | \
208+
CPU_FLAG_AVX | CPU_FLAG_AVX2); \
209+
\
210+
if (is_wc_simd) { \
211+
__asm__ __volatile__ ( \
212+
" movdqa (%1), %%xmm0\n" \
213+
" movdqa 16(%1), %%xmm1\n" \
214+
" movdqa 32(%1), %%xmm2\n" \
215+
" movdqa 48(%1), %%xmm3\n" \
216+
\
217+
" movntdq %%xmm0, (%0)\n" \
218+
" movntdq %%xmm1, 16(%0)\n" \
219+
" movntdq %%xmm2, 32(%0)\n" \
220+
" movntdq %%xmm3, 48(%0)\n" \
221+
: : "r" (_dst), "r" (_src) : "memory"); \
222+
_dst += 8; \
223+
_src += 8; \
224+
} else { \
225+
*_dst++ = *_src++; \
226+
*_dst++ = *_src++; \
227+
*_dst++ = *_src++; \
228+
*_dst++ = *_src++; \
229+
*_dst++ = *_src++; \
230+
*_dst++ = *_src++; \
231+
*_dst++ = *_src++; \
232+
*_dst++ = *_src++; \
233+
} \
234+
}
235+
120236
#endif

src/utils/asm.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
#ifndef ASM_H_
3535
#define ASM_H_
3636

37+
#include "utils/compiler.h"
38+
3739
#ifndef __has_builtin
3840
#define __has_builtin(x) 0
3941
#endif
@@ -54,4 +56,18 @@ typedef atomic_int atomic_t;
5456
#error No architecture specific memory barrier definitions found!
5557
#endif
5658

59+
#ifndef __vma_memory_copy64
60+
#define memory_copy64(dst, src) \
61+
*dst++ = *src++; \
62+
*dst++ = *src++; \
63+
*dst++ = *src++; \
64+
*dst++ = *src++; \
65+
*dst++ = *src++; \
66+
*dst++ = *src++; \
67+
*dst++ = *src++; \
68+
*dst++ = *src++
69+
#else
70+
#define memory_copy64 __vma_memory_copy64
71+
#endif /* atomic_load_explicit */
72+
5773
#endif

src/vma/dev/qp_mgr_eth_mlx5.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#if defined(DEFINED_DIRECT_VERBS)
3535

3636
#include <sys/mman.h>
37+
#include "utils/asm.h"
3738
#include "cq_mgr_mlx5.h"
3839
#include "vma/util/utils.h"
3940
#include "vlogger/vlogger.h"
@@ -322,11 +323,11 @@ inline void qp_mgr_eth_mlx5::ring_doorbell(uint64_t* wqe, int num_wqebb, int num
322323
* which do not guarantee order of copying.
323324
*/
324325
while (num_wqebb--) {
325-
COPY_64B_NT(dst, src);
326+
memory_copy64(dst, src);
326327
}
327328
src = (uint64_t*)m_sq_wqes;
328329
while (num_wqebb_top--) {
329-
COPY_64B_NT(dst, src);
330+
memory_copy64(dst, src);
330331
}
331332
} else {
332333
*dst = *src;

0 commit comments

Comments
 (0)