Skip to content

Commit d464291

Browse files
author
Marzieh Berenjkoub
committed
Fix minor issues found during multi-node testing and addressing msccl failure
1 parent b327cf0 commit d464291

File tree

3 files changed

+15
-1
lines changed

3 files changed

+15
-1
lines changed

src/include/comm.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,10 @@ struct ncclComm {
737737
bool useNetPXN;
738738
bool useGdr;
739739
int splitCount;
740+
// gfx name from hipDeviceProp_t [RCCL]
741+
char* archName;
742+
// multiProcessorCount from hipDeviceProp_t [RCCL]
743+
int cuCount;
740744

741745
struct ncclDevrState devrState; // The symmetric runtime state
742746
struct ncclSymkState symkState; // The symmetric kernels state (built on previous)

src/include/msccl/msccl_kernel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#define MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps) mscclKernel_##devredop##_##type##_##proto##_##fullOps
1010

1111
#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, proto, fullOps) \
12-
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work);
12+
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work);
1313

1414
#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
1515
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL, fullOps) \

src/misc/utils.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,16 @@ ncclResult_t ncclBitsToString(uint32_t bits, uint32_t mask, const char* (*toStr)
317317
return ncclSuccess;
318318
}
319319

320+
size_t get_sc_page_size() {
321+
static size_t cached_page_size = 0;
322+
size_t ps = __atomic_load_n(&cached_page_size,__ATOMIC_RELAXED);
323+
if (ps == 0) {
324+
ps = (size_t)sysconf(_SC_PAGESIZE);
325+
__atomic_store_n(&cached_page_size, ps,__ATOMIC_RELAXED);
326+
}
327+
return ps;
328+
}
329+
320330
void get_aligned_ptr_and_size(const void *ptr, const size_t bufsize, void **aligned_ptr, size_t *aligned_size) {
321331
if (!aligned_ptr || !aligned_size) return;
322332
const size_t page_size = get_sc_page_size();

0 commit comments

Comments
 (0)