Skip to content

Commit dc59b78

Browse files
author
Marzieh Berenjkoub
committed
Fix minor issues found during testing
1 parent 93a340b commit dc59b78

File tree

5 files changed

+19
-14
lines changed

5 files changed

+19
-14
lines changed

src/ce_coll.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_
110110
return false;
111111
}
112112

113-
ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, CUstreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) {
113+
ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipStreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) {
114114
ncclResult_t ret = ncclSuccess;
115115

116116
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
@@ -142,7 +142,7 @@ ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, CUstreamBatc
142142
for (int r = 0; r < comm->nRanks; ++r) {
143143
if (r == comm->rank) continue;
144144
batchParams[*opIdx] = {};
145-
batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
145+
//batchParams[*opIdx].waitValue.operation = HIP_STREAM_MEM_OP_WAIT_VALUE_32;
146146
batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
147147
batchParams[*opIdx].waitValue.value = waitValue;
148148
batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
@@ -156,7 +156,7 @@ ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, CUstreamBatc
156156
}
157157

158158
ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
159-
CUstreamBatchMemOpParams* batchParams,
159+
hipStreamBatchMemOpParams* batchParams,
160160
size_t* opIdx) {
161161
ncclResult_t ret = ncclSuccess;
162162

@@ -175,7 +175,7 @@ ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
175175
size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
176176
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, comm->ceColl.ceSyncWin, offset, r, &peerDstPtr), ret, fail);
177177
batchParams[*opIdx] = {};
178-
batchParams[*opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
178+
batchParams[*opIdx].writeValue.operation = hipStreamMemOpWriteValue32;
179179
batchParams[*opIdx].writeValue.address = (CUdeviceptr)peerDstPtr;
180180
batchParams[*opIdx].writeValue.value = waitValue;
181181
//batchParams[*opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
@@ -186,7 +186,7 @@ ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
186186
for (int r = 0; r < comm->nRanks; ++r) {
187187
if (r == comm->rank) continue;
188188
batchParams[*opIdx] = {};
189-
batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
189+
//batchParams[*opIdx].waitValue.operation = HIP_STREAM_MEM_OP_WAIT_VALUE_32;
190190
batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
191191
batchParams[*opIdx].waitValue.value = waitValue;
192192
batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
@@ -212,7 +212,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
212212
size_t opIdx = 0;
213213

214214
// Prepare batch memory operations for synchronization
215-
CUstreamBatchMemOpParams* batchParams = nullptr;
215+
hipStreamBatchMemOpParams* batchParams = nullptr;
216216
NCCLCHECKGOTO(ncclCalloc(&batchParams, batchSize), ret, fail);
217217

218218
if (comm->nvlsSupport) {
@@ -225,7 +225,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
225225
if (ncclCudaGraphValid(comm->planner.capturingGraph)) {
226226
for (int i = 0; i < comm->nRanks; i++) {
227227
batchParams[opIdx] = {};
228-
batchParams[opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
228+
batchParams[opIdx].writeValue.operation = hipStreamMemOpWriteValue32;
229229
batchParams[opIdx].writeValue.address = (CUdeviceptr)(comm->ceColl.useCompletePtr ? (void*)&completePtrs[i] : (void*)&readyPtrs[i]);
230230
batchParams[opIdx].writeValue.value = 0;
231231
//batchParams[opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
@@ -234,7 +234,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
234234
}
235235

236236
// Execute all memory operations in a single batch
237-
CUDACHECKGOTO(cuStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
237+
CUDACHECKGOTO(hipStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
238238

239239
// Toggle the flag for next call
240240
comm->ceColl.useCompletePtr = !comm->ceColl.useCompletePtr;

src/dev_runtime.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {
7070
CUmemAllocationProp memProp = {};
7171
memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
7272
memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
73-
memProp.requestedHandleTypes = ncclCuMemHandleType;
73+
memProp.requestedHandleType = ncclCuMemHandleType;
7474
memProp.location.id = comm->cudaDev;
7575
CUCHECKGOTO(cuMemGetAllocationGranularity(&devr->granularity, &memProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail_lsaRankList);
7676

@@ -745,7 +745,7 @@ ncclResult_t ncclDevrCommCreateInternal(
745745
CUmemAllocationProp memProp = {};
746746
memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
747747
memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
748-
memProp.requestedHandleTypes = ncclCuMemHandleType;
748+
memProp.requestedHandleType = ncclCuMemHandleType;
749749
memProp.location.id = comm->cudaDev;
750750

751751
CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail);

src/enqueue.cc

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2682,7 +2682,6 @@ static ncclResult_t p2pTaskAppend(
26822682
info->coll = coll;
26832683
// Set capturing graph. Called here so that profiler can emit a group API event with this information
26842684
NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
2685-
printf("p2pTaskAppend isGraphCaptured: %d\n", ncclCudaGraphValid(planner->capturingGraph));
26862685
bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph);
26872686
NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
26882687
NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
@@ -2759,7 +2758,6 @@ static ncclResult_t collTaskAppend(
27592758
// Set capturing graph. Called here so that profiler can emit a group API event with this information
27602759
NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
27612760
bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph);
2762-
printf("collTaskAppend isGraphCaptured: %d\n", isGraphCaptured);
27632761
NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
27642762
NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
27652763
NCCLCHECK(ncclProfilerStartCollApiEvent(info, isGraphCaptured));
@@ -2790,7 +2788,6 @@ static ncclResult_t collTaskAppend(
27902788
ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
27912789

27922790
ncclProfilerStopCollApiEvent();
2793-
printf("ceCollTaskAppend enddddddddd: %d\n", ncclCudaGraphValid(planner->capturingGraph));
27942791
return ncclSuccess;
27952792
}
27962793

src/misc/shmutils.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void
119119
#if defined(HIP_HOST_UNCACHED_MEMORY)
120120
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped | hipExtHostRegisterUncached), ret, fail);
121121
#else
122-
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
123122
CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), ret, fail);
124123
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
125124
#endif

src/misc/utils.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,3 +316,12 @@ ncclResult_t ncclBitsToString(uint32_t bits, uint32_t mask, const char* (*toStr)
316316

317317
return ncclSuccess;
318318
}
319+
320+
void get_aligned_ptr_and_size(const void *ptr, const size_t bufsize, void **aligned_ptr, size_t *aligned_size) {
321+
if (!aligned_ptr || !aligned_size) return;
322+
const size_t page_size = get_sc_page_size();
323+
uintptr_t aligned_ptr_local = (uintptr_t)ptr & ~(page_size - 1);
324+
size_t local_offset = (size_t)((uintptr_t)ptr - aligned_ptr_local);
325+
*aligned_size = (bufsize + local_offset + page_size - 1) & ~(page_size - 1);
326+
*aligned_ptr = (void *)aligned_ptr_local;
327+
}

0 commit comments

Comments
 (0)