Skip to content

Commit 2d93956

Browse files
author
xu.wang
committed
Fix: p2p + collectives called by sequence error
1 parent 8d26308 commit 2d93956

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

src/transport/p2p.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -922,10 +922,12 @@ ncclResult_t ret = ncclSuccess;
922922
cudaStream_t hostStream, deviceStream;
923923
NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
924924
NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
925-
if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
925+
if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL) {
926926
NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
927-
if (needUpdate)
928927
NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
928+
} else if (needUpdate) {
929+
NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
930+
}
929931
NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), ret, fail);
930932
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
931933
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), ret, fail);

0 commit comments

Comments
 (0)