Skip to content
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ set(SRC_FILES
src/bootstrap.cc
src/channel.cc
src/collectives.cc
src/commDump.cc
src/debug.cc
src/enqueue.cc
src/group.cc
Expand Down
26 changes: 26 additions & 0 deletions src/commDump.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#include "nccl.h"
#include <cstring>
#include "comm.h"
#include "device.h"
#include "archinfo.h"

__attribute__ ((visibility("default")))
ncclResult_t ncclCommDump(
const ncclComm_t comm,
std::unordered_map<std::string, std::string>& map) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make const std::unordered_map<std::string, std::string>& map

Copy link
Contributor

@alex-breslow-amd alex-breslow-amd Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may have missed it, but why not pass in ncclComm_t as a const reference rather than passing it by value?

Copy link
Contributor Author

@ahmd-k ahmd-k Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make const std::unordered_map<std::string, std::string>& map

In NCCLX and RCCLX, this map is not const because it is where we store some structured trace data that callers like PyTorch can use.

See the NCCLX API: https://github.com/meta-pytorch/torchcomms/blob/fe4e8116f2107b5aed0e38db10e072471ea95126/comms/ncclx/v2_27/meta/commDump.cc#L219

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may have missed it, but why not pass in ncclComm_t as a const reference rather than passing it by value?

Good point, I was just following the NCCLX implementation but I don't see why we copy the communicator here. @dmwu @YulunW any idea why the the communicator is passed by value in ncclCommDump()?

if (comm == nullptr) {
WARN("ncclCommDump comm is null");
return ncclSuccess;
}
if (comm->proxyState->proxyTrace == nullptr) {
WARN("ncclCommDump comm->proxyState->proxyTrace is null");
return ncclSuccess;
}

WARN("ncclCommDump() ProxyTrace:");
WARN("%s", comm->proxyState->proxyTrace->dump().c_str());

return ncclSuccess;
}
14 changes: 7 additions & 7 deletions src/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ ncclResult_t checkHostUncacheMemSetting(struct ncclComm* comm) {
else {
return ncclSuccess;
}
#endif
#endif
}

static void initOnceFunc() {
Expand Down Expand Up @@ -439,7 +439,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->connectRecv);

if (rcclParamEnableProxyTrace()) {
WARN("ProxyTrace:");
WARN("commFree() ProxyTrace:");
if (comm->proxyState && comm->proxyState->proxyTrace){
WARN("%s", comm->proxyState->proxyTrace->dump().c_str());
}
Expand Down Expand Up @@ -1454,7 +1454,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
}
// For single node communicators that do not uses the full xgmi links per gpu, i.e., nranks < 8
// Inflate the nChannels a bit to achieve higher b/w.
// Inflate the nChannels a bit to achieve higher b/w.
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
if (nranks == 2 && nNodes == 1){
allGather3Data[rank].nc = 16;
Expand Down Expand Up @@ -1817,8 +1817,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
// Compute time models for algorithm and protocol combinations
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);

INFO(NCCL_INIT, "comm:%p, nRanks:%d, nNodes:%d, coll channels:%d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d", comm, comm->nRanks, comm->nNodes, comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
INFO(NCCL_INIT, "comm:%p, nRanks:%d, nNodes:%d, coll channels:%d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d", comm, comm->nRanks, comm->nNodes, comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);

if (comm->intraRank == 0) { // Load ncclParamLaunchMode
const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
enum ncclLaunchMode mode, modeOld;
Expand Down Expand Up @@ -2075,10 +2075,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
comm->cuCount = cuCount;

NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);

// Check if using host uncached mem correctly
NCCLCHECK(checkHostUncacheMemSetting(comm));

// RCCL: determine and set unroll factor for comm
NCCLCHECK(commSetUnrollFactor(comm));

Expand Down
12 changes: 12 additions & 0 deletions src/nccl.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -948,4 +948,16 @@ ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
} // end extern "C"
#endif

#ifdef __cplusplus
#define NCCL_COMM_DUMP

#include <unordered_map>
#include <string>
/* Dump NCCL current internal state for a given communicator in a key-value store format.
* define outside extern "C"{} to pass C++ template */
ncclResult_t ncclCommDump(ncclComm_t comm, std::unordered_map<std::string, std::string>& map);
#else
#warning "NCCL C++ API is disabled because C compiler is being used. Please use a C++ compiler to build NCCL."
#endif

#endif // end include guard
3 changes: 3 additions & 0 deletions src/transport/net.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1480,6 +1480,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done", sub->done, buffSlot, sub->nsteps, sub->requests[buffSlot]);
sub->done += args->sliceSteps;
ncclProfilerStopProxyStepEvent(s, args, doneStepId);
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey,
facebook_rccl::ProxyCounterTypes::DONE, sub->done);
if (resources->shared == 0) {
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
*sendHead = sub->base + sub->done;
Expand Down Expand Up @@ -1822,6 +1824,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
int doneStepId = sub->done;
sub->done += args->sliceSteps;
ncclProfilerStopProxyStepEvent(s+i, args, doneStepId);
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey, facebook_rccl::ProxyCounterTypes::DONE, sub->done);
args->idle = 0;
if (sub->done == sub->nsteps) {
args->done++;
Expand Down