diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h index 85ea79ef7..4ac9c83d1 100644 --- a/ext-net/example/nccl/net.h +++ b/ext-net/example/nccl/net.h @@ -25,6 +25,7 @@ typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData); +#include "net_v11.h" #include "net_v10.h" #include "net_v9.h" #include "net_v8.h" @@ -35,9 +36,10 @@ typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* p #include "net_v3.h" #include "net_v2.h" -typedef ncclNet_v10_t ncclNet_t; -typedef ncclNetProperties_v10_t ncclNetProperties_t; -typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; -typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; +typedef ncclNet_v11_t ncclNet_t; +typedef ncclNetProperties_v11_t ncclNetProperties_t; +typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t; +typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t; +typedef ncclNetPath_v11_t ncclNetPath_t; #endif // end include guard diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h index d693101a3..8892a0221 100644 --- a/ext-net/example/nccl/net_device.h +++ b/ext-net/example/nccl/net_device.h @@ -27,6 +27,7 @@ typedef struct { typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; -typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t; +typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t; #endif diff --git a/ext-net/example/nccl/net_v11.h b/ext-net/example/nccl/net_v11.h new file mode 100644 index 000000000..e73257db5 --- /dev/null +++ b/ext-net/example/nccl/net_v11.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NET_V11_H_ +#define NET_V11_H_ + +#define NCCL_NET_MAX_DEVS_PER_NIC_V11 4 + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V11]; +} ncclNetVDeviceProps_v11_t; + +#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 + +typedef struct { + // Plugin-specific TC value + int trafficClass; +} ncclNetCommConfig_v11_t; + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v11_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations + uint64_t fabricId; // Fabric handle associated to the current device +} ncclNetProperties_v11_t; + +typedef enum { + NET_LOC_DCL0 = 0 /* same DC, hierarchy level 0*/, + NET_LOC_DCL1 = 1 /* different DC, hiearchy level 1 */, + NET_LOC_DISC = 2 /* disconnected*/ +} ncclNetLoc_v11_t; + +typedef struct { + ncclNetLoc_v11_t loc; +} ncclNetPath_v11_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, ncclNetCommConfig_v11_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props); + + // topology API. getNetPath returns the path between two fabricIds. + ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_v11_t* path); +} ncclNet_v11_t; + +#endif // end include guard diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c index 97a29875d..9468d13c9 100644 --- a/ext-net/example/plugin.c +++ b/ext-net/example/plugin.c @@ -17,7 +17,6 @@ __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalE __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) { // Below are default values, if unsure don't change. - props->name = "Example"; // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0 props->pciPath = NULL; @@ -48,6 +47,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) { // maximum transfer sizes the plugin can handle props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES; + // all devs can connect to each other, fabric ID is 0 (any other value would work) + props->fabricId = 0; return ncclSuccess; } @@ -67,10 +68,12 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; } __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; } +__hidden ncclResult_t pluginGetNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { return ncclInternalError; } #define PLUGIN_NAME "Plugin" -const ncclNet_v10_t ncclNetPlugin_v10 = { + +const ncclNet_v11_t ncclNetPlugin_v11 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, @@ -91,6 +94,59 @@ const ncclNet_v10_t ncclNetPlugin_v10 = { .getDeviceMr = pluginGetDeviceMr, .irecvConsumed = pluginIrecvConsumed, .makeVDevice = pluginMakeVDevice, + .getNetPath = pluginGetNetPath, +}; + +__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props_v10) { + ncclNetProperties_t props; + ncclResult_t ret = pluginGetProperties(dev, &props); + if (ret != ncclSuccess) return ret; + props_v10->name = props.name; + props_v10->pciPath = props.pciPath; + props_v10->guid = props.guid; + props_v10->ptrSupport = props.ptrSupport; + props_v10->regIsGlobal = props.regIsGlobal; + props_v10->forceFlush = props.forceFlush; + props_v10->speed = props.speed; + props_v10->port = props.port; + props_v10->maxComms = props.maxComms; + props_v10->maxRecvs = props.maxRecvs; + props_v10->netDeviceType = props.netDeviceType; + props_v10->netDeviceVersion = props.netDeviceVersion; + props_v10->vProps.ndevs = props.vProps.ndevs; + memcpy(props_v10->vProps.devs, props.vProps.devs, sizeof(props.vProps.devs)); + props_v10->maxP2pBytes = props.maxP2pBytes; + props_v10->maxCollBytes = props.maxCollBytes; + return ncclSuccess; +} + +__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return pluginConnect(dev, (ncclNetCommConfig_t*)config, handle, sendComm, sendDevComm); +} + +__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; } + +const ncclNet_v10_t ncclNetPlugin_v10 = { + .name = PLUGIN_NAME, + .init = pluginInit, + .devices = pluginDevices, + .getProperties = pluginGetProperties_v10, + .listen = pluginListen, + .connect = pluginConnect_v10, + .accept = pluginAccept, + .regMr = pluginRegMr, + .regMrDmaBuf = pluginRegMrDmaBuf, + .deregMr = pluginDeregMr, + .isend = pluginIsend, + .irecv = pluginIrecv, + .iflush = pluginIflush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, + .getDeviceMr = pluginGetDeviceMr, + .irecvConsumed = pluginIrecvConsumed, + .makeVDevice = pluginMakeVDevice_v10, }; __hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) { diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 9e24faadf..ff9bc160d 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -479,8 +479,9 @@ static ncclResult_t getUDS(uint64_t* peerUDS) { return ncclSuccess; } #define MAX_OOB_DEVS 16 -static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { +static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, ncclNet_t** net, int* dev) { static int devOOB = -1; + static ncclNet_t* netOOB = NULL; if (devOOB < 0) { pthread_mutex_lock(&bootstrapNetLock); if (devOOB < 0) { @@ -491,24 +492,28 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { if (searchNot) userIfEnv++; bool searchExact = userIfEnv && userIfEnv[0] == '='; if (searchExact) userIfEnv++; + int nUserIfs; struct netIf userIfs[MAX_OOB_DEVS]; - int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS); - // loop over the device and return the first one matching + NCCLCHECK(parseIfList(userIfEnv, userIfs, MAX_OOB_DEVS,&nUserIfs)); + // loop over all nets and all devices and return the first one matching int nDev = 0; - NCCLCHECK(comm->ncclNet->devices(&nDev)); - int devId = 0; - while (devId < nDev) { - ncclNetProperties_t props; - comm->ncclNet->getProperties(devId, &props); - // check against user specified HCAs/ports - if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) { - // All plain physical devices have been initialized at this point - devOOB = devId; - break; + for (int n = 0; n < comm->ncclNetCount; ++n) { + NCCLCHECK(comm->ncclNet[n]->devices(&nDev)); + int devId = 0; + while (devId < nDev) { + ncclNetProperties_t props; + comm->ncclNet[n]->getProperties(devId, &props); + // check against user specified HCAs/ports + if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) { + // All plain physical devices have been initialized at this point + devOOB = devId; + netOOB = comm->ncclNet[n]; + break; + } + devId++; } - devId++; } - if (devOOB == -1) { + if (devOOB == -1 || !netOOB) { if (!searchNot) WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv); else @@ -519,16 +524,18 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { } else { // default choice is device 0 devOOB = 0; + netOOB = comm->ncclNet[0]; } // display info on the chosen device ncclNetProperties_t props; - ncclResult_t res = comm->ncclNet->getProperties(devOOB, &props); + ncclResult_t res = netOOB->getProperties(devOOB, &props); bool hasProp = res == ncclSuccess; INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1); } pthread_mutex_unlock(&bootstrapNetLock); } *dev = devOOB; + *net = netOOB; return ncclSuccess; } @@ -636,7 +643,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { state->nranks = nranks; state->cudaDev = comm->cudaDev; state->abortFlag = comm->abortFlag; - state->net = comm->ncclNet; + state->net = NULL; comm->bootstrap = state; comm->magic = state->magic = BOOTSTRAP_HANDLE(handles, 0)->magic; // state and comm magic set to the first magic ID @@ -651,7 +658,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_CREATE]); if (ncclParamBootstrapNetEnable()) { // Create net interface for other ranks to contact me (all gather) - NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev))); + NCCLCHECK(netGetDevice(rank, comm, &state->net, &STATE_LISTEN(state, net.dev))); NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm))); memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE); } else { @@ -789,7 +796,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo state->nranks = nranks; state->cudaDev = comm->cudaDev; state->abortFlag = comm->abortFlag; - state->net = comm->ncclNet; + state->net = NULL; comm->bootstrap = state; comm->magic = state->magic = magic; @@ -798,7 +805,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo // create a handle for the others to reach out to me if (ncclParamBootstrapNetEnable()) { - NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail); + NCCLCHECKGOTO(netGetDevice(rank, comm, &state->net, &STATE_LISTEN(state, net.dev)), ret, fail); NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail); memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE); } else { diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 76b508c2d..a751d9639 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -65,10 +65,6 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs } } } - // Duplicate channels trees - struct ncclChannel* channel0 = comm->channels; - struct ncclChannel* channel1 = channel0+nChannels; - memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel)); // Get nvls heads and the number of heads. Duplicate head is not allowed. for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) { @@ -90,22 +86,80 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs return ncclSuccess; } -static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) { - int nChannels = comm->nChannels; - int nNodes = comm->nNodes; - for (int c=0; cnNodes; - int* send = ringSend+c*comm->nNodes; - int* prev = ringPrev+c*comm->nRanks; - int* next = ringNext+c*comm->nRanks; - for (int n=0; npeerInfo[0].nvmlDev + (ncclParamScatterXDc() ? channel : 0); + return mirrorBits(commId, pow2) % nNodes; +} + +static int getCrossNodeForRing(struct ncclComm* comm, struct ncclDcNode* dc, int c, int nChannels) { + return channelToNode(comm, dc->localNodes, c, nChannels); +} + + +#define NODE_FROM_DC(dc, i) (dc->localNodeToNode[((i) + dc->localNodes) % dc->localNodes]) +// connect the inter-node for nChannels rings. For each ring, ringPrev and ringNext store respectivelly the previous and the next rank in the ring for all the ranks. +// We close the rings inter-node using the search channels information: ringRecv and ringSend. +// For each search channel, ringRecv and ringSend contains the recv and send rank on each node. +static ncclResult_t connectRings(struct ncclComm* comm, int nChannels, struct ncclChannel* channels, int* ringPrev, int* ringNext, int nSearchChannels, int* ringRecv, int* ringSend) { + INFO(NCCL_GRAPH, "%s: comm 0x%lx connecting %d ring channels using %d search channels", __func__, comm->commHash, nChannels, nSearchChannels); + int nDc = comm->dcCount; + for (int c = 0; c < nChannels; c++) { + int* prev = ringPrev + c * comm->nRanks; + int* next = ringNext + c * comm->nRanks; + int* recv = ringRecv + (c % nSearchChannels) * comm->nNodes; + int* send = ringSend + (c % nSearchChannels) * comm->nNodes; + for (int dc = 0; dc < nDc; ++dc) { + struct ncclDcNode* dcNode = &comm->dcNode[dc]; + // we first connect the all the nodes inside the same DC together + int nNodesInDc = dcNode->localNodes; + for (int iNode = 0; iNode < nNodesInDc; iNode++) { + int node = dcNode->localNodeToNode[iNode]; + // recv connects to the prev send + const int prevNode = dcNode->localNodeToNode[(iNode - 1 + nNodesInDc) % nNodesInDc]; + const int recvRank = recv[node]; + const int prevSendRank = send[prevNode]; + prev[recvRank] = prevSendRank; + // send connect to the next recv + const int nextNode = dcNode->localNodeToNode[(iNode + 1) % nNodesInDc]; + const int sendRank = send[node]; + const int nextRecvRank = recv[nextNode]; + next[sendRank] = nextRecvRank; + } + } + // for each DC, we open the rings between node getCrossNodeForRing(dc,c) and getCrossNodeForRing(dc+1,c) + 1 + for (int dc = 0; dc < nDc; ++dc) { + struct ncclDcNode* currDc = &comm->dcNode[dc]; + struct ncclDcNode* nextDc = &comm->dcNode[(dc + 1) % nDc]; + struct ncclDcNode* prevDc = &comm->dcNode[(dc - 1 + nDc) % nDc]; + int crossNode = getCrossNodeForRing(comm, currDc, c, nChannels); + INFO(NCCL_GRAPH, "%s: crossNode on DC %d of ring[%d] is node %d", __func__, dc, c, crossNode); + // recv from the previous DC + int recvRank = recv[NODE_FROM_DC(currDc, crossNode + 1)]; + int prevSendRank = send[NODE_FROM_DC(prevDc, getCrossNodeForRing(comm, prevDc, c, nChannels))]; prev[recvRank] = prevSendRank; - int sendRank = send[n]; - int nextRecvRank = recv[(n+1)%nNodes]; + INFO(NCCL_GRAPH, "DC %d - ring %d: connecting rank %d (prev DC) -> rank %d", dc, c, prevSendRank, recvRank); + // send to the next recv + int sendRank = send[NODE_FROM_DC(currDc, crossNode)]; + int nextRecvRank = recv[NODE_FROM_DC(nextDc, getCrossNodeForRing(comm, nextDc, c, nChannels) + 1)]; next[sendRank] = nextRecvRank; + INFO(NCCL_GRAPH, "DC %d - ring %d: connecting rank %d -> rank %d (next DC)", dc, c, sendRank, nextRecvRank); } + channels[c].ring.prev = prev[comm->rank]; + channels[c].ring.next = next[comm->rank]; } return ncclSuccess; } @@ -133,41 +187,143 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) { return ncclSuccess; } -static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { - const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node; +static int rootNodeTree0(struct ncclComm* comm, struct ncclDcNode* dc, int tree0Id, int nTrees0) { + // Given a root for tree 0 (R0), the root of tree 1 will either be (R0+1)%nNodes (nNodes odd) or (R0+nNodes-1)%nNodes (nNodes even), see trees.cc + // Therefore, we distribute the roots of trees 0 on the even nodes only + int nNodes = dc->localNodes; + return 2 * channelToNode(comm, nNodes / 2, tree0Id, nTrees0); +} - // Compute tree depth. Not an exact value but a good approximation in most - // cases - int depth = comm->nRanks/nNodes - 1 + log2i(nNodes); +static int shiftedNodeFromLocalNode(struct ncclComm* comm, struct ncclDcNode* dc, int localNode, int tree0Id, int nTrees0) { + int root = rootNodeTree0(comm, dc, tree0Id, nTrees0); + return (localNode - root + dc->localNodes) % dc->localNodes; +} - int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType; - int* ttp, *ttc0, *ttc1; - NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType)); - for (int c=0; cchannels+c; - struct ncclChannel* channel1 = channel0+nChannels; - ttp = treeToParent+c*comm->nNodes; - ttc0 = treeToChild0+c*comm->nNodes; - ttc1 = treeToChild1+c*comm->nNodes; - if (comm->rank == ttp[node]) { - NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u)); - NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u)); - } - if (comm->rank == ttc0[node]) { - NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0)); - NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0)); - } - if (comm->rank == ttc1[node]) { - NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1)); - NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1)); - } - if (comm->rank == ttp[node] || - comm->rank == ttc0[node] || - comm->rank == ttc1[node]) { - INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); - INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); - } - channel0->tree.depth = channel1->tree.depth = depth; +static int globalNodeFromShiftedNode(struct ncclComm* comm, struct ncclDcNode* dc, int shiftedNode, int tree0Id, int nTrees0) { + if (shiftedNode == -1) return -1; + int root = rootNodeTree0(comm, dc, tree0Id, nTrees0); + int localNodeIndex = (shiftedNode + root) % dc->localNodes; + return NODE_FROM_DC(dc, localNodeIndex); +} + +static struct ncclDcNode* dcIdFromGlobalNode(struct ncclComm* comm, int node, int* dcId, int* localNode) { + *dcId = -1; + *localNode = -1; + for (int dc = 0; dc < comm->dcCount; ++dc) { + for (int n = 0; n < comm->dcNode[dc].localNodes; ++n) { + if (comm->dcNode[dc].localNodeToNode[n] == comm->node) { + *dcId = dc; + *localNode = n; + break; + } + } + if (*dcId >= 0 && *localNode >= 0) break; + } + return &comm->dcNode[*dcId]; +} + +// connect a total of nChannels trees (nChannels/2 primary trees and nChannels/2 dual trees) using the channels found in the search. Both trees will use the same search channel. +// For each search channel, treeToParent, treeToChild0, and treeToChild1, contain respectivelly the rank communicating with the parent, the child0, and the child 1 for each node. +static ncclResult_t connectTrees(struct ncclComm* comm, const int nChannels, struct ncclChannel* channels, const int nSearchChannels, int* treeToParent, int* treeToChild0, + int* treeToChild1, int* treePatterns) { + int dcId = -1, localNode = -1; + const int nNodes = comm->nNodes, node = comm->node, nDc = comm->dcCount; + struct ncclDcNode* currDc = dcIdFromGlobalNode(comm, comm->node, &dcId, &localNode); + + // Compute tree depth. Not an exact value but a good approximation in most cases + int maxDepthDc = 0; + for (int d = 0; d < nDc; ++d) maxDepthDc = std::max((int)log2i(comm->dcNode[d].localNodes), maxDepthDc); + int depth = /*intraNode*/ (comm->nRanks / nNodes - 1) + /*intra-DC*/ maxDepthDc + /*inter-DC*/ (nDc - 1); + + int nTrees1 = nChannels / 2; + int nTrees0 = nTrees1 + (nChannels % 2); + INFO(NCCL_GRAPH, "%s: comm 0x%lx connecting %d tree channels (%d primal, %d dual) using %d search channels", __func__, comm->commHash, nChannels, nTrees0, nTrees1, nSearchChannels); + for (int c0 = 0; c0 < nTrees0; c0++) { + // primal and dual channels, if nChannels is odd, the last dual is not done + struct ncclChannel* channel0 = channels + c0; + struct ncclChannel* channel1 = (c0 < nTrees1) ? (channels + nTrees0 + c0) : NULL; + // dual channel (channel1) has to be the same as channel0. This could not be the case if the number of search channels is higher than the number of desired channels. + if (channel1) memcpy(&channel1->tree, &channel0->tree, sizeof(struct ncclTree)); + + int* ttp = treeToParent + (c0 % nSearchChannels) * comm->nNodes; + int* ttc0 = treeToChild0 + (c0 % nSearchChannels) * comm->nNodes; + int* ttc1 = treeToChild1 + (c0 % nSearchChannels) * comm->nNodes; + // intraDC tree: each primal/dual trees will get a different root. the shifted ID is the same for both the primal and the dual tree. + int tt[6]; + int t0ChildType, t1ChildType; + int nodeShifted = shiftedNodeFromLocalNode(comm, currDc, localNode, c0, nTrees0); + NCCLCHECK(ncclGetDtree(comm->dcNode[dcId].localNodes, nodeShifted, tt + 0, tt + 1, tt + 2, &t0ChildType, tt + 3, tt + 4, tt + 5, &t1ChildType)); + // we need to restranslate the shifted local indexes into unshifted global index + int tu[2] = {globalNodeFromShiftedNode(comm, currDc, tt[0], c0, nTrees0), globalNodeFromShiftedNode(comm, currDc, tt[3], c0, nTrees0)}; + int td0[2] = {globalNodeFromShiftedNode(comm, currDc, tt[1], c0, nTrees0), globalNodeFromShiftedNode(comm, currDc, tt[4], c0, nTrees0)}; + int td1[2] = {globalNodeFromShiftedNode(comm, currDc, tt[2], c0, nTrees0), globalNodeFromShiftedNode(comm, currDc, tt[5], c0, nTrees0)}; + if (comm->rank == ttp[node]) { + NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, tu[0])); + if (channel1) NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, tu[1])); + } + if (comm->rank == ttc0[node]) { + NCCLCHECK(setTreeDown(&channel0->tree, ttp, td0[0])); + if (channel1) NCCLCHECK(setTreeDown(&channel1->tree, ttp, td0[1])); + } + if (comm->rank == ttc1[node]) { + NCCLCHECK(setTreeDown(&channel0->tree, ttp, td1[0])); + if (channel1) NCCLCHECK(setTreeDown(&channel1->tree, ttp, td1[1])); + } + if (comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) { + INFO(NCCL_GRAPH, "Tree %d : %d <-> %d <-> %d/%d/%d", c0, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); + if (channel1) + INFO(NCCL_GRAPH, "Tree %d : %d <-> %d <-> %d/%d/%d", c0 + nTrees0, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); + } + + /* chain DC roots together to create a single tree + Example with 3 DCs (R0, R1, and R2 represent the roots of each DC's tree; T0, T1, and T2 the rest of the corresponding tree) + R2 + / \ + R1 T2 + / \ + R0 T1 + \ + T0 + */ + const bool isNodeRoot[2] = {(tu[0] == -1), (tu[1] == -1)}; + if (isNodeRoot[0] || isNodeRoot[1]) { + int root0, root1; + // connect rank = ttp[root node of current DC] to the ttc0[root node of next DC] + if (comm->rank == ttp[node] && dcId < (nDc - 1)) { + NCCLCHECK(ncclGetDtreeRoots(comm->dcNode[dcId + 1].localNodes, &root0, &root1)); + int nodeR0 = globalNodeFromShiftedNode(comm, &comm->dcNode[dcId + 1], root0, c0, nTrees0); + int nodeR1 = globalNodeFromShiftedNode(comm, &comm->dcNode[dcId + 1], root1, c0, nTrees0); + if (isNodeRoot[0]) { + NCCLCHECK(setTreeUp(&channel0->tree, ttc0, nodeR0)); + INFO(NCCL_GRAPH, "%s: comm 0x%lx primal TREE %d/%d -> rank %d (node %d, DC %d) up to rank %d (node %d, DC %d) ", __func__, comm->commHash, c0, nTrees0, comm->rank, node, + dcId, ttc0[nodeR0], nodeR0, dcId + 1); + } + if (isNodeRoot[1] && channel1) { + NCCLCHECK(setTreeUp(&channel1->tree, ttc0, nodeR1)); + INFO(NCCL_GRAPH, "%s: comm 0x%lx dual TREE %d/%d -> rank %d (node %d, DC %d) up to rank %d (node %d, DC %d) ", __func__, comm->commHash, c0, nTrees1, comm->rank, node, + dcId, ttc0[nodeR1], nodeR1, dcId + 1); + } + } + // connect rank = ttc0[root node of current DC] to the ttp[root node of previous DC] + if (comm->rank == ttc0[node] && dcId > 0) { + // if I am the rank talking to the child 0, establish connection with the next DC root + NCCLCHECK(ncclGetDtreeRoots(comm->dcNode[dcId - 1].localNodes, &root0, &root1)); + int nodeR0 = globalNodeFromShiftedNode(comm, &comm->dcNode[dcId - 1], root0, c0, nTrees0); + int nodeR1 = globalNodeFromShiftedNode(comm, &comm->dcNode[dcId - 1], root1, c0, nTrees0); + if (isNodeRoot[0]) { + NCCLCHECK(setTreeDown(&channel0->tree, ttp, nodeR0)); + INFO(NCCL_GRAPH, "%s: comm 0x%lx primal TREE %d/%d -> rank %d (node %d, DC %d) down to rank %d (node %d, DC %d) ", __func__, comm->commHash, c0, nTrees0, comm->rank, node, + dcId, ttp[nodeR0], nodeR0, dcId - 1); + } + if (isNodeRoot[1] && channel1) { + NCCLCHECK(setTreeDown(&channel1->tree, ttp, nodeR1)); + INFO(NCCL_GRAPH, "%s: comm 0x%lx dual TREE %d/%d -> rank %d (node %d, DC %d) down to rank %d (node %d, DC %d) ", __func__, comm->commHash, c0, nTrees1, comm->rank, node, + dcId, ttp[nodeR1], nodeR1, dcId - 1); + } + } + } + channel0->tree.depth = depth; + if (channel1) channel1->tree.depth = depth; } return ncclSuccess; } @@ -356,8 +512,8 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev int nranks = comm->nRanks; int c; for (c=start; cchannels+c, comm->channels+c-start, sizeof(struct ncclChannel)); } return c; @@ -377,24 +533,26 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL; int nranks = comm->nRanks; int nNodes = comm->nNodes; - int nChannels = comm->nChannels; + int nSearchChannels = comm->nChannels; + int maxChannels, minChannels; + int minHeadNum = INT_MAX; - int shared = parent && parent->nvlsSupport && parent->config.splitShare; - NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS)); - NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail); + int shared = parent && parent->nvlsSupport && parent->config.splitShare; + NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks * MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks * MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&ringRecv, nNodes * nSearchChannels), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes * nSearchChannels), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes * nSearchChannels), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes * nSearchChannels), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes * nSearchChannels), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes * MAXCHANNELS), ret, fail); // Alternate rings to avoid crossing rails - if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) { + if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nSearchChannels % 2) == 0) { for (int r=0; rnRanks; r++) { if (comm->rankToNode[r] % 2 == 1) { // Exchange rings - for (int c=0; cringRecv+c, allTopoRanks[r]->ringRecv+(c^1)); exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1)); exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1)); @@ -404,8 +562,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa } } - for (int c=0; cringRecv[c]; ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c]; @@ -432,56 +590,47 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa } } - // Connect rings and trees. This should also duplicate the channels. - NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail); - NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail); - - // Duplicate ringPrev/ringNext for ncclBuildRing - memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); - memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); - - // Set ring prev/next for my rank - for (int c=0; cchannels+c; - struct ncclChannel* channel1 = channel0+nChannels; - channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank]; - channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank]; - } - - // Duplication should be complete now - nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2); + // each search channel gets two compute channels + comm->nChannels = std::min(MAXCHANNELS, nSearchChannels * 2); // Setup CollNet if (comm->collNetSupport == 1) { struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN]; // Add more channels to saturate intra-node bandwidth, except the 1 PPN case if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) { - int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); - nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); + comm->nChannels = std::min(MAXCHANNELS, comm->nChannels + comm->nChannels / 2); } NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail); } // Use 4 compute channels per search channel to reach peak BW on <8 PPN - if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) { - nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); + if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && comm->nChannels < 16) { + comm->nChannels = std::min(MAXCHANNELS, comm->nChannels * 2); } // Double the number of channels when using unpack networking (greater than 1 node) // We won't automatically double past 16 channels, users can specify 32 if they want - if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) { - nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); + if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && comm->nChannels < 16 && ncclParamUnpackDoubleNChannels()) { + comm->nChannels = std::min(MAXCHANNELS, comm->nChannels * 2); } - // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. - // We permit combining max, then min, to only use the first channels, then duplicate them. + // Honor NCCL_MIN/MAX_CTAS and NCCL_MIN/MAX_NCHANNELS + // child comm #channels cannot exceed top parent #channels. if (comm->sharedRes->owner != comm) { - /* child comm #channels cannot exceed top parent #channels. */ - nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels); - nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext); + minChannels = std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels); + maxChannels = std::min(std::min(ncclMaxNchannels(), comm->config.maxCTAs), comm->sharedRes->tpNChannels); } else { - nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs); - nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext); + minChannels = std::max(ncclMinNchannels(), comm->config.minCTAs); + maxChannels = std::min(ncclMaxNchannels(), comm->config.maxCTAs); + } + comm->nChannels = std::max(minChannels, std::min(comm->nChannels, maxChannels)); + if (comm->nChannels > nSearchChannels) comm->nChannels = copyChannels(comm, nSearchChannels, comm->nChannels, ringPrev, ringNext); + NCCLCHECKGOTO(connectRings(comm, comm->nChannels, comm->channels, ringPrev, ringNext, nSearchChannels, ringRecv, ringSend), ret, fail); + NCCLCHECKGOTO(connectTrees(comm, comm->nChannels, comm->channels, nSearchChannels, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail); + + // We permit combining max, then min, to only use the first max channels, then duplicate them. + if (maxChannels < minChannels) { + comm->nChannels = copyChannels(comm, maxChannels, minChannels, ringPrev, ringNext); } comm->collChannels = comm->nChannels; @@ -491,17 +640,17 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa comm->nvlsChannels = parent->nvlsResources->nChannels; } if (comm->nChannels < comm->nvlsChannels) { - nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext); + comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext); } NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail); #endif if (shared && comm->nChannels > parent->sharedRes->tpNChannels) { - nChannels = comm->nChannels = parent->sharedRes->tpNChannels; + comm->nChannels = parent->sharedRes->tpNChannels; comm->collChannels = std::min(comm->collChannels, comm->nChannels); } // Create rings array and check all is fine - NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail); + NCCLCHECKGOTO(ncclBuildRings(comm->nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail); exit: if (ringRecv) free(ringRecv); diff --git a/src/graph/paths.cc b/src/graph/paths.cc index ace4476f6..b9b2804df 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -473,10 +473,10 @@ ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *a NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0); // Determine whether we need to flush the GDR recv buffers -ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) { +ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netIdx, int netDev, int rank, int* flush) { *flush = 1; ncclNetProperties_t props; - NCCLCHECK(comm->ncclNet->getProperties(netDev, &props)); + NCCLCHECK(comm->ncclNet[netIdx]->getProperties(netDev, &props)); if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess; int g; struct ncclTopoSystem* system = comm->topo; @@ -530,6 +530,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank } ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank) { + if (intermediateRank) *intermediateRank = -1; + if (netId == -1 || !intermediateRank) return ncclSuccess; // Get GPU and NET int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n)); @@ -561,12 +563,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0); int ncclPxnDisable(struct ncclComm* comm) { static int pxnDisable = -1; if (pxnDisable == -1) { - if (comm && comm->ncclNetVer == 4) { - INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); - pxnDisable = 1; - } else { - pxnDisable = ncclParamPxnDisable(); - } + pxnDisable = ncclParamPxnDisable(); } return pxnDisable; } @@ -582,7 +579,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, for (int rank=0; ranknRanks; rank++) { int64_t netId; int proxyRank; - NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank)); + NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, rank, &netId, NULL, &proxyRank)); if (proxyRank == comm->rank) continue; enum ncclTopoGdrMode useGdr; NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr)); diff --git a/src/graph/search.cc b/src/graph/search.cc index 15a01243f..a14b9e048 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -438,6 +438,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop } // Build a sorted list of the NETs to try. +// The NETs returned are compatible with at least one element in the list of netDevs. // // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu // index when trying to get back to the NIC. @@ -446,7 +447,8 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. // 2. add other NETs satisfying typeInter but not already in the list. -ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) { +static ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpuId, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet, int netDevCount, + struct ncclNetDev* netDevs) { ncclResult_t ret = ncclSuccess; int netCount = 0; int localNetCount; @@ -454,12 +456,19 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in // First add the preferred NICs for (int g=0; gnodes[GPU].count; g++) { - if (gpu != -1 && gpu != g) continue; + if (gpuId != -1 && gpuId != g) continue; localNetCount = 0; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; for (int c = 0; cgpu.rank, c, &netId, NULL)); + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL, /*pathType=*/NULL, netDevCount, netDevs)); + if (netId == -1) { + char msg[256]; + for (int i = 0; i < netDevCount; ++i) snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "%s %d:%lu", (i == 0) ? "" : ",", netDevs[i].netIdx, netDevs[i].fabricId); + WARN("Could not find any local path from gpu %d to net with%s", gpu->gpu.rank, msg); + return ncclInternalError; + } + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; localNetCount++; @@ -476,12 +485,20 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in // Then add others satisfying typeInter for (int t=0; t <= typeInter; t++) { for (int g=0; gnodes[GPU].count; g++) { - if (gpu != -1 && gpu != g) continue; + if (gpuId != -1 && gpuId != g) continue; localNetCount = 0; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; struct ncclTopoLinkList* paths = gpu->paths[NET]; for (int n=0; nnodes[NET].count && nnodes[NET].nodes[n]; + // add if the type is right and the netNode is in the list of requested device + int listId = 0; + for (listId = 0; listId < netDevCount; listId++) { + ncclNetPath_t path = {.loc = NET_LOC_DISC}; + if (node->net.netIdx == netDevs[listId].netIdx) NCCLCHECK(node->net.getNetPath(node->net.fabricId, netDevs[listId].fabricId, &path)); + if (path.loc < NET_LOC_DISC) break; + } + if (paths[n].type == t && (netDevCount == 0 || listId < netDevCount)) localNets[localNetCount++] = n; } // Append NICs to list for (int i=0; iinter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; + // When coming back to the NIC, we must select a NIC compatible with the startNet. + // We compare two NICs on the same node, so they are either fast connected, or disconnected. A simple == check is enough. + // A more thorough check would require access to the comm's ncclNet array, which we don't have here. int netCount; - NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount)); + struct ncclNetDev netDev = {.netIdx = startNet->net.netIdx, .fabricId = startNet->net.fabricId}; + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount, 1, &netDev)); for (int i=0; inodes[NET].nodes+n; - if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric + if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // enter and exit through the same NIC if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) { if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue; } else { @@ -597,12 +618,13 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo return ncclSuccess; } -ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { +static ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { const int bw = graph->bwInter; int nets[NCCL_TOPO_MAX_NODES]; int netCount; int graphFound = 0; - NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount)); + // when getting the first net dev, there is no restriction on the (netId, fabricId) + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount, /*netDevCount=*/0, /*netDevs=*/NULL)); for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break; int n = nets[(graph->nChannels+i)%netCount]; @@ -809,7 +831,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter)); - if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0; + NCCLCHECK(xmlGetAttrFloatDefault(xmlGraph, "latencyinter", &graph->latencyInter, 0.0)); const char* str; NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); @@ -1178,7 +1200,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru #include "comm.h" // NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head -ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int64_t* netId) { +static ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int64_t* netId) { ncclResult_t ret = ncclSuccess; int localRanks = comm->topo->nodes[GPU].count; int netNum = 0; @@ -1192,42 +1214,93 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i if (netNum) { *netId = net[channelId % netNum]; } else { + // in case of error, it means that the current rank is not an NVLS head + // the caller is responsible to determine if it's an error or not ret = ncclInternalError; - goto fail; } - -exit: return ret; -fail: - WARN("Could not find NIC for rank %d in NVLS graph", comm->rank); - goto exit; } // 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2); -ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank) { +static ncclResult_t ncclTopoPrintNetDev(int netDevCount, struct ncclNetDev* netDevs, char* msg, size_t len) { + snprintf(msg, len, "netDevs:"); + for (int i = 0; i < netDevCount; ++i) { + snprintf(msg + strlen(msg), len - strlen(msg), "%s %d:%lu", (i == 0) ? "" : ",", netDevs[i].netIdx, netDevs[i].fabricId); + } + return ncclSuccess; +} + +static ncclResult_t ncclTopoTryConnectToPeer(struct ncclComm* comm, int64_t netId, int peerRank, bool* successful) { + int netIndex; + int peerNetDevCount = comm->peerInfo[peerRank].netDevCount; + struct ncclNetDev* peerNetDevs = comm->peerInfo[peerRank].netDevs; + struct ncclTopoNode* netNode; + *successful = false; + if (netId == -1) goto exit; + + NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &netIndex)); + netNode = comm->topo->nodes[NET].nodes + netIndex; + for (int i = 0; i < peerNetDevCount; ++i) { + ncclNetPath_t path = {.loc = NET_LOC_DISC}; + if (netNode->net.netIdx == peerNetDevs[i].netIdx) + NCCLCHECK(comm->ncclNet[netNode->net.netIdx]->getNetPath(netNode->net.fabricId, peerNetDevs[i].fabricId, &path)); + if (path.loc < NET_LOC_DISC) { + *successful = true; + goto exit; + } + } +exit: + return ncclSuccess; +} + +ncclResult_t ncclTopoGetNetDevFromGraph(struct ncclComm* comm, int rank, int peerRank, struct ncclTopoGraph* graph, int channelId, int64_t* netId, int* netDev, int* netPathType) { + int64_t localNetId; + int channel = channelId % graph->nChannels; + int ngpus = comm->topo->nodes[GPU].count; + int index = graph->intra[channel * ngpus] == rank ? 0 : 1; + if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) { + localNetId = graph->inter[channel * 2 + index]; + } else { + // failing here means that the current rank is not an NVLS head + // the caller is responsible to determine if it's an error or not, we pass the error along + NCCLCHECK(getNvlsNetDev(comm, graph, channelId, &localNetId)); + } + // verify if we can connect. if we cannot connect, search a new net and update the netdev info + bool canConnectToPeer = false; + NCCLCHECK(ncclTopoTryConnectToPeer(comm, localNetId, peerRank, &canConnectToPeer)); + if (canConnectToPeer && netDev) { + NCCLCHECK(ncclTopoIdToNetDev(comm->topo, localNetId, netDev)); + } else { + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &localNetId, netDev, netPathType, comm->peerInfo[peerRank].netDevCount, comm->peerInfo[peerRank].netDevs)); + } + if(netId) *netId = localNetId; + return ncclSuccess; +} + +ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int pxnRank, int64_t* id, int* dev, int* proxyRank) { + ncclResult_t res = ncclSuccess; int64_t netId = -1; int netDev = -1; + + if (peerRank == -1) return ncclInternalError; + if (comm->topo->nodes[NET].count == 0) { + if (dev) *dev = -1; + if (id) *id = -1; + goto exit; + } + if (graph) { - // Honor the net device in the graph - int channel = channelId%graph->nChannels; - int ngpus = comm->topo->nodes[GPU].count; - int index = graph->intra[channel*ngpus] == rank ? 0 : 1; - if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) { - netId = graph->inter[channel*2+index]; - } else { - NCCLCHECK(getNvlsNetDev(comm, graph, channelId, &netId)); - } - NCCLCHECK(ncclTopoIdToNetDev(comm->topo, netId, &netDev)); + NCCLCHECK(ncclTopoGetNetDevFromGraph(comm, rank, peerRank, graph, channelId, &netId, &netDev, /*netPathType=*/NULL)); + NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, netId, proxyRank)); + if(netId == -1 || netDev == -1) goto fail; if (dev) *dev = netDev; if (id) *id = netId; - NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, netId, proxyRank)); - } else if (peerRank == -1) { - return ncclInternalError; } else { - // Start with our local NIC and local Rank - NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &netId, &netDev)); + // Start with our local NIC and local rank (NIC must connect to the peer). Not finding the net it is an error. + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &netId, &netDev, /*pathType=*/NULL, comm->peerInfo[peerRank].netDevCount, comm->peerInfo[peerRank].netDevs)); + if (netId == -1) goto fail; if (dev) *dev = netDev; if (id) *id = netId; *proxyRank = rank; @@ -1235,11 +1308,13 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel(); // See whether we can use the remote rank preferred device. if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) { - // Find local NIC number close to local nvmlDev - int nvmlDev = comm->peerInfo[peerRank].nvmlDev; + if(pxnRank == -1) return ncclInternalError; + // Find local NIC number close to local nvmlDev, not finding it, we stop here + int nvmlDev = comm->peerInfo[pxnRank].nvmlDev; int localRank; if (ncclTopoDevToRank(comm->topo, nvmlDev, &localRank) != ncclSuccess) return ncclSuccess; - NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netId, &netDev)); + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netId, &netDev, /*pathType=*/NULL, comm->peerInfo[peerRank].netDevCount, comm->peerInfo[peerRank].netDevs)); + if (netId == -1) return ncclSuccess; // Check that device exists on our node if (ncclParamCrossNic() == 0) { @@ -1273,6 +1348,14 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG } } } + if (netId == -1) goto fail; } - return ncclSuccess; +exit: + return res; +fail: + char msg[256]; + NCCLCHECK(ncclTopoPrintNetDev(comm->peerInfo[peerRank].netDevCount, comm->peerInfo[peerRank].netDevs, msg, sizeof(msg))); + WARN("Could not find any netDev to communicate from rank %d to peer %d (peer netDev list = %s)", rank, peerRank, msg); + res = ncclInternalError; + goto exit; } diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 9499f396d..a0cad332b 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -12,6 +12,8 @@ #include "net.h" #include "coll_net.h" #include "transport.h" +#include +#include #include #include #include "xml.h" @@ -343,6 +345,17 @@ static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* return ncclSuccess; } +ncclResult_t ncclTopoIdToNetIdx(struct ncclComm* comm, int64_t topoId, int* netIdx) { + int netTopoIndex; + NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, topoId, &netTopoIndex)); + *netIdx = comm->topo->nodes[NET].nodes[netTopoIndex].net.netIdx; + if (*netIdx == -1) { + WARN("the chosen device has a network index of %d", *netIdx); + return ncclInternalError; + } + return ncclSuccess; +} + // We want the graph to be organized to ease/accelerate traversal : // 1. NVLinks (already the case) // 2. PCI down @@ -354,28 +367,33 @@ ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) { } ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) { - int dev; + const char* str = NULL; + int dev, netIdx; NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "netIdx", &netIdx, 0)); struct ncclTopoNode* net; - NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev))); + NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID_NIC(systemId, netIdx, dev))); + net->net.netIdx = netIdx; net->net.dev = dev; - const char* str; NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); if (str) sscanf(str, "0x%lx", &net->net.asic); else net->net.asic = dev; - ncclDebugNoWarn = NCCL_GRAPH; int mbps; NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.bw = mbps / 8000.0; - if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0; + NCCLCHECK(xmlGetAttrFloatDefault(xmlNet, "latency", &net->net.latency,0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); - ncclDebugNoWarn = 0; + // get the fabricId if it exists. If not, we have bypassed ncclTopoNet and we should not use the netIdx value + NCCLCHECK(xmlGetAttr(xmlNet,"fabricId", &str)); + if(str) net->net.fabricId = strtoull(str, NULL, 0); + else net->net.netIdx = -1; + net->net.getNetPath = NULL; // updated as part of the PostProcessNet function NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw)); NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw)); @@ -437,8 +455,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s type = NIC; // Ignore sub device ID and merge multi-port NICs into one PCI device. struct ncclTopoNode* nicNode = NULL; - int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, busId); - int64_t id = NCCL_TOPO_ID(systemId, localNicId); + int64_t id = NCCL_TOPO_ID(systemId, busId); NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id)); if (nicNode == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id)); @@ -533,8 +550,7 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId, numaId)); if (strcmp(node->name, "nic") == 0) { struct ncclTopoNode* nic = NULL; - int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, 0); - int64_t id = NCCL_TOPO_ID(systemId, localNicId); + int64_t id = NCCL_TOPO_ID(systemId, 0); NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, id)); if (nic == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, id)); @@ -800,7 +816,7 @@ typedef struct xmlNodeStack { } xmlNodeStack; // 1. Find the common parent xmlNode between the given set of nodes -ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) { +ncclResult_t ncclgetNetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) { // Track a stack of parents per-net node being merged xmlNodeStack* parents; NCCLCHECK(ncclCalloc(&parents, nNodes)); @@ -990,8 +1006,9 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char char* semi = strtok_r(ncStr, ";", &semi_token); while (semi) { TRACE(NCCL_NET, "Fusing %s", semi); + int nUserIfs; struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC]; - int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC); + NCCLCHECK(parseIfList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC, &nUserIfs)); if (nUserIfs == 0) { INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.", ncStr, semi); @@ -1053,7 +1070,7 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe nodes[0] = physNetNodes[i]; nodes[1] = physNetNodes[j]; struct ncclXmlNode* parent; - NCCLCHECKGOTO(ncclTopoGetPath(nodes, 2, &paths[i*nPhysDevs + j], &parent), res, out); + NCCLCHECKGOTO(ncclgetNetPath(nodes, 2, &paths[i*nPhysDevs + j], &parent), res, out); } } @@ -1135,7 +1152,7 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper } int path = PATH_LOC; - NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent)); + NCCLCHECK(ncclgetNetPath(physNetNodes, vProps->ndevs, &path, parent)); if (path == PATH_LOC) { *parent = NULL; } else if (parent && strcmp((*parent)->name, "pci") == 0) { @@ -1187,12 +1204,39 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_ return res; } -static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) { +static ncclResult_t ncclTopoPrintInfoNic(struct ncclXmlNode* netNode, const char* pciPath, const char* funcName) { + if (ncclDebugLevel <= NCCL_LOG_WARN) return ncclSuccess; + + char msg[8 * MAX_STR_LEN]; + snprintf(msg, sizeof(msg), "%s: filled", funcName); + const char* attr; + NCCLCHECK(xmlGetAttr(netNode, "name", &attr)); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " %s in topo:", attr); + NCCLCHECK(xmlGetAttr(netNode, "type", &attr)); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " type=%s", attr); + NCCLCHECK(xmlGetAttr(netNode, "netIdx", &attr)); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", netIdx=%s", attr ? attr : "-1"); + NCCLCHECK(xmlGetAttr(netNode, "dev", &attr)); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", dev=%s", attr); + NCCLCHECK(xmlGetAttr(netNode, "keep", &attr)); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", keep=%s", attr); + NCCLCHECK(xmlGetAttr(netNode, "coll", &attr)); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", coll=%s", attr ? attr : "0"); + NCCLCHECK(xmlGetAttr(netNode, "gdr", &attr)); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", gdr=%s", attr); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", pci=%s", pciPath); + + INFO(NCCL_GRAPH, "%s", msg); + return ncclSuccess; +} + +static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, int coll, int keep, int virtualNics, int netIdx, const char* netName, + ncclResult_t (*getProperties)(int, ncclNetProperties_t*), + ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* loc), bool netDmaBufSupport, bool dumpXml) { for (int n = startIndex; n < endIndex; n++) { ncclNetProperties_t props; NCCLCHECK(getProperties(n, &props)); - struct ncclXmlNode* netNode = NULL; - struct ncclXmlNode* parent = NULL; + struct ncclXmlNode *netNode = NULL, *parent = NULL; if (virtualNics) { struct ncclXmlNode* net = NULL; NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name)); @@ -1200,88 +1244,133 @@ static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int star // Only run this if the net doesn't exist locally - this may alter the XML state if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent)); } - NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent)); - const char* colAttr; + // If coll == 0 but the netNode is already tagged as coll, don't overwrite the keep value + const char* colAttr = NULL; NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr)); + if (colAttr == NULL || coll != 0 || strcmp(colAttr, "1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep)); - // If coll == 0 but the netNode is tagged as coll, don't update the keep value - if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep)); - NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); + if (coll) { + // there is a single collNet, so no conflict. update the dev index everytime + // need to overwrite the dev index if it was a physical and it becomes a virtual + NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); + NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll)); + } else { + int netIdxAttr = -1; + const char* typeAttr = NULL; + NCCLCHECK(xmlGetAttr(netNode, "type", &typeAttr)); + NCCLCHECK(xmlGetAttrIntDefault(netNode, "netIdx", &netIdxAttr, -1)); + // the devices belongs to this network if: (1) netIdx is set to the current index, + // or (2) the device has a netType and the type equals the name of the current network + if (netIdxAttr == netIdx || (netIdxAttr == -1 && (!typeAttr || (typeAttr && strcmp(typeAttr, netName))))) { + // need to overwrite the dev index: if it was a physical device and it becomes a virtual device, the index is changed + NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); + NCCLCHECK(xmlSetAttr(netNode, "type", netName)); + // no need to dump the netIdx and other net-related info, they are local to the comm + if (!dumpXml) NCCLCHECK(xmlSetAttrInt(netNode, "netIdx", netIdx)); + } + } NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency)); NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); - bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name); + bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (netDmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); - // Only set coll if it's not 0 - if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll)); + NCCLCHECK(xmlInitAttrUint64(netNode, "fabricId", props.fabricId)); - const char* keepAttr; - NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr)); - NCCLCHECK(xmlGetAttr(netNode, "keep", &keepAttr)); - INFO(NCCL_GRAPH, "ncclTopoPopulateNics : Filled %s in topo with pciPath=%s keep=%s coll=%s", - props.name, props.pciPath, keepAttr, colAttr); + NCCLCHECK(ncclTopoPrintInfoNic(netNode, props.pciPath, __func__)); } return ncclSuccess; } struct ncclTopoNetState { + int netPluginIdx; // -1 if uninitialized, shared by all communicators int nVirtualNics; int nPhysicalNics; - const char* name; + const char* name; // name of the net Plugin + ncclResult_t (*getProperties)(int, ncclNetProperties_t*); + ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*); + ncclResult_t (*devices)(int*); + ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* loc); }; // Calls to network plugin APIs should be protected. This function should be called inside a per-process lock. -static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) { - int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL); - if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics)); - // Enumerate physical devices - NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0)); - if (!usePhysicalDevices) { +static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, bool dumpXml, ncclTopoNetState* state, const int netCommIdx) { + bool dmaBufSupport = comm->dmaBufSupport[netCommIdx]; + int usePhysicalDevices = (dumpXml || state->makeVDevice == NULL); + if (state->nPhysicalNics == -1) NCCLCHECK(state->devices(&state->nPhysicalNics)); + if (usePhysicalDevices) { + // Enumerate physical devices, always keep them + NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, coll, /*keep=*/1, /*virtual=*/0, netCommIdx, state->name, state->getProperties, state->getNetPath, + dmaBufSupport, dumpXml)); + } else { + // TopoMakeVNics creates all the virtual NICs needed (including for a single physical device). + // Physical NICs need to be discovered first but will not be kept + NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, coll, /*keep=*/0, /*virtual=*/0, netCommIdx, state->name, state->getProperties, state->getNetPath, + dmaBufSupport, dumpXml)); if (state->nVirtualNics == -1) { - NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics)); + NCCLCHECK(ncclTopoMakeVNics(comm, xml, state->makeVDevice, state->getProperties, state->nPhysicalNics)); int nDevs; - NCCLCHECK(devices(&nDevs)); + NCCLCHECK(state->devices(&nDevs)); state->nVirtualNics = nDevs - state->nPhysicalNics; } - // Remove keep=1 for physical collnets - if (state->nVirtualNics > 0) { - NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0)); - // Populate new devices - NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1)); - } + // all NICs needed by the topology are now virtual, fill the information and set them as "keep=1" + NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics + state->nVirtualNics, coll, /*keep=*/1, /*virtual=*/1, netCommIdx, state->name, + state->getProperties, state->getNetPath, dmaBufSupport, dumpXml)); } + return ncclSuccess; +} +// Bind network function and resources to the net topo nodes. +static ncclResult_t ncclTopoPostProcessNets(ncclCollNet_t* collNet, ncclNet_t* nets[], struct ncclTopoSystem* topoSystem) { + for(int n=0 ; nnodes[NET].count; ++n){ + struct ncclTopoNode* node = &topoSystem->nodes[NET].nodes[n]; + if(node->net.collSupport) node->net.getNetPath = collNet->getNetPath; + else node->net.getNetPath = nets[node->net.netIdx]->getNetPath; + } return ncclSuccess; } static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; -ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {}; -ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {}; -ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) { - INFO(NCCL_GRAPH, "Retrieving state for %s", name); - for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) { - // Empty slot - if (states[i].name == NULL) { - states[i].nVirtualNics = -1; - states[i].nPhysicalNics = -1; - states[i].name = strdup(name); - *state = states + i; - INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name); - return ncclSuccess; - // Found my slot - } else if (strcmp(states[i].name, name) == 0) { - *state = states + i; - return ncclSuccess; - } +static ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {}; +static ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {}; + +static void ncclTopoSharedStateInitOnce() { + // make sure that initialized is 0 everywhere a the first call + for (int n = 0; n < NCCL_NET_MAX_PLUGINS; ++n) { + netStates[n].netPluginIdx = -1; + collNetStates[n].netPluginIdx = -1; } - WARN("NET/TOPO : Couldn't find net with name %s", name); - return ncclInternalError; +} + +// must be called within a lock to avoid conflict when initializing the states +static ncclResult_t ncclTopoInitSharedState(ncclTopoNetState* states, const int netPluginIdx, const char* name, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), + ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), + ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path), + ncclTopoNetState** state) { + // init the array of states only once + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once, ncclTopoSharedStateInitOnce); + // get the state from the netPluginIdx + *state = states + netPluginIdx; + // setup the state if not already set by others + TRACE(NCCL_GRAPH, "Retrieving state for %s", name); + // Empty slot + if ((*state)->netPluginIdx == -1) { + (*state)->name = name; + (*state)->nVirtualNics = -1; + (*state)->nPhysicalNics = -1; + (*state)->getProperties = getProperties; + (*state)->makeVDevice = makeVDevice; + (*state)->devices = devices; + (*state)->getNetPath = getNetPath; + (*state)->netPluginIdx = netPluginIdx; + INFO(NCCL_GRAPH, "Initialized state %d for %s", netPluginIdx, name); + } + return ncclSuccess; } ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) { @@ -1321,21 +1410,23 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail); } - // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, - // so we start with collnet so that it has precedence. + // Auto-detect NICs if needed pthread_mutex_lock(&netLock); netLockHeld = 1; INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology"); - ncclTopoNetState* state; - state = NULL; + // net/collnet share the same xml/graph nodes, so we start with collnet so that it has precedence. if (collNetSupport(comm)) { - NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail); - NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state, - comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail); + ncclTopoNetState* state = NULL; + ncclCollNet_t* net = comm->ncclCollNet; + NCCLCHECKGOTO(ncclTopoInitSharedState(collNetStates, comm->ncclCollNetPluginIdx, net->name, net->getProperties, net->makeVDevice, net->devices, net->getNetPath, &state), ret, fail); + NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile != NULL, state, /*netCommIdx=*/0), ret, fail); + } + for (int n = 0; n < comm->ncclNetCount; ++n) { + ncclTopoNetState* state = NULL; + ncclNet_t* net = comm->ncclNet[n]; + NCCLCHECKGOTO(ncclTopoInitSharedState(netStates, comm->ncclNetPluginIdx[n], net->name, net->getProperties, net->makeVDevice, net->devices, net->getNetPath, &state), ret, fail); + NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile != NULL, state, /*netCommIdx=*/n), ret, fail); } - NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail); - NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state, - comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name), ret, fail); pthread_mutex_unlock(&netLock); netLockHeld = 0; @@ -1387,7 +1478,10 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy } // Only update our topo tracking structure if we aren't dumping (separate steps) - if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail); + if (dumpXmlFile == NULL){ + NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail); + NCCLCHECKGOTO(ncclTopoPostProcessNets(comm->ncclCollNet, comm->ncclNet, *system), ret, fail); + } exit: if (!comm->MNNVL && localRanks) free(localRanks); @@ -1399,14 +1493,22 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy goto exit; } -static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, - int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) { +static ncclResult_t ncclTopoGetAllLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType, + int nExcl, int64_t* exclIds) { int minType = PATH_DIS; float maxBw = 0; int count = 0; struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType]; if (paths == NULL) { *localCount = 0; return ncclSuccess; } for (int i=0; inodes[resultType].count; i++) { + // Exclude the node if in the list of excluded topo ids. Go over the list, if found, skip the node. + // If unfound (idExcl == nExcl) or the list is empty (nExcl ==0), test if we should add the node. + int idExcl = 0; + for (idExcl = 0; idExcl < nExcl; ++idExcl) { + if (system->nodes[resultType].nodes[i].id == exclIds[idExcl]) break; + } + if (nExcl > 0 && idExcl < nExcl) continue; + if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) { maxBw = paths[i].bw; minType = paths[i].type; @@ -1415,9 +1517,9 @@ static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, in } if (paths[i].bw == maxBw && paths[i].type == minType) { if (count == NCCL_TOPO_MAX_NODES) { - WARN("Error : ran out of room to store found nodes in ncclTopoGetLocal." + WARN("Error : ran out of room to store found nodes in %s." " Filled %d of type %d, starting from index %d of type %d.", - NCCL_TOPO_MAX_NODES, resultType, index, type); + __func__, NCCL_TOPO_MAX_NODES, resultType, index, type); return ncclInternalError; } locals[count++] = i; @@ -1427,6 +1529,56 @@ static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, in return ncclSuccess; } +union ncclTopoGetLocalRestriction { + struct ncclNetDev net; +}; + +static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType, + int nRestrictions, union ncclTopoGetLocalRestriction* restrictions) { + if (resultType != NET) { + NCCLCHECK(ncclTopoGetAllLocal(system, type, index, resultType, locals, localCount, pathType, 0, NULL)); + } else { + int netCount = 0, nExcl = 0; + int64_t exclList[NCCL_TOPO_MAX_NODES]; + while (netCount == 0) { + NCCLCHECK(ncclTopoGetAllLocal(system, type, index, NET, locals, localCount, pathType, nExcl, exclList)); + if (*localCount == 0) break; + + int minNetIdx = NCCL_NET_MAX_PLUGINS + 1; + for (int i = 0; i < *localCount; ++i) { + struct ncclTopoNode* node = system->nodes[NET].nodes + locals[i]; + int netIdx = node->net.netIdx; + uint64_t fabricId = node->net.fabricId; + ncclResult_t (*getNetPath)(uint64_t, uint64_t, ncclNetPath_t*) = node->net.getNetPath; + + // apply restrictions: if not in the list of requirements, add to the list of excluded topo id for next call + int c = 0; + for (c = 0; c < nRestrictions; ++c) { + struct ncclNetDev* crit = &(restrictions[c].net); + ncclNetPath_t path = {.loc = NET_LOC_DISC}; + if (netIdx == crit->netIdx) NCCLCHECK(getNetPath(fabricId, crit->fabricId, &path)); + if (path.loc < NET_LOC_DISC) break; + } + if (nRestrictions > 0 && c == nRestrictions) { + exclList[nExcl++] = node->id; + continue; + } + + // return only the device with the lowest netIdx (that acts as a proxy for priority). + if (netIdx < minNetIdx) { + minNetIdx = netIdx; + netCount = 0; + } + if (netIdx == minNetIdx) { + locals[netCount++] = locals[i]; + } + } + } + *localCount = netCount; + } + return ncclSuccess; +} + ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) { int localNetCount = 0, netCountByBw = 0; int localNets[NCCL_TOPO_MAX_NODES]; @@ -1441,7 +1593,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c } } - NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL)); + NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL, 0, NULL)); for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) { totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw; } @@ -1450,21 +1602,21 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c return ncclSuccess; } -ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev, int* pathType, int netDevCount, struct ncclNetDev* netDevs) { int gpu; NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu)); - - int localNets[NCCL_TOPO_MAX_NODES]; int localNetCount; - NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL)); - if (localNetCount==0) { - WARN("Could not find any local path from gpu %d to net.", gpu); - return ncclInternalError; + int localNets[NCCL_TOPO_MAX_NODES]; + NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, pathType, netDevCount, (union ncclTopoGetLocalRestriction*)netDevs)); + if (localNetCount == 0) { + if(id) *id = -1; + if(dev) *dev = -1; + return ncclSuccess; } int localGpus[NCCL_TOPO_MAX_NODES]; int localGpuCount; - NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL)); + NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL, /*nConstrain=*/0, /*constrain=*/NULL)); int net = system->nodes[GPU].nodes[gpu].gpu.dev; if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount); @@ -1481,7 +1633,7 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i int localGpus[NCCL_TOPO_MAX_NODES]; int localGpuCount; - NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL)); + NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL, /*nConstrain=*/0, /*constrain=*/NULL)); int foundGpu = -1; for (int c=0; cnodes[GPU].nodes+g; int64_t id; - NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL)); + // no need to give a list of fabricId/netId when searching for a specific net + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL, /*pathType=*/NULL, 0, NULL)); if (netId == id) { foundGpu = g; goto exit; diff --git a/src/graph/topo.h b/src/graph/topo.h index 921a7f5d6..ee4677ad5 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -112,10 +112,12 @@ struct ncclTopoLinkList { #define NCCL_TOPO_UNDEF (-1) #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff +// access a TOPO_ID #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56) #define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK) -#define NCCL_TOPO_LOCAL_NIC_ID(numaid, busid) (((int64_t)numaid << 56) + busid) -#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK)) +// create a TOPO_ID +#define NCCL_TOPO_ID(systemid, localid) (((int64_t)(systemid) << 56) + ((localid) & NCCL_TOPO_ID_LOCAL_ID_MASK)) +#define NCCL_TOPO_ID_NIC(systemid, pluginid, localid) NCCL_TOPO_ID(systemid + pluginid, localid) struct ncclTopoNode { int type; @@ -129,6 +131,7 @@ struct ncclTopoNode { int gdrSupport; }gpu; struct { + int netIdx; // net index inside the comm net array int dev; // Plugin dev number uint64_t asic; int port; @@ -137,6 +140,8 @@ struct ncclTopoNode { int gdrSupport; int collSupport; int maxChannels; + uint64_t fabricId; + ncclResult_t (*getNetPath)(uint64_t, uint64_t, ncclNetPath_t*); // gives the locality between this device and another one }net; struct { int arch; @@ -189,14 +194,18 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax); +ncclResult_t ncclTopoIdToNetIdx(struct ncclComm* comm, int64_t topoId, int* netIdx); + static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) { *index = -1; + if(system->nodes[type].count == 0) return ncclSuccess; for (int i=0; inodes[type].count; i++) { if (system->nodes[type].nodes[i].id == id) { *index = i; return ncclSuccess; } } + WARN("failed to find a topo node with id %ld", id); return ncclInternalError; } diff --git a/src/graph/trees.cc b/src/graph/trees.cc index 8e1e2ae85..817234496 100644 --- a/src/graph/trees.cc +++ b/src/graph/trees.cc @@ -107,3 +107,9 @@ ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, i } return ncclSuccess; } + +ncclResult_t ncclGetDtreeRoots(int nranks, int* r0, int* r1) { + if (r0) *r0 = 0; // primal tree root is always 0 + if (r1) *r1 = (nranks % 2 == 1 && nranks > 1) ? 1 : (nranks - 1); + return ncclSuccess; +} diff --git a/src/graph/xml.h b/src/graph/xml.h index f06c0e68b..256cef407 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -124,6 +124,19 @@ static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrNam return ncclSuccess; } +static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrName, uint64_t* value) { + const char* str; + NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); + *value = strtoull(str, NULL, 0); + return ncclSuccess; +} + +static ncclResult_t xmlGetAttrUint64Default(struct ncclXmlNode* node, const char* attrName, uint64_t* value, uint64_t defaultValue) { + const char* str; + NCCLCHECK(xmlGetAttr(node, attrName, &str)); + *value = str ? strtoull(str, NULL, 0) : defaultValue; + return ncclSuccess; +} static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) { const char* str; @@ -284,6 +297,19 @@ static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char* attrNam return ncclSuccess; } +static ncclResult_t xmlSetAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + if (index == -1) { + index = node->nAttrs++; + strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + node->attrs[index].key[MAX_STR_LEN] = '\0'; + } + snprintf(node->attrs[index].value, MAX_STR_LEN, "%#lx", value); + node->attrs[index].value[MAX_STR_LEN] = '\0'; + return ncclSuccess; +} + static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); diff --git a/src/include/comm.h b/src/include/comm.h index 409518713..09fc171c2 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -85,10 +85,16 @@ struct ncclUserRedOp { }; struct ncclNodeRanks { + int dcIndex; // index into the DC array int localRanks; int* localRankToRank; }; +struct ncclDcNode { + int localNodes; + int* localNodeToNode; +}; + struct cliqueInfo { int id; int size; @@ -421,10 +427,14 @@ struct ncclComm { struct ncclProxyConnector* gproxyConn; struct ncclIntruQueue legacyRegCleanupQueue; + int ncclNetCount; + int ncclDcNetIndex; int netPluginLoaded; - ncclNet_t* ncclNet; - int ncclNetVer; ncclNetDeviceType netDeviceType; + int ncclNetVer[NCCL_NET_MAX_PLUGINS]; + int ncclNetPluginIdx[NCCL_NET_MAX_PLUGINS]; + ncclNet_t* ncclNet[NCCL_NET_MAX_PLUGINS]; + int ncclCollNetPluginIdx; ncclCollNet_t* ncclCollNet; void* bootstrap; // Bitmasks for ncclTransportP2pSetup @@ -464,13 +474,16 @@ struct ncclComm { int* localRankToRank; // localRanks and localRanktoRank for all nodes struct ncclNodeRanks* nodeRanks; + // multi-DC support + int dcCount; + struct ncclDcNode* dcNode; // MNNVL: Multi-Node NVLink int MNNVL; // true when MNNVL is available struct cliqueInfo clique; // Our MNNVL clique information int cliqueRank; // Our rank within the MNNVL clique bool checkPointers; - bool dmaBufSupport; + bool dmaBufSupport[NCCL_NET_MAX_PLUGINS]; // Counter for tracking CUDA launches (P2P and collectives included) uint64_t opCount; diff --git a/src/include/device.h b/src/include/device.h index 0763a579a..8016c0833 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -155,7 +155,7 @@ struct ncclRing { // The root of each tree only has one node down (+1 intra-node). -#define NCCL_MAX_TREE_ARITY_TOP 2 +#define NCCL_MAX_TREE_ARITY_TOP 3 // Nodes inside the binary tree can have to two nodes down (+1 intra-node). #define NCCL_MAX_TREE_ARITY 3 struct ncclTree { diff --git a/src/include/graph.h b/src/include/graph.h index b779773da..9af4dd094 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -9,6 +9,7 @@ #include "nccl.h" #include "device.h" +#include "nccl_net.h" #include #include #include @@ -33,7 +34,8 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology -ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); +ncclResult_t ncclTopoGetNetDevFromGraph(struct ncclComm* comm, int rank, int peerRank, struct ncclTopoGraph* graph, int channelId, int64_t* netId, int* netDev, int* netPathType); +ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int pxnRank, int64_t* id, int* dev, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret); enum ncclTopoGdrMode { @@ -43,7 +45,7 @@ enum ncclTopoGdrMode { ncclTopoGdrModeNum = 3 }; ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode); -ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush); +ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netIdx, int netDev, int rank, int* flush); ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail); ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net); int ncclPxnDisable(struct ncclComm* comm); @@ -70,7 +72,7 @@ ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vend ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); -ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev); +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev, int* pathType, int netDevCount, struct ncclNetDev* netDevs); ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex); ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count); diff --git a/src/include/net.h b/src/include/net.h index afc2d160e..174f958fb 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -12,6 +12,8 @@ #include "comm.h" #include "checks.h" +static NCCL_PARAM(AllNet,"ALLNET_ENABLE",0); + typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; ncclResult_t ncclNetPluginLoad(struct ncclComm* comm); diff --git a/src/include/net_device.h b/src/include/net_device.h index c3a79e35c..bdf281ec5 100644 --- a/src/include/net_device.h +++ b/src/include/net_device.h @@ -27,6 +27,7 @@ typedef struct { typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; -typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t; +typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t; #endif diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h index d57aad5a9..e2f810033 100644 --- a/src/include/plugin/nccl_net.h +++ b/src/include/plugin/nccl_net.h @@ -33,22 +33,25 @@ // NCCL core profiler callback for network defined events instrumentation typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); +#include "net/net_v11.h" #include "net/net_v10.h" #include "net/net_v9.h" #include "net/net_v8.h" #include "net/net_v7.h" #include "net/net_v6.h" -typedef ncclNet_v10_t ncclNet_t; -typedef ncclCollNet_v10_t ncclCollNet_t; -typedef ncclNetSGE_v10_t ncclNetSGE_t; -typedef ncclNetProperties_v10_t ncclNetProperties_t; -typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; -typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; +typedef ncclNet_v11_t ncclNet_t; +typedef ncclCollNet_v11_t ncclCollNet_t; +typedef ncclNetSGE_v11_t ncclNetSGE_t; +typedef ncclNetProperties_v11_t ncclNetProperties_t; +typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t; +typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t; +typedef ncclNetPath_v11_t ncclNetPath_t; -#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10 +#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V11 -#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10 -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10 +#define NCCL_NET_PLUGIN_VERSION 11 +#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v11 +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v11 #endif // end include guard diff --git a/src/include/plugin/net/net_v11.h b/src/include/plugin/net/net_v11.h new file mode 100644 index 000000000..3a909ed1d --- /dev/null +++ b/src/include/plugin/net/net_v11.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NET_V11_H_ +#define NET_V11_H_ + +#include +#define NCCL_NET_MAX_DEVS_PER_NIC_V11 4 + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V11]; +} ncclNetVDeviceProps_v11_t; + +#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 + +typedef struct { + // Plugin-specific TC value + int trafficClass; +} ncclNetCommConfig_v11_t; + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v11_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations + uint64_t fabricId; // Fabric handle associated to the current device +} ncclNetProperties_v11_t; + +typedef enum { + NET_LOC_DCL0 = 0 /* same DC, hierarchy level 0*/, + NET_LOC_DCL1 = 1 /* different DC, hiearchy level 1 */, + NET_LOC_DISC = 2 /* disconnected*/ +} ncclNetLoc_v11_t; +static_assert(NET_LOC_DCL0 < NET_LOC_DCL1 && NET_LOC_DCL1 < NET_LOC_DISC, "Locality must go in increasing order"); + +typedef struct { + ncclNetLoc_v11_t loc; +} ncclNetPath_v11_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, ncclNetCommConfig_v11_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + // DMA-BUF support + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props); + + // topology API. getNetPath returns the path between two fabricIds. + ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_v11_t* path); +} ncclNet_v11_t; + +typedef struct { + void* mhandle; + void* address; + size_t size; +} ncclNetSGE_v11_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + // DMA-BUF support + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, + void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v11_t* recvParts, size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v11_t* sendParts, void* recvData, size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Create a virtual NIC given the specified properties, which can be accessed at device index d + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props); + + // topology API. getNetPath returns the path between two fabricIds. + ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_v11_t* path); +} ncclCollNet_v11_t; + +#endif // end include guard diff --git a/src/include/proxy.h b/src/include/proxy.h index 225acb22d..2331e0ba3 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -299,8 +299,8 @@ struct ncclProxyState { int nChannels; int buffSizes[NCCL_NUM_PROTOCOLS]; bool allocP2pNetLLBuffers; - bool dmaBufSupport; - ncclNet_t* ncclNet; + bool dmaBufSupport[NCCL_NET_MAX_PLUGINS]; + ncclNet_t* ncclNet[NCCL_NET_MAX_PLUGINS]; ncclCollNet_t* ncclCollNet; uint32_t* abortFlag; bool directMode; diff --git a/src/include/transport.h b/src/include/transport.h index c563fbbd6..e7afbb25d 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -35,6 +35,13 @@ struct ncclRing; struct ncclConnector; struct ncclComm; +#define PEERINFO_NETDEV_MAXCOUNT (MAXCHANNELS + 2) + +struct ncclNetDev { + int netIdx; + uint64_t fabricId; +}; + struct ncclPeerInfo { int rank; int cudaDev; @@ -50,6 +57,9 @@ struct ncclPeerInfo { nvmlGpuFabricInfoV_t fabricInfo; int cuMemSupport; int version; + // multi-DC support + int netDevCount; + ncclNetDev netDevs[PEERINFO_NETDEV_MAXCOUNT]; }; #define CONNECT_SIZE 256 diff --git a/src/include/trees.h b/src/include/trees.h index ded84a667..10df339b3 100644 --- a/src/include/trees.h +++ b/src/include/trees.h @@ -9,5 +9,6 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); +ncclResult_t ncclGetDtreeRoots(int nranks, int* r0, int* r1); #endif diff --git a/src/include/utils.h b/src/include/utils.h index 383f678c8..3f3a5f668 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -34,9 +34,13 @@ ncclResult_t getRandomData(void* buffer, size_t bytes); struct netIf { char prefix[64]; int port; + int64_t fabricId; // -1 is undefined }; -int parseStringList(const char* string, struct netIf* ifList, int maxList); +#define NCCL_IF_MAX_FABRICID (1L<<48) + +ncclResult_t parseIfList(const char* string, struct netIf* ifList, int maxList, int *ifCount); +bool indexIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact, int* index); bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); static long log2i(long n) { diff --git a/src/init.cc b/src/init.cc index 46b02e65e..29345b11f 100644 --- a/src/init.cc +++ b/src/init.cc @@ -201,6 +201,10 @@ static ncclResult_t commFree(ncclComm_t comm) { for (int n=0; nnNodes; n++) free(comm->nodeRanks[n].localRankToRank); free(comm->nodeRanks); } + if(comm->dcNode){ + for (int dc = 0; dc < comm->dcCount; ++dc) free(comm->dcNode[dc].localNodeToNode); + free(comm->dcNode); + } free(comm->rankToNode); free(comm->rankToLocalRank); free(comm->collNetHeads); @@ -278,22 +282,22 @@ enum ncclLaunchMode ncclParamLaunchMode; NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1); // Detect DMA-BUF support -static ncclResult_t dmaBufSupported(struct ncclComm* comm) { - if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL || ncclCudaLibraryInit() != ncclSuccess) return ncclInternalError; +static ncclResult_t dmaBufSupportedByCuda(struct ncclComm* comm) { + if (ncclParamDmaBufEnable() == 0 || ncclCudaLibraryInit() != ncclSuccess) return ncclInvalidUsage; #if CUDA_VERSION >= 11070 int flag = 0; CUdevice dev; int cudaDriverVersion; CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion)); - if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError; + if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInvalidUsage; CUCHECK(cuDeviceGet(&dev, comm->cudaDev)); // Query device to see if DMA-BUF support is available (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev)); - if (flag == 0) return ncclInternalError; + if (flag == 0) return ncclInvalidUsage; INFO(NCCL_INIT, "DMA-BUF is available on GPU device %d", comm->cudaDev); return ncclSuccess; #endif - return ncclInternalError; + return ncclInvalidUsage; } ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { @@ -315,6 +319,15 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { return ret; } +static ncclResult_t commNetName(struct ncclComm* comm, char* netName, size_t len) { + snprintf(netName, len, "%s", comm->ncclNet[0]->name); + for (int n = 1; n < comm->ncclNetCount; n++) { + size_t offset = strlen(netName); + snprintf(netName + offset, len - offset, "+%s", comm->ncclNet[n]->name); + } + return ncclSuccess; +} + static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) { if (ndev < 1) { WARN("invalid device count (%d) requested", ndev); @@ -334,13 +347,16 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in NCCLCHECK(ncclNetPluginLoad(comm)); NCCLCHECK(ncclNetInit(comm)); NCCLCHECK(ncclProfilerPluginInit(comm)); - INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name); - if (parent && parent->config.splitShare) { - if (parent->ncclNet != comm->ncclNet) { - WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name); - return ncclInvalidUsage; - } + char netName[128]; + NCCLCHECK(commNetName(comm,netName,sizeof(netName))); + INFO(NCCL_INIT, "Using network %s", netName); + + if (parent && parent->config.splitShare && (parent->config.netName && comm->config.netName && strcmp(parent->config.netName, comm->config.netName) != 0)) { + char parentNetName[128]; + NCCLCHECK(commNetName(parent, parentNetName, sizeof(parentNetName))); + WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parentNetName, netName); + return ncclInvalidUsage; } // Try to create a CUDA object right away. If there is something wrong with // the device we're on (failure cause #1) , better know it early. @@ -359,7 +375,11 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; - comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false; + + bool dmaBufCuda = dmaBufSupportedByCuda(comm) == ncclSuccess; + for (int n = 0; n < comm->ncclNetCount; ++n) { + comm->dmaBufSupport[n] = dmaBufCuda && (comm->ncclNet[n]->regMrDmaBuf != NULL); + } comm->collNetSupport = 0; memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix)); @@ -560,6 +580,37 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->comm = comm; info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap; + // cross-DC support, build the list of unique fabric Id on this node. + // Packing of info->netDevs must happen with increasing net index. + info->netDevCount = 0; + for (int n = 0; n < comm->ncclNetCount; ++n) { + ncclNet_t* net = comm->ncclNet[n]; + int nDevs = 0; + NCCLCHECK(net->devices(&nDevs)); + for (int d = 0; d < nDevs; ++d) { + ncclNetProperties_t props; + NCCLCHECK(net->getProperties(d, &props)); + // look for a similar fabricID + int sameId = -1; + for (int j = 0; j < info->netDevCount; ++j) { + if (n == info->netDevs[j].netIdx && props.fabricId == info->netDevs[j].fabricId) { + sameId = j; + break; + } + } + // if we haven't found the id already, add it + if (sameId == -1) { + if (info->netDevCount == PEERINFO_NETDEV_MAXCOUNT) { + WARN("Node cannot have more than %d fabric IDs (found %d)", PEERINFO_NETDEV_MAXCOUNT, info->netDevCount + 1); + return ncclInternalError; + } + info->netDevs[info->netDevCount].netIdx = n; + info->netDevs[info->netDevCount].fabricId = props.fabricId; + info->netDevCount++; + } + } + } + // MNNVL support { // MNNVL: Request the fabric UUID and partition info @@ -690,6 +741,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p struct ncclTopoRanks topoRanks; int cpuArch; int cpuVendor; + int firstRankDc; }; int nChannelsOrig; @@ -705,7 +757,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p timers[TIMER_INIT_ALLGATHER] = clockNano(); // AllGather1 - begin NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root - NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail); + NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo + rank, comm->commHash), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail); comm->cuMemSupport = 1; @@ -892,9 +944,48 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather, // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer. timers[TIMER_INIT_CONNECT] = clockNano(); + // AllGather3 - begin NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail); + // allGather3Data for multiDC support + allGather3Data[rank].firstRankDc = -1; + int graphTypeInter [NCCL_NUM_ALGORITHMS]; + for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) graphTypeInter[a] = graphs[a]->typeInter; + for (int r = 0; r < comm->nRanks; ++r) { + // for each algorithm, gather the largest PATH between the GPU and the net that will be used to connect to the peer. + for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) { + for (int c = 0; c < graphs[a]->nChannels; ++c) { + int netPathType; + ncclResult_t res = ncclTopoGetNetDevFromGraph(comm, comm->rank, /*peerRank=*/r, graphs[a], c, NULL, NULL, &netPathType); + if (res != ncclSuccess && graphs[a]->pattern != NCCL_TOPO_PATTERN_NVLS) { + // not finding a nic for rank r is an error; + //Unless it's an NVLS graphs, then it's expected because not every rank is an NVLS head + WARN("Unable to find a net dev to connect to %d with channel %d in graph[%d]", r, c, a); + return ncclInternalError; + } + if(netPathType > graphTypeInter[a]) graphTypeInter[a] = netPathType; + } + } + struct ncclPeerInfo* rankInfo = &comm->peerInfo[rank]; + struct ncclPeerInfo* peerInfo = &comm->peerInfo[r]; + for (int d = 0; d < rankInfo->netDevCount; d++) { + int devNetIdx = rankInfo->netDevs[d].netIdx; + uint64_t devFabricId = rankInfo->netDevs[d].fabricId; + // skip if it's not the DC network or if we have found the first rank already + if (devNetIdx != comm->ncclDcNetIndex || allGather3Data[rank].firstRankDc != -1) continue; + for (int p = 0; p < peerInfo->netDevCount; p++) { + ncclNetPath_t path = {.loc = NET_LOC_DISC}; + if (peerInfo->netDevs[p].netIdx == devNetIdx) NCCLCHECK(comm->ncclNet[devNetIdx]->getNetPath(peerInfo->netDevs[p].fabricId, devFabricId, &path)); + if (path.loc <= NET_LOC_DCL0) { + allGather3Data[rank].firstRankDc = r; + break; + } + } + } + } + + for (int a=0; apattern; allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels; @@ -902,7 +993,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra; allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter; allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra; - allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter; + allGather3Data[rank].graphInfo[a].typeInter = graphTypeInter[a]; allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic; } @@ -975,6 +1066,47 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm->localRank = comm->rankToLocalRank[rank]; comm->localRanks = comm->nodeRanks[comm->node].localRanks; + // multiDC support - obtain the DC ID for each node and create the DC rank list + do { + comm->dcCount = 0; + int* firstRankDcNode = NULL; + NCCLCHECKGOTO(ncclCalloc(&firstRankDcNode, comm->nNodes), ret, fail); + for (int n = 0; n < comm->nNodes; n++) { + struct ncclNodeRanks* nodeRank = &comm->nodeRanks[n]; + if (nodeRank->localRanks > 0) { + firstRankDcNode[n] = allGather3Data[nodeRank->localRankToRank[0]].firstRankDc; + } + // now find previous nodes with the same firstRank, assign their DC index + int m = 0; + for (m = 0; m < n; ++m) { + if (firstRankDcNode[m] == firstRankDcNode[n]) { + nodeRank->dcIndex = comm->nodeRanks[m].dcIndex; + break; + } + } + // haven't found any, create a new DC + if (m == n) nodeRank->dcIndex = comm->dcCount++; + } + free(firstRankDcNode); + // then build information for each of them + NCCLCHECKGOTO(ncclCalloc(&comm->dcNode, comm->dcCount), ret, fail); + for (int n = 0; n < comm->nNodes; n++) { + struct ncclNodeRanks* nodeRank = &comm->nodeRanks[n]; + comm->dcNode[nodeRank->dcIndex].localNodes++; + } + for (int dc = 0; dc < comm->dcCount; ++dc) { + struct ncclDcNode* dcRank = &comm->dcNode[dc]; + NCCLCHECKGOTO(ncclCalloc(&dcRank->localNodeToNode, dcRank->localNodes), ret, fail); + dcRank->localNodes = 0; // reset to 0 to be able to fill the arrays + } + // store the rank arrays inside the DC rank list + for (int n = 0; n < comm->nNodes; n++) { + struct ncclNodeRanks* nodeRank = &comm->nodeRanks[n]; + struct ncclDcNode* dcNode = &comm->dcNode[nodeRank->dcIndex]; + dcNode->localNodeToNode[dcNode->localNodes++] = n; + } + } while (0); + TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d", rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) { diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 731dbcee1..e32f83f85 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -124,12 +124,13 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif + int nUserIfs; struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; if (searchNot) prefixList++; bool searchExact = prefixList && prefixList[0] == '='; if (searchExact) prefixList++; - int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); + NCCLCHECK(parseIfList(prefixList, userIfs, MAX_IFS, &nUserIfs)); int found = 0; struct ifaddrs *interfaces, *interface; @@ -275,7 +276,9 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char if (!ipv6) { struct netIf ni; // parse : string, expect one pair - if (parseStringList(ip_port_pair, &ni, 1) != 1) { + int nIfs; + NCCLCHECK(parseIfList(ip_port_pair, &ni, 1, &nIfs)); + if (nIfs != 1) { WARN("Net : No valid : pair found"); return ncclInvalidArgument; } @@ -450,6 +453,9 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) { return ncclSuccess; } +NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1); +NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1); + static ncclResult_t socketSetFlags(struct ncclSocket* sock) { const int one = 1; /* Set socket as non-blocking if async or if we need to be able to abort */ @@ -458,7 +464,11 @@ static ncclResult_t socketSetFlags(struct ncclSocket* sock) { SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl"); SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); } - SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); + SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY"); + // setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1) + int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff(); + if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF"); + if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF"); return ncclSuccess; } diff --git a/src/misc/utils.cc b/src/misc/utils.cc index bb59947e4..ffddce30b 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -132,20 +132,34 @@ uint64_t getPidHash(void) { return getHash(pname, strlen(pname)); } -int parseStringList(const char* string, struct netIf* ifList, int maxList) { - if (!string) return 0; - +ncclResult_t parseIfList(const char* string, struct netIf* ifList, int maxList, int *ifCount) { const char* ptr = string; - - int ifNum = 0; - int ifC = 0; + int ifNum = 0, ifC = 0; char c; + if (!string) goto exit; do { c = *ptr; if (c == ':') { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; - ifList[ifNum].port = atoi(ptr+1); + ifList[ifNum].port = -1; + ifList[ifNum].fabricId = -1; + char* next = NULL; + const char* start = ptr + 1; + long port = strtol(start, &next, 10); + if (next != start) ifList[ifNum].port = port; + if (*next == ':') { + start = next + 1; + uint64_t fabId = strtol(start, &next, 10); + if (next != start) { + if (fabId < 0 || fabId > NCCL_IF_MAX_FABRICID) { + WARN("fabric ID %ld must be between 0 and %ld.", fabId, NCCL_IF_MAX_FABRICID); + goto fail; + } + ifList[ifNum].fabricId = fabId; + } + } + INFO(NCCL_ENV | NCCL_NET, "found IF %s port %d fabricId %ld", ifList[ifNum].prefix, ifList[ifNum].port, ifList[ifNum].fabricId); ifNum++; ifC = 0; } while (c != ',' && c != '\0') c = *(++ptr); @@ -153,6 +167,8 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; ifList[ifNum].port = -1; + ifList[ifNum].fabricId = -1; + INFO(NCCL_ENV | NCCL_NET, "found IF %s port %d fabricId %ld", ifList[ifNum].prefix, ifList[ifNum].port, ifList[ifNum].fabricId); ifNum++; ifC = 0; } } else { @@ -161,7 +177,12 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) { } ptr++; } while (ifNum < maxList && c); - return ifNum; +exit: + if(ifCount) *ifCount = ifNum; + return ncclSuccess; +fail: + if(ifCount) *ifCount = ifNum; + return ncclInvalidUsage; } static bool matchIf(const char* string, const char* ref, bool matchExact) { @@ -178,18 +199,22 @@ static bool matchPort(const int port1, const int port2) { } -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { +bool indexIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact, int* index) { + if (index) *index = -1; // Make an exception for the case where no user list is defined if (listSize == 0) return true; for (int i=0; i #include -//#include -//#include -//#include extern ncclNet_t* getNcclNet_v6(void* netPluginLib); extern ncclNet_t* getNcclNet_v7(void* netPluginLib); extern ncclNet_t* getNcclNet_v8(void* netPluginLib); extern ncclNet_t* getNcclNet_v9(void* netPluginLib); extern ncclNet_t* getNcclNet_v10(void* netPluginLib); +extern ncclNet_t* getNcclNet_v11(void* netPluginLib); extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib); extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib); extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib); extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib); extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v11(void* netPluginLib); + +extern void* openNetPluginLib(const char* name); +extern void closePluginLib(void* handle); static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket }; -static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 }; +static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, NCCL_NET_PLUGIN_VERSION, NCCL_NET_PLUGIN_VERSION }; ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr }; enum ncclNetState { ncclNetStateInit = 0, @@ -72,8 +73,13 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { goto fail; } - ncclNets[0] = getNcclNet_v10(netPluginLib); - if (ncclNets[0]) ncclNetsVer[0] = 10; + ncclNets[0] = getNcclNet_v11(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 11; + if (ncclNets[0] == nullptr) { + // Try v10 plugin + ncclNets[0] = getNcclNet_v10(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 10; + } if (ncclNets[0] == nullptr) { // Try v9 plugin ncclNets[0] = getNcclNet_v9(netPluginLib); @@ -99,7 +105,10 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { } // Check for CollNet - ncclCollNets[0] = getNcclCollNet_v10(netPluginLib); + ncclCollNets[0] = getNcclCollNet_v11(netPluginLib); + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v10(netPluginLib); + } if (ncclCollNets[0] == nullptr) { ncclCollNets[0] = getNcclCollNet_v9(netPluginLib); } @@ -123,6 +132,7 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { fail: if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib)); netPluginStatus = netPluginLoadFailed; + ncclNetsVer[0] = -1; goto exit; } @@ -199,33 +209,48 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { } ncclResult_t ncclNetInit(struct ncclComm* comm) { - // Initialize main communication network - const char* netName; + // get the list of user-provided nets, reuse the code for interfaces parsing. + int nUserIfs; + const char* netName = comm->config.netName; + struct netIf userNets[NCCL_NET_MAX_PLUGINS]; + NCCLCHECK(parseIfList(netName, userNets, NCCL_NET_MAX_PLUGINS, &nUserIfs)); + + const char* dcNet = ncclGetEnv("NCCL_ALLNET_FASTNET"); + comm->ncclDcNetIndex = 0; + comm->ncclNetCount = 0; bool ok = false; - - netName = comm->config.netName; - for (int i=0; i<3; i++) { + for (int i=0; iname) != 0) continue; + if (!matchIfList(ncclNets[i]->name, /*port=*/-1, userNets, nUserIfs, 1)) continue; if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) { // Mismatched device plugin version continue; } - comm->ncclNet = ncclNets[i]; - comm->ncclNetVer = ncclNetsVer[i]; + comm->ncclNetPluginIdx[comm->ncclNetCount] = i; + comm->ncclNetVer[comm->ncclNetCount] = ncclNetsVer[i]; + comm->ncclNet[comm->ncclNetCount] = ncclNets[i]; + if (dcNet && strcmp(dcNet, ncclNets[i]->name) == 0) comm->ncclDcNetIndex = i; ok = true; - if (ncclCollNets[i]) { - NCCLCHECK(collNetGetState(i, &state)); - if (state == ncclNetStateEnabled) { - comm->ncclCollNet = ncclCollNets[i]; + // try to load the collNet if we don't use ALLNET and it's the first net to be loaded + if (comm->ncclNetCount == 0 && ncclCollNets[i]) { + if (ncclParamAllNet()) { + INFO(NCCL_INIT | NCCL_NET, "NCCL_ALLNET_ENABLE=1, ignore CollNet. Set NCCL_ALLNET_ENABLE=0 if CollNet is needed."); + } else { + NCCLCHECK(collNetGetState(i, &state)); + if (state == ncclNetStateEnabled) { + comm->ncclCollNetPluginIdx = i; + comm->ncclCollNet = ncclCollNets[i]; + } } } - break; + comm->ncclNetCount++; + + if (!ncclParamAllNet()) break; } if (!ok) { @@ -236,7 +261,9 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) { } ncclResult_t ncclNetFinalize(struct ncclComm* comm) { - comm->ncclNet = nullptr; + for (int n = 0; n < comm->ncclNetCount; ++n) { + comm->ncclNet[n] = nullptr; + } comm->ncclCollNet = nullptr; return ncclSuccess; } @@ -259,14 +286,22 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; if (gdrSupportMatrix[comm->cudaDev] == -1) { - int netDevs; - NCCLCHECK(comm->ncclNet->devices(&netDevs)); gdrSupportMatrix[comm->cudaDev] = 0; - for (int dev=0; devncclNet->getProperties(dev, &props)); - if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; + + // find any device on any net that is GDR-capable + int dev = -1; + ncclNet_t * net = NULL; + for (int n = 0; n < comm->ncclNetCount; ++n) { + net = comm->ncclNet[n]; + int netDevs; + NCCLCHECK(net->devices(&netDevs)); + for (dev = 0; dev < netDevs; dev++) { + ncclNetProperties_t props; + NCCLCHECK(net->getProperties(dev, &props)); + // if cuda is supported, select the net and device, then return + if (props.ptrSupport & NCCL_PTR_CUDA) break; + } + } // Allocate memory on the GPU and try to register it on the NIC. void *lComm = NULL, *sComm = NULL, *rComm = NULL; @@ -275,45 +310,37 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { void* mHandle = NULL; ncclResult_t ret; ncclDebugNoWarn = NCCL_NET; - NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); + NCCLCHECKGOTO(net->listen(dev, &handle, &lComm), ret, cleanup1); bool connected; connected = false; while (!connected) { - // If we're aborting now, skip to cleanup if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { goto cleanup2; } - if (sComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2); - + NCCLCHECKGOTO(net->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2); if (rComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2); - + NCCLCHECKGOTO(net->accept(lComm, &rComm, NULL), ret, cleanup2); connected = (rComm != NULL) && (sComm != NULL); } NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); - if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { - NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle)); - NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); - NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle)); + if (net->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { + NCCLCHECK(net->deregMr(sComm, mHandle)); + NCCLCHECK(net->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); + NCCLCHECK(net->deregMr(rComm, mHandle)); gdrSupportMatrix[comm->cudaDev] = 1; } ncclDebugNoWarn = 0; NCCLCHECK(ncclCudaFree(gpuPtr)); -cleanup2: - if (rComm != NULL) - NCCLCHECK(comm->ncclNet->closeRecv(rComm)); - if (sComm != NULL) - NCCLCHECK(comm->ncclNet->closeSend(sComm)); - NCCLCHECK(comm->ncclNet->closeListen(lComm)); -cleanup1: - break; - } + cleanup2: + if (rComm != NULL) NCCLCHECK(net->closeRecv(rComm)); + if (sComm != NULL) NCCLCHECK(net->closeSend(sComm)); + NCCLCHECK(net->closeListen(lComm)); } +cleanup1: *gdrSupport = gdrSupportMatrix[comm->cudaDev]; return ncclSuccess; } diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc index 682f239f7..a464e2188 100644 --- a/src/plugin/net/net_v10.cc +++ b/src/plugin/net/net_v10.cc @@ -8,25 +8,162 @@ #include "net_device.h" #include "proxy.h" +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; static ncclNet_v10_t* ncclNet_v10; static ncclCollNet_v10_t* ncclCollNet_v10; +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v10_t p10; + ncclResult_t ans = ncclNet_v10->getProperties(dev, &p10); + if (ans != ncclSuccess) return ans; + props->name = p10.name; + props->pciPath = p10.pciPath; + props->guid = p10.guid; + props->ptrSupport = p10.ptrSupport; + props->regIsGlobal = p10.regIsGlobal; + props->forceFlush = p10.forceFlush; + props->speed = p10.speed; + props->port = p10.port; + props->maxComms = p10.maxComms; + props->maxRecvs = p10.maxRecvs; + props->latency = p10.latency; + props->netDeviceType = p10.netDeviceType; + props->netDeviceVersion = p10.netDeviceVersion; + props->vProps.ndevs = p10.vProps.ndevs; + memcpy(props->vProps.devs, p10.vProps.devs, sizeof(p10.vProps.devs)); + props->maxP2pBytes = p10.maxP2pBytes; + props->maxCollBytes = p10.maxCollBytes; + props->fabricId = 0; // all devs are on the same rail if v10 + return ncclSuccess; +} + +static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v10_t p10; + ncclResult_t ans = ncclCollNet_v10->getProperties(dev, &p10); + if (ans != ncclSuccess) return ans; + props->name = p10.name; + props->pciPath = p10.pciPath; + props->guid = p10.guid; + props->ptrSupport = p10.ptrSupport; + props->regIsGlobal = p10.regIsGlobal; + props->forceFlush = p10.forceFlush; + props->speed = p10.speed; + props->port = p10.port; + props->maxComms = p10.maxComms; + props->maxRecvs = p10.maxRecvs; + props->latency = p10.latency; + props->netDeviceType = p10.netDeviceType; + props->netDeviceVersion = p10.netDeviceVersion; + props->vProps.ndevs = p10.vProps.ndevs; + memcpy(props->vProps.devs, p10.vProps.devs, sizeof(p10.vProps.devs)); + props->maxP2pBytes = p10.maxP2pBytes; + props->maxCollBytes = p10.maxCollBytes; + props->fabricId = 0; // all devs are on the same rail if v10 + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return ncclNet_v10->connect(dev, (ncclNetCommConfig_v10_t*)config, handle, sendComm, sendDevComm); +} + +static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) { + return ncclNet_v10->makeVDevice(d, (ncclNetVDeviceProps_v10_t*)props); +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v10->init(logfn, proffn)); + ncclNet.devices = ncclNet_v10->devices; + ncclNet.getProperties = ncclNet_getProperties; + ncclNet.listen = ncclNet_v10->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_v10->accept; + ncclNet.regMr = ncclNet_v10->regMr; + ncclNet.regMrDmaBuf = ncclNet_v10->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v10->deregMr; + ncclNet.isend = ncclNet_v10->isend; + ncclNet.irecv = ncclNet_v10->irecv; + ncclNet.iflush = ncclNet_v10->iflush; + ncclNet.test = ncclNet_v10->test; + ncclNet.closeSend = ncclNet_v10->closeSend; + ncclNet.closeRecv = ncclNet_v10->closeRecv; + ncclNet.closeListen = ncclNet_v10->closeListen; + ncclNet.getDeviceMr = ncclNet_v10->getDeviceMr; + ncclNet.irecvConsumed = ncclNet_v10->irecvConsumed; + ncclNet.makeVDevice = ncclNet_v10->makeVDevice ? ncclNet_makeVDevice : nullptr; + ncclNet.getNetPath = ncclNet_getNetPath; + return ncclSuccess; +} + ncclNet_t* getNcclNet_v10(void* lib) { - ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10"); + ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v9"); if (ncclNet_v10) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name); - return ncclNet_v10; + ncclNet.name = ncclNet_v10->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name); + return &ncclNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol."); + INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol."); return nullptr; } +static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request) { +return ncclCollNet_v10->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v10_t*)recvParts, bytesPerRank, + windowOffset, windowBytes, sendMhandle, request); +} + +static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request) { +return ncclCollNet_v10->ireducescatter(collComm, nSendParts, (ncclNetSGE_v10_t*)sendParts, recvData, bytesPerRank, + windowOffset, windowBytes, dataType, redOp, recvMhandle, request); +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v10->init(logfn)); + ncclCollNet.devices = ncclCollNet_v10->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v10->listen; + ncclCollNet.connect = ncclCollNet_v10->connect; + ncclCollNet.reduceSupport = ncclCollNet_v10->reduceSupport; + ncclCollNet.regMr = ncclCollNet_v10->regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v10->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v10->deregMr; + ncclCollNet.iallreduce = ncclCollNet_v10->iallreduce; + ncclCollNet.iallgather = ncclCollNet_iallgather; + ncclCollNet.ireducescatter = ncclCollNet_ireducescatter; + ncclCollNet.iflush = ncclCollNet_v10->iflush; + ncclCollNet.test = ncclCollNet_v10->test; + ncclCollNet.closeColl = ncclCollNet_v10->closeColl; + ncclCollNet.closeListen = ncclCollNet_v10->closeListen; + ncclCollNet.getNetPath = ncclCollNet_getNetPath; + return ncclSuccess; +} + ncclCollNet_t* getNcclCollNet_v10(void* lib) { ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10"); if (ncclCollNet_v10) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name); - return ncclCollNet_v10; + ncclCollNet.name = ncclCollNet_v10->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclCollNet_v10->name); + return &ncclCollNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol."); + INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol."); return nullptr; } diff --git a/src/plugin/net/net_v11.cc b/src/plugin/net/net_v11.cc new file mode 100644 index 000000000..d1c8b7c36 --- /dev/null +++ b/src/plugin/net/net_v11.cc @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" + +static ncclNet_v11_t* ncclNet_v11; +static ncclCollNet_v11_t* ncclCollNet_v11; + +ncclNet_t* getNcclNet_v11(void* lib) { + ncclNet_v11 = (ncclNet_v11_t*)dlsym(lib, "ncclNetPlugin_v11"); + if (ncclNet_v11) { + INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Loaded net plugin %s (v11)", ncclNet_v11->name); + return ncclNet_v11; + } + INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v11 symbol."); + return nullptr; +} + +ncclCollNet_t* getNcclCollNet_v11(void* lib) { + ncclCollNet_v11 = (ncclCollNet_v11_t*)dlsym(lib, "ncclCollNetPlugin_v11"); + if (ncclCollNet_v11) { + INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v11)", ncclCollNet_v11->name); + return ncclCollNet_v11; + } + INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v11 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc index baff67935..f6f743f02 100644 --- a/src/plugin/net/net_v6.cc +++ b/src/plugin/net/net_v6.cc @@ -35,6 +35,7 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->fabricId = 0; return ncclSuccess; } @@ -71,6 +72,12 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si return ans; } +static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); @@ -92,6 +99,7 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->fabricId = 0; return ncclSuccess; } @@ -110,6 +118,12 @@ static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* return ans; } +static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { NCCLCHECK(ncclNet_v6->init(logfn)); ncclNet.devices = ncclNet_v6->devices; diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc index 4bad5ec26..804a917db 100644 --- a/src/plugin/net/net_v7.cc +++ b/src/plugin/net/net_v7.cc @@ -35,6 +35,7 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->fabricId = 0; return ncclSuccess; } @@ -67,6 +68,12 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si return ans; } +static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v7_t p7; ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7); @@ -88,6 +95,7 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->fabricId = 0; return ncclSuccess; } @@ -106,6 +114,12 @@ static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* return ans; } +static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { NCCLCHECK(ncclNet_v7->init(logfn)); ncclNet.devices = ncclNet_v7->devices; diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc index b43bb895e..c014d237b 100644 --- a/src/plugin/net/net_v8.cc +++ b/src/plugin/net/net_v8.cc @@ -35,6 +35,7 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->fabricId = 0; return ncclSuccess; } @@ -61,6 +62,11 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); return ans; } +static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t * path) { + if(!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v8_t p8; @@ -83,6 +89,7 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->fabricId = 0; return ncclSuccess; } @@ -128,6 +135,12 @@ static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, n return ans; } +static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { NCCLCHECK(ncclNet_v8->init(logfn)); ncclNet.devices = ncclNet_v8->devices; @@ -148,6 +161,7 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr; ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed; ncclNet.makeVDevice = NULL; + ncclNet.getNetPath = ncclCollNet_getNetPath; return ncclSuccess; } @@ -180,6 +194,7 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { ncclCollNet.test = ncclCollNet_v8->test; ncclCollNet.closeColl = ncclCollNet_v8->closeColl; ncclCollNet.closeListen = ncclCollNet_v8->closeListen; + ncclCollNet.getNetPath = ncclCollNet_getNetPath; return ncclSuccess; } diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc index 34e039332..66c1cefbe 100644 --- a/src/plugin/net/net_v9.cc +++ b/src/plugin/net/net_v9.cc @@ -4,10 +4,11 @@ * See LICENSE.txt for license information ************************************************************************/ +#include "debug.h" #include "nccl_net.h" #include "net_device.h" -#include "proxy.h" #include "checks.h" +#include static ncclNet_t ncclNet; static ncclCollNet_t ncclCollNet; @@ -15,7 +16,28 @@ static ncclNet_v9_t* ncclNet_v9; static ncclCollNet_v9_t* ncclCollNet_v9; static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { - return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props); + ncclNetProperties_v9_t p9; + ncclResult_t ans = ncclNet_v9->getProperties(dev, &p9); + if (ans != ncclSuccess) return ans; + props->name = p9.name; + props->pciPath = p9.pciPath; + props->guid = p9.guid; + props->ptrSupport = p9.ptrSupport; + props->regIsGlobal = p9.regIsGlobal; + props->forceFlush = p9.forceFlush; + props->speed = p9.speed; + props->port = p9.port; + props->maxComms = p9.maxComms; + props->maxRecvs = p9.maxRecvs; + props->latency = p9.latency; + props->netDeviceType = p9.netDeviceType; + props->netDeviceVersion = p9.netDeviceVersion; + props->vProps.ndevs = p9.vProps.ndevs; + memcpy(props->vProps.devs, p9.vProps.devs, sizeof(p9.vProps.devs)); + props->maxP2pBytes = p9.maxP2pBytes; + props->maxCollBytes = p9.maxCollBytes; + props->fabricId= 0; // all devs are on the same rail if v9 + return ncclSuccess; } static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { @@ -34,8 +56,35 @@ static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props); } +static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { - return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props); + ncclNetProperties_v9_t p9; + ncclResult_t ans = ncclCollNet_v9->getProperties(dev, &p9); + if (ans != ncclSuccess) return ans; + props->name = p9.name; + props->pciPath = p9.pciPath; + props->guid = p9.guid; + props->ptrSupport = p9.ptrSupport; + props->regIsGlobal = p9.regIsGlobal; + props->forceFlush = p9.forceFlush; + props->speed = p9.speed; + props->port = p9.port; + props->maxComms = p9.maxComms; + props->maxRecvs = p9.maxRecvs; + props->latency = p9.latency; + props->netDeviceType = p9.netDeviceType; + props->netDeviceVersion = p9.netDeviceVersion; + props->vProps.ndevs = p9.vProps.ndevs; + memcpy(props->vProps.devs, p9.vProps.devs, sizeof(p9.vProps.devs)); + props->maxP2pBytes = p9.maxP2pBytes; + props->maxCollBytes = p9.maxCollBytes; + props->fabricId= 0; // all devs are on the same rail if v9 + return ncclSuccess; } static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts, @@ -52,6 +101,11 @@ static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, n return ncclCollNet_v9->ireducescatter(collComm, nSendParts, (ncclNetSGE_v9_t*)sendParts, recvData, bytesPerRank, windowOffset, windowBytes, dataType, redOp, recvMhandle, request); } +static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { NCCLCHECK(ncclNet_v9->init(logfn)); @@ -73,6 +127,7 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr; ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed; ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr; + ncclNet.getNetPath = ncclNet_getNetPath; return ncclSuccess; } @@ -105,6 +160,7 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { ncclCollNet.test = ncclCollNet_v9->test; ncclCollNet.closeColl = ncclCollNet_v9->closeColl; ncclCollNet.closeListen = ncclCollNet_v9->closeListen; + ncclCollNet.getNetPath = ncclCollNet_getNetPath; return ncclSuccess; } diff --git a/src/proxy.cc b/src/proxy.cc index 7e8021e47..060521336 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -1821,8 +1821,10 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) { proxyState->p2pChunkSize = comm->p2pChunkSize; proxyState->nChannels = comm->nChannels; proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers; - proxyState->dmaBufSupport = comm->dmaBufSupport; - proxyState->ncclNet = comm->ncclNet; + for (int n = 0; n < comm->ncclNetCount; ++n) { + proxyState->dmaBufSupport[n] = comm->dmaBufSupport[n]; + proxyState->ncclNet[n] = comm->ncclNet[n]; + } proxyState->ncclCollNet = comm->ncclCollNet; proxyState->profilerContext = comm->profilerContext; proxyState->directMode = comm->directMode; diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index c1ccfcaa8..1eaa87a97 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -168,7 +168,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph int proxyRank; int64_t netId; - NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, -1, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; @@ -188,11 +188,11 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph int proxyRank; int64_t netId; - NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, -1, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr)); recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; // Determine whether we need to flush the GDR buffer on recv or not - if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush)); + if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, /*netIdx*/ 0, req.netDev, myInfo->rank, &req.needFlush)); recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn)); @@ -330,7 +330,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props)); connection->collNet = req->collNet; /* DMA-BUF support */ - resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport[0] && (props.ptrSupport & NCCL_PTR_DMABUF); /* collective size limits*/ resources->maxCollBytes = props.maxCollBytes; if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) { @@ -448,7 +448,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props)); connection->collNet = req->collNet; /* DMA-BUF support */ - resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport[0] && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxCollBytes = props.maxCollBytes; if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) { WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \ diff --git a/src/transport/net.cc b/src/transport/net.cc index 40d334fa7..e72d5b3bb 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -7,6 +7,7 @@ #include "comm.h" #include "net.h" #include "graph.h" +#include "graph/topo.h" #include "proxy.h" #include "collectives.h" #include "gdrwrap.h" @@ -109,6 +110,7 @@ struct sendNetResources { ncclNetDeviceType netDeviceType; ncclNetDeviceHandle_t* netDeviceHandle; size_t maxP2pBytes; + ncclNet_t* ncclNet; }; struct recvNetResources { @@ -142,6 +144,7 @@ struct recvNetResources { ncclNetDeviceType netDeviceType; ncclNetDeviceHandle_t* netDeviceHandle; size_t maxP2pBytes; + ncclNet_t* ncclNet; }; struct netRegInfo { @@ -167,6 +170,7 @@ struct setupReq { int tpLocalRank; int tpRemoteRank; int shared; + int netIdx; // net index int netDev; enum ncclTopoGdrMode useGdr; int needFlush; @@ -201,7 +205,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph int proxyRank; int64_t netId; - NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, /*pxnRank=*/peerInfo->rank, &netId, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoIdToNetIdx(comm, netId, &req.netIdx)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; if (!req.useGdr && connIndex == 0) comm->useGdr = 0; @@ -214,11 +219,11 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); if (proxyRank == myInfo->rank) { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet[req.netIdx]->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", req.shared ? "/Shared" : ""); } else { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet[req.netIdx]->name, req.netDev, proxyRank, req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", req.shared ? "/Shared" : ""); @@ -244,13 +249,14 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph // Use myInfo->rank as the receiver uses its own NIC int proxyRank; int64_t netId; - NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, /*pxnRank=*/myInfo->rank, &netId, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoIdToNetIdx(comm, netId, &req.netIdx)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr)); recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; if (!req.useGdr && connIndex == 0) comm->useGdr = 0; // Determine whether we need to flush the GDR buffer on recv or not - if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush)); + if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netIdx, req.netDev, myInfo->rank, &req.needFlush)); // We don't support PXN on receive yet NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn)); @@ -260,7 +266,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int)); - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet[req.netIdx]->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", req.shared ? "/Shared" : ""); return ncclSuccess; @@ -613,6 +619,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; + resources->ncclNet = proxyState->ncclNet[req->netIdx]; resources->tpRank = req->tpRank; resources->tpLocalRank = req->tpLocalRank; resources->tpRemoteRank = req->tpRemoteRank; @@ -622,9 +629,9 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; - NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props)); + NCCLCHECK(resources->ncclNet->getProperties(req->netDev, &props)); /* DMA-BUF support */ - resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport[req->netIdx] && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; resources->netDeviceVersion = props.netDeviceVersion; resources->netDeviceType = props.netDeviceType; @@ -653,6 +660,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; + resources->ncclNet = proxyState->ncclNet[req->netIdx]; resources->tpRank = req->tpRank; resources->tpLocalRank = req->tpLocalRank; resources->tpRemoteRank = req->tpRemoteRank; @@ -663,9 +671,9 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; - NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props)); + NCCLCHECK(resources->ncclNet->getProperties(req->netDev, &props)); /* DMA-BUF support */ - resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport[req->netIdx] && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; resources->netDeviceVersion = props.netDeviceVersion; resources->netDeviceType = props.netDeviceType; @@ -678,7 +686,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc } if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; - NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm)); + NCCLCHECK(resources->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm)); *done = 1; return ncclSuccess; @@ -732,15 +740,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank; - if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); + if (comms->sendComm[resources->channelId] == NULL) ret = resources->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { - ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); + ret = resources->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); } } else { // Connect to remote peer - ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); + ret = resources->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } @@ -839,17 +847,17 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); - NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); + NCCLCHECK(resources->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { - NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + NCCLCHECK(resources->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } // Copy the mhandle dptr, if implemented - if (resources->netDeviceHandle && proxyState->ncclNet->getDeviceMr) - NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netSendComm, resources->mhandles[p], &connection->mhandles[p])); + if (resources->netDeviceHandle && resources->ncclNet->getDeviceMr) + NCCLCHECK(resources->ncclNet->getDeviceMr(resources->netSendComm, resources->mhandles[p], &connection->mhandles[p])); } } @@ -886,15 +894,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank; - if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle); + if (comms->recvComm[resources->channelId] == NULL) ret = resources->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle); resources->netRecvComm = comms->recvComm[resources->channelId]; if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; } else { - ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle); + ret = resources->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle); } } else { // Connect to remote peer - ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle); + ret = resources->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } @@ -912,7 +920,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str connection->needsProxyProgress = 1; } - NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm)); + NCCLCHECK(resources->ncclNet->closeListen(resources->netListenComm)); // Create structures struct connectMap* map = &resources->map; @@ -983,17 +991,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); - NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); + NCCLCHECK(resources->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { - NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + NCCLCHECK(resources->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } // Copy the mhandle dptr - if (resources->netDeviceType != NCCL_NET_DEVICE_HOST && proxyState->ncclNet->getDeviceMr) - NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netRecvComm, resources->mhandles[p], &connection->mhandles[p])); + if (resources->netDeviceType != NCCL_NET_DEVICE_HOST && resources->ncclNet->getDeviceMr) + NCCLCHECK(resources->ncclNet->getDeviceMr(resources->netRecvComm, resources->mhandles[p], &connection->mhandles[p])); } } @@ -1013,7 +1021,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct if (connection->state == connConnected) { for (int p=0; pbuffers[p]) { - NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p])); + NCCLCHECK(resources->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; @@ -1035,12 +1043,12 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank; comms->sendRefCount[resources->channelId]--; - if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeSend(comms->sendComm[resources->channelId])); + if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(resources->ncclNet->closeSend(comms->sendComm[resources->channelId])); } else { - NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm)); + NCCLCHECK(resources->ncclNet->closeSend(resources->netSendComm)); } } else { - NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm)); + NCCLCHECK(resources->ncclNet->closeSend(resources->netSendComm)); } } @@ -1058,7 +1066,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct if (connection->state == connConnected) { for (int p=0; pbuffers[p]) { - NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p])); + NCCLCHECK(resources->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; @@ -1076,12 +1084,12 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank; comms->recvRefCount[resources->channelId]--; - if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeRecv(comms->recvComm[resources->channelId])); + if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(resources->ncclNet->closeRecv(comms->recvComm[resources->channelId])); } else { - NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm)); + NCCLCHECK(resources->ncclNet->closeRecv(resources->netRecvComm)); } } else { - NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm)); + NCCLCHECK(resources->ncclNet->closeRecv(resources->netRecvComm)); } } @@ -1193,7 +1201,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct // Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense, // since size is a plain integer. // coverity[use_invalid:FALSE] - NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot)); + NCCLCHECK(resources->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle,sub, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle); sub->transSize += size; @@ -1212,7 +1220,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct int done; int size; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; - NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size)); + NCCLCHECK(resources->ncclNet->test(sub->requests[buffSlot], &done, &size)); if (done) { // Make sure size is reset to -1 before we update the head. connFifo[buffSlot].size = -1; @@ -1353,7 +1361,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct void** requestPtr = subGroup->requests+(step%NCCL_STEPS); bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1); if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION; - NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr)); + NCCLCHECK(resources->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr)); if (*requestPtr) { subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr; subGroup->recvRequestsSubCount = subCount; @@ -1381,7 +1389,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct int sizes[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; incclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes)); + struct recvNetResources* resources = (struct recvNetResources*)(subGroup->connection->transportResources); + NCCLCHECK(resources->ncclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes)); if (done) { int needFlush = 0; int totalSize = 0; @@ -1437,7 +1446,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct } } struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); - NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); + NCCLCHECK(resources->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); } } args->idle = 0; @@ -1452,7 +1461,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct uint64_t step = subGroup->transmitted; int done = 1; void* request = subGroup->requests[step%NCCL_STEPS]; - if (request) NCCLCHECK(proxyState->ncclNet->test(request, &done, NULL)); + struct recvNetResources* resources = (struct recvNetResources*)(subGroup->connection->transportResources); + if (request) NCCLCHECK(resources->ncclNet->test(request, &done, NULL)); if (done) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; @@ -1490,8 +1500,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct sub->transmitted > sub->done) { if (subGroup->recvRequestsCache[sub->done%NCCL_STEPS]) { // the multirecv requests are only cached in the first sub. - if (proxyState->ncclNet->irecvConsumed) - NCCLCHECK(proxyState->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS])); + if (resources->ncclNet->irecvConsumed) + NCCLCHECK(resources->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS])); subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL; } sub->done += args->sliceSteps; @@ -1662,14 +1672,14 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s if (resources->useDmaBuf) { int dmabuf_fd; CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); - NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); + NCCLCHECKGOTO(resources->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); (void)close(dmabuf_fd); needReg = false; } peermem: #endif if (needReg) { - NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); + NCCLCHECKGOTO(resources->ncclNet->regMr(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); } exit: @@ -1696,14 +1706,14 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s if (resources->useDmaBuf) { int dmabuf_fd; CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); - NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); + NCCLCHECKGOTO(resources->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); (void)close(dmabuf_fd); needReg = false; } peermem: #endif if (needReg) { - NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); + NCCLCHECKGOTO(resources->ncclNet->regMr(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); } exit: @@ -1721,7 +1731,7 @@ static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, assert(reqSize == sizeof(void*)); memcpy(&handle, reqBuff, sizeof(void*)); - NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, handle)); + NCCLCHECK(resources->ncclNet->deregMr(resources->netSendComm, handle)); *done = 1; return ncclSuccess; } @@ -1732,7 +1742,7 @@ static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, assert(reqSize == sizeof(void*)); memcpy(&handle, reqBuff, sizeof(void*)); - NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, handle)); + NCCLCHECK(resources->ncclNet->deregMr(resources->netRecvComm, handle)); *done = 1; return ncclSuccess; } diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index bfff6e555..2fba4d69d 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -14,6 +14,7 @@ #include "profiler/net_ib.h" #include +#include #include #include #include @@ -77,6 +78,7 @@ struct alignas(64) ncclIbDev { struct ibv_port_attr portAttr; struct ncclIbStats stats; int dmaBufSupported; + uint64_t fid; }; #define MAX_IB_DEVS 32 @@ -543,6 +545,10 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) { props->devs[0], dev0->devName, dev0->portNum, NCCL_IB_LLSTR(dev0->link), props->devs[i], dev->devName, dev->portNum, NCCL_IB_LLSTR(dev->link)); return ncclInvalidUsage; } + if (dev->fid != dev0->fid) { + WARN("NET/IB : Trying to merge multiple devices together with different fabric ID properties %s -> %lu, %s -> %lu.", dev0->devName, dev0->fid, dev->devName, dev->fid); + return ncclInvalidUsage; + } } *d = ncclNMergedIbDevs++; @@ -559,6 +565,32 @@ ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { static ncclProfilerCallback_t ncclProfilerFunction; +// NCCL_IF_FABRICID_MAX is set to (1<<48), all the bits above are available for default values +NCCL_PARAM(IbDcMaxRail, "IB_FABRICID_MAXRAIL", (1L<<62)); +NCCL_PARAM(IbDefaultFabricId, "IB_FABRICID_DEFAULT", (1L << 60)); +NCCL_PARAM(RoceDefaultFabricId, "ROCE_FABRICID_DEFAULT", (1L << 61)); + +// Fabric Id are constructed as dcId * NCCL_IB_FABRICID_DC_MAXRAIL + railId. +// Two fabric Ids are connected if they have the same rail Id. +// If they share the same dcId they are connected with LOC_DCL0 (level 0), if not they are connected with LOC_DCL1 (level 1). +// Note: default fabricIds cannot be associated to a specific rail Ids or DC. By default, they correspond to their own DC. +ncclResult_t ncclIbgetNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if (!path) return ncclInvalidArgument; + uint64_t maxRail0 = (fabricId0 == ncclParamIbDefaultFabricId() || fabricId0 == ncclParamRoceDefaultFabricId()) ? UINT64_MAX : ncclParamIbDcMaxRail(); + uint64_t maxRail1 = (fabricId1 == ncclParamIbDefaultFabricId() || fabricId1 == ncclParamRoceDefaultFabricId()) ? UINT64_MAX: ncclParamIbDcMaxRail(); + uint64_t dcId0 = fabricId0 / maxRail0; + uint64_t dcId1 = fabricId1 / maxRail1; + uint64_t railId0 = fabricId0 % maxRail0; + uint64_t railId1 = fabricId1 % maxRail1; + if (railId0 != railId1) + path->loc = NET_LOC_DISC; + else if (dcId0 == dcId1) /*railId0 ==railId1 */ + path->loc = NET_LOC_DCL0; + else /*railId0 == railId1 && dcId0 != dcId1*/ + path->loc = NET_LOC_DCL1; + return ncclSuccess; +} + ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { ncclResult_t ret = ncclSuccess; ncclProfilerFunction = profFunction; @@ -590,7 +622,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr if (searchNot) userIbEnv++; bool searchExact = userIbEnv && userIbEnv[0] == '='; if (searchExact) userIbEnv++; - int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS); + int nUserIfs; + NCCLCHECK(parseIfList(userIbEnv, userIfs, MAX_IB_DEVS, &nUserIfs)); if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; } @@ -619,9 +652,22 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; // check against user specified HCAs/ports - if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) { - continue; + int indexUserIf = -1; + if (!(indexIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact, &indexUserIf) ^ searchNot)) continue; + + // create the default fabric ID, use the user provided one if available + uint64_t fabId = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? (uint64_t)ncclParamIbDefaultFabricId() : (uint64_t)ncclParamRoceDefaultFabricId(); + if (indexUserIf != -1) { + int64_t ifFabricId = userIfs[indexUserIf].fabricId; + if (ifFabricId == ncclParamIbDefaultFabricId() || ifFabricId == ncclParamRoceDefaultFabricId()) { + INFO(NCCL_NET, "Cannot use device %s because the associated fabric Id = %ld conflicts with the default IB = %ld or RoCE = %ld ones. " + "Please consider changing the value of NCCL_IB_FABRICID_DEFAULT and NCCL_IB_ROCE_FABRICID_DEFAULT to avoid conflicts.", + devices[d]->name, ifFabricId, ncclParamIbDefaultFabricId(), ncclParamRoceDefaultFabricId()); + continue; + } + if (ifFabricId >= 0) fabId = (uint64_t)ifFabricId; } + pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL); ncclIbDevs[ncclNIbDevs].device = d; ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; @@ -632,6 +678,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr ncclIbDevs[ncclNIbDevs].context = context; ncclIbDevs[ncclNIbDevs].pdRefs = 0; ncclIbDevs[ncclNIbDevs].pd = NULL; + ncclIbDevs[ncclNIbDevs].fid = fabId; strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail); ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; @@ -645,8 +692,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0; if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting(); - TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, - NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); + TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d fabricId=%lu", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, + NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar,ncclIbDevs[ncclNIbDevs].fabricId); PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail); ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); @@ -786,6 +833,7 @@ ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) { props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; + props->fabricId = ibDev->fid; pthread_mutex_unlock(&ibDev->lock); return ncclSuccess; } @@ -2485,7 +2533,8 @@ ncclNet_t ncclNetIb = { ncclIbCloseListen, NULL /* getDeviceMr */, NULL /* irecvConsumed */, - ncclIbMakeVDevice + ncclIbMakeVDevice, + ncclIbgetNetPath }; /* diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 8034d95fe..15e28e25a 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -116,18 +116,20 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; + props->fabricId = 0; return ncclSuccess; } /* Communication functions */ -#define MAX_SOCKETS 64 +#define MAX_SOCKETS 128 #define MAX_THREADS 16 #define MAX_REQUESTS NCCL_NET_MAX_REQUESTS -#define MIN_CHUNKSIZE (64*1024) NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); +NCCL_PARAM(SocketInlineSize, "SOCKET_INLINE", /*1 kiB=*/1 << 10); +NCCL_PARAM(SocketMinTaskSize, "SOCKET_MIN_TASKSIZE", /*64 kiB=*/1 << 16); enum ncclNetSocketCommState { ncclNetSocketCommStateStart = 0, @@ -171,6 +173,7 @@ struct ncclNetSocketRequest { int op; void* data; int size; + void* inlineData; struct ncclSocket* ctrlSock; int offset; int used; @@ -211,6 +214,7 @@ struct ncclNetSocketComm { int nSocks; int nThreads; int nextSock; + void* inlineData; struct ncclNetSocketRequest requests[MAX_REQUESTS]; pthread_t helperThread[MAX_THREADS]; struct ncclNetSocketThreadResources threadResources[MAX_THREADS]; @@ -360,6 +364,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) goto exit; } +#define SOCKET_CTRL_SIZE (sizeof(int)) ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; @@ -401,6 +406,7 @@ ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* op NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done)); if (done == 0) return ncclSuccess; } + NCCLCHECK(ncclCalloc(&comm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize()))); *sendComm = comm; return ncclSuccess; } @@ -449,6 +455,7 @@ ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDevic memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket)); free(sock); } + NCCLCHECK(ncclCalloc(&rComm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize()))); *recvComm = rComm; /* reset lComm state */ @@ -470,6 +477,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi r->used = 1; r->comm = comm; r->nSubs = 0; + r->inlineData = (uint8_t*)comm->inlineData + i * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize()); *req = r; return ncclSuccess; } @@ -520,6 +528,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclPro return ncclInternalError; } +// if the dataSize is smaller than the inline size, return the inline size; if not, return 0 to avoid the extra copy. +static int ncclNetSocketInlineSize(int dataSize) { return (dataSize <= ncclParamSocketInlineSize()) ? dataSize : 0; } + ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { *done = 0; struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request; @@ -527,37 +538,50 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { WARN("NET/Socket : test called with NULL request"); return ncclInternalError; } - if (r->used == 1) { /* try to send/recv size */ - int data = r->size; + if (r->used == 1) { /* try to send/recv size (+ inline data if any) */ + int msgSize; + uint8_t* msg = (uint8_t*)r->inlineData; + if (r->op == NCCL_SOCKET_SEND) { + int inlineSize = ncclNetSocketInlineSize(r->size); + msgSize = inlineSize + SOCKET_CTRL_SIZE; + ((int*)(msg))[0] = r->size; + if (inlineSize > 0) memcpy(msg + SOCKET_CTRL_SIZE, r->data, inlineSize); + } else { + int sizeOffset = 0; + while (sizeOffset < SOCKET_CTRL_SIZE) { + NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, msg, SOCKET_CTRL_SIZE, &sizeOffset)); + if (sizeOffset == 0) return ncclSuccess; /* not ready yet*/ + } + int senderSize = ((int*)(msg))[0]; + if (senderSize > r->size) { + char line[SOCKET_NAME_MAXLEN + 1]; + union ncclSocketAddress addr; + NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr)); + WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, " + "there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", + ncclSocketToString(&addr, line), senderSize, r->size); + return ncclInvalidUsage; + } + // from the actual size, extract the remaining inline size to be received and redirect the msg buffer to the user data + r->size = senderSize; + msgSize = ncclNetSocketInlineSize(r->size); + msg = (uint8_t*)r->data; + } int offset = 0; - NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset)); - - if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ - - // Not sure we could ever receive less than 4 bytes, but just in case ... - if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset)); - - // Check size is less or equal to the size provided by the user - if (r->op == NCCL_SOCKET_RECV && data > r->size) { - char line[SOCKET_NAME_MAXLEN+1]; - union ncclSocketAddress addr; - NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr)); - WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ - there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", - ncclSocketToString(&addr, line), data, r->size); - return ncclInvalidUsage; + while (offset < msgSize) { + NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, msg, msgSize, &offset)); + if (offset == 0) return ncclSuccess; /* not ready yet*/ } - r->size = data; - r->offset = 0; - r->used = 2; // done exchanging size - // divide into subtasks - int chunkOffset = 0, i = 0; + // done exchanging sizes, r->size now contains the actual size + r->used = 2; + r->offset = ncclNetSocketInlineSize(r->size); + int chunkOffset = r->offset, i = 0; if (r->comm->nSocks > 0) { - // each request can be divided up to nSocks tasks - int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); + // each request can be divided up to nSocks tasks, we use the size left to transfer + int taskSize = std::max((int)ncclParamSocketMinTaskSize(), DIVUP(r->size - r->offset, r->comm->nSocks)); while (chunkOffset < r->size) { - int chunkSize = std::min(taskSize, r->size-chunkOffset); - NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); + int chunkSize = std::min(taskSize, r->size - chunkOffset); + NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data) + chunkOffset, chunkSize, r->tasks + i++)); chunkOffset += chunkSize; } } @@ -673,11 +697,18 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) { NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready)); if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i])); } + if(comm->inlineData) free(comm->inlineData); free(comm); } return ncclSuccess; } +ncclResult_t ncclSocketgetNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { + if(!path) return ncclInvalidArgument; + path->loc = (fabricId0 == fabricId1)? NET_LOC_DCL0 : NET_LOC_DISC; + return ncclSuccess; +} + ncclNet_t ncclNetSocket = { "Socket", ncclNetSocketInit, @@ -698,5 +729,6 @@ ncclNet_t ncclNetSocket = { ncclNetSocketCloseListen, NULL /* getDeviceMr */, NULL /* irecvConsumed */, - NULL /* mergeDevices */ + NULL /* mergeDevices */, + ncclSocketgetNetPath };