diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 85ea79ef7..4ac9c83d1 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -25,6 +25,7 @@
 
 typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
 
+#include "net_v11.h"
 #include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
@@ -35,9 +36,10 @@ typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* p
 #include "net_v3.h"
 #include "net_v2.h"
 
-typedef ncclNet_v10_t ncclNet_t;
-typedef ncclNetProperties_v10_t ncclNetProperties_t;
-typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
-typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+typedef ncclNet_v11_t ncclNet_t;
+typedef ncclNetProperties_v11_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
+typedef ncclNetPath_v11_t ncclNetPath_t;
 
 #endif // end include guard
diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h
index d693101a3..8892a0221 100644
--- a/ext-net/example/nccl/net_device.h
+++ b/ext-net/example/nccl/net_device.h
@@ -27,6 +27,7 @@ typedef struct {
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
 typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
-typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
+typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/ext-net/example/nccl/net_v11.h b/ext-net/example/nccl/net_v11.h
new file mode 100644
index 000000000..e73257db5
--- /dev/null
+++ b/ext-net/example/nccl/net_v11.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V11_H_
+#define NET_V11_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V11 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V11];
+} ncclNetVDeviceProps_v11_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v11_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v11_t vProps;
+  size_t maxP2pBytes;  // Max transfer size for point-to-point operations
+  size_t maxCollBytes; // Max transfer size for collective operations
+  uint64_t fabricId;   // Fabric handle associated to the current device
+} ncclNetProperties_v11_t;
+
+typedef enum {
+  NET_LOC_DCL0 = 0 /* same DC, hierarchy level 0*/,
+  NET_LOC_DCL1 = 1 /* different DC, hiearchy level 1 */,
+  NET_LOC_DISC = 2 /* disconnected*/
+} ncclNetLoc_v11_t;
+
+typedef struct {
+  ncclNetLoc_v11_t loc;
+} ncclNetPath_v11_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v11_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
+
+  // topology API. getNetPath returns the path between two fabricIds.
+  ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_v11_t* path);
+} ncclNet_v11_t;
+
+#endif // end include guard
diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c
index 97a29875d..9468d13c9 100644
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@@ -17,7 +17,6 @@ __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalE
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
 __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
   // Below are default values, if unsure don't change.
-
   props->name = "Example";
   // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
   props->pciPath = NULL;
@@ -48,6 +47,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
   // maximum transfer sizes the plugin can handle
   props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
   props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  // all devs can connect to each other, fabric ID is 0 (any other value would work)
+  props->fabricId = 0;
   return ncclSuccess;
 }
 
@@ -67,10 +68,12 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE
 __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
 __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
+__hidden ncclResult_t pluginGetNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { return ncclInternalError; }
 
 #define PLUGIN_NAME "Plugin"
 
-const ncclNet_v10_t ncclNetPlugin_v10 = {
+
+const ncclNet_v11_t ncclNetPlugin_v11 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
@@ -91,6 +94,59 @@ const ncclNet_v10_t ncclNetPlugin_v10 = {
   .getDeviceMr = pluginGetDeviceMr,
   .irecvConsumed = pluginIrecvConsumed,
   .makeVDevice   = pluginMakeVDevice,
+  .getNetPath = pluginGetNetPath,
+};
+
+__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props_v10) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v10->name = props.name;
+  props_v10->pciPath = props.pciPath;
+  props_v10->guid = props.guid;
+  props_v10->ptrSupport = props.ptrSupport;
+  props_v10->regIsGlobal = props.regIsGlobal;
+  props_v10->forceFlush = props.forceFlush;
+  props_v10->speed = props.speed;
+  props_v10->port = props.port;
+  props_v10->maxComms = props.maxComms;
+  props_v10->maxRecvs = props.maxRecvs;
+  props_v10->netDeviceType = props.netDeviceType;
+  props_v10->netDeviceVersion = props.netDeviceVersion;
+  props_v10->vProps.ndevs = props.vProps.ndevs;
+  memcpy(props_v10->vProps.devs, props.vProps.devs, sizeof(props.vProps.devs));
+  props_v10->maxP2pBytes = props.maxP2pBytes;
+  props_v10->maxCollBytes = props.maxCollBytes;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return pluginConnect(dev, (ncclNetCommConfig_t*)config, handle, sendComm, sendDevComm);
+}
+
+__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; }
+
+const ncclNet_v10_t ncclNetPlugin_v10 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v10,
+  .listen = pluginListen,
+  .connect = pluginConnect_v10,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v10,
 };
 
 __hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 9e24faadf..ff9bc160d 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -479,8 +479,9 @@ static ncclResult_t getUDS(uint64_t* peerUDS) {
   return ncclSuccess;
 }
 #define MAX_OOB_DEVS 16
-static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
+static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, ncclNet_t** net, int* dev) {
   static int devOOB = -1;
+  static ncclNet_t* netOOB = NULL;
   if (devOOB < 0) {
     pthread_mutex_lock(&bootstrapNetLock);
     if (devOOB < 0) {
@@ -491,24 +492,28 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
         if (searchNot) userIfEnv++;
         bool searchExact = userIfEnv && userIfEnv[0] == '=';
         if (searchExact) userIfEnv++;
+        int nUserIfs;
         struct netIf userIfs[MAX_OOB_DEVS];
-        int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS);
-        // loop over the device and return the first one matching
+        NCCLCHECK(parseIfList(userIfEnv, userIfs, MAX_OOB_DEVS,&nUserIfs));
+        // loop over all nets and all devices and return the first one matching
         int nDev = 0;
-        NCCLCHECK(comm->ncclNet->devices(&nDev));
-        int devId = 0;
-        while (devId < nDev) {
-          ncclNetProperties_t props;
-          comm->ncclNet->getProperties(devId, &props);
-          // check against user specified HCAs/ports
-          if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) {
-            // All plain physical devices have been initialized at this point
-            devOOB = devId;
-            break;
+        for (int n = 0; n < comm->ncclNetCount; ++n) {
+          NCCLCHECK(comm->ncclNet[n]->devices(&nDev));
+          int devId = 0;
+          while (devId < nDev) {
+            ncclNetProperties_t props;
+            comm->ncclNet[n]->getProperties(devId, &props);
+            // check against user specified HCAs/ports
+            if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) {
+              // All plain physical devices have been initialized at this point
+              devOOB = devId;
+              netOOB = comm->ncclNet[n];
+              break;
+            }
+            devId++;
           }
-          devId++;
         }
-        if (devOOB == -1) {
+        if (devOOB == -1 || !netOOB) {
           if (!searchNot)
             WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
           else
@@ -519,16 +524,18 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
       } else {
         // default choice is device 0
         devOOB = 0;
+        netOOB = comm->ncclNet[0];
       }
       // display info on the chosen device
       ncclNetProperties_t props;
-      ncclResult_t res = comm->ncclNet->getProperties(devOOB, &props);
+      ncclResult_t res = netOOB->getProperties(devOOB, &props);
       bool hasProp = res == ncclSuccess;
       INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
     }
     pthread_mutex_unlock(&bootstrapNetLock);
   }
   *dev = devOOB;
+  *net = netOOB;
   return ncclSuccess;
 }
 
@@ -636,7 +643,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   state->nranks = nranks;
   state->cudaDev = comm->cudaDev;
   state->abortFlag = comm->abortFlag;
-  state->net = comm->ncclNet;
+  state->net = NULL;
   comm->bootstrap = state;
   comm->magic = state->magic = BOOTSTRAP_HANDLE(handles, 0)->magic; // state and comm magic set to the first magic ID
 
@@ -651,7 +658,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_CREATE]);
   if (ncclParamBootstrapNetEnable()) {
     // Create net interface for other ranks to contact me (all gather)
-    NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
+    NCCLCHECK(netGetDevice(rank, comm, &state->net, &STATE_LISTEN(state, net.dev)));
     NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
     memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
   } else {
@@ -789,7 +796,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
   state->nranks = nranks;
   state->cudaDev = comm->cudaDev;
   state->abortFlag = comm->abortFlag;
-  state->net = comm->ncclNet;
+  state->net = NULL;
   comm->bootstrap = state;
   comm->magic = state->magic = magic;
 
@@ -798,7 +805,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
 
   // create a handle for the others to reach out to me
   if (ncclParamBootstrapNetEnable()) {
-    NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
+    NCCLCHECKGOTO(netGetDevice(rank, comm, &state->net, &STATE_LISTEN(state, net.dev)), ret, fail);
     NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
     memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
   } else {
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 76b508c2d..a751d9639 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -65,10 +65,6 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
       }
     }
   }
-  // Duplicate channels trees
-  struct ncclChannel* channel0 = comm->channels;
-  struct ncclChannel* channel1 = channel0+nChannels;
-  memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
 
   // Get nvls heads and the number of heads. Duplicate head is not allowed.
   for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
@@ -90,22 +86,80 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
   return ncclSuccess;
 }
 
-static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
-  int nChannels = comm->nChannels;
-  int nNodes = comm->nNodes;
-  for (int c=0; c<nChannels; c++) {
-    int* recv = ringRecv+c*comm->nNodes;
-    int* send = ringSend+c*comm->nNodes;
-    int* prev = ringPrev+c*comm->nRanks;
-    int* next = ringNext+c*comm->nRanks;
-    for (int n=0; n<nNodes; n++) {
-      int recvRank = recv[n];
-      int prevSendRank = send[(n-1+nNodes)%nNodes];
+// ScatterDc = 0 will disable the scattering of the channels on different NICs.
+// This way we ensure that all channels will follow the same rank order.
+NCCL_PARAM(ScatterXDc, "SCATTER_XDC", 1);
+
+// returns the node that will be used to cross-DC for a given channel
+static int channelToNode(struct ncclComm* comm, int nNodes, int channel, int nChannels) {
+  int pow2 = 1;
+  while ((pow2 << 1) <= nNodes) pow2 <<= 1;
+  // In the case of multiple communicators on the same node, they will all have the same value of c, which leads them to use the same node to cross the DC.
+  // To avoid this, we offset the channel index by the nvmlDev index on rank 0.
+  // For example with 4 GPUs/node and 16 channels/comm (so c = 0 - 15):
+  // - comm 0 (GPU 0-3) will have channel index of 0-15,
+  // - comm 1 (GPU 4-7) will have channel index of 16-31
+  // This way (GPU as the outer index), we make sure that two GPUs need to share a node, it will not be adjacent GPUs.
+  // For example, if two out of 8 GPUs need to share node 0, it will be GPU 0 and 4.
+  int commId = nChannels * comm->peerInfo[0].nvmlDev + (ncclParamScatterXDc() ? channel : 0);
+  return mirrorBits(commId, pow2) % nNodes;
+}
+
+static int getCrossNodeForRing(struct ncclComm* comm, struct ncclDcNode* dc, int c, int nChannels) {
+  return channelToNode(comm, dc->localNodes, c, nChannels);
+}
+
+
+#define NODE_FROM_DC(dc, i) (dc->localNodeToNode[((i) + dc->localNodes) % dc->localNodes])
+// connect the inter-node for nChannels rings. For each ring, ringPrev and ringNext store respectivelly the previous and the next rank in the ring for all the ranks.
+// We close the rings inter-node using the search channels information: ringRecv and ringSend.
+// For each search channel, ringRecv and ringSend contains the recv and send rank on each node.
+static ncclResult_t connectRings(struct ncclComm* comm, int nChannels, struct ncclChannel* channels, int* ringPrev, int* ringNext, int nSearchChannels, int* ringRecv, int* ringSend) {
+  INFO(NCCL_GRAPH, "%s: comm 0x%lx connecting %d ring channels using %d search channels", __func__, comm->commHash, nChannels, nSearchChannels);
+  int nDc = comm->dcCount;
+  for (int c = 0; c < nChannels; c++) {
+    int* prev = ringPrev + c * comm->nRanks;
+    int* next = ringNext + c * comm->nRanks;
+    int* recv = ringRecv + (c % nSearchChannels) * comm->nNodes;
+    int* send = ringSend + (c % nSearchChannels) * comm->nNodes;
+    for (int dc = 0; dc < nDc; ++dc) {
+      struct ncclDcNode* dcNode = &comm->dcNode[dc];
+      // we first connect the all the nodes inside the same DC together
+      int nNodesInDc = dcNode->localNodes;
+      for (int iNode = 0; iNode < nNodesInDc; iNode++) {
+        int node = dcNode->localNodeToNode[iNode];
+        // recv connects to the prev send
+        const int prevNode = dcNode->localNodeToNode[(iNode - 1 + nNodesInDc) % nNodesInDc];
+        const int recvRank = recv[node];
+        const int prevSendRank = send[prevNode];
+        prev[recvRank] = prevSendRank;
+        // send connect to the next recv
+        const int nextNode = dcNode->localNodeToNode[(iNode + 1) % nNodesInDc];
+        const int sendRank = send[node];
+        const int nextRecvRank = recv[nextNode];
+        next[sendRank] = nextRecvRank;
+      }
+    }
+    // for each DC, we open the rings between node getCrossNodeForRing(dc,c) and getCrossNodeForRing(dc+1,c) + 1
+    for (int dc = 0; dc < nDc; ++dc) {
+      struct ncclDcNode* currDc = &comm->dcNode[dc];
+      struct ncclDcNode* nextDc = &comm->dcNode[(dc + 1) % nDc];
+      struct ncclDcNode* prevDc = &comm->dcNode[(dc - 1 + nDc) % nDc];
+      int crossNode = getCrossNodeForRing(comm, currDc, c, nChannels);
+      INFO(NCCL_GRAPH, "%s: crossNode on DC %d of ring[%d] is node %d", __func__, dc, c, crossNode);
+      // recv from the previous DC
+      int recvRank = recv[NODE_FROM_DC(currDc, crossNode + 1)];
+      int prevSendRank = send[NODE_FROM_DC(prevDc, getCrossNodeForRing(comm, prevDc, c, nChannels))];
       prev[recvRank] = prevSendRank;
-      int sendRank = send[n];
-      int nextRecvRank = recv[(n+1)%nNodes];
+      INFO(NCCL_GRAPH, "DC %d - ring %d: connecting rank %d (prev DC) -> rank %d", dc, c, prevSendRank, recvRank);
+      // send to the next recv
+      int sendRank = send[NODE_FROM_DC(currDc, crossNode)];
+      int nextRecvRank = recv[NODE_FROM_DC(nextDc, getCrossNodeForRing(comm, nextDc, c, nChannels) + 1)];
       next[sendRank] = nextRecvRank;
+      INFO(NCCL_GRAPH, "DC %d - ring %d: connecting rank %d -> rank %d (next DC)", dc, c, sendRank, nextRecvRank);
     }
+    channels[c].ring.prev = prev[comm->rank];
+    channels[c].ring.next = next[comm->rank];
   }
   return ncclSuccess;
 }
@@ -133,41 +187,143 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
   return ncclSuccess;
 }
 
-static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
-  const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
+static int rootNodeTree0(struct ncclComm* comm, struct ncclDcNode* dc, int tree0Id, int nTrees0) {
+  // Given a root for tree 0 (R0), the root of tree 1 will either be (R0+1)%nNodes (nNodes odd) or (R0+nNodes-1)%nNodes (nNodes even), see trees.cc
+  // Therefore, we distribute the roots of trees 0 on the even nodes only
+  int nNodes = dc->localNodes;
+  return 2 * channelToNode(comm, nNodes / 2, tree0Id, nTrees0);
+}
 
-  // Compute tree depth. Not an exact value but a good approximation in most
-  // cases
-  int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
+static int shiftedNodeFromLocalNode(struct ncclComm* comm, struct ncclDcNode* dc, int localNode, int tree0Id, int nTrees0) {
+  int root = rootNodeTree0(comm, dc, tree0Id, nTrees0);
+  return (localNode - root + dc->localNodes) % dc->localNodes;
+}
 
-  int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
-  int* ttp, *ttc0, *ttc1;
-  NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
-  for (int c=0; c<nChannels; c++) {
-     struct ncclChannel* channel0 = comm->channels+c;
-     struct ncclChannel* channel1 = channel0+nChannels;
-     ttp = treeToParent+c*comm->nNodes;
-     ttc0 = treeToChild0+c*comm->nNodes;
-     ttc1 = treeToChild1+c*comm->nNodes;
-     if (comm->rank == ttp[node]) {
-       NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
-       NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
-     }
-     if (comm->rank == ttc0[node]) {
-       NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
-       NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
-     }
-     if (comm->rank == ttc1[node]) {
-       NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
-       NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
-     }
-     if (comm->rank == ttp[node] ||
-         comm->rank == ttc0[node] ||
-         comm->rank == ttc1[node]) {
-       INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c,           channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
-       INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
-     }
-     channel0->tree.depth = channel1->tree.depth = depth;
+static int globalNodeFromShiftedNode(struct ncclComm* comm, struct ncclDcNode* dc, int shiftedNode, int tree0Id, int nTrees0) {
+  if (shiftedNode == -1) return -1;
+  int root = rootNodeTree0(comm, dc, tree0Id, nTrees0);
+  int localNodeIndex = (shiftedNode + root) % dc->localNodes;
+  return NODE_FROM_DC(dc, localNodeIndex);
+}
+
+static struct ncclDcNode* dcIdFromGlobalNode(struct ncclComm* comm, int node, int* dcId, int* localNode) {
+  *dcId = -1;
+  *localNode = -1;
+  for (int dc = 0; dc < comm->dcCount; ++dc) {
+    for (int n = 0; n < comm->dcNode[dc].localNodes; ++n) {
+      if (comm->dcNode[dc].localNodeToNode[n] == comm->node) {
+        *dcId = dc;
+        *localNode = n;
+        break;
+      }
+    }
+    if (*dcId >= 0 && *localNode >= 0) break;
+  }
+  return &comm->dcNode[*dcId];
+}
+
+// connect a total of nChannels trees (nChannels/2 primary trees and nChannels/2 dual trees) using the channels found in the search. Both trees will use the same search channel.
+// For each search channel, treeToParent, treeToChild0, and treeToChild1, contain respectivelly the rank communicating with the parent, the child0, and the child 1 for each node.
+static ncclResult_t connectTrees(struct ncclComm* comm, const int nChannels, struct ncclChannel* channels, const int nSearchChannels, int* treeToParent, int* treeToChild0,
+                                 int* treeToChild1, int* treePatterns) {
+  int dcId = -1, localNode = -1;
+  const int nNodes = comm->nNodes, node = comm->node, nDc = comm->dcCount;
+  struct ncclDcNode* currDc = dcIdFromGlobalNode(comm, comm->node, &dcId, &localNode);
+
+  // Compute tree depth. Not an exact value but a good approximation in most cases
+  int maxDepthDc = 0;
+  for (int d = 0; d < nDc; ++d) maxDepthDc = std::max((int)log2i(comm->dcNode[d].localNodes), maxDepthDc);
+  int depth = /*intraNode*/ (comm->nRanks / nNodes - 1) + /*intra-DC*/ maxDepthDc + /*inter-DC*/ (nDc - 1);
+
+  int nTrees1 = nChannels / 2;
+  int nTrees0 = nTrees1 + (nChannels % 2);
+  INFO(NCCL_GRAPH, "%s: comm 0x%lx connecting %d tree channels (%d primal, %d dual) using %d search channels", __func__, comm->commHash, nChannels, nTrees0, nTrees1, nSearchChannels);
+  for (int c0 = 0; c0 < nTrees0; c0++) {
+    // primal and dual channels, if nChannels is odd, the last dual is not done
+    struct ncclChannel* channel0 = channels + c0;
+    struct ncclChannel* channel1 = (c0 < nTrees1) ? (channels + nTrees0 + c0) : NULL;
+    // dual channel (channel1) has to be the same as channel0. This could not be the case if the number of search channels is higher than the number of desired channels.
+    if (channel1) memcpy(&channel1->tree, &channel0->tree, sizeof(struct ncclTree));
+
+    int* ttp = treeToParent + (c0 % nSearchChannels) * comm->nNodes;
+    int* ttc0 = treeToChild0 + (c0 % nSearchChannels) * comm->nNodes;
+    int* ttc1 = treeToChild1 + (c0 % nSearchChannels) * comm->nNodes;
+    //  intraDC tree: each primal/dual trees will get a different root. the shifted ID is the same for both the primal and the dual tree.
+    int tt[6];
+    int t0ChildType, t1ChildType;
+    int nodeShifted = shiftedNodeFromLocalNode(comm, currDc, localNode, c0, nTrees0);
+    NCCLCHECK(ncclGetDtree(comm->dcNode[dcId].localNodes, nodeShifted, tt + 0, tt + 1, tt + 2, &t0ChildType, tt + 3, tt + 4, tt + 5, &t1ChildType));
+    // we need to restranslate the shifted local indexes into unshifted global index
+    int tu[2] = {globalNodeFromShiftedNode(comm, currDc, tt[0], c0, nTrees0), globalNodeFromShiftedNode(comm, currDc, tt[3], c0, nTrees0)};
+    int td0[2] = {globalNodeFromShiftedNode(comm, currDc, tt[1], c0, nTrees0), globalNodeFromShiftedNode(comm, currDc, tt[4], c0, nTrees0)};
+    int td1[2] = {globalNodeFromShiftedNode(comm, currDc, tt[2], c0, nTrees0), globalNodeFromShiftedNode(comm, currDc, tt[5], c0, nTrees0)};
+    if (comm->rank == ttp[node]) {
+      NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, tu[0]));
+      if (channel1) NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, tu[1]));
+    }
+    if (comm->rank == ttc0[node]) {
+      NCCLCHECK(setTreeDown(&channel0->tree, ttp, td0[0]));
+      if (channel1) NCCLCHECK(setTreeDown(&channel1->tree, ttp, td0[1]));
+    }
+    if (comm->rank == ttc1[node]) {
+      NCCLCHECK(setTreeDown(&channel0->tree, ttp, td1[0]));
+      if (channel1) NCCLCHECK(setTreeDown(&channel1->tree, ttp, td1[1]));
+    }
+    if (comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
+      INFO(NCCL_GRAPH, "Tree %d : %d <-> %d <-> %d/%d/%d", c0, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
+      if (channel1)
+        INFO(NCCL_GRAPH, "Tree %d : %d <-> %d <-> %d/%d/%d", c0 + nTrees0, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
+    }
+
+    /* chain DC roots together to create a single tree
+    Example with 3 DCs (R0, R1, and R2 represent the roots of each DC's tree; T0, T1, and T2 the rest of the corresponding tree)
+               R2
+              /  \
+           R1     T2
+         /   \
+      R0      T1
+       \
+        T0
+    */
+    const bool isNodeRoot[2] = {(tu[0] == -1), (tu[1] == -1)};
+    if (isNodeRoot[0] || isNodeRoot[1]) {
+      int root0, root1;
+      // connect rank = ttp[root node of current DC] to the ttc0[root node of next DC]
+      if (comm->rank == ttp[node] && dcId < (nDc - 1)) {
+        NCCLCHECK(ncclGetDtreeRoots(comm->dcNode[dcId + 1].localNodes, &root0, &root1));
+        int nodeR0 = globalNodeFromShiftedNode(comm, &comm->dcNode[dcId + 1], root0, c0, nTrees0);
+        int nodeR1 = globalNodeFromShiftedNode(comm, &comm->dcNode[dcId + 1], root1, c0, nTrees0);
+        if (isNodeRoot[0]) {
+          NCCLCHECK(setTreeUp(&channel0->tree, ttc0, nodeR0));
+          INFO(NCCL_GRAPH, "%s: comm 0x%lx primal TREE %d/%d -> rank %d (node %d, DC %d) up to rank %d (node %d, DC %d) ", __func__, comm->commHash, c0, nTrees0, comm->rank, node,
+               dcId, ttc0[nodeR0], nodeR0, dcId + 1);
+        }
+        if (isNodeRoot[1] && channel1) {
+          NCCLCHECK(setTreeUp(&channel1->tree, ttc0, nodeR1));
+          INFO(NCCL_GRAPH, "%s: comm 0x%lx dual TREE %d/%d -> rank %d (node %d, DC %d) up to rank %d (node %d, DC %d) ", __func__, comm->commHash, c0, nTrees1, comm->rank, node,
+               dcId, ttc0[nodeR1], nodeR1, dcId + 1);
+        }
+      }
+      // connect rank = ttc0[root node of current DC] to the ttp[root node of previous DC]
+      if (comm->rank == ttc0[node] && dcId > 0) {
+        // if I am the rank talking to the child 0, establish connection with the next DC root
+        NCCLCHECK(ncclGetDtreeRoots(comm->dcNode[dcId - 1].localNodes, &root0, &root1));
+        int nodeR0 = globalNodeFromShiftedNode(comm, &comm->dcNode[dcId - 1], root0, c0, nTrees0);
+        int nodeR1 = globalNodeFromShiftedNode(comm, &comm->dcNode[dcId - 1], root1, c0, nTrees0);
+        if (isNodeRoot[0]) {
+          NCCLCHECK(setTreeDown(&channel0->tree, ttp, nodeR0));
+          INFO(NCCL_GRAPH, "%s: comm 0x%lx primal TREE %d/%d -> rank %d (node %d, DC %d) down to rank %d (node %d, DC %d) ", __func__, comm->commHash, c0, nTrees0, comm->rank, node,
+               dcId, ttp[nodeR0], nodeR0, dcId - 1);
+        }
+        if (isNodeRoot[1] && channel1) {
+          NCCLCHECK(setTreeDown(&channel1->tree, ttp, nodeR1));
+          INFO(NCCL_GRAPH, "%s: comm 0x%lx dual TREE %d/%d -> rank %d (node %d, DC %d) down to rank %d (node %d, DC %d) ", __func__, comm->commHash, c0, nTrees1, comm->rank, node,
+               dcId, ttp[nodeR1], nodeR1, dcId - 1);
+        }
+      }
+    }
+    channel0->tree.depth = depth;
+    if (channel1) channel1->tree.depth = depth;
   }
   return ncclSuccess;
 }
@@ -356,8 +512,8 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev
   int nranks = comm->nRanks;
   int c;
   for (c=start; c<end; c++) {
-    memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
-    memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
+    memcpy(ringPrev+c*nranks, ringPrev+(c%start)*nranks, nranks*sizeof(int));
+    memcpy(ringNext+c*nranks, ringNext+(c%start)*nranks, nranks*sizeof(int));
     memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
   }
   return c;
@@ -377,24 +533,26 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
   int nranks = comm->nRanks;
   int nNodes = comm->nNodes;
-  int nChannels = comm->nChannels;
+  int nSearchChannels = comm->nChannels;
+  int maxChannels, minChannels;
+
   int minHeadNum = INT_MAX;
-  int shared = parent && parent->nvlsSupport  && parent->config.splitShare;
-  NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
-  NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
+  int shared = parent && parent->nvlsSupport && parent->config.splitShare;
+  NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks * MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks * MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&ringRecv, nNodes * nSearchChannels), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes * nSearchChannels), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes * nSearchChannels), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes * nSearchChannels), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes * nSearchChannels), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes * MAXCHANNELS), ret, fail);
 
   // Alternate rings to avoid crossing rails
-  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
+  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nSearchChannels % 2) == 0) {
     for (int r=0; r<comm->nRanks; r++) {
       if (comm->rankToNode[r] % 2 == 1) {
         // Exchange rings
-        for (int c=0; c<nChannels; c+=2) {
+        for (int c=0; c<nSearchChannels; c+=2) {
           exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
           exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
           exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
@@ -404,8 +562,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     }
   }
 
-  for (int c=0; c<nChannels;c++) {
-    for (int n=0; n<nNodes; n++) {
+  for (int c = 0; c < nSearchChannels; c++) {
+    for (int n = 0; n < nNodes; n++) {
       int r = firstRanks[n];
       ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
       ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
@@ -432,56 +590,47 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     }
   }
 
-  // Connect rings and trees. This should also duplicate the channels.
-  NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail);
-  NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
-
-  // Duplicate ringPrev/ringNext for ncclBuildRing
-  memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
-  memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
-
-  // Set ring prev/next for my rank
-  for (int c=0; c<nChannels; c++) {
-    struct ncclChannel* channel0 = comm->channels+c;
-    struct ncclChannel* channel1 = channel0+nChannels;
-    channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
-    channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
-  }
-
-  // Duplication should be complete now
-  nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
+  // each search channel gets two compute channels
+  comm->nChannels = std::min(MAXCHANNELS, nSearchChannels * 2);
 
   // Setup CollNet
   if (comm->collNetSupport == 1) {
     struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
     // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
     if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
-      int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
-      nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
+      comm->nChannels = std::min(MAXCHANNELS, comm->nChannels + comm->nChannels / 2);
     }
     NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
   }
 
   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
-  if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
-     nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+  if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && comm->nChannels < 16) {
+    comm->nChannels = std::min(MAXCHANNELS, comm->nChannels * 2);
   }
 
   // Double the number of channels when using unpack networking (greater than 1 node)
   // We won't automatically double past 16 channels, users can specify 32 if they want
-  if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
-     nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+  if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && comm->nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
+    comm->nChannels = std::min(MAXCHANNELS, comm->nChannels * 2);
   }
 
-  // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
-  // We permit combining max, then min, to only use the first channels, then duplicate them.
+  // Honor NCCL_MIN/MAX_CTAS and NCCL_MIN/MAX_NCHANNELS
+  // child comm #channels cannot exceed top parent #channels.
   if (comm->sharedRes->owner != comm) {
-    /* child comm #channels cannot exceed top parent #channels. */
-    nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
-    nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
+    minChannels = std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels);
+    maxChannels = std::min(std::min(ncclMaxNchannels(), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
   } else {
-    nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
-    nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
+    minChannels = std::max(ncclMinNchannels(), comm->config.minCTAs);
+    maxChannels = std::min(ncclMaxNchannels(), comm->config.maxCTAs);
+  }
+  comm->nChannels = std::max(minChannels, std::min(comm->nChannels, maxChannels));
+  if (comm->nChannels > nSearchChannels) comm->nChannels = copyChannels(comm, nSearchChannels, comm->nChannels, ringPrev, ringNext);
+  NCCLCHECKGOTO(connectRings(comm, comm->nChannels, comm->channels, ringPrev, ringNext, nSearchChannels, ringRecv, ringSend), ret, fail);
+  NCCLCHECKGOTO(connectTrees(comm, comm->nChannels, comm->channels, nSearchChannels, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
+
+  // We permit combining max, then min, to only use the first max channels, then duplicate them.
+  if (maxChannels < minChannels) {
+    comm->nChannels = copyChannels(comm, maxChannels, minChannels, ringPrev, ringNext);
   }
 
   comm->collChannels = comm->nChannels;
@@ -491,17 +640,17 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     comm->nvlsChannels = parent->nvlsResources->nChannels;
   }
   if (comm->nChannels < comm->nvlsChannels) {
-    nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
+    comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
   }
   NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
 #endif
   if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
-    nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
+    comm->nChannels = parent->sharedRes->tpNChannels;
     comm->collChannels = std::min(comm->collChannels, comm->nChannels);
   }
 
   // Create rings array and check all is fine
-  NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
+  NCCLCHECKGOTO(ncclBuildRings(comm->nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
 
 exit:
   if (ringRecv) free(ringRecv);
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index ace4476f6..b9b2804df 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -473,10 +473,10 @@ ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *a
 NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
 
 // Determine whether we need to flush the GDR recv buffers
-ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) {
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netIdx, int netDev, int rank, int* flush) {
   *flush = 1;
   ncclNetProperties_t props;
-  NCCLCHECK(comm->ncclNet->getProperties(netDev, &props));
+  NCCLCHECK(comm->ncclNet[netIdx]->getProperties(netDev, &props));
   if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess;
   int g;
   struct ncclTopoSystem* system = comm->topo;
@@ -530,6 +530,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank
 }
 
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank) {
+  if (intermediateRank) *intermediateRank = -1;
+  if (netId == -1 || !intermediateRank) return ncclSuccess;
   // Get GPU and NET
   int n, g;
   NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
@@ -561,12 +563,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);
 int ncclPxnDisable(struct ncclComm* comm) {
   static int pxnDisable = -1;
   if (pxnDisable == -1) {
-    if (comm && comm->ncclNetVer == 4) {
-      INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
-      pxnDisable = 1;
-    } else {
-      pxnDisable = ncclParamPxnDisable();
-    }
+    pxnDisable = ncclParamPxnDisable();
   }
   return pxnDisable;
 }
@@ -582,7 +579,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
   for (int rank=0; rank<comm->nRanks; rank++) {
     int64_t netId;
     int proxyRank;
-    NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
+    NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, rank, &netId, NULL, &proxyRank));
     if (proxyRank == comm->rank) continue;
     enum ncclTopoGdrMode useGdr;
     NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 15a01243f..a14b9e048 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -438,6 +438,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 }
 
 // Build a sorted list of the NETs to try.
+// The NETs returned are compatible with at least one element in the list of netDevs.
 //
 // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
 //  index when trying to get back to the NIC.
@@ -446,7 +447,8 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
 // 2. add other NETs satisfying typeInter but not already in the list.
 
-ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
+static ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpuId, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet, int netDevCount,
+                                       struct ncclNetDev* netDevs) {
   ncclResult_t ret = ncclSuccess;
   int netCount = 0;
   int localNetCount;
@@ -454,12 +456,19 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
 
   // First add the preferred NICs
   for (int g=0; g<system->nodes[GPU].count; g++) {
-    if (gpu != -1 && gpu != g) continue;
+    if (gpuId != -1 && gpuId != g) continue;
     localNetCount = 0;
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
     for (int c = 0; c<MAXCHANNELS; c++) {
       int64_t netId;
-      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL, /*pathType=*/NULL, netDevCount, netDevs));
+      if (netId == -1) {
+        char msg[256];
+        for (int i = 0; i < netDevCount; ++i) snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "%s %d:%lu", (i == 0) ? "" : ",", netDevs[i].netIdx, netDevs[i].fabricId);
+        WARN("Could not find any local path from gpu %d to net with%s", gpu->gpu.rank, msg);
+        return ncclInternalError;
+      }
+
       NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
       if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
       localNetCount++;
@@ -476,12 +485,20 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
   // Then add others satisfying typeInter
   for (int t=0; t <= typeInter; t++) {
     for (int g=0; g<system->nodes[GPU].count; g++) {
-      if (gpu != -1 && gpu != g) continue;
+      if (gpuId != -1 && gpuId != g) continue;
       localNetCount = 0;
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       struct ncclTopoLinkList* paths = gpu->paths[NET];
       for (int n=0; n<system->nodes[NET].count && n<MAXCHANNELS; n++) {
-        if (paths[n].type == t) localNets[localNetCount++] = n;
+        struct ncclTopoNode* node = &system->nodes[NET].nodes[n];
+        // add if the type is right and the netNode is in the list of requested device
+        int listId = 0;
+        for (listId = 0; listId < netDevCount; listId++) {
+          ncclNetPath_t path = {.loc = NET_LOC_DISC};
+          if (node->net.netIdx == netDevs[listId].netIdx) NCCLCHECK(node->net.getNetPath(node->net.fabricId, netDevs[listId].fabricId, &path));
+          if (path.loc < NET_LOC_DISC) break;
+        }
+        if (paths[n].type == t && (netDevCount == 0 || listId < netDevCount)) localNets[localNetCount++] = n;
       }
       // Append NICs to list
       for (int i=0; i<localNetCount; i++) {
@@ -526,12 +543,16 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       int startNetIndex;
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
+      // When coming back to the NIC, we must select a NIC compatible with the startNet.
+      // We compare two NICs on the same node, so they are either fast connected, or disconnected. A simple == check is enough.
+      // A more thorough check would require access to the comm's ncclNet array, which we don't have here.
       int netCount;
-      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
+      struct ncclNetDev netDev = {.netIdx = startNet->net.netIdx, .fabricId = startNet->net.fabricId};
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount, 1, &netDev));
       for (int i=0; i<netCount; i++) {
         int n = nets[i];
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-        if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
+        if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // enter and exit through the same NIC
         if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
           if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
         } else {
@@ -597,12 +618,13 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
+static ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
   const int bw = graph->bwInter;
   int nets[NCCL_TOPO_MAX_NODES];
   int netCount;
   int graphFound = 0;
-  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
+  // when getting the first net dev, there is no restriction on the (netId, fabricId)
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount, /*netDevCount=*/0, /*netDevs=*/NULL));
   for (int i=0; i<netCount; i++) {
     if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
     int n = nets[(graph->nChannels+i)%netCount];
@@ -809,7 +831,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
-  if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
+  NCCLCHECK(xmlGetAttrFloatDefault(xmlGraph, "latencyinter", &graph->latencyInter, 0.0));
   const char* str;
   NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
   NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
@@ -1178,7 +1200,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
 
 #include "comm.h"
 // NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
-ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int64_t* netId) {
+static ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int64_t* netId) {
   ncclResult_t ret = ncclSuccess;
   int localRanks = comm->topo->nodes[GPU].count;
   int netNum = 0;
@@ -1192,42 +1214,93 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
   if (netNum) {
     *netId = net[channelId % netNum];
   } else {
+    // in case of error, it means that the current rank is not an NVLS head
+    // the caller is responsible to determine if it's an error or not
     ret = ncclInternalError;
-    goto fail;
   }
-
-exit:
   return ret;
-fail:
-  WARN("Could not find NIC for rank %d in NVLS graph", comm->rank);
-  goto exit;
 }
 
 // 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
 NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
 
-ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank) {
+static ncclResult_t ncclTopoPrintNetDev(int netDevCount, struct ncclNetDev* netDevs, char* msg, size_t len) {
+  snprintf(msg, len, "netDevs:");
+  for (int i = 0; i < netDevCount; ++i) {
+    snprintf(msg + strlen(msg), len - strlen(msg), "%s %d:%lu", (i == 0) ? "" : ",", netDevs[i].netIdx, netDevs[i].fabricId);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoTryConnectToPeer(struct ncclComm* comm, int64_t netId, int peerRank, bool* successful) {
+  int netIndex;
+  int peerNetDevCount = comm->peerInfo[peerRank].netDevCount;
+  struct ncclNetDev* peerNetDevs = comm->peerInfo[peerRank].netDevs;
+  struct ncclTopoNode* netNode;
+  *successful = false;
+  if (netId == -1) goto exit;
+
+  NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &netIndex));
+  netNode = comm->topo->nodes[NET].nodes + netIndex;
+  for (int i = 0; i < peerNetDevCount; ++i) {
+    ncclNetPath_t path = {.loc = NET_LOC_DISC};
+    if (netNode->net.netIdx == peerNetDevs[i].netIdx)
+      NCCLCHECK(comm->ncclNet[netNode->net.netIdx]->getNetPath(netNode->net.fabricId, peerNetDevs[i].fabricId, &path));
+    if (path.loc < NET_LOC_DISC) {
+      *successful = true;
+      goto exit;
+    }
+  }
+exit:
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNetDevFromGraph(struct ncclComm* comm, int rank, int peerRank, struct ncclTopoGraph* graph, int channelId, int64_t* netId, int* netDev, int* netPathType) {
+  int64_t localNetId;
+  int channel = channelId % graph->nChannels;
+  int ngpus = comm->topo->nodes[GPU].count;
+  int index = graph->intra[channel * ngpus] == rank ? 0 : 1;
+  if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
+    localNetId = graph->inter[channel * 2 + index];
+  } else {
+    // failing here means that the current rank is not an NVLS head
+    // the caller is responsible to determine if it's an error or not, we pass the error along
+    NCCLCHECK(getNvlsNetDev(comm, graph, channelId, &localNetId));
+  }
+  // verify if we can connect. if we cannot connect, search a new net and update the netdev info
+  bool canConnectToPeer = false;
+  NCCLCHECK(ncclTopoTryConnectToPeer(comm, localNetId, peerRank, &canConnectToPeer));
+  if (canConnectToPeer && netDev) {
+    NCCLCHECK(ncclTopoIdToNetDev(comm->topo, localNetId, netDev));
+  } else {
+    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &localNetId, netDev, netPathType, comm->peerInfo[peerRank].netDevCount, comm->peerInfo[peerRank].netDevs));
+  }
+  if(netId) *netId = localNetId;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int pxnRank, int64_t* id, int* dev, int* proxyRank) {
+  ncclResult_t res = ncclSuccess;
   int64_t netId = -1;
   int netDev = -1;
+
+  if (peerRank == -1) return ncclInternalError;
+  if (comm->topo->nodes[NET].count == 0) {
+    if (dev) *dev = -1;
+    if (id) *id = -1;
+    goto exit;
+  }
+
   if (graph) {
-    // Honor the net device in the graph
-    int channel = channelId%graph->nChannels;
-    int ngpus = comm->topo->nodes[GPU].count;
-    int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
-    if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
-      netId = graph->inter[channel*2+index];
-    } else {
-      NCCLCHECK(getNvlsNetDev(comm, graph, channelId, &netId));
-    }
-    NCCLCHECK(ncclTopoIdToNetDev(comm->topo, netId, &netDev));
+    NCCLCHECK(ncclTopoGetNetDevFromGraph(comm, rank, peerRank, graph, channelId, &netId, &netDev, /*netPathType=*/NULL));
+    NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, netId, proxyRank));
+    if(netId == -1 || netDev == -1) goto fail;
     if (dev) *dev = netDev;
     if (id) *id = netId;
-    NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, netId, proxyRank));
-  } else if (peerRank == -1) {
-    return ncclInternalError;
   } else {
-    // Start with our local NIC and local Rank
-    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &netId, &netDev));
+    // Start with our local NIC and local rank (NIC must connect to the peer). Not finding the net it is an error.
+    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &netId, &netDev, /*pathType=*/NULL, comm->peerInfo[peerRank].netDevCount, comm->peerInfo[peerRank].netDevs));
+    if (netId == -1) goto fail;
     if (dev) *dev = netDev;
     if (id) *id = netId;
     *proxyRank = rank;
@@ -1235,11 +1308,13 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
     int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
     // See whether we can use the remote rank preferred device.
     if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
-      // Find local NIC number close to local nvmlDev
-      int nvmlDev = comm->peerInfo[peerRank].nvmlDev;
+      if(pxnRank == -1) return ncclInternalError;
+      // Find local NIC number close to local nvmlDev, not finding it, we stop here
+      int nvmlDev = comm->peerInfo[pxnRank].nvmlDev;
       int localRank;
       if (ncclTopoDevToRank(comm->topo, nvmlDev, &localRank) != ncclSuccess) return ncclSuccess;
-      NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netId, &netDev));
+      NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netId, &netDev, /*pathType=*/NULL, comm->peerInfo[peerRank].netDevCount, comm->peerInfo[peerRank].netDevs));
+      if (netId == -1) return ncclSuccess;
 
       // Check that device exists on our node
       if (ncclParamCrossNic() == 0) {
@@ -1273,6 +1348,14 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
         }
       }
     }
+    if (netId == -1) goto fail;
   }
-  return ncclSuccess;
+exit:
+  return res;
+fail:
+  char msg[256];
+  NCCLCHECK(ncclTopoPrintNetDev(comm->peerInfo[peerRank].netDevCount, comm->peerInfo[peerRank].netDevs, msg, sizeof(msg)));
+  WARN("Could not find any netDev to communicate from rank %d to peer %d (peer netDev list = %s)", rank, peerRank, msg);
+  res = ncclInternalError;
+  goto exit;
 }
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 9499f396d..a0cad332b 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -12,6 +12,8 @@
 #include "net.h"
 #include "coll_net.h"
 #include "transport.h"
+#include <cstdint>
+#include <cstring>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include "xml.h"
@@ -343,6 +345,17 @@ static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode*
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoIdToNetIdx(struct ncclComm* comm, int64_t topoId, int* netIdx) {
+  int netTopoIndex;
+  NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, topoId, &netTopoIndex));
+  *netIdx = comm->topo->nodes[NET].nodes[netTopoIndex].net.netIdx;
+  if (*netIdx == -1) {
+    WARN("the chosen device has a network index of %d", *netIdx);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
 // We want the graph to be organized to ease/accelerate traversal :
 // 1. NVLinks (already the case)
 // 2. PCI down
@@ -354,28 +367,33 @@ ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
 }
 
 ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) {
-  int dev;
+  const char* str = NULL;
+  int dev, netIdx;
   NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "netIdx", &netIdx, 0));
 
   struct ncclTopoNode* net;
-  NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev)));
+  NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID_NIC(systemId, netIdx, dev)));
+  net->net.netIdx = netIdx;
   net->net.dev = dev;
-  const char* str;
   NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
   if (str) sscanf(str, "0x%lx", &net->net.asic);
   else net->net.asic = dev;
 
-  ncclDebugNoWarn = NCCL_GRAPH;
   int mbps;
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
   if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
   net->net.bw = mbps / 8000.0;
-  if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
+  NCCLCHECK(xmlGetAttrFloatDefault(xmlNet, "latency", &net->net.latency,0));
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
   NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
-  ncclDebugNoWarn = 0;
+  // get the fabricId if it exists. If not, we have bypassed ncclTopoNet and we should not use the netIdx value
+  NCCLCHECK(xmlGetAttr(xmlNet,"fabricId", &str));
+  if(str) net->net.fabricId = strtoull(str, NULL, 0);
+  else net->net.netIdx = -1;
+  net->net.getNetPath = NULL; // updated as part of the PostProcessNet function
 
   NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
   NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
@@ -437,8 +455,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
     type = NIC;
     // Ignore sub device ID and merge multi-port NICs into one PCI device.
     struct ncclTopoNode* nicNode = NULL;
-    int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, busId);
-    int64_t id = NCCL_TOPO_ID(systemId, localNicId);
+    int64_t id = NCCL_TOPO_ID(systemId, busId);
     NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id));
     if (nicNode == NULL) {
       NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id));
@@ -533,8 +550,7 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
     if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId, numaId));
     if (strcmp(node->name, "nic") == 0) {
       struct ncclTopoNode* nic = NULL;
-      int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, 0);
-      int64_t id = NCCL_TOPO_ID(systemId, localNicId);
+      int64_t id = NCCL_TOPO_ID(systemId, 0);
       NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, id));
       if (nic == NULL) {
         NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, id));
@@ -800,7 +816,7 @@ typedef struct xmlNodeStack {
 } xmlNodeStack;
 
 // 1. Find the common parent xmlNode between the given set of nodes
-ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) {
+ncclResult_t ncclgetNetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) {
   // Track a stack of parents per-net node being merged
   xmlNodeStack* parents;
   NCCLCHECK(ncclCalloc(&parents, nNodes));
@@ -990,8 +1006,9 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char
   char* semi = strtok_r(ncStr, ";", &semi_token);
   while (semi) {
     TRACE(NCCL_NET, "Fusing %s", semi);
+    int nUserIfs;
     struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
-    int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
+    NCCLCHECK(parseIfList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC, &nUserIfs));
     if (nUserIfs == 0) {
       INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
         ncStr, semi);
@@ -1053,7 +1070,7 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe
       nodes[0] = physNetNodes[i];
       nodes[1] = physNetNodes[j];
       struct ncclXmlNode* parent;
-      NCCLCHECKGOTO(ncclTopoGetPath(nodes, 2, &paths[i*nPhysDevs + j], &parent), res, out);
+      NCCLCHECKGOTO(ncclgetNetPath(nodes, 2, &paths[i*nPhysDevs + j], &parent), res, out);
     }
   }
 
@@ -1135,7 +1152,7 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper
   }
 
   int path = PATH_LOC;
-  NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent));
+  NCCLCHECK(ncclgetNetPath(physNetNodes, vProps->ndevs, &path, parent));
   if (path == PATH_LOC) {
     *parent = NULL;
   } else if (parent && strcmp((*parent)->name, "pci") == 0) {
@@ -1187,12 +1204,39 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
   return res;
 }
 
-static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) {
+static ncclResult_t ncclTopoPrintInfoNic(struct ncclXmlNode* netNode, const char* pciPath, const char* funcName) {
+  if (ncclDebugLevel <= NCCL_LOG_WARN) return ncclSuccess;
+
+  char msg[8 * MAX_STR_LEN];
+  snprintf(msg, sizeof(msg), "%s: filled", funcName);
+  const char* attr;
+  NCCLCHECK(xmlGetAttr(netNode, "name", &attr));
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " %s in topo:", attr);
+  NCCLCHECK(xmlGetAttr(netNode, "type", &attr));
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " type=%s", attr);
+  NCCLCHECK(xmlGetAttr(netNode, "netIdx", &attr));
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", netIdx=%s", attr ? attr : "-1");
+  NCCLCHECK(xmlGetAttr(netNode, "dev", &attr));
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", dev=%s", attr);
+  NCCLCHECK(xmlGetAttr(netNode, "keep", &attr));
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", keep=%s", attr);
+  NCCLCHECK(xmlGetAttr(netNode, "coll", &attr));
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", coll=%s", attr ? attr : "0");
+  NCCLCHECK(xmlGetAttr(netNode, "gdr", &attr));
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", gdr=%s", attr);
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ", pci=%s", pciPath);
+
+  INFO(NCCL_GRAPH, "%s", msg);
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, int coll, int keep, int virtualNics, int netIdx, const char* netName,
+                                         ncclResult_t (*getProperties)(int, ncclNetProperties_t*),
+                                         ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* loc), bool netDmaBufSupport, bool dumpXml) {
   for (int n = startIndex; n < endIndex; n++) {
     ncclNetProperties_t props;
     NCCLCHECK(getProperties(n, &props));
-    struct ncclXmlNode* netNode = NULL;
-    struct ncclXmlNode* parent = NULL;
+    struct ncclXmlNode *netNode = NULL, *parent = NULL;
     if (virtualNics) {
       struct ncclXmlNode* net = NULL;
       NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name));
@@ -1200,88 +1244,133 @@ static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int star
       // Only run this if the net doesn't exist locally - this may alter the XML state
       if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent));
     }
-
     NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent));
 
-    const char* colAttr;
+    // If coll == 0 but the netNode is already tagged as coll, don't overwrite the keep value
+    const char* colAttr = NULL;
     NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
+    if (colAttr == NULL || coll != 0 || strcmp(colAttr, "1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep));
 
-    // If coll == 0 but the netNode is tagged as coll, don't update the keep value
-    if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep));
-    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+    if (coll) {
+      // there is a single collNet, so no conflict. update the dev index everytime
+      // need to overwrite the dev index if it was a physical and it becomes a virtual
+      NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+      NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll));
+    } else {
+      int netIdxAttr = -1;
+      const char* typeAttr = NULL;
+      NCCLCHECK(xmlGetAttr(netNode, "type", &typeAttr));
+      NCCLCHECK(xmlGetAttrIntDefault(netNode, "netIdx", &netIdxAttr, -1));
+      // the devices belongs to this network if: (1) netIdx is set to the current index,
+      // or (2) the device has a netType and the type equals the name of the current network
+      if (netIdxAttr == netIdx || (netIdxAttr == -1 && (!typeAttr || (typeAttr && strcmp(typeAttr, netName))))) {
+        // need to overwrite the dev index: if it was a physical device and it becomes a virtual device, the index is changed
+        NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+        NCCLCHECK(xmlSetAttr(netNode, "type", netName));
+        // no need to dump the netIdx and other net-related info, they are local to the comm
+        if (!dumpXml) NCCLCHECK(xmlSetAttrInt(netNode, "netIdx", netIdx));
+      }
+    }
     NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
     NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
     NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
     NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
     NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (netDmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
     NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
-    // Only set coll if it's not 0
-    if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll));
+    NCCLCHECK(xmlInitAttrUint64(netNode, "fabricId", props.fabricId));
 
-    const char* keepAttr;
-    NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
-    NCCLCHECK(xmlGetAttr(netNode, "keep", &keepAttr));
-    INFO(NCCL_GRAPH, "ncclTopoPopulateNics : Filled %s in topo with pciPath=%s keep=%s coll=%s",
-      props.name, props.pciPath, keepAttr, colAttr);
+    NCCLCHECK(ncclTopoPrintInfoNic(netNode, props.pciPath, __func__));
   }
 
   return ncclSuccess;
 }
 
 struct ncclTopoNetState {
+  int netPluginIdx; // -1  if uninitialized, shared by all communicators
   int nVirtualNics;
   int nPhysicalNics;
-  const char* name;
+  const char* name; // name of the net Plugin
+  ncclResult_t (*getProperties)(int, ncclNetProperties_t*);
+  ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*);
+  ncclResult_t (*devices)(int*);
+  ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* loc);
 };
 
 // Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
-static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) {
-  int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
-  if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
-  // Enumerate physical devices
-  NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0));
-  if (!usePhysicalDevices) {
+static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, bool dumpXml, ncclTopoNetState* state, const int netCommIdx) {
+  bool dmaBufSupport = comm->dmaBufSupport[netCommIdx];
+  int usePhysicalDevices = (dumpXml || state->makeVDevice == NULL);
+  if (state->nPhysicalNics == -1) NCCLCHECK(state->devices(&state->nPhysicalNics));
+  if (usePhysicalDevices) {
+    // Enumerate physical devices, always keep them
+    NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, coll, /*keep=*/1, /*virtual=*/0, netCommIdx, state->name, state->getProperties, state->getNetPath,
+                                   dmaBufSupport, dumpXml));
+  } else {
+    // TopoMakeVNics creates all the virtual NICs needed (including for a single physical device).
+    // Physical NICs need to be discovered first but will not be kept
+    NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, coll, /*keep=*/0, /*virtual=*/0, netCommIdx, state->name, state->getProperties, state->getNetPath,
+                                   dmaBufSupport, dumpXml));
     if (state->nVirtualNics == -1) {
-      NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics));
+      NCCLCHECK(ncclTopoMakeVNics(comm, xml, state->makeVDevice, state->getProperties, state->nPhysicalNics));
       int nDevs;
-      NCCLCHECK(devices(&nDevs));
+      NCCLCHECK(state->devices(&nDevs));
       state->nVirtualNics = nDevs - state->nPhysicalNics;
     }
-    // Remove keep=1 for physical collnets
-    if (state->nVirtualNics > 0) {
-      NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0));
-      // Populate new devices
-      NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1));
-    }
+    // all NICs needed by the topology are now virtual, fill the information and set them as "keep=1"
+    NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics + state->nVirtualNics, coll, /*keep=*/1, /*virtual=*/1, netCommIdx, state->name,
+                                   state->getProperties, state->getNetPath, dmaBufSupport, dumpXml));
   }
+  return ncclSuccess;
+}
 
+// Bind network function and resources to the net topo nodes.
+static ncclResult_t ncclTopoPostProcessNets(ncclCollNet_t* collNet, ncclNet_t* nets[], struct ncclTopoSystem* topoSystem) {
+  for(int n=0 ; n<topoSystem->nodes[NET].count; ++n){
+    struct ncclTopoNode* node = &topoSystem->nodes[NET].nodes[n];
+    if(node->net.collSupport) node->net.getNetPath = collNet->getNetPath;
+    else node->net.getNetPath = nets[node->net.netIdx]->getNetPath;
+  }
   return ncclSuccess;
 }
 
 static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {};
-ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {};
-ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) {
-  INFO(NCCL_GRAPH, "Retrieving state for %s", name);
-  for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) {
-    // Empty slot
-    if (states[i].name == NULL) {
-      states[i].nVirtualNics = -1;
-      states[i].nPhysicalNics = -1;
-      states[i].name = strdup(name);
-      *state = states + i;
-      INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name);
-      return ncclSuccess;
-    // Found my slot
-    } else if (strcmp(states[i].name, name) == 0) {
-      *state = states + i;
-      return ncclSuccess;
-    }
+static ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {};
+static ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {};
+
+static void ncclTopoSharedStateInitOnce() {
+  // make sure that initialized is 0 everywhere a the first call
+  for (int n = 0; n < NCCL_NET_MAX_PLUGINS; ++n) {
+    netStates[n].netPluginIdx = -1;
+    collNetStates[n].netPluginIdx = -1;
   }
-  WARN("NET/TOPO : Couldn't find net with name %s", name);
-  return ncclInternalError;
+}
+
+// must be called within a lock to avoid conflict when initializing the states
+static ncclResult_t ncclTopoInitSharedState(ncclTopoNetState* states, const int netPluginIdx, const char* name, ncclResult_t (*getProperties)(int, ncclNetProperties_t*),
+                                            ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*),
+                                            ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path),
+                                            ncclTopoNetState** state) {
+  // init the array of states only once
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once, ncclTopoSharedStateInitOnce);
+  // get the state from the netPluginIdx
+  *state = states + netPluginIdx;
+  // setup the state if not already set by others
+  TRACE(NCCL_GRAPH, "Retrieving state for %s", name);
+  // Empty slot
+  if ((*state)->netPluginIdx == -1) {
+    (*state)->name = name;
+    (*state)->nVirtualNics = -1;
+    (*state)->nPhysicalNics = -1;
+    (*state)->getProperties = getProperties;
+    (*state)->makeVDevice = makeVDevice;
+    (*state)->devices = devices;
+    (*state)->getNetPath = getNetPath;
+    (*state)->netPluginIdx = netPluginIdx;
+    INFO(NCCL_GRAPH, "Initialized state %d for %s", netPluginIdx, name);
+  }
+  return ncclSuccess;
 }
 
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) {
@@ -1321,21 +1410,23 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail);
   }
 
-  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
-  // so we start with collnet so that it has precedence.
+  // Auto-detect NICs if needed
   pthread_mutex_lock(&netLock);
   netLockHeld = 1;
   INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
-  ncclTopoNetState* state;
-  state = NULL;
+  // net/collnet share the same xml/graph nodes, so we start with collnet so that it has precedence.
   if (collNetSupport(comm)) {
-    NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
-    NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state,
-      comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail);
+    ncclTopoNetState* state = NULL;
+    ncclCollNet_t* net = comm->ncclCollNet;
+    NCCLCHECKGOTO(ncclTopoInitSharedState(collNetStates, comm->ncclCollNetPluginIdx, net->name, net->getProperties, net->makeVDevice, net->devices, net->getNetPath, &state), ret, fail);
+    NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile != NULL, state, /*netCommIdx=*/0), ret, fail);
+  }
+  for (int n = 0; n < comm->ncclNetCount; ++n) {
+    ncclTopoNetState* state = NULL;
+    ncclNet_t* net = comm->ncclNet[n];
+    NCCLCHECKGOTO(ncclTopoInitSharedState(netStates, comm->ncclNetPluginIdx[n], net->name, net->getProperties, net->makeVDevice, net->devices, net->getNetPath, &state), ret, fail);
+    NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile != NULL, state, /*netCommIdx=*/n), ret, fail);
   }
-  NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
-  NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state,
-    comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name), ret, fail);
   pthread_mutex_unlock(&netLock);
   netLockHeld = 0;
 
@@ -1387,7 +1478,10 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   }
 
   // Only update our topo tracking structure if we aren't dumping (separate steps)
-  if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+  if (dumpXmlFile == NULL){
+    NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPostProcessNets(comm->ncclCollNet, comm->ncclNet, *system), ret, fail);
+  }
 
 exit:
   if (!comm->MNNVL && localRanks) free(localRanks);
@@ -1399,14 +1493,22 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   goto exit;
 }
 
-static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
-                                     int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) {
+static ncclResult_t ncclTopoGetAllLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType,
+                                        int nExcl, int64_t* exclIds) {
   int minType = PATH_DIS;
   float maxBw = 0;
   int count = 0;
   struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
   if (paths == NULL) { *localCount = 0; return ncclSuccess; }
   for (int i=0; i<system->nodes[resultType].count; i++) {
+    // Exclude the node if in the list of excluded topo ids. Go over the list, if found, skip the node.
+    // If unfound (idExcl == nExcl) or the list is empty (nExcl ==0), test if we should add the node.
+    int idExcl = 0;
+    for (idExcl = 0; idExcl < nExcl; ++idExcl) {
+      if (system->nodes[resultType].nodes[i].id == exclIds[idExcl]) break;
+    }
+    if (nExcl > 0 && idExcl < nExcl) continue;
+
     if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
       maxBw = paths[i].bw;
       minType = paths[i].type;
@@ -1415,9 +1517,9 @@ static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, in
     }
     if (paths[i].bw == maxBw && paths[i].type == minType) {
       if (count == NCCL_TOPO_MAX_NODES) {
-        WARN("Error : ran out of room to store found nodes in ncclTopoGetLocal."
+        WARN("Error : ran out of room to store found nodes in %s."
              " Filled %d of type %d, starting from index %d of type %d.",
-             NCCL_TOPO_MAX_NODES, resultType, index, type);
+             __func__, NCCL_TOPO_MAX_NODES, resultType, index, type);
         return ncclInternalError;
       }
       locals[count++] = i;
@@ -1427,6 +1529,56 @@ static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, in
   return ncclSuccess;
 }
 
+union ncclTopoGetLocalRestriction {
+  struct ncclNetDev net;
+};
+
+static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType,
+                                     int nRestrictions, union ncclTopoGetLocalRestriction* restrictions) {
+  if (resultType != NET) {
+    NCCLCHECK(ncclTopoGetAllLocal(system, type, index, resultType, locals, localCount, pathType, 0, NULL));
+  } else {
+    int netCount = 0, nExcl = 0;
+    int64_t exclList[NCCL_TOPO_MAX_NODES];
+    while (netCount == 0) {
+      NCCLCHECK(ncclTopoGetAllLocal(system, type, index, NET, locals, localCount, pathType, nExcl, exclList));
+      if (*localCount == 0) break;
+
+      int minNetIdx = NCCL_NET_MAX_PLUGINS + 1;
+      for (int i = 0; i < *localCount; ++i) {
+        struct ncclTopoNode* node = system->nodes[NET].nodes + locals[i];
+        int netIdx = node->net.netIdx;
+        uint64_t fabricId = node->net.fabricId;
+        ncclResult_t (*getNetPath)(uint64_t, uint64_t, ncclNetPath_t*) = node->net.getNetPath;
+
+        // apply restrictions: if not in the list of requirements, add to the list of excluded topo id for next call
+        int c = 0;
+        for (c = 0; c < nRestrictions; ++c) {
+          struct ncclNetDev* crit = &(restrictions[c].net);
+          ncclNetPath_t path = {.loc = NET_LOC_DISC};
+          if (netIdx == crit->netIdx) NCCLCHECK(getNetPath(fabricId, crit->fabricId, &path));
+          if (path.loc < NET_LOC_DISC) break;
+        }
+        if (nRestrictions > 0 && c == nRestrictions) {
+          exclList[nExcl++] = node->id;
+          continue;
+        }
+
+        // return only the device with the lowest netIdx (that acts as a proxy for priority).
+        if (netIdx < minNetIdx) {
+          minNetIdx = netIdx;
+          netCount = 0;
+        }
+        if (netIdx == minNetIdx) {
+          locals[netCount++] = locals[i];
+        }
+      }
+    }
+    *localCount = netCount;
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
   int localNetCount = 0, netCountByBw = 0;
   int localNets[NCCL_TOPO_MAX_NODES];
@@ -1441,7 +1593,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
     }
   }
 
-  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL, 0, NULL));
   for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
      totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
   }
@@ -1450,21 +1602,21 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev, int* pathType, int netDevCount, struct ncclNetDev* netDevs) {
   int gpu;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
-
-  int localNets[NCCL_TOPO_MAX_NODES];
   int localNetCount;
-  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
-  if (localNetCount==0) {
-    WARN("Could not find any local path from gpu %d to net.", gpu);
-    return ncclInternalError;
+  int localNets[NCCL_TOPO_MAX_NODES];
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, pathType, netDevCount, (union ncclTopoGetLocalRestriction*)netDevs));
+  if (localNetCount == 0) {
+    if(id) *id = -1;
+    if(dev) *dev = -1;
+    return ncclSuccess;
   }
 
   int localGpus[NCCL_TOPO_MAX_NODES];
   int localGpuCount;
-  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
+  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL, /*nConstrain=*/0, /*constrain=*/NULL));
 
   int net = system->nodes[GPU].nodes[gpu].gpu.dev;
   if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
@@ -1481,7 +1633,7 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i
 
   int localGpus[NCCL_TOPO_MAX_NODES];
   int localGpuCount;
-  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL));
+  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL, /*nConstrain=*/0, /*constrain=*/NULL));
 
   int foundGpu = -1;
   for (int c=0; c<MAXCHANNELS; c++) {
@@ -1489,7 +1641,8 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i
       int g = localGpus[lg];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       int64_t id;
-      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
+      // no need to give a list of fabricId/netId when searching for a specific net
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL, /*pathType=*/NULL, 0, NULL));
       if (netId == id) {
         foundGpu = g;
         goto exit;
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 921a7f5d6..ee4677ad5 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -112,10 +112,12 @@ struct ncclTopoLinkList {
 #define NCCL_TOPO_UNDEF (-1)
 
 #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
+// access a TOPO_ID
 #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
 #define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK)
-#define NCCL_TOPO_LOCAL_NIC_ID(numaid, busid) (((int64_t)numaid << 56) + busid)
-#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK))
+// create a TOPO_ID
+#define NCCL_TOPO_ID(systemid, localid) (((int64_t)(systemid) << 56) + ((localid) & NCCL_TOPO_ID_LOCAL_ID_MASK))
+#define NCCL_TOPO_ID_NIC(systemid, pluginid, localid) NCCL_TOPO_ID(systemid + pluginid, localid)
 
 struct ncclTopoNode {
   int type;
@@ -129,6 +131,7 @@ struct ncclTopoNode {
       int gdrSupport;
     }gpu;
     struct {
+      int netIdx; // net index inside the comm net array
       int dev; // Plugin dev number
       uint64_t asic;
       int port;
@@ -137,6 +140,8 @@ struct ncclTopoNode {
       int gdrSupport;
       int collSupport;
       int maxChannels;
+      uint64_t fabricId;
+      ncclResult_t (*getNetPath)(uint64_t, uint64_t, ncclNetPath_t*); // gives the locality between this device and another one
     }net;
     struct {
       int arch;
@@ -189,14 +194,18 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
 
 ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
 
+ncclResult_t ncclTopoIdToNetIdx(struct ncclComm* comm, int64_t topoId, int* netIdx);
+
 static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
   *index = -1;
+  if(system->nodes[type].count == 0) return ncclSuccess;
   for (int i=0; i<system->nodes[type].count; i++) {
     if (system->nodes[type].nodes[i].id == id) {
       *index = i;
       return ncclSuccess;
     }
   }
+  WARN("failed to find a topo node with id %ld", id);
   return ncclInternalError;
 }
 
diff --git a/src/graph/trees.cc b/src/graph/trees.cc
index 8e1e2ae85..817234496 100644
--- a/src/graph/trees.cc
+++ b/src/graph/trees.cc
@@ -107,3 +107,9 @@ ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, i
   }
   return ncclSuccess;
 }
+
+ncclResult_t ncclGetDtreeRoots(int nranks, int* r0, int* r1) {
+  if (r0) *r0 = 0; // primal tree root is always 0
+  if (r1) *r1 = (nranks % 2 == 1 && nranks > 1) ? 1 : (nranks - 1);
+  return ncclSuccess;
+}
diff --git a/src/graph/xml.h b/src/graph/xml.h
index f06c0e68b..256cef407 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -124,6 +124,19 @@ static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrNam
   return ncclSuccess;
 }
 
+static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrName, uint64_t* value) {
+  const char* str;
+  NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+  *value = strtoull(str, NULL, 0);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttrUint64Default(struct ncclXmlNode* node, const char* attrName, uint64_t* value, uint64_t defaultValue) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, attrName, &str));
+  *value = str ? strtoull(str, NULL, 0) : defaultValue;
+  return ncclSuccess;
+}
 
 static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
   const char* str;
@@ -284,6 +297,19 @@ static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char* attrNam
   return ncclSuccess;
 }
 
+static ncclResult_t xmlSetAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
+  }
+  snprintf(node->attrs[index].value, MAX_STR_LEN, "%#lx", value);
+  node->attrs[index].value[MAX_STR_LEN] = '\0';
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
   int index;
   NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
diff --git a/src/include/comm.h b/src/include/comm.h
index 409518713..09fc171c2 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -85,10 +85,16 @@ struct ncclUserRedOp {
 };
 
 struct ncclNodeRanks {
+  int dcIndex; // index into the DC array
   int localRanks;
   int* localRankToRank;
 };
 
+struct ncclDcNode {
+  int localNodes;
+  int* localNodeToNode;
+};
+
 struct cliqueInfo {
   int id;
   int size;
@@ -421,10 +427,14 @@ struct ncclComm {
   struct ncclProxyConnector* gproxyConn;
   struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
 
+  int ncclNetCount;
+  int ncclDcNetIndex;
   int netPluginLoaded;
-  ncclNet_t* ncclNet;
-  int ncclNetVer;
   ncclNetDeviceType netDeviceType;
+  int ncclNetVer[NCCL_NET_MAX_PLUGINS];
+  int ncclNetPluginIdx[NCCL_NET_MAX_PLUGINS];
+  ncclNet_t* ncclNet[NCCL_NET_MAX_PLUGINS];
+  int ncclCollNetPluginIdx;
   ncclCollNet_t* ncclCollNet;
   void* bootstrap;
   // Bitmasks for ncclTransportP2pSetup
@@ -464,13 +474,16 @@ struct ncclComm {
   int* localRankToRank;
   // localRanks and localRanktoRank for all nodes
   struct ncclNodeRanks* nodeRanks;
+  // multi-DC support
+  int dcCount;
+  struct ncclDcNode* dcNode;
   // MNNVL: Multi-Node NVLink
   int MNNVL; // true when MNNVL is available
   struct cliqueInfo clique; // Our MNNVL clique information
   int cliqueRank; // Our rank within the MNNVL clique
 
   bool checkPointers;
-  bool dmaBufSupport;
+  bool dmaBufSupport[NCCL_NET_MAX_PLUGINS];
 
   // Counter for tracking CUDA launches (P2P and collectives included)
   uint64_t opCount;
diff --git a/src/include/device.h b/src/include/device.h
index 0763a579a..8016c0833 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -155,7 +155,7 @@ struct ncclRing {
 
 
 // The root of each tree only has one node down (+1 intra-node).
-#define NCCL_MAX_TREE_ARITY_TOP 2
+#define NCCL_MAX_TREE_ARITY_TOP 3
 // Nodes inside the binary tree can have to two nodes down (+1 intra-node).
 #define NCCL_MAX_TREE_ARITY 3
 struct ncclTree {
diff --git a/src/include/graph.h b/src/include/graph.h
index b779773da..9af4dd094 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -9,6 +9,7 @@
 
 #include "nccl.h"
 #include "device.h"
+#include "nccl_net.h"
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
@@ -33,7 +34,8 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink
 ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
-ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
+ncclResult_t ncclTopoGetNetDevFromGraph(struct ncclComm* comm, int rank, int peerRank, struct ncclTopoGraph* graph, int channelId, int64_t* netId, int* netDev, int* netPathType);
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int pxnRank, int64_t* id, int* dev, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
 enum ncclTopoGdrMode {
@@ -43,7 +45,7 @@ enum ncclTopoGdrMode {
   ncclTopoGdrModeNum = 3
 };
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode);
-ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netIdx, int netDev, int rank, int* flush);
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
@@ -70,7 +72,7 @@ ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vend
 ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev, int* pathType, int netDevCount, struct ncclNetDev* netDevs);
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
 ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
 
diff --git a/src/include/net.h b/src/include/net.h
index afc2d160e..174f958fb 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -12,6 +12,8 @@
 #include "comm.h"
 #include "checks.h"
 
+static NCCL_PARAM(AllNet,"ALLNET_ENABLE",0);
+
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
 ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
diff --git a/src/include/net_device.h b/src/include/net_device.h
index c3a79e35c..bdf281ec5 100644
--- a/src/include/net_device.h
+++ b/src/include/net_device.h
@@ -27,6 +27,7 @@ typedef struct {
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
 typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
-typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
+typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h
index d57aad5a9..e2f810033 100644
--- a/src/include/plugin/nccl_net.h
+++ b/src/include/plugin/nccl_net.h
@@ -33,22 +33,25 @@
 // NCCL core profiler callback for network defined events instrumentation
 typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
 
+#include "net/net_v11.h"
 #include "net/net_v10.h"
 #include "net/net_v9.h"
 #include "net/net_v8.h"
 #include "net/net_v7.h"
 #include "net/net_v6.h"
 
-typedef ncclNet_v10_t ncclNet_t;
-typedef ncclCollNet_v10_t ncclCollNet_t;
-typedef ncclNetSGE_v10_t ncclNetSGE_t;
-typedef ncclNetProperties_v10_t ncclNetProperties_t;
-typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
-typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+typedef ncclNet_v11_t ncclNet_t;
+typedef ncclCollNet_v11_t ncclCollNet_t;
+typedef ncclNetSGE_v11_t ncclNetSGE_t;
+typedef ncclNetProperties_v11_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
+typedef ncclNetPath_v11_t ncclNetPath_t;
 
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V11
 
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10
+#define NCCL_NET_PLUGIN_VERSION 11
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v11
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v11
 
 #endif // end include guard
diff --git a/src/include/plugin/net/net_v11.h b/src/include/plugin/net/net_v11.h
new file mode 100644
index 000000000..3a909ed1d
--- /dev/null
+++ b/src/include/plugin/net/net_v11.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V11_H_
+#define NET_V11_H_
+
+#include <cstdint>
+#define NCCL_NET_MAX_DEVS_PER_NIC_V11 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V11];
+} ncclNetVDeviceProps_v11_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v11_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v11_t vProps;
+  size_t maxP2pBytes;  // Max transfer size for point-to-point operations
+  size_t maxCollBytes; // Max transfer size for collective operations
+  uint64_t fabricId;   // Fabric handle associated to the current device
+} ncclNetProperties_v11_t;
+
+typedef enum {
+  NET_LOC_DCL0 = 0 /* same DC, hierarchy level 0*/,
+  NET_LOC_DCL1 = 1 /* different DC, hiearchy level 1 */,
+  NET_LOC_DISC = 2 /* disconnected*/
+} ncclNetLoc_v11_t;
+static_assert(NET_LOC_DCL0 < NET_LOC_DCL1 && NET_LOC_DCL1 < NET_LOC_DISC, "Locality must go in increasing order");
+
+typedef struct {
+  ncclNetLoc_v11_t loc;
+} ncclNetPath_v11_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v11_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  // DMA-BUF support
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
+
+  // topology API. getNetPath returns the path between two fabricIds.
+  ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_v11_t* path);
+} ncclNet_v11_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v11_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  // DMA-BUF support
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,
+                             void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v11_t* recvParts, size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v11_t* sendParts, void* recvData, size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
+
+  // topology API. getNetPath returns the path between two fabricIds.
+  ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_v11_t* path);
+} ncclCollNet_v11_t;
+
+#endif // end include guard
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 225acb22d..2331e0ba3 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -299,8 +299,8 @@ struct ncclProxyState {
   int nChannels;
   int buffSizes[NCCL_NUM_PROTOCOLS];
   bool allocP2pNetLLBuffers;
-  bool dmaBufSupport;
-  ncclNet_t* ncclNet;
+  bool dmaBufSupport[NCCL_NET_MAX_PLUGINS];
+  ncclNet_t* ncclNet[NCCL_NET_MAX_PLUGINS];
   ncclCollNet_t* ncclCollNet;
   uint32_t* abortFlag;
   bool directMode;
diff --git a/src/include/transport.h b/src/include/transport.h
index c563fbbd6..e7afbb25d 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -35,6 +35,13 @@ struct ncclRing;
 struct ncclConnector;
 struct ncclComm;
 
+#define PEERINFO_NETDEV_MAXCOUNT (MAXCHANNELS + 2)
+
+struct ncclNetDev {
+  int netIdx;
+  uint64_t fabricId;
+};
+
 struct ncclPeerInfo {
   int rank;
   int cudaDev;
@@ -50,6 +57,9 @@ struct ncclPeerInfo {
   nvmlGpuFabricInfoV_t fabricInfo;
   int cuMemSupport;
   int version;
+  // multi-DC support
+  int netDevCount;
+  ncclNetDev netDevs[PEERINFO_NETDEV_MAXCOUNT];
 };
 
 #define CONNECT_SIZE 256
diff --git a/src/include/trees.h b/src/include/trees.h
index ded84a667..10df339b3 100644
--- a/src/include/trees.h
+++ b/src/include/trees.h
@@ -9,5 +9,6 @@
 
 ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
 ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
+ncclResult_t ncclGetDtreeRoots(int nranks, int* r0, int* r1);
 
 #endif
diff --git a/src/include/utils.h b/src/include/utils.h
index 383f678c8..3f3a5f668 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -34,9 +34,13 @@ ncclResult_t getRandomData(void* buffer, size_t bytes);
 struct netIf {
   char prefix[64];
   int port;
+  int64_t fabricId; // -1 is undefined
 };
 
-int parseStringList(const char* string, struct netIf* ifList, int maxList);
+#define NCCL_IF_MAX_FABRICID (1L<<48)
+
+ncclResult_t parseIfList(const char* string, struct netIf* ifList, int maxList, int *ifCount);
+bool indexIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact, int* index);
 bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
 
 static long log2i(long n) {
diff --git a/src/init.cc b/src/init.cc
index 46b02e65e..29345b11f 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -201,6 +201,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
     for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
     free(comm->nodeRanks);
   }
+  if(comm->dcNode){
+    for (int dc = 0; dc < comm->dcCount; ++dc) free(comm->dcNode[dc].localNodeToNode);
+    free(comm->dcNode);
+  }
   free(comm->rankToNode);
   free(comm->rankToLocalRank);
   free(comm->collNetHeads);
@@ -278,22 +282,22 @@ enum ncclLaunchMode ncclParamLaunchMode;
 NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1);
 
 // Detect DMA-BUF support
-static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
-  if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL || ncclCudaLibraryInit() != ncclSuccess) return ncclInternalError;
+static ncclResult_t dmaBufSupportedByCuda(struct ncclComm* comm) {
+  if (ncclParamDmaBufEnable() == 0 || ncclCudaLibraryInit() != ncclSuccess) return ncclInvalidUsage;
 #if CUDA_VERSION >= 11070
   int flag = 0;
   CUdevice dev;
   int cudaDriverVersion;
   CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion));
-  if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError;
+  if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInvalidUsage;
   CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
   // Query device to see if DMA-BUF support is available
   (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
-  if (flag == 0) return ncclInternalError;
+  if (flag == 0) return ncclInvalidUsage;
   INFO(NCCL_INIT, "DMA-BUF is available on GPU device %d", comm->cudaDev);
   return ncclSuccess;
 #endif
-  return ncclInternalError;
+  return ncclInvalidUsage;
 }
 
 ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
@@ -315,6 +319,15 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
   return ret;
 }
 
+static ncclResult_t commNetName(struct ncclComm* comm, char* netName, size_t len) {
+  snprintf(netName, len, "%s", comm->ncclNet[0]->name);
+  for (int n = 1; n < comm->ncclNetCount; n++) {
+    size_t offset = strlen(netName);
+    snprintf(netName + offset, len - offset, "+%s", comm->ncclNet[n]->name);
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
   if (ndev < 1) {
     WARN("invalid device count (%d) requested", ndev);
@@ -334,13 +347,16 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   NCCLCHECK(ncclNetPluginLoad(comm));
   NCCLCHECK(ncclNetInit(comm));
   NCCLCHECK(ncclProfilerPluginInit(comm));
-  INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
 
-  if (parent && parent->config.splitShare) {
-    if (parent->ncclNet != comm->ncclNet) {
-      WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
-      return ncclInvalidUsage;
-    }
+  char netName[128];
+  NCCLCHECK(commNetName(comm,netName,sizeof(netName)));
+  INFO(NCCL_INIT, "Using network %s", netName);
+
+  if (parent && parent->config.splitShare && (parent->config.netName && comm->config.netName && strcmp(parent->config.netName, comm->config.netName) != 0)) {
+    char parentNetName[128];
+    NCCLCHECK(commNetName(parent, parentNetName, sizeof(parentNetName)));
+    WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parentNetName, netName);
+    return ncclInvalidUsage;
   }
   // Try to create a CUDA object right away. If there is something wrong with
   // the device we're on (failure cause #1) , better know it early.
@@ -359,7 +375,11 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
 
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-  comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
+
+  bool dmaBufCuda = dmaBufSupportedByCuda(comm) == ncclSuccess;
+  for (int n = 0; n < comm->ncclNetCount; ++n) {
+    comm->dmaBufSupport[n] = dmaBufCuda && (comm->ncclNet[n]->regMrDmaBuf != NULL);
+  }
 
   comm->collNetSupport = 0;
   memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
@@ -560,6 +580,37 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
   info->comm = comm;
   info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap;
 
+  // cross-DC support, build the list of unique fabric Id on this node.
+  // Packing of info->netDevs must happen with increasing net index.
+  info->netDevCount = 0;
+  for (int n = 0; n < comm->ncclNetCount; ++n) {
+    ncclNet_t* net = comm->ncclNet[n];
+    int nDevs = 0;
+    NCCLCHECK(net->devices(&nDevs));
+    for (int d = 0; d < nDevs; ++d) {
+      ncclNetProperties_t props;
+      NCCLCHECK(net->getProperties(d, &props));
+      // look for a similar fabricID
+      int sameId = -1;
+      for (int j = 0; j < info->netDevCount; ++j) {
+        if (n == info->netDevs[j].netIdx && props.fabricId == info->netDevs[j].fabricId) {
+          sameId = j;
+          break;
+        }
+      }
+      // if we haven't found the id already, add it
+      if (sameId == -1) {
+        if (info->netDevCount == PEERINFO_NETDEV_MAXCOUNT) {
+          WARN("Node cannot have more than %d fabric IDs (found %d)", PEERINFO_NETDEV_MAXCOUNT, info->netDevCount + 1);
+          return ncclInternalError;
+        }
+        info->netDevs[info->netDevCount].netIdx = n;
+        info->netDevs[info->netDevCount].fabricId = props.fabricId;
+        info->netDevCount++;
+      }
+    }
+  }
+
   // MNNVL support
   {
     // MNNVL: Request the fabric UUID and partition info
@@ -690,6 +741,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     struct ncclTopoRanks topoRanks;
     int cpuArch;
     int cpuVendor;
+    int firstRankDc;
   };
 
   int nChannelsOrig;
@@ -705,7 +757,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   timers[TIMER_INIT_ALLGATHER] = clockNano();
   // AllGather1 - begin
   NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
-  NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
+  NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo + rank, comm->commHash), ret, fail);
   NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
 
   comm->cuMemSupport = 1;
@@ -892,9 +944,48 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
   // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
   timers[TIMER_INIT_CONNECT] = clockNano();
+
   // AllGather3 - begin
   NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
 
+  // allGather3Data for multiDC support
+  allGather3Data[rank].firstRankDc = -1;
+  int graphTypeInter [NCCL_NUM_ALGORITHMS];
+  for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) graphTypeInter[a] = graphs[a]->typeInter;
+  for (int r = 0; r < comm->nRanks; ++r) {
+    // for each algorithm, gather the largest PATH between the GPU and the net that will be used to connect to the peer.
+    for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) {
+      for (int c = 0; c < graphs[a]->nChannels; ++c) {
+        int netPathType;
+        ncclResult_t res = ncclTopoGetNetDevFromGraph(comm, comm->rank, /*peerRank=*/r, graphs[a], c, NULL, NULL, &netPathType);
+        if (res != ncclSuccess && graphs[a]->pattern != NCCL_TOPO_PATTERN_NVLS) {
+          // not finding a nic for rank r is an error;
+          //Unless it's an NVLS graphs, then it's expected because not every rank is an NVLS head
+          WARN("Unable to find a net dev to connect to %d with channel %d in graph[%d]", r, c, a);
+          return ncclInternalError;
+        }
+        if(netPathType > graphTypeInter[a]) graphTypeInter[a] = netPathType;
+      }
+    }
+    struct ncclPeerInfo* rankInfo = &comm->peerInfo[rank];
+    struct ncclPeerInfo* peerInfo = &comm->peerInfo[r];
+    for (int d = 0; d < rankInfo->netDevCount; d++) {
+      int devNetIdx = rankInfo->netDevs[d].netIdx;
+      uint64_t devFabricId = rankInfo->netDevs[d].fabricId;
+      // skip if it's not the DC network or if we have found the first rank already
+      if (devNetIdx != comm->ncclDcNetIndex || allGather3Data[rank].firstRankDc != -1) continue;
+      for (int p = 0; p < peerInfo->netDevCount; p++) {
+        ncclNetPath_t path = {.loc = NET_LOC_DISC};
+        if (peerInfo->netDevs[p].netIdx == devNetIdx) NCCLCHECK(comm->ncclNet[devNetIdx]->getNetPath(peerInfo->netDevs[p].fabricId, devFabricId, &path));
+        if (path.loc <= NET_LOC_DCL0) {
+          allGather3Data[rank].firstRankDc = r;
+          break;
+        }
+      }
+    }
+  }
+
+
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
     allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
     allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
@@ -902,7 +993,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
     allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
     allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
-    allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
+    allGather3Data[rank].graphInfo[a].typeInter = graphTypeInter[a];
     allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
   }
 
@@ -975,6 +1066,47 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   comm->localRank = comm->rankToLocalRank[rank];
   comm->localRanks = comm->nodeRanks[comm->node].localRanks;
 
+  // multiDC support - obtain the DC ID for each node and create the DC rank list
+  do {
+    comm->dcCount = 0;
+    int* firstRankDcNode = NULL;
+    NCCLCHECKGOTO(ncclCalloc(&firstRankDcNode, comm->nNodes), ret, fail);
+    for (int n = 0; n < comm->nNodes; n++) {
+      struct ncclNodeRanks* nodeRank = &comm->nodeRanks[n];
+      if (nodeRank->localRanks > 0) {
+        firstRankDcNode[n] = allGather3Data[nodeRank->localRankToRank[0]].firstRankDc;
+      }
+      // now find previous nodes with the same firstRank, assign their DC index
+      int m = 0;
+      for (m = 0; m < n; ++m) {
+        if (firstRankDcNode[m] == firstRankDcNode[n]) {
+          nodeRank->dcIndex = comm->nodeRanks[m].dcIndex;
+          break;
+        }
+      }
+      // haven't found any, create a new DC
+      if (m == n) nodeRank->dcIndex = comm->dcCount++;
+    }
+    free(firstRankDcNode);
+    // then build information for each of them
+    NCCLCHECKGOTO(ncclCalloc(&comm->dcNode, comm->dcCount), ret, fail);
+    for (int n = 0; n < comm->nNodes; n++) {
+      struct ncclNodeRanks* nodeRank = &comm->nodeRanks[n];
+      comm->dcNode[nodeRank->dcIndex].localNodes++;
+    }
+    for (int dc = 0; dc < comm->dcCount; ++dc) {
+      struct ncclDcNode* dcRank = &comm->dcNode[dc];
+      NCCLCHECKGOTO(ncclCalloc(&dcRank->localNodeToNode, dcRank->localNodes), ret, fail);
+      dcRank->localNodes = 0; // reset to 0 to be able to fill the arrays
+    }
+    // store the rank arrays inside the DC rank list
+    for (int n = 0; n < comm->nNodes; n++) {
+      struct ncclNodeRanks* nodeRank = &comm->nodeRanks[n];
+      struct ncclDcNode* dcNode = &comm->dcNode[nodeRank->dcIndex];
+      dcNode->localNodeToNode[dcNode->localNodes++] = n;
+    }
+  } while (0);
+
   TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
         rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
   if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 731dbcee1..e32f83f85 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -124,12 +124,13 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
 #ifdef ENABLE_TRACE
   char line[SOCKET_NAME_MAXLEN+1];
 #endif
+  int nUserIfs;
   struct netIf userIfs[MAX_IFS];
   bool searchNot = prefixList && prefixList[0] == '^';
   if (searchNot) prefixList++;
   bool searchExact = prefixList && prefixList[0] == '=';
   if (searchExact) prefixList++;
-  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
+  NCCLCHECK(parseIfList(prefixList, userIfs, MAX_IFS, &nUserIfs));
 
   int found = 0;
   struct ifaddrs *interfaces, *interface;
@@ -275,7 +276,9 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char
   if (!ipv6) {
     struct netIf ni;
     // parse <ip_or_hostname>:<port> string, expect one pair
-    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+    int nIfs;
+    NCCLCHECK(parseIfList(ip_port_pair, &ni, 1, &nIfs));
+    if (nIfs != 1) {
       WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
       return ncclInvalidArgument;
     }
@@ -450,6 +453,9 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
   return ncclSuccess;
 }
 
+NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1);
+NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1);
+
 static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
   const int one = 1;
   /* Set socket as non-blocking if async or if we need to be able to abort */
@@ -458,7 +464,11 @@ static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
     SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
     SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
   }
-  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY");
+  // setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1)
+  int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff();
+  if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF");
+  if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF");
   return ncclSuccess;
 }
 
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index bb59947e4..ffddce30b 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -132,20 +132,34 @@ uint64_t getPidHash(void) {
   return getHash(pname, strlen(pname));
 }
 
-int parseStringList(const char* string, struct netIf* ifList, int maxList) {
-  if (!string) return 0;
-
+ncclResult_t parseIfList(const char* string, struct netIf* ifList, int maxList, int *ifCount) {
   const char* ptr = string;
-
-  int ifNum = 0;
-  int ifC = 0;
+  int ifNum = 0, ifC = 0;
   char c;
+  if (!string) goto exit;
   do {
     c = *ptr;
     if (c == ':') {
       if (ifC > 0) {
         ifList[ifNum].prefix[ifC] = '\0';
-        ifList[ifNum].port = atoi(ptr+1);
+        ifList[ifNum].port = -1;
+        ifList[ifNum].fabricId = -1;
+        char* next = NULL;
+        const char* start = ptr + 1;
+        long port = strtol(start, &next, 10);
+        if (next != start) ifList[ifNum].port = port;
+        if (*next == ':') {
+          start = next + 1;
+          uint64_t fabId = strtol(start, &next, 10);
+          if (next != start) {
+            if (fabId < 0 || fabId > NCCL_IF_MAX_FABRICID) {
+              WARN("fabric ID %ld must be between 0 and  %ld.", fabId, NCCL_IF_MAX_FABRICID);
+              goto fail;
+            }
+            ifList[ifNum].fabricId = fabId;
+          }
+        }
+        INFO(NCCL_ENV | NCCL_NET, "found IF %s port %d fabricId %ld", ifList[ifNum].prefix, ifList[ifNum].port, ifList[ifNum].fabricId);
         ifNum++; ifC = 0;
       }
       while (c != ',' && c != '\0') c = *(++ptr);
@@ -153,6 +167,8 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
       if (ifC > 0) {
         ifList[ifNum].prefix[ifC] = '\0';
         ifList[ifNum].port = -1;
+        ifList[ifNum].fabricId = -1;
+        INFO(NCCL_ENV | NCCL_NET, "found IF %s port %d fabricId %ld", ifList[ifNum].prefix, ifList[ifNum].port, ifList[ifNum].fabricId);
         ifNum++; ifC = 0;
       }
     } else {
@@ -161,7 +177,12 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
     }
     ptr++;
   } while (ifNum < maxList && c);
-  return ifNum;
+exit:
+  if(ifCount) *ifCount = ifNum;
+  return ncclSuccess;
+fail:
+  if(ifCount) *ifCount = ifNum;
+  return ncclInvalidUsage;
 }
 
 static bool matchIf(const char* string, const char* ref, bool matchExact) {
@@ -178,18 +199,22 @@ static bool matchPort(const int port1, const int port2) {
 }
 
 
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
+bool indexIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact, int* index) {
+  if (index) *index = -1;
   // Make an exception for the case where no user list is defined
   if (listSize == 0) return true;
 
   for (int i=0; i<listSize; i++) {
-    if (matchIf(string, ifList[i].prefix, matchExact)
-        && matchPort(port, ifList[i].port)) {
+    if (matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
+      if(index) *index=i;
       return true;
     }
   }
   return false;
 }
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
+  return indexIfList(string,port,ifList,listSize,matchExact,NULL);
+}
 
 __thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
 
diff --git a/src/plugin/net.cc b/src/plugin/net.cc
index 9257d7786..d8998782f 100644
--- a/src/plugin/net.cc
+++ b/src/plugin/net.cc
@@ -5,31 +5,32 @@
  ************************************************************************/
 
 #include "net.h"
-#include "bootstrap.h"
 #include "checks.h"
 #include "plugin.h"
-
+#include "nccl_net.h"
 #include <string.h>
 #include <errno.h>
-//#include <sys/types.h>
-//#include <sys/stat.h>
-//#include <unistd.h>
 
 extern ncclNet_t* getNcclNet_v6(void* netPluginLib);
 extern ncclNet_t* getNcclNet_v7(void* netPluginLib);
 extern ncclNet_t* getNcclNet_v8(void* netPluginLib);
 extern ncclNet_t* getNcclNet_v9(void* netPluginLib);
 extern ncclNet_t* getNcclNet_v10(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v11(void* netPluginLib);
 
 extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib);
 extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib);
 extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib);
 extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib);
 extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v11(void* netPluginLib);
+
+extern void* openNetPluginLib(const char* name);
+extern void closePluginLib(void* handle);
 
 static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
 ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
-static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 };
+static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, NCCL_NET_PLUGIN_VERSION, NCCL_NET_PLUGIN_VERSION };
 ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
 enum ncclNetState {
   ncclNetStateInit = 0,
@@ -72,8 +73,13 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
     goto fail;
   }
 
-  ncclNets[0] = getNcclNet_v10(netPluginLib);
-  if (ncclNets[0]) ncclNetsVer[0] = 10;
+  ncclNets[0] = getNcclNet_v11(netPluginLib);
+  if (ncclNets[0]) ncclNetsVer[0] = 11;
+  if (ncclNets[0] == nullptr) {
+    // Try v10 plugin
+    ncclNets[0] = getNcclNet_v10(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 10;
+  }
   if (ncclNets[0] == nullptr) {
     // Try v9 plugin
     ncclNets[0] = getNcclNet_v9(netPluginLib);
@@ -99,7 +105,10 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
   }
 
   // Check for CollNet
-  ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
+  ncclCollNets[0] = getNcclCollNet_v11(netPluginLib);
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
+  }
   if (ncclCollNets[0] == nullptr) {
     ncclCollNets[0] = getNcclCollNet_v9(netPluginLib);
   }
@@ -123,6 +132,7 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
 fail:
   if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib));
   netPluginStatus = netPluginLoadFailed;
+  ncclNetsVer[0] = -1;
   goto exit;
 }
 
@@ -199,33 +209,48 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
 }
 
 ncclResult_t ncclNetInit(struct ncclComm* comm) {
-  // Initialize main communication network
-  const char* netName;
+  // get the list of user-provided nets, reuse the code for interfaces parsing.
+  int nUserIfs;
+  const char* netName = comm->config.netName;
+  struct netIf userNets[NCCL_NET_MAX_PLUGINS];
+  NCCLCHECK(parseIfList(netName, userNets, NCCL_NET_MAX_PLUGINS, &nUserIfs));
+
+  const char* dcNet = ncclGetEnv("NCCL_ALLNET_FASTNET");
+  comm->ncclDcNetIndex = 0;
+  comm->ncclNetCount = 0;
   bool ok = false;
-
-  netName = comm->config.netName;
-  for (int i=0; i<3; i++) {
+  for (int i=0; i<NCCL_NET_MAX_PLUGINS; i++) {
     if (ncclNets[i] == nullptr) continue;
     enum ncclNetState state;
     NCCLCHECK(netGetState(i, &state));
     if (state != ncclNetStateEnabled) continue;
-    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
+    if (!matchIfList(ncclNets[i]->name, /*port=*/-1, userNets, nUserIfs, 1)) continue;
     if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
       // Mismatched device plugin version
       continue;
     }
 
-    comm->ncclNet = ncclNets[i];
-    comm->ncclNetVer = ncclNetsVer[i];
+    comm->ncclNetPluginIdx[comm->ncclNetCount] = i;
+    comm->ncclNetVer[comm->ncclNetCount] = ncclNetsVer[i];
+    comm->ncclNet[comm->ncclNetCount] = ncclNets[i];
+    if (dcNet && strcmp(dcNet, ncclNets[i]->name) == 0) comm->ncclDcNetIndex = i;
     ok = true;
 
-    if (ncclCollNets[i]) {
-      NCCLCHECK(collNetGetState(i, &state));
-      if (state == ncclNetStateEnabled) {
-        comm->ncclCollNet = ncclCollNets[i];
+    // try to load the collNet if we don't use ALLNET and it's the first net to be loaded
+    if (comm->ncclNetCount == 0 && ncclCollNets[i]) {
+      if (ncclParamAllNet()) {
+        INFO(NCCL_INIT | NCCL_NET, "NCCL_ALLNET_ENABLE=1, ignore CollNet. Set NCCL_ALLNET_ENABLE=0 if CollNet is needed.");
+      } else {
+        NCCLCHECK(collNetGetState(i, &state));
+        if (state == ncclNetStateEnabled) {
+          comm->ncclCollNetPluginIdx = i;
+          comm->ncclCollNet = ncclCollNets[i];
+        }
       }
     }
-    break;
+    comm->ncclNetCount++;
+
+    if (!ncclParamAllNet()) break;
   }
 
   if (!ok) {
@@ -236,7 +261,9 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
 }
 
 ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
-  comm->ncclNet = nullptr;
+  for (int n = 0; n < comm->ncclNetCount; ++n) {
+    comm->ncclNet[n] = nullptr;
+  }
   comm->ncclCollNet = nullptr;
   return ncclSuccess;
 }
@@ -259,14 +286,22 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
 	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
   if (gdrSupportMatrix[comm->cudaDev] == -1) {
-    int netDevs;
-    NCCLCHECK(comm->ncclNet->devices(&netDevs));
     gdrSupportMatrix[comm->cudaDev] = 0;
-    for (int dev=0; dev<netDevs; dev++) {
-      // Find a net device which is GDR-capable
-      ncclNetProperties_t props;
-      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
-      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+
+    // find any device on any net that is GDR-capable
+    int dev = -1;
+    ncclNet_t * net = NULL;
+    for (int n = 0; n < comm->ncclNetCount; ++n) {
+      net = comm->ncclNet[n];
+      int netDevs;
+      NCCLCHECK(net->devices(&netDevs));
+      for (dev = 0; dev < netDevs; dev++) {
+        ncclNetProperties_t props;
+        NCCLCHECK(net->getProperties(dev, &props));
+        // if cuda is supported, select the net and device, then return
+        if (props.ptrSupport & NCCL_PTR_CUDA) break;
+      }
+    }
 
     // Allocate memory on the GPU and try to register it on the NIC.
     void *lComm = NULL, *sComm = NULL, *rComm = NULL;
@@ -275,45 +310,37 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
     void* mHandle = NULL;
     ncclResult_t ret;
     ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTO(net->listen(dev, &handle, &lComm), ret, cleanup1);
 
     bool connected;
     connected = false;
     while (!connected) {
-
       // If we're aborting now, skip to cleanup
       if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
         goto cleanup2;
       }
-
       if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2);
-
+        NCCLCHECKGOTO(net->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2);
       if (rComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
-
+        NCCLCHECKGOTO(net->accept(lComm, &rComm, NULL), ret, cleanup2);
       connected = (rComm != NULL) && (sComm != NULL);
     }
 
     NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
-    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
-      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
+    if (net->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(net->deregMr(sComm, mHandle));
+      NCCLCHECK(net->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(net->deregMr(rComm, mHandle));
       gdrSupportMatrix[comm->cudaDev] = 1;
     }
     ncclDebugNoWarn = 0;
     NCCLCHECK(ncclCudaFree(gpuPtr));
-cleanup2:
-    if (rComm != NULL)
-      NCCLCHECK(comm->ncclNet->closeRecv(rComm));
-    if (sComm != NULL)
-      NCCLCHECK(comm->ncclNet->closeSend(sComm));
-    NCCLCHECK(comm->ncclNet->closeListen(lComm));
-cleanup1:
-      break;
-    }
+  cleanup2:
+    if (rComm != NULL) NCCLCHECK(net->closeRecv(rComm));
+    if (sComm != NULL) NCCLCHECK(net->closeSend(sComm));
+    NCCLCHECK(net->closeListen(lComm));
   }
+cleanup1:
   *gdrSupport = gdrSupportMatrix[comm->cudaDev];
   return ncclSuccess;
 }
diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc
index 682f239f7..a464e2188 100644
--- a/src/plugin/net/net_v10.cc
+++ b/src/plugin/net/net_v10.cc
@@ -8,25 +8,162 @@
 #include "net_device.h"
 #include "proxy.h"
 
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
 static ncclNet_v10_t* ncclNet_v10;
 static ncclCollNet_v10_t* ncclCollNet_v10;
 
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v10_t p10;
+  ncclResult_t ans = ncclNet_v10->getProperties(dev, &p10);
+  if (ans != ncclSuccess) return ans;
+  props->name = p10.name;
+  props->pciPath = p10.pciPath;
+  props->guid = p10.guid;
+  props->ptrSupport = p10.ptrSupport;
+  props->regIsGlobal = p10.regIsGlobal;
+  props->forceFlush = p10.forceFlush;
+  props->speed = p10.speed;
+  props->port = p10.port;
+  props->maxComms = p10.maxComms;
+  props->maxRecvs = p10.maxRecvs;
+  props->latency = p10.latency;
+  props->netDeviceType = p10.netDeviceType;
+  props->netDeviceVersion = p10.netDeviceVersion;
+  props->vProps.ndevs = p10.vProps.ndevs;
+  memcpy(props->vProps.devs, p10.vProps.devs, sizeof(p10.vProps.devs));
+  props->maxP2pBytes = p10.maxP2pBytes;
+  props->maxCollBytes = p10.maxCollBytes;
+  props->fabricId = 0; // all devs are on the same rail if v10
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v10_t p10;
+  ncclResult_t ans = ncclCollNet_v10->getProperties(dev, &p10);
+  if (ans != ncclSuccess) return ans;
+  props->name = p10.name;
+  props->pciPath = p10.pciPath;
+  props->guid = p10.guid;
+  props->ptrSupport = p10.ptrSupport;
+  props->regIsGlobal = p10.regIsGlobal;
+  props->forceFlush = p10.forceFlush;
+  props->speed = p10.speed;
+  props->port = p10.port;
+  props->maxComms = p10.maxComms;
+  props->maxRecvs = p10.maxRecvs;
+  props->latency = p10.latency;
+  props->netDeviceType = p10.netDeviceType;
+  props->netDeviceVersion = p10.netDeviceVersion;
+  props->vProps.ndevs = p10.vProps.ndevs;
+  memcpy(props->vProps.devs, p10.vProps.devs, sizeof(p10.vProps.devs));
+  props->maxP2pBytes = p10.maxP2pBytes;
+  props->maxCollBytes = p10.maxCollBytes;
+  props->fabricId = 0; // all devs are on the same rail if v10
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v10->connect(dev, (ncclNetCommConfig_v10_t*)config, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) {
+  return ncclNet_v10->makeVDevice(d, (ncclNetVDeviceProps_v10_t*)props);
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v10->init(logfn, proffn));
+  ncclNet.devices = ncclNet_v10->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v10->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept = ncclNet_v10->accept;
+  ncclNet.regMr = ncclNet_v10->regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v10->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v10->deregMr;
+  ncclNet.isend = ncclNet_v10->isend;
+  ncclNet.irecv = ncclNet_v10->irecv;
+  ncclNet.iflush = ncclNet_v10->iflush;
+  ncclNet.test = ncclNet_v10->test;
+  ncclNet.closeSend = ncclNet_v10->closeSend;
+  ncclNet.closeRecv = ncclNet_v10->closeRecv;
+  ncclNet.closeListen = ncclNet_v10->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v10->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v10->irecvConsumed;
+  ncclNet.makeVDevice = ncclNet_v10->makeVDevice ? ncclNet_makeVDevice : nullptr;
+  ncclNet.getNetPath = ncclNet_getNetPath;
+  return ncclSuccess;
+}
+
 ncclNet_t* getNcclNet_v10(void* lib) {
-  ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10");
+  ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v9");
   if (ncclNet_v10) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name);
-    return ncclNet_v10;
+    ncclNet.name = ncclNet_v10->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name);
+    return &ncclNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol.");
+  INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
   return nullptr;
 }
 
+static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
+  size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+  void* sendMhandle, void** request) {
+return ncclCollNet_v10->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v10_t*)recvParts, bytesPerRank,
+  windowOffset, windowBytes, sendMhandle, request);
+}
+
+static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData,
+      size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+      ncclDataType_t dataType, ncclRedOp_t redOp,
+      void* recvMhandle, void** request) {
+return ncclCollNet_v10->ireducescatter(collComm, nSendParts, (ncclNetSGE_v10_t*)sendParts, recvData, bytesPerRank,
+      windowOffset, windowBytes, dataType, redOp, recvMhandle, request);
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v10->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v10->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v10->listen;
+  ncclCollNet.connect = ncclCollNet_v10->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v10->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_v10->regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v10->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v10->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_v10->iallreduce;
+  ncclCollNet.iallgather = ncclCollNet_iallgather;
+  ncclCollNet.ireducescatter = ncclCollNet_ireducescatter;
+  ncclCollNet.iflush = ncclCollNet_v10->iflush;
+  ncclCollNet.test = ncclCollNet_v10->test;
+  ncclCollNet.closeColl = ncclCollNet_v10->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v10->closeListen;
+  ncclCollNet.getNetPath = ncclCollNet_getNetPath;
+  return ncclSuccess;
+}
+
 ncclCollNet_t* getNcclCollNet_v10(void* lib) {
   ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10");
   if (ncclCollNet_v10) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name);
-    return ncclCollNet_v10;
+    ncclCollNet.name = ncclCollNet_v10->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclCollNet_v10->name);
+    return &ncclCollNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol.");
+  INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol.");
   return nullptr;
 }
diff --git a/src/plugin/net/net_v11.cc b/src/plugin/net/net_v11.cc
new file mode 100644
index 000000000..d1c8b7c36
--- /dev/null
+++ b/src/plugin/net/net_v11.cc
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+
+static ncclNet_v11_t* ncclNet_v11;
+static ncclCollNet_v11_t* ncclCollNet_v11;
+
+ncclNet_t* getNcclNet_v11(void* lib) {
+  ncclNet_v11 = (ncclNet_v11_t*)dlsym(lib, "ncclNetPlugin_v11");
+  if (ncclNet_v11) {
+    INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Loaded net plugin %s (v11)", ncclNet_v11->name);
+    return ncclNet_v11;
+  }
+  INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v11 symbol.");
+  return nullptr;
+}
+
+ncclCollNet_t* getNcclCollNet_v11(void* lib) {
+  ncclCollNet_v11 = (ncclCollNet_v11_t*)dlsym(lib, "ncclCollNetPlugin_v11");
+  if (ncclCollNet_v11) {
+    INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v11)", ncclCollNet_v11->name);
+    return ncclCollNet_v11;
+  }
+  INFO(NCCL_INIT | NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v11 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc
index baff67935..f6f743f02 100644
--- a/src/plugin/net/net_v6.cc
+++ b/src/plugin/net/net_v6.cc
@@ -35,6 +35,7 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->fabricId = 0;
   return ncclSuccess;
 }
 
@@ -71,6 +72,12 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si
   return ans;
 }
 
+static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
@@ -92,6 +99,7 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->fabricId = 0;
   return ncclSuccess;
 }
 
@@ -110,6 +118,12 @@ static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void*
   return ans;
 }
 
+static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
   NCCLCHECK(ncclNet_v6->init(logfn));
   ncclNet.devices = ncclNet_v6->devices;
diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc
index 4bad5ec26..804a917db 100644
--- a/src/plugin/net/net_v7.cc
+++ b/src/plugin/net/net_v7.cc
@@ -35,6 +35,7 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->fabricId = 0;
   return ncclSuccess;
 }
 
@@ -67,6 +68,12 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si
   return ans;
 }
 
+static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v7_t p7;
   ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
@@ -88,6 +95,7 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->fabricId = 0;
   return ncclSuccess;
 }
 
@@ -106,6 +114,12 @@ static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void*
   return ans;
 }
 
+static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
   NCCLCHECK(ncclNet_v7->init(logfn));
   ncclNet.devices = ncclNet_v7->devices;
diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc
index b43bb895e..c014d237b 100644
--- a/src/plugin/net/net_v8.cc
+++ b/src/plugin/net/net_v8.cc
@@ -35,6 +35,7 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->fabricId = 0;
   return ncclSuccess;
 }
 
@@ -61,6 +62,11 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si
   ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
   return ans;
 }
+static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t * path) {
+  if(!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
 
 static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v8_t p8;
@@ -83,6 +89,7 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->fabricId = 0;
   return ncclSuccess;
 }
 
@@ -128,6 +135,12 @@ static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, n
   return ans;
 }
 
+static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
   NCCLCHECK(ncclNet_v8->init(logfn));
   ncclNet.devices = ncclNet_v8->devices;
@@ -148,6 +161,7 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t
   ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr;
   ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed;
   ncclNet.makeVDevice   = NULL;
+  ncclNet.getNetPath = ncclCollNet_getNetPath;
   return ncclSuccess;
 }
 
@@ -180,6 +194,7 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
   ncclCollNet.test = ncclCollNet_v8->test;
   ncclCollNet.closeColl = ncclCollNet_v8->closeColl;
   ncclCollNet.closeListen = ncclCollNet_v8->closeListen;
+  ncclCollNet.getNetPath = ncclCollNet_getNetPath;
   return ncclSuccess;
 }
 
diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc
index 34e039332..66c1cefbe 100644
--- a/src/plugin/net/net_v9.cc
+++ b/src/plugin/net/net_v9.cc
@@ -4,10 +4,11 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
+#include "debug.h"
 #include "nccl_net.h"
 #include "net_device.h"
-#include "proxy.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclNet_t ncclNet;
 static ncclCollNet_t ncclCollNet;
@@ -15,7 +16,28 @@ static ncclNet_v9_t* ncclNet_v9;
 static ncclCollNet_v9_t* ncclCollNet_v9;
 
 static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
-  return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+  ncclNetProperties_v9_t p9;
+  ncclResult_t ans = ncclNet_v9->getProperties(dev, &p9);
+  if (ans != ncclSuccess) return ans;
+  props->name = p9.name;
+  props->pciPath = p9.pciPath;
+  props->guid = p9.guid;
+  props->ptrSupport = p9.ptrSupport;
+  props->regIsGlobal = p9.regIsGlobal;
+  props->forceFlush = p9.forceFlush;
+  props->speed = p9.speed;
+  props->port = p9.port;
+  props->maxComms = p9.maxComms;
+  props->maxRecvs = p9.maxRecvs;
+  props->latency = p9.latency;
+  props->netDeviceType = p9.netDeviceType;
+  props->netDeviceVersion = p9.netDeviceVersion;
+  props->vProps.ndevs = p9.vProps.ndevs;
+  memcpy(props->vProps.devs, p9.vProps.devs, sizeof(p9.vProps.devs));
+  props->maxP2pBytes = p9.maxP2pBytes;
+  props->maxCollBytes = p9.maxCollBytes;
+  props->fabricId= 0; // all devs are on the same rail if v9
+  return ncclSuccess;
 }
 
 static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
@@ -34,8 +56,35 @@ static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) {
   return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props);
 }
 
+static ncclResult_t ncclNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
-  return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+  ncclNetProperties_v9_t p9;
+  ncclResult_t ans = ncclCollNet_v9->getProperties(dev, &p9);
+  if (ans != ncclSuccess) return ans;
+  props->name = p9.name;
+  props->pciPath = p9.pciPath;
+  props->guid = p9.guid;
+  props->ptrSupport = p9.ptrSupport;
+  props->regIsGlobal = p9.regIsGlobal;
+  props->forceFlush = p9.forceFlush;
+  props->speed = p9.speed;
+  props->port = p9.port;
+  props->maxComms = p9.maxComms;
+  props->maxRecvs = p9.maxRecvs;
+  props->latency = p9.latency;
+  props->netDeviceType = p9.netDeviceType;
+  props->netDeviceVersion = p9.netDeviceVersion;
+  props->vProps.ndevs = p9.vProps.ndevs;
+  memcpy(props->vProps.devs, p9.vProps.devs, sizeof(p9.vProps.devs));
+  props->maxP2pBytes = p9.maxP2pBytes;
+  props->maxCollBytes = p9.maxCollBytes;
+  props->fabricId= 0; // all devs are on the same rail if v9
+  return ncclSuccess;
 }
 
 static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
@@ -52,6 +101,11 @@ static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, n
   return ncclCollNet_v9->ireducescatter(collComm, nSendParts, (ncclNetSGE_v9_t*)sendParts, recvData, bytesPerRank,
                                  windowOffset, windowBytes, dataType, redOp, recvMhandle, request);
 }
+static ncclResult_t ncclCollNet_getNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1) ? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
 
 static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
   NCCLCHECK(ncclNet_v9->init(logfn));
@@ -73,6 +127,7 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t
   ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr;
   ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed;
   ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr;
+  ncclNet.getNetPath = ncclNet_getNetPath;
   return ncclSuccess;
 }
 
@@ -105,6 +160,7 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
   ncclCollNet.test = ncclCollNet_v9->test;
   ncclCollNet.closeColl = ncclCollNet_v9->closeColl;
   ncclCollNet.closeListen = ncclCollNet_v9->closeListen;
+  ncclCollNet.getNetPath = ncclCollNet_getNetPath;
   return ncclSuccess;
 }
 
diff --git a/src/proxy.cc b/src/proxy.cc
index 7e8021e47..060521336 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -1821,8 +1821,10 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
     proxyState->p2pChunkSize = comm->p2pChunkSize;
     proxyState->nChannels = comm->nChannels;
     proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers;
-    proxyState->dmaBufSupport = comm->dmaBufSupport;
-    proxyState->ncclNet = comm->ncclNet;
+    for (int n = 0; n < comm->ncclNetCount; ++n) {
+      proxyState->dmaBufSupport[n] = comm->dmaBufSupport[n];
+      proxyState->ncclNet[n] = comm->ncclNet[n];
+    }
     proxyState->ncclCollNet = comm->ncclCollNet;
     proxyState->profilerContext = comm->profilerContext;
     proxyState->directMode = comm->directMode;
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index c1ccfcaa8..1eaa87a97 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -168,7 +168,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 
   int proxyRank;
   int64_t netId;
-  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, -1, &netId, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
 
@@ -188,11 +188,11 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 
   int proxyRank;
   int64_t netId;
-  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, -1, &netId, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
   recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
   // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, /*netIdx*/ 0, req.netDev, myInfo->rank, &req.needFlush));
 
   recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
@@ -330,7 +330,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props));
   connection->collNet = req->collNet;
   /* DMA-BUF support */
-  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport[0] && (props.ptrSupport & NCCL_PTR_DMABUF);
   /* collective size limits*/
   resources->maxCollBytes = props.maxCollBytes;
   if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) {
@@ -448,7 +448,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props));
   connection->collNet = req->collNet;
   /* DMA-BUF support */
-  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport[0] && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxCollBytes = props.maxCollBytes;
   if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) {
     WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 40d334fa7..e72d5b3bb 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -7,6 +7,7 @@
 #include "comm.h"
 #include "net.h"
 #include "graph.h"
+#include "graph/topo.h"
 #include "proxy.h"
 #include "collectives.h"
 #include "gdrwrap.h"
@@ -109,6 +110,7 @@ struct sendNetResources {
   ncclNetDeviceType netDeviceType;
   ncclNetDeviceHandle_t* netDeviceHandle;
   size_t maxP2pBytes;
+  ncclNet_t* ncclNet;
 };
 
 struct recvNetResources {
@@ -142,6 +144,7 @@ struct recvNetResources {
   ncclNetDeviceType netDeviceType;
   ncclNetDeviceHandle_t* netDeviceHandle;
   size_t maxP2pBytes;
+  ncclNet_t* ncclNet;
 };
 
 struct netRegInfo {
@@ -167,6 +170,7 @@ struct setupReq {
   int tpLocalRank;
   int tpRemoteRank;
   int shared;
+  int netIdx; // net index
   int netDev;
   enum ncclTopoGdrMode useGdr;
   int needFlush;
@@ -201,7 +205,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 
   int proxyRank;
   int64_t netId;
-  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, /*pxnRank=*/peerInfo->rank, &netId, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoIdToNetIdx(comm, netId, &req.netIdx));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
   if (!req.useGdr && connIndex == 0) comm->useGdr = 0;
@@ -214,11 +219,11 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
   if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet[req.netIdx]->name, req.netDev,
         req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
         req.shared ? "/Shared" : "");
   } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet[req.netIdx]->name, req.netDev,
         proxyRank,
         req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
         req.shared ? "/Shared" : "");
@@ -244,13 +249,14 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   // Use myInfo->rank as the receiver uses its own NIC
   int proxyRank;
   int64_t netId;
-  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, /*pxnRank=*/myInfo->rank, &netId, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoIdToNetIdx(comm, netId, &req.netIdx));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
   recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
   if (!req.useGdr && connIndex == 0) comm->useGdr = 0;
 
   // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netIdx, req.netDev, myInfo->rank, &req.needFlush));
 
   // We don't support PXN on receive yet
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
@@ -260,7 +266,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
   memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet[req.netIdx]->name, req.netDev,
       req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
       req.shared ? "/Shared" : "");
   return ncclSuccess;
@@ -613,6 +619,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   NCCLCHECK(ncclCalloc(&resources, 1));
   connection->transportResources = resources;
 
+  resources->ncclNet = proxyState->ncclNet[req->netIdx];
   resources->tpRank = req->tpRank;
   resources->tpLocalRank = req->tpLocalRank;
   resources->tpRemoteRank = req->tpRemoteRank;
@@ -622,9 +629,9 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   resources->channelId = req->channelId;
   resources->connIndex = req->connIndex;
   ncclNetProperties_t props;
-  NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props));
+  NCCLCHECK(resources->ncclNet->getProperties(req->netDev, &props));
   /* DMA-BUF support */
-  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport[req->netIdx] && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxRecvs = props.maxRecvs;
   resources->netDeviceVersion = props.netDeviceVersion;
   resources->netDeviceType = props.netDeviceType;
@@ -653,6 +660,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   NCCLCHECK(ncclCalloc(&resources, 1));
   connection->transportResources = resources;
 
+  resources->ncclNet = proxyState->ncclNet[req->netIdx];
   resources->tpRank = req->tpRank;
   resources->tpLocalRank = req->tpLocalRank;
   resources->tpRemoteRank = req->tpRemoteRank;
@@ -663,9 +671,9 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   resources->channelId = req->channelId;
   resources->connIndex = req->connIndex;
   ncclNetProperties_t props;
-  NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props));
+  NCCLCHECK(resources->ncclNet->getProperties(req->netDev, &props));
   /* DMA-BUF support */
-  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport[req->netIdx] && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxRecvs = props.maxRecvs;
   resources->netDeviceVersion = props.netDeviceVersion;
   resources->netDeviceType = props.netDeviceType;
@@ -678,7 +686,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   }
 
   if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
-  NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
+  NCCLCHECK(resources->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
   *done = 1;
 
   return ncclSuccess;
@@ -732,15 +740,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
+      if (comms->sendComm[resources->channelId] == NULL) ret = resources->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
-      ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+      ret = resources->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     }
   } else {
     // Connect to remote peer
-    ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+    ret = resources->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -839,17 +847,17 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
         CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
-        NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        NCCLCHECK(resources->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
 #endif
       {
-        NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+        NCCLCHECK(resources->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
       }
 
       // Copy the mhandle dptr, if implemented
-      if (resources->netDeviceHandle && proxyState->ncclNet->getDeviceMr)
-        NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netSendComm, resources->mhandles[p], &connection->mhandles[p]));
+      if (resources->netDeviceHandle && resources->ncclNet->getDeviceMr)
+        NCCLCHECK(resources->ncclNet->getDeviceMr(resources->netSendComm, resources->mhandles[p], &connection->mhandles[p]));
     }
   }
 
@@ -886,15 +894,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank;
-      if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle);
+      if (comms->recvComm[resources->channelId] == NULL) ret = resources->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle);
       resources->netRecvComm = comms->recvComm[resources->channelId];
       if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
     } else {
-      ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle);
+      ret = resources->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle);
     }
   } else {
     // Connect to remote peer
-    ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle);
+    ret = resources->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -912,7 +920,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
     connection->needsProxyProgress = 1;
   }
 
-  NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm));
+  NCCLCHECK(resources->ncclNet->closeListen(resources->netListenComm));
 
   // Create structures
   struct connectMap* map = &resources->map;
@@ -983,17 +991,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
         CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
-        NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
+        NCCLCHECK(resources->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
 #endif
       {
-        NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+        NCCLCHECK(resources->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
       }
 
       // Copy the mhandle dptr
-      if (resources->netDeviceType != NCCL_NET_DEVICE_HOST && proxyState->ncclNet->getDeviceMr)
-        NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netRecvComm, resources->mhandles[p], &connection->mhandles[p]));
+      if (resources->netDeviceType != NCCL_NET_DEVICE_HOST && resources->ncclNet->getDeviceMr)
+        NCCLCHECK(resources->ncclNet->getDeviceMr(resources->netRecvComm, resources->mhandles[p], &connection->mhandles[p]));
     }
   }
 
@@ -1013,7 +1021,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
   if (connection->state == connConnected) {
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
       if (resources->buffers[p]) {
-        NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p]));
+        NCCLCHECK(resources->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p]));
       }
     }
     struct connectMapMem* mems = resources->map.mems;
@@ -1035,12 +1043,12 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
       if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
         struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank;
         comms->sendRefCount[resources->channelId]--;
-        if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeSend(comms->sendComm[resources->channelId]));
+        if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(resources->ncclNet->closeSend(comms->sendComm[resources->channelId]));
       } else {
-        NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm));
+        NCCLCHECK(resources->ncclNet->closeSend(resources->netSendComm));
       }
     } else {
-      NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm));
+      NCCLCHECK(resources->ncclNet->closeSend(resources->netSendComm));
     }
   }
 
@@ -1058,7 +1066,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
   if (connection->state == connConnected) {
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
       if (resources->buffers[p]) {
-        NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p]));
+        NCCLCHECK(resources->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p]));
       }
     }
     struct connectMapMem* mems = resources->map.mems;
@@ -1076,12 +1084,12 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
       if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
         struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank;
         comms->recvRefCount[resources->channelId]--;
-        if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeRecv(comms->recvComm[resources->channelId]));
+        if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(resources->ncclNet->closeRecv(comms->recvComm[resources->channelId]));
       } else {
-        NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm));
+        NCCLCHECK(resources->ncclNet->closeRecv(resources->netRecvComm));
       }
     } else {
-      NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm));
+      NCCLCHECK(resources->ncclNet->closeRecv(resources->netRecvComm));
     }
   }
 
@@ -1193,7 +1201,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
             // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
             // since size is a plain integer.
             // coverity[use_invalid:FALSE]
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot));
+            NCCLCHECK(resources->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle,sub, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
               TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
               sub->transSize += size;
@@ -1212,7 +1220,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
         int done;
         int size;
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size));
+        NCCLCHECK(resources->ncclNet->test(sub->requests[buffSlot], &done, &size));
         if (done) {
           // Make sure size is reset to -1 before we update the head.
           connFifo[buffSlot].size = -1;
@@ -1353,7 +1361,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
         bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1);
         if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
-        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr));
+        NCCLCHECK(resources->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr));
         if (*requestPtr) {
           subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr;
           subGroup->recvRequestsSubCount = subCount;
@@ -1381,7 +1389,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         int sizes[NCCL_PROXY_MAX_SUBS];
         void* mhandles[NCCL_PROXY_MAX_SUBS];
         for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
-        NCCLCHECK(proxyState->ncclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes));
+        struct recvNetResources* resources = (struct recvNetResources*)(subGroup->connection->transportResources);
+        NCCLCHECK(resources->ncclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes));
         if (done) {
           int needFlush = 0;
           int totalSize = 0;
@@ -1437,7 +1446,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
                 }
               }
               struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
-              NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
+              NCCLCHECK(resources->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
             }
           }
           args->idle = 0;
@@ -1452,7 +1461,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         uint64_t step = subGroup->transmitted;
         int done = 1;
         void* request = subGroup->requests[step%NCCL_STEPS];
-        if (request) NCCLCHECK(proxyState->ncclNet->test(request, &done, NULL));
+        struct recvNetResources* resources = (struct recvNetResources*)(subGroup->connection->transportResources);
+        if (request) NCCLCHECK(resources->ncclNet->test(request, &done, NULL));
         if (done) {
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup + i;
@@ -1490,8 +1500,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
               sub->transmitted > sub->done) {
             if (subGroup->recvRequestsCache[sub->done%NCCL_STEPS]) {
               // the multirecv requests are only cached in the first sub.
-              if (proxyState->ncclNet->irecvConsumed)
-                NCCLCHECK(proxyState->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS]));
+              if (resources->ncclNet->irecvConsumed)
+                NCCLCHECK(resources->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS]));
               subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
             }
             sub->done += args->sliceSteps;
@@ -1662,14 +1672,14 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   if (resources->useDmaBuf) {
     int dmabuf_fd;
     CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
-    NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    NCCLCHECKGOTO(resources->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     (void)close(dmabuf_fd);
     needReg = false;
   }
 peermem:
 #endif
   if (needReg) {
-    NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+    NCCLCHECKGOTO(resources->ncclNet->regMr(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
 
 exit:
@@ -1696,14 +1706,14 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
   if (resources->useDmaBuf) {
     int dmabuf_fd;
     CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
-    NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    NCCLCHECKGOTO(resources->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     (void)close(dmabuf_fd);
     needReg = false;
   }
 peermem:
 #endif
   if (needReg) {
-    NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+    NCCLCHECKGOTO(resources->ncclNet->regMr(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
 
 exit:
@@ -1721,7 +1731,7 @@ static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection,
 
   assert(reqSize == sizeof(void*));
   memcpy(&handle, reqBuff, sizeof(void*));
-  NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, handle));
+  NCCLCHECK(resources->ncclNet->deregMr(resources->netSendComm, handle));
   *done = 1;
   return ncclSuccess;
 }
@@ -1732,7 +1742,7 @@ static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection,
 
   assert(reqSize == sizeof(void*));
   memcpy(&handle, reqBuff, sizeof(void*));
-  NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, handle));
+  NCCLCHECK(resources->ncclNet->deregMr(resources->netRecvComm, handle));
   *done = 1;
   return ncclSuccess;
 }
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index bfff6e555..2fba4d69d 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -14,6 +14,7 @@
 #include "profiler/net_ib.h"
 
 #include <assert.h>
+#include <cstdint>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -77,6 +78,7 @@ struct alignas(64) ncclIbDev {
   struct ibv_port_attr portAttr;
   struct ncclIbStats stats;
   int dmaBufSupported;
+  uint64_t fid;
 };
 
 #define MAX_IB_DEVS  32
@@ -543,6 +545,10 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
         props->devs[0], dev0->devName, dev0->portNum, NCCL_IB_LLSTR(dev0->link), props->devs[i], dev->devName, dev->portNum, NCCL_IB_LLSTR(dev->link));
       return ncclInvalidUsage;
     }
+    if (dev->fid != dev0->fid) {
+      WARN("NET/IB : Trying to merge multiple devices together with different fabric ID properties %s -> %lu, %s -> %lu.", dev0->devName, dev0->fid, dev->devName, dev->fid);
+      return ncclInvalidUsage;
+    }
   }
 
   *d = ncclNMergedIbDevs++;
@@ -559,6 +565,32 @@ ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
 
 static ncclProfilerCallback_t ncclProfilerFunction;
 
+// NCCL_IF_FABRICID_MAX is set to (1<<48), all the bits above are available for default values
+NCCL_PARAM(IbDcMaxRail, "IB_FABRICID_MAXRAIL", (1L<<62));
+NCCL_PARAM(IbDefaultFabricId, "IB_FABRICID_DEFAULT", (1L << 60));
+NCCL_PARAM(RoceDefaultFabricId, "ROCE_FABRICID_DEFAULT", (1L << 61));
+
+// Fabric Id are constructed as dcId * NCCL_IB_FABRICID_DC_MAXRAIL + railId.
+// Two fabric Ids are connected if they have the same rail Id.
+// If they share the same dcId they are connected with LOC_DCL0 (level 0), if not they are connected with LOC_DCL1 (level 1).
+// Note: default fabricIds cannot be associated to a specific rail Ids or DC. By default, they correspond to their own DC.
+ncclResult_t ncclIbgetNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if (!path) return ncclInvalidArgument;
+  uint64_t maxRail0 = (fabricId0 == ncclParamIbDefaultFabricId() || fabricId0 == ncclParamRoceDefaultFabricId()) ? UINT64_MAX : ncclParamIbDcMaxRail();
+  uint64_t maxRail1 = (fabricId1 == ncclParamIbDefaultFabricId() || fabricId1 == ncclParamRoceDefaultFabricId()) ? UINT64_MAX: ncclParamIbDcMaxRail();
+  uint64_t dcId0 = fabricId0 / maxRail0;
+  uint64_t dcId1 = fabricId1 / maxRail1;
+  uint64_t railId0 = fabricId0 % maxRail0;
+  uint64_t railId1 = fabricId1 % maxRail1;
+  if (railId0 != railId1)
+    path->loc = NET_LOC_DISC;
+  else if (dcId0 == dcId1) /*railId0 ==railId1 */
+    path->loc = NET_LOC_DCL0;
+  else /*railId0 == railId1 && dcId0 != dcId1*/
+    path->loc = NET_LOC_DCL1;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
   ncclResult_t ret = ncclSuccess;
   ncclProfilerFunction = profFunction;
@@ -590,7 +622,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
       if (searchNot) userIbEnv++;
       bool searchExact = userIbEnv && userIbEnv[0] == '=';
       if (searchExact) userIbEnv++;
-      int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
+      int nUserIfs;
+      NCCLCHECK(parseIfList(userIbEnv, userIfs, MAX_IB_DEVS, &nUserIfs));
 
       if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
 
@@ -619,9 +652,22 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
               && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
 
           // check against user specified HCAs/ports
-          if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) {
-            continue;
+          int indexUserIf = -1;
+          if (!(indexIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact, &indexUserIf) ^ searchNot)) continue;
+
+          // create the default fabric ID, use the user provided one if available
+          uint64_t fabId = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? (uint64_t)ncclParamIbDefaultFabricId() : (uint64_t)ncclParamRoceDefaultFabricId();
+          if (indexUserIf != -1) {
+            int64_t ifFabricId = userIfs[indexUserIf].fabricId;
+            if (ifFabricId == ncclParamIbDefaultFabricId() || ifFabricId == ncclParamRoceDefaultFabricId()) {
+              INFO(NCCL_NET, "Cannot use device %s because the associated fabric Id = %ld conflicts with the default IB = %ld or RoCE = %ld ones. "
+                   "Please consider changing the value of NCCL_IB_FABRICID_DEFAULT and NCCL_IB_ROCE_FABRICID_DEFAULT to avoid conflicts.",
+                   devices[d]->name, ifFabricId, ncclParamIbDefaultFabricId(), ncclParamRoceDefaultFabricId());
+              continue;
+            }
+            if (ifFabricId >= 0) fabId = (uint64_t)ifFabricId;
           }
+
           pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
           ncclIbDevs[ncclNIbDevs].device = d;
           ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
@@ -632,6 +678,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
           ncclIbDevs[ncclNIbDevs].context = context;
           ncclIbDevs[ncclNIbDevs].pdRefs = 0;
           ncclIbDevs[ncclNIbDevs].pd = NULL;
+          ncclIbDevs[ncclNIbDevs].fid = fabId;
           strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
           NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
           ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
@@ -645,8 +692,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
           ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
           if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
 
-          TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
-              NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
+          TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d fabricId=%lu", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
+              NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar,ncclIbDevs[ncclNIbDevs].fabricId);
 
           PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
           ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
@@ -786,6 +833,7 @@ ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) {
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
   props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->fabricId = ibDev->fid;
   pthread_mutex_unlock(&ibDev->lock);
   return ncclSuccess;
 }
@@ -2485,7 +2533,8 @@ ncclNet_t ncclNetIb = {
   ncclIbCloseListen,
   NULL /* getDeviceMr */,
   NULL /* irecvConsumed */,
-  ncclIbMakeVDevice
+  ncclIbMakeVDevice,
+  ncclIbgetNetPath
 };
 
 /*
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 8034d95fe..15e28e25a 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -116,18 +116,20 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
   props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->fabricId = 0;
   return ncclSuccess;
 }
 
 /* Communication functions */
 
-#define MAX_SOCKETS 64
+#define MAX_SOCKETS 128
 #define MAX_THREADS 16
 #define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
-#define MIN_CHUNKSIZE (64*1024)
 
 NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
 NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
+NCCL_PARAM(SocketInlineSize, "SOCKET_INLINE", /*1 kiB=*/1 << 10);
+NCCL_PARAM(SocketMinTaskSize, "SOCKET_MIN_TASKSIZE", /*64 kiB=*/1 << 16);
 
 enum ncclNetSocketCommState {
   ncclNetSocketCommStateStart = 0,
@@ -171,6 +173,7 @@ struct ncclNetSocketRequest {
   int op;
   void* data;
   int size;
+  void* inlineData;
   struct ncclSocket* ctrlSock;
   int offset;
   int used;
@@ -211,6 +214,7 @@ struct ncclNetSocketComm {
   int nSocks;
   int nThreads;
   int nextSock;
+  void* inlineData;
   struct ncclNetSocketRequest requests[MAX_REQUESTS];
   pthread_t helperThread[MAX_THREADS];
   struct ncclNetSocketThreadResources threadResources[MAX_THREADS];
@@ -360,6 +364,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm)
   goto exit;
 }
 
+#define SOCKET_CTRL_SIZE (sizeof(int))
 ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     return ncclInternalError;
@@ -401,6 +406,7 @@ ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* op
     NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done));
     if (done == 0) return ncclSuccess;
   }
+  NCCLCHECK(ncclCalloc(&comm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize())));
   *sendComm = comm;
   return ncclSuccess;
 }
@@ -449,6 +455,7 @@ ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDevic
       memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
     free(sock);
   }
+  NCCLCHECK(ncclCalloc(&rComm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize())));
   *recvComm = rComm;
 
   /* reset lComm state */
@@ -470,6 +477,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi
       r->used = 1;
       r->comm = comm;
       r->nSubs = 0;
+      r->inlineData = (uint8_t*)comm->inlineData + i * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize());
       *req = r;
       return ncclSuccess;
     }
@@ -520,6 +528,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclPro
   return ncclInternalError;
 }
 
+// if the dataSize is smaller than the inline size, return the inline size; if not, return 0 to avoid the extra copy.
+static int ncclNetSocketInlineSize(int dataSize) { return (dataSize <= ncclParamSocketInlineSize()) ? dataSize : 0; }
+
 ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
   *done = 0;
   struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request;
@@ -527,37 +538,50 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
     WARN("NET/Socket : test called with NULL request");
     return ncclInternalError;
   }
-  if (r->used == 1) { /* try to send/recv size */
-    int data = r->size;
+  if (r->used == 1) { /* try to send/recv size (+ inline data if any) */
+    int msgSize;
+    uint8_t* msg = (uint8_t*)r->inlineData;
+    if (r->op == NCCL_SOCKET_SEND) {
+      int inlineSize = ncclNetSocketInlineSize(r->size);
+      msgSize = inlineSize + SOCKET_CTRL_SIZE;
+      ((int*)(msg))[0] = r->size;
+      if (inlineSize > 0) memcpy(msg + SOCKET_CTRL_SIZE, r->data, inlineSize);
+    } else {
+      int sizeOffset = 0;
+      while (sizeOffset < SOCKET_CTRL_SIZE) {
+        NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, msg, SOCKET_CTRL_SIZE, &sizeOffset));
+        if (sizeOffset == 0) return ncclSuccess; /* not ready yet*/
+      }
+      int senderSize = ((int*)(msg))[0];
+      if (senderSize > r->size) {
+        char line[SOCKET_NAME_MAXLEN + 1];
+        union ncclSocketAddress addr;
+        NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr));
+        WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, "
+             "there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
+             ncclSocketToString(&addr, line), senderSize, r->size);
+        return ncclInvalidUsage;
+      }
+      // from the actual size, extract the remaining inline size to be received and redirect the msg buffer to the user data
+      r->size = senderSize;
+      msgSize = ncclNetSocketInlineSize(r->size);
+      msg = (uint8_t*)r->data;
+    }
     int offset = 0;
-    NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset));
-
-    if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
-
-    // Not sure we could ever receive less than 4 bytes, but just in case ...
-    if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset));
-
-    // Check size is less or equal to the size provided by the user
-    if (r->op == NCCL_SOCKET_RECV && data > r->size) {
-      char line[SOCKET_NAME_MAXLEN+1];
-      union ncclSocketAddress addr;
-      NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr));
-      WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
-          there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
-          ncclSocketToString(&addr, line), data, r->size);
-      return ncclInvalidUsage;
+    while (offset < msgSize) {
+      NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, msg, msgSize, &offset));
+      if (offset == 0) return ncclSuccess; /* not ready yet*/
     }
-    r->size = data;
-    r->offset = 0;
-    r->used = 2; // done exchanging size
-    // divide into subtasks
-    int chunkOffset = 0, i = 0;
+    // done exchanging sizes, r->size now contains the actual size
+    r->used = 2;
+    r->offset = ncclNetSocketInlineSize(r->size);
+    int chunkOffset = r->offset, i = 0;
     if (r->comm->nSocks > 0) {
-      // each request can be divided up to nSocks tasks
-      int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
+      // each request can be divided up to nSocks tasks, we use the size left to transfer
+      int taskSize = std::max((int)ncclParamSocketMinTaskSize(), DIVUP(r->size - r->offset, r->comm->nSocks));
       while (chunkOffset < r->size) {
-        int chunkSize = std::min(taskSize, r->size-chunkOffset);
-        NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+        int chunkSize = std::min(taskSize, r->size - chunkOffset);
+        NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data) + chunkOffset, chunkSize, r->tasks + i++));
         chunkOffset += chunkSize;
       }
     }
@@ -673,11 +697,18 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) {
       NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready));
       if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i]));
     }
+    if(comm->inlineData) free(comm->inlineData);
     free(comm);
   }
   return ncclSuccess;
 }
 
+ncclResult_t ncclSocketgetNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) {
+  if(!path) return ncclInvalidArgument;
+  path->loc = (fabricId0 == fabricId1)? NET_LOC_DCL0 : NET_LOC_DISC;
+  return ncclSuccess;
+}
+
 ncclNet_t ncclNetSocket = {
   "Socket",
   ncclNetSocketInit,
@@ -698,5 +729,6 @@ ncclNet_t ncclNetSocket = {
   ncclNetSocketCloseListen,
   NULL /* getDeviceMr */,
   NULL /* irecvConsumed */,
-  NULL /* mergeDevices */
+  NULL /* mergeDevices */,
+  ncclSocketgetNetPath
 };