Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions ext-net/example/nccl/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);

#include "net_v11.h"
#include "net_v10.h"
#include "net_v9.h"
#include "net_v8.h"
Expand All @@ -35,9 +36,10 @@ typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* p
#include "net_v3.h"
#include "net_v2.h"

typedef ncclNet_v10_t ncclNet_t;
typedef ncclNetProperties_v10_t ncclNetProperties_t;
typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
typedef ncclNet_v11_t ncclNet_t;
typedef ncclNetProperties_v11_t ncclNetProperties_t;
typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
typedef ncclNetPath_v11_t ncclNetPath_t;

#endif // end include guard
3 changes: 2 additions & 1 deletion ext-net/example/nccl/net_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ typedef struct {
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;

#endif
115 changes: 115 additions & 0 deletions ext-net/example/nccl/net_v11.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NET_V11_H_
#define NET_V11_H_

#define NCCL_NET_MAX_DEVS_PER_NIC_V11 4

typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V11];
} ncclNetVDeviceProps_v11_t;

#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1

typedef struct {
// Plugin-specific TC value
int trafficClass;
} ncclNetCommConfig_v11_t;

typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int regIsGlobal; // regMr is not tied to a particular comm
int forceFlush; // Force a flush on receives
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
ncclNetVDeviceProps_v11_t vProps;
size_t maxP2pBytes; // Max transfer size for point-to-point operations
size_t maxCollBytes; // Max transfer size for collective operations
uint64_t fabricId; // Fabric handle associated to the current device
} ncclNetProperties_v11_t;

typedef enum {
NET_LOC_DCL0 = 0 /* same DC, hierarchy level 0*/,
NET_LOC_DCL1 = 1 /* different DC, hiearchy level 1 */,
NET_LOC_DISC = 2 /* disconnected*/
} ncclNetLoc_v11_t;

typedef struct {
ncclNetLoc_v11_t loc;
} ncclNetPath_v11_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v11_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);

// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);

// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);

// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
// what index this new vNIC exists at
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);

// topology API. getNetPath returns the path between two fabricIds.
ncclResult_t (*getNetPath)(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_v11_t* path);
} ncclNet_v11_t;

#endif // end include guard
60 changes: 58 additions & 2 deletions ext-net/example/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalE
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
// Below are default values, if unsure don't change.

props->name = "Example";
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
props->pciPath = NULL;
Expand Down Expand Up @@ -48,6 +47,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
// maximum transfer sizes the plugin can handle
props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
// all devs can connect to each other, fabric ID is 0 (any other value would work)
props->fabricId = 0;
return ncclSuccess;
}

Expand All @@ -67,10 +68,12 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE
__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
__hidden ncclResult_t pluginGetNetPath(uint64_t fabricId0, uint64_t fabricId1, ncclNetPath_t* path) { return ncclInternalError; }

#define PLUGIN_NAME "Plugin"

const ncclNet_v10_t ncclNetPlugin_v10 = {

const ncclNet_v11_t ncclNetPlugin_v11 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
Expand All @@ -91,6 +94,59 @@ const ncclNet_v10_t ncclNetPlugin_v10 = {
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
.makeVDevice = pluginMakeVDevice,
.getNetPath = pluginGetNetPath,
};

__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props_v10) {
ncclNetProperties_t props;
ncclResult_t ret = pluginGetProperties(dev, &props);
if (ret != ncclSuccess) return ret;
props_v10->name = props.name;
props_v10->pciPath = props.pciPath;
props_v10->guid = props.guid;
props_v10->ptrSupport = props.ptrSupport;
props_v10->regIsGlobal = props.regIsGlobal;
props_v10->forceFlush = props.forceFlush;
props_v10->speed = props.speed;
props_v10->port = props.port;
props_v10->maxComms = props.maxComms;
props_v10->maxRecvs = props.maxRecvs;
props_v10->netDeviceType = props.netDeviceType;
props_v10->netDeviceVersion = props.netDeviceVersion;
props_v10->vProps.ndevs = props.vProps.ndevs;
memcpy(props_v10->vProps.devs, props.vProps.devs, sizeof(props.vProps.devs));
props_v10->maxP2pBytes = props.maxP2pBytes;
props_v10->maxCollBytes = props.maxCollBytes;
return ncclSuccess;
}

__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
return pluginConnect(dev, (ncclNetCommConfig_t*)config, handle, sendComm, sendDevComm);
}

__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; }

const ncclNet_v10_t ncclNetPlugin_v10 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v10,
.listen = pluginListen,
.connect = pluginConnect_v10,
.accept = pluginAccept,
.regMr = pluginRegMr,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
.irecv = pluginIrecv,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
.makeVDevice = pluginMakeVDevice_v10,
};

__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
Expand Down
47 changes: 27 additions & 20 deletions src/bootstrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -479,8 +479,9 @@ static ncclResult_t getUDS(uint64_t* peerUDS) {
return ncclSuccess;
}
#define MAX_OOB_DEVS 16
static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, ncclNet_t** net, int* dev) {
static int devOOB = -1;
static ncclNet_t* netOOB = NULL;
if (devOOB < 0) {
pthread_mutex_lock(&bootstrapNetLock);
if (devOOB < 0) {
Expand All @@ -491,24 +492,28 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
if (searchNot) userIfEnv++;
bool searchExact = userIfEnv && userIfEnv[0] == '=';
if (searchExact) userIfEnv++;
int nUserIfs;
struct netIf userIfs[MAX_OOB_DEVS];
int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS);
// loop over the device and return the first one matching
NCCLCHECK(parseIfList(userIfEnv, userIfs, MAX_OOB_DEVS,&nUserIfs));
// loop over all nets and all devices and return the first one matching
int nDev = 0;
NCCLCHECK(comm->ncclNet->devices(&nDev));
int devId = 0;
while (devId < nDev) {
ncclNetProperties_t props;
comm->ncclNet->getProperties(devId, &props);
// check against user specified HCAs/ports
if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) {
// All plain physical devices have been initialized at this point
devOOB = devId;
break;
for (int n = 0; n < comm->ncclNetCount; ++n) {
NCCLCHECK(comm->ncclNet[n]->devices(&nDev));
int devId = 0;
while (devId < nDev) {
ncclNetProperties_t props;
comm->ncclNet[n]->getProperties(devId, &props);
// check against user specified HCAs/ports
if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) {
// All plain physical devices have been initialized at this point
devOOB = devId;
netOOB = comm->ncclNet[n];
break;
}
devId++;
}
devId++;
}
if (devOOB == -1) {
if (devOOB == -1 || !netOOB) {
if (!searchNot)
WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
else
Expand All @@ -519,16 +524,18 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
} else {
// default choice is device 0
devOOB = 0;
netOOB = comm->ncclNet[0];
}
// display info on the chosen device
ncclNetProperties_t props;
ncclResult_t res = comm->ncclNet->getProperties(devOOB, &props);
ncclResult_t res = netOOB->getProperties(devOOB, &props);
bool hasProp = res == ncclSuccess;
INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
}
pthread_mutex_unlock(&bootstrapNetLock);
}
*dev = devOOB;
*net = netOOB;
return ncclSuccess;
}

Expand Down Expand Up @@ -636,7 +643,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
state->nranks = nranks;
state->cudaDev = comm->cudaDev;
state->abortFlag = comm->abortFlag;
state->net = comm->ncclNet;
state->net = NULL;
comm->bootstrap = state;
comm->magic = state->magic = BOOTSTRAP_HANDLE(handles, 0)->magic; // state and comm magic set to the first magic ID

Expand All @@ -651,7 +658,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_CREATE]);
if (ncclParamBootstrapNetEnable()) {
// Create net interface for other ranks to contact me (all gather)
NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
NCCLCHECK(netGetDevice(rank, comm, &state->net, &STATE_LISTEN(state, net.dev)));
NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
} else {
Expand Down Expand Up @@ -789,7 +796,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
state->nranks = nranks;
state->cudaDev = comm->cudaDev;
state->abortFlag = comm->abortFlag;
state->net = comm->ncclNet;
state->net = NULL;
comm->bootstrap = state;
comm->magic = state->magic = magic;

Expand All @@ -798,7 +805,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo

// create a handle for the others to reach out to me
if (ncclParamBootstrapNetEnable()) {
NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
NCCLCHECKGOTO(netGetDevice(rank, comm, &state->net, &STATE_LISTEN(state, net.dev)), ret, fail);
NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
} else {
Expand Down
Loading