Skip to content

Commit 178b6b7

Browse files
committed
2.22.3-1
Rework core for NVIDIA Trusted Computing * Compress work structs so that they are shared between channels * Utilize the full amount of kernel argument space permitted (4k) before resorting to work fifo. * Rework the task preprocessing phase. * Use a separate abortDevFlag which is kept in sync with abortFlag using cudaMemcpy operations. * Rename src/include/align.h to src/include/bitops.h Add lazy connection establishment for collective operations * Move buffer allocation and connection establishment to the first collective operation using that algorithm. * Accelerate init time and reduce memory usage. * Avoid allocating NVLS buffers if all calls are registered. * Compute algo/proto in ncclLaunchCollTasksInfo early on. * Connect peers in ncclCollPreconnectFunc if not connected already. * Also move shared buffer creation to the first send/recv call. Accelerate intra-node NVLink detection * Make each rank only detect NVLinks attached to its GPU. * Fuse XMLs to reconstruct the full NVLink topology Add init profiling to report time spend in different init phases. * Report timings of bootstrap, allgather, search, connect, etc. * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS. Add support for PCI p2p on split PCI switches * Detect split PCI switches through a kernel module exposing switch information. * Update the topology XML and graph to add those inter-switch connections. Add cost estimation API * Add a new ncclGroupEndSimulate primitive to return the estimated time a group would take. Net/IB: Add separate traffic class for fifo messages * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages independently from NCCL_IB_TC. Merges PR #1194 Net/IB: Add support for IB router * Use flid instead of lid if subnets do not match * Warn if flid is 0 Optimizations and fixes for device network offload (unpack) * Double the default number of channels * Cache netDeviceType * Fix save/increment head logic to enable Tree support. Support ncclGroupStart/End for ncclCommAbort/Destroy * Allow Abort/Destroy to be called within a group when managing multiple GPUs with a single process. Improve Tuner API * Provide to the plugin the original cost table so that the plugin can leave unknown or disabled algo/proto combinations untouched. * Remove nvlsSupport and collnetSupport. Do not print version to stdout when using a debug file * Also print version from all processes with INFO debug level. Fixes issue #1271 Fix clang warnings in NVTX headers * Update NVTX headers to the latest version Fixes issue #1270 Disable port fusion in heterogeneous systems * Do not fuse ports if a mix of multi-port and single port are detected. Fix NVLS graphs search for dual NICs. * Fix NVLS graph search when we have more than one NIC per GPU. Fix crash with collnetDirect * Add separate graph search for collnetDirect, testing alltoall paths and working similarly to the NVLS search. Fix hang when nodes have different CPU types * Add the CPU type to the rank peer info. * Align all ranks on the CPU type after the first allgather. * Only use the aligned CPU type for all tuning operations. Fixes issue #1136 Fixes issue #1184 Fix performance of registered send/recv operations * Allow for single full size operations * Add INFO to confirm the registration of send/recv buffers. Move all sync ops to finalize stage * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has been called. Improve error reporting during SHM segment creation Improve support of various compilers Merges PR #1177 Merges PR #1228 Allow net and tuner plugins to be statically linked * Search for ncclNet or ncclTuner symbols in the main binary. Merges PR #979 Plugin examples includes cleanup * Harmonize err.h and common.h usage. * Add mixed plugin with both net and tuner.
1 parent 529ee69 commit 178b6b7

File tree

115 files changed

+8595
-4326
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

115 files changed

+8595
-4326
lines changed

ext-net/example/nccl/common.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/*************************************************************************
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef COMMON_H_
8+
#define COMMON_H_
9+
10+
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11+
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12+
13+
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14+
15+
#endif

ext-net/example/nccl/err.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ typedef enum { ncclSuccess = 0,
1111
ncclSystemError = 2,
1212
ncclInternalError = 3,
1313
ncclInvalidArgument = 4,
14+
ncclInvalidUsage = 5,
1415
ncclRemoteError = 6 } ncclResult_t;
1516

1617
#endif

ext-net/example/nccl/net.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <stdint.h>
99
#include <stdlib.h>
1010

11+
#include "common.h"
1112
#include "err.h"
1213

1314
#define NCCL_NET_HANDLE_MAXSIZE 128
@@ -19,11 +20,6 @@
1920
// Maximum number of requests per comm object
2021
#define NCCL_NET_MAX_REQUESTS 32
2122

22-
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
23-
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
24-
25-
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
26-
2723
#include "net_v8.h"
2824
#include "net_v7.h"
2925
#include "net_v6.h"

ext-net/example/nccl/types.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
33
*/
44

5-
#ifndef NCCL_ERR_H_
6-
#define NCCL_ERR_H_
5+
#ifndef NCCL_TYPES_H_
6+
#define NCCL_TYPES_H_
77

88
/* Data types */
99
typedef enum { ncclInt8 = 0, ncclChar = 0,

ext-tuner/example/nccl/common.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/*************************************************************************
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef COMMON_H_
8+
#define COMMON_H_
9+
10+
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11+
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12+
13+
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14+
15+
#endif

ext-tuner/example/nccl/err.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/*
2+
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
3+
*/
4+
5+
#ifndef NCCL_ERR_H_
6+
#define NCCL_ERR_H_
7+
8+
/* Error type for plugins */
9+
typedef enum { ncclSuccess = 0,
10+
ncclUnhandledCudaError = 1,
11+
ncclSystemError = 2,
12+
ncclInternalError = 3,
13+
ncclInvalidArgument = 4,
14+
ncclInvalidUsage = 5,
15+
ncclRemoteError = 6 } ncclResult_t;
16+
17+
#endif

ext-tuner/example/nccl/tuner.h

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,24 @@
88
#ifndef NCCL_TUNER_H_
99
#define NCCL_TUNER_H_
1010

11-
#include "nccl.h"
11+
#include <stdint.h>
12+
#include <stdlib.h>
1213

13-
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
14-
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
15-
16-
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14+
#include "common.h"
15+
#include "err.h"
1716

1817
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
19-
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
18+
typedef enum {
19+
ncclFuncBroadcast = 0,
20+
ncclFuncReduce = 1,
21+
ncclFuncAllGather = 2,
22+
ncclFuncReduceScatter = 3,
23+
ncclFuncAllReduce = 4,
24+
ncclFuncSendRecv = 5,
25+
ncclFuncSend = 6,
26+
ncclFuncRecv = 7,
27+
ncclNumFuncs = 8
28+
} ncclFunc_t;
2029

2130
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
2231
#define NCCL_ALGO_UNDEF -1
@@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
3342
#define NCCL_PROTO_LL128 1
3443
#define NCCL_PROTO_SIMPLE 2
3544

45+
#define NCCL_ALGO_PROTO_IGNORE -1.0
46+
3647
// API to be implemented by external tuner
3748
typedef struct {
3849
// Name of the tuner
@@ -52,31 +63,33 @@ typedef struct {
5263
// - context: tuner context object
5364
// - collType: collective type , e.g., allreduce, allgather…
5465
// - nBytes: collective size in bytes
55-
// - collNetSupport: whether collnet supports this type
56-
// - nvlsSupport: whether nvlink sharp supports this time
5766
// - numPipeOps: number of operations in the group
67+
// - numAlgo: number of algorithms in collCostTable
68+
// - numProto: number of protocols in collCostTable
5869
//
5970
// Outputs:
60-
// - algorithm: selected algorithm to be used for the given collective
61-
// - protocol: selected protocol to be used for the given collective
6271
// - nChannels: number of channels (hence SMs) to be used.
6372
//
73+
// InOut:
74+
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
75+
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
76+
//
6477
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
6578
// default tuning for the given collective.
6679
// Also, the plugin is allowed to not set any output, or set only the
6780
// algorithm and protocol, but not only the algorithm or only the protocol.
6881
// Unset fields will be set automatically by NCCL.
6982
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
70-
int collNetSupport, int nvlsSupport, int numPipeOps,
71-
int *algorithm, int *protocol, int* nChannels);
83+
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
84+
int* nChannels);
7285

7386
// Terminates the plugin and cleans up any resources that the plugin allocated.
7487
// context: tuner context object
7588
ncclResult_t (*destroy)(void* context);
76-
} ncclTuner_v2_t;
89+
} ncclTuner_v3_t;
7790

78-
typedef ncclTuner_v2_t ncclTuner_t;
91+
typedef ncclTuner_v3_t ncclTuner_t;
7992

80-
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
93+
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
8194

8295
#endif

ext-tuner/example/plugin.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,21 @@
1111
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
1212

1313
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
14-
int collNetSupport, int nvlsSupport, int numPipeOps,
15-
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
14+
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
15+
int* nChannels) {
16+
// Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
17+
if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
18+
collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
19+
}
20+
*nChannels = 1;
21+
return ncclSuccess;
22+
}
1623

1724
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
1825

1926
#define PLUGIN_NAME "Example"
2027

21-
const ncclTuner_v2_t ncclTunerPlugin_v2 = {
28+
const ncclTuner_v3_t ncclTunerPlugin_v3 = {
2229
.name = PLUGIN_NAME,
2330
.init = pluginInit,
2431
.getCollInfo = pluginGetCollInfo,

makefiles/version.mk

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
##### version
22
NCCL_MAJOR := 2
3-
NCCL_MINOR := 21
4-
NCCL_PATCH := 5
3+
NCCL_MINOR := 22
4+
NCCL_PATCH := 3
55
NCCL_SUFFIX :=
66
PKG_REVISION := 1

src/bootstrap.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom
201201

202202
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
203203
memset(handle, 0, sizeof(ncclBootstrapHandle));
204-
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
205204

206205
const char* env = ncclGetEnv("NCCL_COMM_ID");
207206
if (env) {
@@ -210,7 +209,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
210209
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
211210
return ncclInvalidArgument;
212211
}
212+
handle->magic = NCCL_MAGIC;
213213
} else {
214+
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
214215
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
215216
NCCLCHECK(bootstrapCreateRoot(handle, false));
216217
}
@@ -626,7 +627,7 @@ ncclResult_t bootstrapClose(void* commState) {
626627
struct bootstrapState* state = (struct bootstrapState*)commState;
627628
if (state->unexpectedConnections != NULL) {
628629
unexpectedFree(state);
629-
if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
630+
if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
630631
WARN("Unexpected connections are not empty");
631632
return ncclInternalError;
632633
}

0 commit comments

Comments
 (0)