Skip to content

Commit 3ea7eed

Browse files
committed
NCCL 2.27.5-1
Improvements for GB200 systems * Optimize the network performance by alternating the direction of the rings and the NIC to GPU assignment across communicators to limit unnecessary sharing. * Fix the detection of C2C links in case GPU Direct RDMA is disabled between a GPU and a NIC. * Fix PXN support on MNNVL systems, where NCCL would try (and fail) to share regular host memory across multiple nodes. * Fix P2C (PXN over C2C), which is now preferred over regular PXN. This support is currently preliminary and is disabled by default; use NCCL_PXN_C2C=1 to enable. Further reduce the overheads of CUDA graph capturing, which increased in NCCL 2.26.2 for large graphs. Optimize the network performance on DGX B200 systems by adjusting the bandwidths provided to the graph search algorithm. Enable fp8 reductions in symmetric kernels on Blackwell with CUDA 12.8. Restore the plugin name handling logic to make it possible to specify a path to the plugin (Issue #1732). Restore the ability to change NCCL_COLLNET_ENABLE during execution (Issue #1741). Add an example tuner plugin with CSV-based overrides. Remove an x86 dependency from the example profiler.
1 parent 72d2432 commit 3ea7eed

File tree

33 files changed

+2740
-143
lines changed

33 files changed

+2740
-143
lines changed

ext-net/example/Makefile

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,20 @@
33
#
44
# See LICENSE.txt for license information
55
#
6-
NCCL_HOME:=../../build/
7-
CUDA_HOME:=/usr/local/cuda
8-
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
9-
PLUGIN_SO:=libnccl-net.so
6+
.DEFAULT_GOAL: build
7+
include ../../makefiles/common.mk
8+
SRCDIR ?= $(abspath ../..)
9+
BUILDDIR ?= .
10+
NCCLDIR := $(BUILDDIR)
1011

11-
default: $(PLUGIN_SO)
12+
SRC_FILES := $(wildcard *.c)
1213

13-
$(PLUGIN_SO): plugin.c
14-
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
14+
build: ${BUILDDIR}/libnccl-net-example.so
15+
16+
${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES}
17+
@printf "Compiling %-35s > %s\n" $< $@
18+
@mkdir -p ${BUILDDIR}
19+
$(CC) -Inccl -fPIC -shared -o $@ $^
1520

1621
clean:
17-
rm -f $(PLUGIN_SO)
22+
rm -f ${BUILDDIR}/libnccl-net-example.so

ext-profiler/example/Makefile

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,20 @@
33
#
44
# See LICENSE.txt for license information
55
#
6-
NCCL_HOME := ../../build
7-
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
8-
PLUGIN_SO := libnccl-profiler.so
6+
.DEFAULT_GOAL: build
7+
include ../../makefiles/common.mk
8+
SRCDIR ?= $(abspath ../..)
9+
BUILDDIR ?= .
10+
NCCLDIR := $(BUILDDIR)
911

10-
default: $(PLUGIN_SO)
12+
SRC_FILES := $(wildcard *.c)
1113

12-
$(PLUGIN_SO): plugin.c event.c print_event.c
13-
$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
14+
build: ${BUILDDIR}/libnccl-profiler-example.so
15+
16+
${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES}
17+
@printf "Compiling %-35s > %s\n" $< $@
18+
@mkdir -p ${BUILDDIR}
19+
$(CC) -Inccl -fPIC -shared -o $@ $^
1420

1521
clean:
16-
rm -f $(PLUGIN_SO)
22+
rm -f ${BUILDDIR}/libnccl-profiler-example.so

ext-profiler/example/plugin.c

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include <sys/types.h>
1313
#include <sys/syscall.h>
1414
#include <unistd.h>
15-
#include <x86intrin.h>
15+
#include <time.h>
1616
#include "event.h"
1717
#include "print_event.h"
1818

@@ -41,22 +41,10 @@ static struct proxyOp* detachPool;
4141
ncclDebugLogger_t logFn;
4242
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
4343

44-
static double freq = -1;
45-
__hidden void calibrate() {
46-
struct timeval tv;
47-
gettimeofday(&tv, NULL);
48-
uint64_t timeCycles = __rdtsc();
49-
double time = - tv.tv_sec*1e6 - tv.tv_usec;
50-
uint64_t total = 0ULL;
51-
for (int i = 0; i < 10000; i++) total += __rdtsc();
52-
gettimeofday(&tv, NULL);
53-
timeCycles = __rdtsc() - timeCycles;
54-
time += tv.tv_sec*1e6 + tv.tv_usec;
55-
freq = timeCycles / time;
56-
}
57-
5844
__hidden double gettime(void) {
59-
return __rdtsc() / freq;
45+
struct timespec t;
46+
clock_gettime(CLOCK_MONOTONIC, &t);
47+
return (t.tv_sec*1e6 + (t.tv_nsec*1e-3));
6048
}
6149

6250
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@@ -98,8 +86,6 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
9886
// process address space.
9987
pid = getpid();
10088

101-
// calibrate and start timer
102-
calibrate();
10389
startTime = gettime();
10490
}
10591
pthread_mutex_unlock(&lock);

ext-tuner/basic/Makefile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# See LICENSE.txt for license information
5+
#
6+
.DEFAULT_GOAL: build
7+
include ../../makefiles/common.mk
8+
SRCDIR ?= $(abspath ../..)
9+
BUILDDIR ?= .
10+
NCCLDIR := $(BUILDDIR)
11+
12+
SRC_FILES := $(wildcard *.c)
13+
DST_DIR := $(BUILDDIR)/test/unit/plugins
14+
15+
build: ${BUILDDIR}/libnccl-tuner-basic.so
16+
17+
${BUILDDIR}/libnccl-tuner-basic.so: ${SRC_FILES}
18+
@printf "Compiling %-35s > %s\n" $< $@
19+
@mkdir -p ${BUILDDIR}
20+
$(CC) -Inccl -fPIC -shared -o $@ $^
21+
22+
clean:
23+
rm -f ${BUILDDIR}/libnccl-tuner-basic.so

ext-tuner/basic/nccl/common.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/*************************************************************************
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef COMMON_H_
8+
#define COMMON_H_
9+
10+
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11+
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12+
13+
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14+
15+
#endif

ext-tuner/basic/nccl/err.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/*
2+
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
3+
*/
4+
5+
#ifndef NCCL_ERR_H_
6+
#define NCCL_ERR_H_
7+
8+
/* Error type for plugins */
9+
typedef enum { ncclSuccess = 0,
10+
ncclUnhandledCudaError = 1,
11+
ncclSystemError = 2,
12+
ncclInternalError = 3,
13+
ncclInvalidArgument = 4,
14+
ncclInvalidUsage = 5,
15+
ncclRemoteError = 6 } ncclResult_t;
16+
17+
#endif

ext-tuner/basic/nccl/tuner.h

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*************************************************************************
2+
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3+
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
4+
*
5+
* See LICENSE.txt for license information
6+
************************************************************************/
7+
8+
#ifndef NCCL_TUNER_H_
9+
#define NCCL_TUNER_H_
10+
11+
#include <stdint.h>
12+
#include <stdlib.h>
13+
14+
#include "common.h"
15+
#include "err.h"
16+
17+
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
18+
typedef enum {
19+
ncclFuncBroadcast = 0,
20+
ncclFuncReduce = 1,
21+
ncclFuncAllGather = 2,
22+
ncclFuncReduceScatter = 3,
23+
ncclFuncAllReduce = 4,
24+
ncclFuncSendRecv = 5,
25+
ncclFuncSend = 6,
26+
ncclFuncRecv = 7,
27+
ncclNumFuncs = 8
28+
} ncclFunc_t;
29+
30+
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
31+
#define NCCL_ALGO_UNDEF -1
32+
#define NCCL_ALGO_TREE 0
33+
#define NCCL_ALGO_RING 1
34+
#define NCCL_ALGO_COLLNET_DIRECT 2
35+
#define NCCL_ALGO_COLLNET_CHAIN 3
36+
#define NCCL_ALGO_NVLS 4
37+
#define NCCL_ALGO_NVLS_TREE 5
38+
#define NCCL_ALGO_PAT 6
39+
40+
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
41+
#define NCCL_PROTO_UNDEF -1
42+
#define NCCL_PROTO_LL 0
43+
#define NCCL_PROTO_LL128 1
44+
#define NCCL_PROTO_SIMPLE 2
45+
46+
#define NCCL_ALGO_PROTO_IGNORE -1.0
47+
48+
// API to be implemented by external tuner
49+
typedef struct {
50+
// Name of the tuner
51+
const char* name;
52+
53+
// Initializes tuner states.
54+
// Inputs:
55+
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
56+
// - nNodes: number of nodes in current communicator.
57+
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
58+
// Outputs:
59+
// - context: tuner context object
60+
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
61+
62+
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
63+
// Inputs:
64+
// - context: tuner context object
65+
// - collType: collective type , e.g., allreduce, allgather…
66+
// - nBytes: collective size in bytes
67+
// - numPipeOps: number of operations in the group
68+
// - numAlgo: number of algorithms in collCostTable
69+
// - numProto: number of protocols in collCostTable
70+
// - regBuff: can register user buffer
71+
//
72+
// Outputs:
73+
// - nChannels: number of channels (hence SMs) to be used.
74+
//
75+
// InOut:
76+
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
77+
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
78+
//
79+
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
80+
// default tuning for the given collective.
81+
// Also, the plugin is allowed to not set any output, or set only the
82+
// algorithm and protocol, but not only the algorithm or only the protocol.
83+
// Unset fields will be set automatically by NCCL.
84+
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
85+
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
86+
int regBuff, int* nChannels);
87+
88+
// Terminates the plugin and cleans up any resources that the plugin allocated.
89+
// context: tuner context object
90+
ncclResult_t (*destroy)(void* context);
91+
} ncclTuner_v4_t;
92+
93+
typedef ncclTuner_v4_t ncclTuner_t;
94+
95+
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
96+
97+
#endif

ext-tuner/basic/plugin.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*************************************************************************
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#include "tuner.h"
8+
9+
#define __hidden __attribute__ ((visibility("hidden")))
10+
11+
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
12+
13+
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
14+
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
15+
int regBuff, int* nChannels) {
16+
// Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
17+
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
18+
if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
19+
table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
20+
}
21+
*nChannels = 1;
22+
return ncclSuccess;
23+
}
24+
25+
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
26+
27+
#define PLUGIN_NAME "Basic"
28+
29+
const ncclTuner_v4_t ncclTunerPlugin_v4 = {
30+
.name = PLUGIN_NAME,
31+
.init = pluginInit,
32+
.getCollInfo = pluginGetCollInfo,
33+
.destroy = pluginDestroy
34+
};

ext-tuner/example/Makefile

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,53 @@
33
#
44
# See LICENSE.txt for license information
55
#
6-
NCCL_HOME:=../../build/
7-
CUDA_HOME:=/usr/local/cuda
8-
INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
9-
PLUGIN_SO:=libnccl-tuner.so
106

11-
default: $(PLUGIN_SO)
7+
.DEFAULT_GOAL: build
8+
PLUGIN_SO:=libnccl-tuner-example.so
9+
include ../../makefiles/common.mk
10+
SRCDIR ?= $(abspath ../..)
11+
BUILDDIR ?= .
12+
NCCLDIR := $(BUILDDIR)
1213

13-
$(PLUGIN_SO): plugin.c
14-
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
14+
SRC_FILES := $(wildcard *.c)
15+
DST_DIR := $(BUILDDIR)/test/unit/plugins
16+
17+
default: ${BUILDDIR}/$(PLUGIN_SO)
18+
19+
build: ${BUILDDIR}/$(PLUGIN_SO)
20+
21+
${BUILDDIR}/$(PLUGIN_SO): plugin.c
22+
@printf "Compiling %-35s > %s\n" $< $@
23+
@mkdir -p ${BUILDDIR}
24+
$(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
25+
26+
# Test targets - delegate to test directory
27+
test:
28+
$(MAKE) -C test test TEST_CASE=$(TEST_CASE)
29+
30+
test-verbose:
31+
$(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE)
32+
33+
# Build tests
34+
test-build:
35+
$(MAKE) -C test all
36+
37+
# Optimize configurations from performance data
38+
optimize-config:
39+
@if [ -z "$(CSV_FILE)" ]; then \
40+
echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \
41+
echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \
42+
exit 1; \
43+
fi
44+
python3 scripts/optimize_config.py $(CSV_FILE) \
45+
$(if $(OUTPUT),-o $(OUTPUT)) \
46+
$(if $(METRIC),-m $(METRIC)) \
47+
$(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \
48+
$(if $(DRY_RUN),--dry-run) \
49+
$(if $(NO_HEADER),--no-header)
1550

1651
clean:
17-
rm -f $(PLUGIN_SO)
52+
rm -f ${BUILDDIR}/$(PLUGIN_SO)
53+
$(MAKE) -C test clean
54+
55+
.PHONY: test test-verbose test-build optimize-config clean

0 commit comments

Comments
 (0)