NVIDIA · marksantesson · Nov 6, 2025
diff --git a/Makefile b/Makefile
@@ -9,7 +9,7 @@ default: src.build
 install: src.install
 BUILDDIR ?= $(abspath ./build)
 ABSBUILDDIR := $(abspath $(BUILDDIR))
-TARGETS := src pkg
+TARGETS := src pkg nccl4py
 clean: ${TARGETS:%=%.clean}
 examples.build: src.build
 LICENSE_FILES := LICENSE.txt
@@ -30,5 +30,8 @@ examples: src.build
 pkg.%:
 	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
 
+nccl4py.%:
+	${MAKE} -C nccl4py $* BUILDDIR=${ABSBUILDDIR}
+
 pkg.debian.prep: lic
 pkg.txz.prep: lic
diff --git a/nccl4py/.gitignore b/nccl4py/.gitignore
@@ -0,0 +1,21 @@
+# Build artifacts
+build/
+dist/
+*.egg-info/
+
+# Cython compiled outputs
+nccl/bindings/**/*.cpp
+nccl/bindings/**/*.so
+
+# Generated package files
+nccl/_version.py
+
+# Stamps
+**/.stamp
+
+__pycache__
+
+# Tool caches
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
diff --git a/nccl4py/MANIFEST.in b/nccl4py/MANIFEST.in
@@ -0,0 +1,16 @@
+include pyproject.toml
+include setup.py
+include README.md
+
+# Include Cython sources and headers for sdist only
+include nccl/bindings/_internal/*.pyx
+include nccl/bindings/_internal/*.pxd
+include nccl/bindings/*.pxd
+include nccl/bindings/*.pyx
+
+# Typing markers and stubs
+include nccl/bindings/*.pyi
+
+# Exclude build artifacts and cache
+prune nccl/__pycache__
+global-exclude __pycache__/* *.pyc *.pyo *.stamp *.cpp *.c
diff --git a/nccl4py/Makefile b/nccl4py/Makefile
@@ -0,0 +1,70 @@
+UV            ?= uv
+
+# Root Makefile passes BUILDDIR; default to ../build if not set
+MKFILE_DIR           := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+BUILDDIR             ?= $(abspath $(MKFILE_DIR)/../build)
+NCCL_DIR             := $(abspath $(MKFILE_DIR)/../src)
+NCCL4PY_DIR          := $(abspath $(MKFILE_DIR))
+NCCL4PY_BINDINGS_DIR := $(NCCL4PY_DIR)/nccl/bindings
+
+EXTERNALS_DIR      := $(BUILDDIR)/externals
+EXTERNALS_VENV_DIR := $(EXTERNALS_DIR)/venvs
+DIST_DIR           := $(BUILDDIR)/dist
+
+# CUDA home is only used to provide headers for cython
+CUDA_HOME       ?= /usr/local/cuda
+CUDA_VARIANTS   := 12 13
+PYTHON_VERSIONS := 3.10 3.11 3.12 3.13 3.14
+
+# manylinux repair configuration
+AUDITWHEEL_PLAT ?= manylinux_2_34_$(shell uname -m)
+
+.DEFAULT_GOAL := build
+.PHONY: all build dev clean
+
+all: build
+
+dev:
+	@cd $(NCCL4PY_DIR) && $(UV) sync --extra dev && $(UV) pip install -e .[dev]
+
+
+NCCL4PY_FILES := \
+	nccl/__init__.py \
+	nccl/core/*.py \
+	nccl/core/interop/*.py \
+	nccl/bindings/*.pyx \
+	nccl/bindings/*.pxd \
+	nccl/bindings/*.py \
+	nccl/bindings/_internal/*.pyx \
+	nccl/bindings/_internal/*.pxd \
+	nccl/bindings/_internal/*.py
+
+BUILD_DIR = $(BUILDDIR)/nccl4py_cu$*
+OUT_DIR = $(DIST_DIR)/nccl4py_cu$*
+
+$(CUDA_VARIANTS:%=$(DIST_DIR)/nccl4py_cu%/makefile.stamp): $(DIST_DIR)/nccl4py_cu%/makefile.stamp : $(wildcard $(NCCL4PY_DIR)/nccl/core/*.py)
+	mkdir -p "$(BUILD_DIR)"
+	cp "$(NCCL4PY_DIR)/nccl4py_cu$*/pyproject.toml" "$(NCCL4PY_DIR)/nccl4py_cu$*/setup.py" "$(NCCL4PY_DIR)/MANIFEST.in" "$(NCCL4PY_DIR)/README.md" "$(BUILD_DIR)/"
+	cd "$(NCCL4PY_DIR)"; cp --parents $(NCCL4PY_FILES) "$(BUILD_DIR)/"
+	CUDA_HOME="$(CUDA_HOME)" $(UV) build --sdist -p "[email protected]" --out-dir "$(OUT_DIR)" "$(BUILD_DIR)"
+	for v in $(PYTHON_VERSIONS); do \
+	  	CUDA_HOME="$(CUDA_HOME)" $(UV) build --wheel -p "cpython@$${v}" --out-dir "$(OUT_DIR)" "$(OUT_DIR)"/nccl4py_cu$*-*.tar.gz; \
+	done;
+	find "$(OUT_DIR)" -maxdepth 1 -type f -name '*.whl' ! -name '*manylinux*' -print0 | \
+		xargs -0 -r $(UV) run --isolated --no-project -p "[email protected]" --with auditwheel --with patchelf \
+		-m auditwheel repair --plat "$(AUDITWHEEL_PLAT)" -w "$(OUT_DIR)"; \
+	touch "$@"
+
+$(CUDA_VARIANTS:%=nccl4py_cu%): nccl4py_cu%: $(DIST_DIR)/nccl4py_cu%/makefile.stamp
+
+build: $(CUDA_VARIANTS:%=nccl4py_cu%)
+
+clean:
+	@rm -rf $(DIST_DIR) \
+			$(BUILDDIR)/nccl4py_cu* \
+			$(NCCL4PY_DIR)/*.egg-info \
+			$(NCCL4PY_DIR)/build \
+			$(NCCL4PY_DIR)/nccl/_version.py
+	@find $(NCCL4PY_DIR) -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+	@find $(NCCL4PY_DIR) -type f -name "*.pyc" -delete 2>/dev/null || true
+	@find $(NCCL4PY_BINDINGS_DIR) -type f \( -name "*.so" -o -name "*.cpp" -o -name "*.c" \) -delete 2>/dev/null || true
diff --git a/nccl4py/README.md b/nccl4py/README.md
@@ -0,0 +1,3 @@
+# NCCL4Py Overview
+
+NCCL4Py is a Python package that provides a Pythonic interface to NCCL
diff --git a/nccl4py/examples/01_basic/01_allreduce.py b/nccl4py/examples/01_basic/01_allreduce.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""
+Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+NCCL4Py Basic Example: AllReduce
+==================================
+
+The simplest possible example showing how to use NCCL4Py with AllReduce.
+Each rank contributes its rank number, and all ranks receive the sum.
+
+USAGE:
+mpirun -np 4 python 01_allreduce.py
+"""
+
+import sys
+
+try:
+    from mpi4py import MPI
+except ImportError:
+    print("ERROR: mpi4py required. Install with: pip install mpi4py")
+    sys.exit(1)
+
+try:
+    import torch
+except ImportError:
+    print("ERROR: PyTorch required. Install with: pip install torch")
+    sys.exit(1)
+
+import nccl.core as nccl
+
+
+def main():
+    # Initialize MPI
+    comm_mpi = MPI.COMM_WORLD
+    rank = comm_mpi.Get_rank()
+    nranks = comm_mpi.Get_size()
+
+    # Set rank 0 as the root
+    root = 0
+
+    # Assign GPU to each process
+    device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+    torch.cuda.set_device(device)
+
+    # [NCCL4Py] Generate unique ID on the root rank
+    unique_id = nccl.get_unique_id() if rank == root else None
+
+    # Broadcast unique ID to all ranks
+    unique_id = comm_mpi.bcast(unique_id, root=root)
+
+    # [NCCL4Py] Initialize NCCL communicator
+    nccl_comm = nccl.Communicator.init(nranks=nranks, rank=rank, unique_id=unique_id)
+
+    if rank == root:
+        print(f"Running AllReduce with {nranks} ranks...")
+
+    # Create PyTorch tensor with rank value
+    data = torch.tensor([float(rank)], dtype=torch.float32, device=device)
+
+    # [NCCL4Py] AllReduce: Sum all rank values
+    nccl_comm.all_reduce(data, data, nccl.SUM)
+
+    torch.cuda.synchronize()
+
+    # Verify result
+    expected = float(nranks * (nranks - 1) // 2)
+    actual = float(data[0].item())
+
+    print(f"Rank {rank}: AllReduce result = {actual:.0f} (expected {expected:.0f})")
+
+    # [NCCL4Py] Destroy NCCL communicator (collective call)
+    nccl_comm.destroy()
+
+    if rank == root:
+        if actual == expected:
+            print("SUCCESS!")
+        else:
+            print("FAILED!")
+
+    return 0 if actual == expected else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/nccl4py/examples/01_basic/02_send_recv.py b/nccl4py/examples/01_basic/02_send_recv.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""
+Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+NCCL4Py Basic Example: Send/Recv
+==================================
+
+Simple point-to-point communication example showing two ranks exchanging data.
+Demonstrates the use of group() to avoid deadlocks.
+
+USAGE:
+mpirun -np 2 python 02_send_recv.py
+"""
+
+import sys
+
+try:
+    from mpi4py import MPI
+except ImportError:
+    print("ERROR: mpi4py required. Install with: pip install mpi4py")
+    sys.exit(1)
+
+try:
+    import torch
+except ImportError:
+    print("ERROR: PyTorch required. Install with: pip install torch")
+    sys.exit(1)
+
+import nccl.core as nccl
+
+
+def main():
+    # Initialize MPI
+    comm_mpi = MPI.COMM_WORLD
+    rank = comm_mpi.Get_rank()
+    nranks = comm_mpi.Get_size()
+
+    # Set rank 0 as the root
+    root = 0
+
+    if nranks != 2:
+        if rank == root:
+            print("ERROR: This example requires exactly 2 ranks")
+            print("Usage: mpirun -np 2 python 02_send_recv.py")
+        sys.exit(1)
+
+    # Assign GPU to each process
+    device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+    torch.cuda.set_device(device)
+
+    # [NCCL4Py] Generate unique ID on the root rank
+    unique_id = nccl.get_unique_id() if rank == root else None
+
+    # Broadcast unique ID to all ranks
+    unique_id = comm_mpi.bcast(unique_id, root=root)
+
+    # [NCCL4Py] Initialize NCCL communicator
+    nccl_comm = nccl.Communicator.init(nranks=nranks, rank=rank, unique_id=unique_id)
+
+    if rank == root:
+        print("Running Send/Recv between 2 ranks...")
+
+    # Create tensors
+    send_data = torch.tensor([float(100 + rank)], dtype=torch.float32, device=device)
+    recv_data = torch.zeros(1, dtype=torch.float32, device=device)
+
+    other_rank = 1 - rank  # Rank 0 <-> Rank 1
+
+    # Exchange data using group() to avoid deadlock
+    # IMPORTANT: Using group() allows send() and recv() to be called in any order.
+    # Without group(), you must ensure send() and recv() are ordered carefully to avoid deadlock.
+    # For example:
+    #   - All ranks call send() first, then all call recv(), OR
+    #   - Even ranks send then recv, odd ranks recv then send
+
+    # [NCCL4Py] Use group() to avoid deadlock
+    with nccl.group():
+        # [NCCL4Py] Send data to the other rank
+        nccl_comm.send(send_data, peer=other_rank)
+        # [NCCL4Py] Receive data from the other rank
+        nccl_comm.recv(recv_data, peer=other_rank)
+
+    torch.cuda.synchronize()
+
+    # Verify result
+    expected = float(100 + (1 - rank))
+    actual = float(recv_data[0].item())
+
+    print(f"Rank {rank}: Sent {100 + rank:.0f}, Received {actual:.0f} (expected {expected:.0f})")
+
+    # [NCCL4Py] Destroy NCCL communicator (collective call)
+    nccl_comm.destroy()
+
+    if rank == root:
+        if actual == expected:
+            print("SUCCESS!")
+        else:
+            print("FAILED!")
+
+    return 0 if actual == expected else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/nccl4py/nccl/__init__.py b/nccl4py/nccl/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# See LICENSE.txt for license information
+
+"""
+NCCL4Py: Python bindings for NVIDIA Collective Communications Library (NCCL).
+
+NCCL4Py provides Pythonic access to NCCL for efficient multi-GPU and multi-node
+communication. It supports all NCCL collective operations, point-to-point
+communication, and advanced features like buffer registration and custom reduction
+operators.
+"""
+
+from nccl._version import __version__, __version_tuple__
+
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+]
diff --git a/nccl4py/nccl/bindings/__init__.py b/nccl4py/nccl/bindings/__init__.py
@@ -0,0 +1 @@
+from .nccl import *
diff --git a/nccl4py/nccl/bindings/_internal/__init__.py b/nccl4py/nccl/bindings/_internal/__init__.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# See LICENSE.txt for license information
+
+"""
+Internal bindings implementation.
+
+This module preloads the NCCL library using cuda-pathfinder if available,
+which provides better library discovery across different environments
+(conda, system installs, custom CUDA paths, etc.).
+
+If cuda-pathfinder is not available or fails to find NCCL, the Cython
+bindings will fall back to direct dlopen("libnccl.so.2") which works
+if the library is in standard system paths.
+
+See documentation for cuda-pathfinder including the search order at:
+https://nvidia.github.io/cuda-python/cuda-pathfinder/latest/generated/cuda.pathfinder.load_nvidia_dynamic_lib.html#cuda.pathfinder.load_nvidia_dynamic_lib
+"""
+
+# Optional: Preload NCCL library for better discovery
+# This runs before the Cython extensions are loaded, allowing
+# dlsym(RTLD_DEFAULT, ...) to find the already-loaded library
+try:
+    from cuda.pathfinder import load_nvidia_dynamic_lib
+
+    load_nvidia_dynamic_lib("nccl")
+except ImportError:
+    # cuda-python not installed, fall back to Cython's dlopen
+    pass
+except Exception:
+    # Library not found by pathfinder or other error
+    # Fall back to Cython's dlopen - it will provide the error message if needed
+    pass
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# NCCL4Py Overview

		NCCL4Py is a Python package that provides a Pythonic interface to NCCL