Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ default: src.build
install: src.install
BUILDDIR ?= $(abspath ./build)
ABSBUILDDIR := $(abspath $(BUILDDIR))
TARGETS := src pkg
TARGETS := src pkg nccl4py
clean: ${TARGETS:%=%.clean}
examples.build: src.build
LICENSE_FILES := LICENSE.txt
Expand All @@ -30,5 +30,8 @@ examples: src.build
pkg.%:
${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}

nccl4py.%:
${MAKE} -C nccl4py $* BUILDDIR=${ABSBUILDDIR}

pkg.debian.prep: lic
pkg.txz.prep: lic
21 changes: 21 additions & 0 deletions nccl4py/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Build artifacts
build/
dist/
*.egg-info/

# Cython compiled outputs
nccl/bindings/**/*.cpp
nccl/bindings/**/*.so

# Generated package files
nccl/_version.py

# Stamps
**/.stamp

__pycache__

# Tool caches
.pytest_cache/
.ruff_cache/
.mypy_cache/
16 changes: 16 additions & 0 deletions nccl4py/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
include pyproject.toml
include setup.py
include README.md

# Include Cython sources and headers for sdist only
include nccl/bindings/_internal/*.pyx
include nccl/bindings/_internal/*.pxd
include nccl/bindings/*.pxd
include nccl/bindings/*.pyx

# Typing markers and stubs
include nccl/bindings/*.pyi

# Exclude build artifacts and cache
prune nccl/__pycache__
global-exclude __pycache__/* *.pyc *.pyo *.stamp *.cpp *.c
70 changes: 70 additions & 0 deletions nccl4py/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
UV ?= uv

# Root Makefile passes BUILDDIR; default to ../build if not set
MKFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
BUILDDIR ?= $(abspath $(MKFILE_DIR)/../build)
NCCL_DIR := $(abspath $(MKFILE_DIR)/../src)
NCCL4PY_DIR := $(abspath $(MKFILE_DIR))
NCCL4PY_BINDINGS_DIR := $(NCCL4PY_DIR)/nccl/bindings

EXTERNALS_DIR := $(BUILDDIR)/externals
EXTERNALS_VENV_DIR := $(EXTERNALS_DIR)/venvs
DIST_DIR := $(BUILDDIR)/dist

# CUDA home is only used to provide headers for cython
CUDA_HOME ?= /usr/local/cuda
CUDA_VARIANTS := 12 13
PYTHON_VERSIONS := 3.10 3.11 3.12 3.13 3.14

# manylinux repair configuration
AUDITWHEEL_PLAT ?= manylinux_2_34_$(shell uname -m)

.DEFAULT_GOAL := build
.PHONY: all build dev clean

all: build

dev:
@cd $(NCCL4PY_DIR) && $(UV) sync --extra dev && $(UV) pip install -e .[dev]


NCCL4PY_FILES := \
nccl/__init__.py \
nccl/core/*.py \
nccl/core/interop/*.py \
nccl/bindings/*.pyx \
nccl/bindings/*.pxd \
nccl/bindings/*.py \
nccl/bindings/_internal/*.pyx \
nccl/bindings/_internal/*.pxd \
nccl/bindings/_internal/*.py

BUILD_DIR = $(BUILDDIR)/nccl4py_cu$*
OUT_DIR = $(DIST_DIR)/nccl4py_cu$*

$(CUDA_VARIANTS:%=$(DIST_DIR)/nccl4py_cu%/makefile.stamp): $(DIST_DIR)/nccl4py_cu%/makefile.stamp : $(wildcard $(NCCL4PY_DIR)/nccl/core/*.py)
mkdir -p "$(BUILD_DIR)"
cp "$(NCCL4PY_DIR)/nccl4py_cu$*/pyproject.toml" "$(NCCL4PY_DIR)/nccl4py_cu$*/setup.py" "$(NCCL4PY_DIR)/MANIFEST.in" "$(NCCL4PY_DIR)/README.md" "$(BUILD_DIR)/"
cd "$(NCCL4PY_DIR)"; cp --parents $(NCCL4PY_FILES) "$(BUILD_DIR)/"
CUDA_HOME="$(CUDA_HOME)" $(UV) build --sdist -p "[email protected]" --out-dir "$(OUT_DIR)" "$(BUILD_DIR)"
for v in $(PYTHON_VERSIONS); do \
CUDA_HOME="$(CUDA_HOME)" $(UV) build --wheel -p "cpython@$${v}" --out-dir "$(OUT_DIR)" "$(OUT_DIR)"/nccl4py_cu$*-*.tar.gz; \
done;
find "$(OUT_DIR)" -maxdepth 1 -type f -name '*.whl' ! -name '*manylinux*' -print0 | \
xargs -0 -r $(UV) run --isolated --no-project -p "[email protected]" --with auditwheel --with patchelf \
-m auditwheel repair --plat "$(AUDITWHEEL_PLAT)" -w "$(OUT_DIR)"; \
touch "$@"

$(CUDA_VARIANTS:%=nccl4py_cu%): nccl4py_cu%: $(DIST_DIR)/nccl4py_cu%/makefile.stamp

build: $(CUDA_VARIANTS:%=nccl4py_cu%)

clean:
@rm -rf $(DIST_DIR) \
$(BUILDDIR)/nccl4py_cu* \
$(NCCL4PY_DIR)/*.egg-info \
$(NCCL4PY_DIR)/build \
$(NCCL4PY_DIR)/nccl/_version.py
@find $(NCCL4PY_DIR) -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
@find $(NCCL4PY_DIR) -type f -name "*.pyc" -delete 2>/dev/null || true
@find $(NCCL4PY_BINDINGS_DIR) -type f \( -name "*.so" -o -name "*.cpp" -o -name "*.c" \) -delete 2>/dev/null || true
3 changes: 3 additions & 0 deletions nccl4py/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# NCCL4Py Overview

NCCL4Py is a Python package that provides a Pythonic interface to NCCL
84 changes: 84 additions & 0 deletions nccl4py/examples/01_basic/01_allreduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3
"""
Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.

NCCL4Py Basic Example: AllReduce
==================================

The simplest possible example showing how to use NCCL4Py with AllReduce.
Each rank contributes its rank number, and all ranks receive the sum.

USAGE:
mpirun -np 4 python 01_allreduce.py
"""

import sys

try:
from mpi4py import MPI
except ImportError:
print("ERROR: mpi4py required. Install with: pip install mpi4py")
sys.exit(1)

try:
import torch
except ImportError:
print("ERROR: PyTorch required. Install with: pip install torch")
sys.exit(1)

import nccl.core as nccl


def main():
# Initialize MPI
comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank()
nranks = comm_mpi.Get_size()

# Set rank 0 as the root
root = 0

# Assign GPU to each process
device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
torch.cuda.set_device(device)

# [NCCL4Py] Generate unique ID on the root rank
unique_id = nccl.get_unique_id() if rank == root else None

# Broadcast unique ID to all ranks
unique_id = comm_mpi.bcast(unique_id, root=root)

# [NCCL4Py] Initialize NCCL communicator
nccl_comm = nccl.Communicator.init(nranks=nranks, rank=rank, unique_id=unique_id)

if rank == root:
print(f"Running AllReduce with {nranks} ranks...")

# Create PyTorch tensor with rank value
data = torch.tensor([float(rank)], dtype=torch.float32, device=device)

# [NCCL4Py] AllReduce: Sum all rank values
nccl_comm.all_reduce(data, data, nccl.SUM)

torch.cuda.synchronize()

# Verify result
expected = float(nranks * (nranks - 1) // 2)
actual = float(data[0].item())

print(f"Rank {rank}: AllReduce result = {actual:.0f} (expected {expected:.0f})")

# [NCCL4Py] Destroy NCCL communicator (collective call)
nccl_comm.destroy()

if rank == root:
if actual == expected:
print("SUCCESS!")
else:
print("FAILED!")

return 0 if actual == expected else 1


if __name__ == "__main__":
sys.exit(main())
104 changes: 104 additions & 0 deletions nccl4py/examples/01_basic/02_send_recv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""
Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.

NCCL4Py Basic Example: Send/Recv
==================================

Simple point-to-point communication example showing two ranks exchanging data.
Demonstrates the use of group() to avoid deadlocks.

USAGE:
mpirun -np 2 python 02_send_recv.py
"""

import sys

try:
from mpi4py import MPI
except ImportError:
print("ERROR: mpi4py required. Install with: pip install mpi4py")
sys.exit(1)

try:
import torch
except ImportError:
print("ERROR: PyTorch required. Install with: pip install torch")
sys.exit(1)

import nccl.core as nccl


def main():
# Initialize MPI
comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank()
nranks = comm_mpi.Get_size()

# Set rank 0 as the root
root = 0

if nranks != 2:
if rank == root:
print("ERROR: This example requires exactly 2 ranks")
print("Usage: mpirun -np 2 python 02_send_recv.py")
sys.exit(1)

# Assign GPU to each process
device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
torch.cuda.set_device(device)

# [NCCL4Py] Generate unique ID on the root rank
unique_id = nccl.get_unique_id() if rank == root else None

# Broadcast unique ID to all ranks
unique_id = comm_mpi.bcast(unique_id, root=root)

# [NCCL4Py] Initialize NCCL communicator
nccl_comm = nccl.Communicator.init(nranks=nranks, rank=rank, unique_id=unique_id)

if rank == root:
print("Running Send/Recv between 2 ranks...")

# Create tensors
send_data = torch.tensor([float(100 + rank)], dtype=torch.float32, device=device)
recv_data = torch.zeros(1, dtype=torch.float32, device=device)

other_rank = 1 - rank # Rank 0 <-> Rank 1

# Exchange data using group() to avoid deadlock
# IMPORTANT: Using group() allows send() and recv() to be called in any order.
# Without group(), you must ensure send() and recv() are ordered carefully to avoid deadlock.
# For example:
# - All ranks call send() first, then all call recv(), OR
# - Even ranks send then recv, odd ranks recv then send

# [NCCL4Py] Use group() to avoid deadlock
with nccl.group():
# [NCCL4Py] Send data to the other rank
nccl_comm.send(send_data, peer=other_rank)
# [NCCL4Py] Receive data from the other rank
nccl_comm.recv(recv_data, peer=other_rank)

torch.cuda.synchronize()

# Verify result
expected = float(100 + (1 - rank))
actual = float(recv_data[0].item())

print(f"Rank {rank}: Sent {100 + rank:.0f}, Received {actual:.0f} (expected {expected:.0f})")

# [NCCL4Py] Destroy NCCL communicator (collective call)
nccl_comm.destroy()

if rank == root:
if actual == expected:
print("SUCCESS!")
else:
print("FAILED!")

return 0 if actual == expected else 1


if __name__ == "__main__":
sys.exit(main())
25 changes: 25 additions & 0 deletions nccl4py/nccl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#
# See LICENSE.txt for license information

"""
NCCL4Py: Python bindings for NVIDIA Collective Communications Library (NCCL).

NCCL4Py provides Pythonic access to NCCL for efficient multi-GPU and multi-node
communication. It supports all NCCL collective operations, point-to-point
communication, and advanced features like buffer registration and custom reduction
operators.
"""

from nccl._version import __version__, __version_tuple__

__all__ = [
"__version__",
"__version_tuple__",
]
1 change: 1 addition & 0 deletions nccl4py/nccl/bindings/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .nccl import *
39 changes: 39 additions & 0 deletions nccl4py/nccl/bindings/_internal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#
# See LICENSE.txt for license information

"""
Internal bindings implementation.

This module preloads the NCCL library using cuda-pathfinder if available,
which provides better library discovery across different environments
(conda, system installs, custom CUDA paths, etc.).

If cuda-pathfinder is not available or fails to find NCCL, the Cython
bindings will fall back to direct dlopen("libnccl.so.2") which works
if the library is in standard system paths.

See documentation for cuda-pathfinder including the search order at:
https://nvidia.github.io/cuda-python/cuda-pathfinder/latest/generated/cuda.pathfinder.load_nvidia_dynamic_lib.html#cuda.pathfinder.load_nvidia_dynamic_lib
"""

# Optional: Preload NCCL library for better discovery
# This runs before the Cython extensions are loaded, allowing
# dlsym(RTLD_DEFAULT, ...) to find the already-loaded library
try:
from cuda.pathfinder import load_nvidia_dynamic_lib

load_nvidia_dynamic_lib("nccl")
except ImportError:
# cuda-python not installed, fall back to Cython's dlopen
pass
except Exception:
# Library not found by pathfinder or other error
# Fall back to Cython's dlopen - it will provide the error message if needed
pass
Loading