eugo-inc · BwL1289 · May 30, 2025 · Jun 18, 2025 · Jul 11, 2025 · Jul 16, 2025
diff --git a/.github/ISSUE_TEMPLATE/ISSUE.yaml b/.github/ISSUE_TEMPLATE/ISSUE.yaml
@@ -0,0 +1,77 @@
+name: NCCL issue or bug
+description: Report an issue or failure when running NCCL code
+title: "[Issue]: "
+labels: ["triage"]
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for reaching out! Before reporting a new issue, please feel free to search for the behavior in the existing issues. If you found an issue which is already closed or you are unsure, open a new issue and reference the old one from it.
+        You can also check out the [troubleshooting section](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html) in our user guide.
+
+        ---
+
+        To ensure we can assist you quickly and accurately, we often need the following information:
+  - type: dropdown
+    id: type
+    attributes:
+      label: How is this issue impacting you?
+      description: What best describes your issue?
+      options:
+        - Lower performance than expected
+        - Application crash
+        - Data corruption
+        - Application hang
+    validations:
+      required: true
+
+  - type: textarea
+    id: log
+    attributes:
+      label: Share Your Debug Logs
+      description: |
+
+        The logs and topo-files are a great tool to pin down issues. You can create them by setting these environment variables before the run.
+        * `NCCL_DEBUG=INFO` and `NCCL_DEBUG_FILE=ncclDebug.%h.%p` to produce one file per rank
+        * `NCCL_TOPO_DUMP_FILE=ncclSystem.txt`
+
+  - type: textarea
+    id: repro
+    attributes:
+      label: Steps to Reproduce the Issue
+      description: |
+        * **Minimal Steps**: Please provide a simple way to recreate the issue (see [Minimal Bug Reports](https://matthewrocklin.com/minimal-bug-reports) for inspiration).
+        * **Environment Details**: Include software versions and relevant settings.
+        * **Intermittency**: Is this a sporadic issue? If so, how often does it occur?
+        * **Previous Success**: Did this work with an older NCCL version?
+
+        The easier we can reproduce on our side the more likely we are to be able to solve it in a timely manner.
+
+  - type: input
+    id: nccl_version
+    attributes:
+      label: NCCL Version
+      description: |
+        NCCL reports its version string in the debug logs.
+        You can also determine the version if you know which library was used by running `strings libnccl.so | grep 'NCCL version'`.
+      placeholder: "e.g. 2.27.1+cuda12.8"
+    validations:
+      required: true
+
+  - type: textarea
+    id: platform
+    attributes:
+      label: Your platform details
+      description: |
+        * **GPU & Network**: Share your architecture and topology (e.g., from `nvidia-smi`, `nvidia-smi topo -m`, `ibstatus`).
+        * **Environment**: Bare-metal, containers, or cloud?
+        * **Scalability**: Does this issue occur with a specific number of ranks/nodes?
+
+  - type: textarea
+    id: issue-description
+    attributes:
+      label: Error Message & Behavior
+      description: |
+        * **First Error**: What was the initial `NCCL WARN` message in your logs?
+        * **Expected vs. Actual**: Briefly describe the anticipated behavior versus what you're seeing.
diff --git a/.github/ISSUE_TEMPLATE/QUESTION.yaml b/.github/ISSUE_TEMPLATE/QUESTION.yaml
@@ -0,0 +1,15 @@
+name: NCCL question
+description: Ask the NCCL team a question
+title: "[Question]: "
+labels: ["question"]
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for reaching out! To solve your problem, feel free to check out the [user guide](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html), in particular the troubleshooting section, and also the [release notes](https://docs.nvidia.com/deeplearning/nccl/release-notes/index.html).
+        ---
+  - type: textarea
+    id: question
+    attributes:
+      label: Question
diff --git a/.github/ISSUE_TEMPLATE/RFE.yaml b/.github/ISSUE_TEMPLATE/RFE.yaml
@@ -0,0 +1,22 @@
+name: NCCL request for enhancement
+description: Request for enhancement
+title: "[RFE]: "
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+
+        Thanks for your feedback! Before reporting a new RFE you could quickly check if this already exists in our [existing requests](https://github.com/NVIDIA/nccl/issues?q=sort%3Aupdated-desc%20is%3Aissue%20is%3Aopen%20label%3Aenhancement).
+
+        ---
+  - type: textarea
+    id: rfe-description
+    attributes:
+      label: Please provide the below details to ensure we understand your needs
+      description: |
+        * What is the goal of this request?
+        * Who will benefit from this feature?
+        * Is this request for a specific GPU architecture or network infrastructure?
+        * How will this feature improve current workflows or processes?
+        * What is the priority level of this request?
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: false
diff --git a/.github/workflows/close-old-issues.js b/.github/workflows/close-old-issues.js
@@ -0,0 +1,79 @@
+const { Octokit } = require("@octokit/rest");
+
+const octokit = new Octokit({ auth: process.env.GITHUB_TOKEN });
+
+const owner = process.env.REPO_OWNER;
+const repo = process.env.REPO_NAME.split('/').pop(); // Handles owner/repo format
+
+const now = new Date();
+const sixMonthsAgo = new Date(now);
+sixMonthsAgo.setMonth(now.getMonth() - 6);
+const oneMonthAgo = new Date(now);
+oneMonthAgo.setMonth(now.getMonth() - 1);
+
+async function closeOldIssues() {
+  let page = 1;
+  let closedCount = 0;
+
+    // write a multiline comment into a variable:
+    let body = `### Issue Cleanup: Helping Us Focus on Current Challenges
+
+We're [reviewing](https://github.com/NVIDIA/nccl/discussions/1761) older issues to ensure we prioritize the most relevant and active ones. Since this issue hasn't seen updates in over 6 months, we'll be closing it for now.
+
+*This change helps us focus our efforts on addressing any current issues our users are facing.* If this issue still affects you, please don't hesitate to reopen it with a quick update (e.g., \"Still relevant on [version=X]\").
+Thanks for your understanding and for contributing to NCCL.`;
+
+  while (true) {
+    const { data: issues } = await octokit.issues.listForRepo({
+      owner,
+      repo,
+      state: "open",
+      per_page: 100,
+      page,
+    });
+
+    if (issues.length === 0) break;
+
+    for (const issue of issues) {
+      // Ignore PRs
+      if (issue.pull_request) continue;
+
+      // Ignore issues with label "ongoing"
+      if (issue.labels.some(label => label.name === "ongoing")) continue;
+
+      const createdAt = new Date(issue.created_at);
+      const updatedAt = new Date(issue.updated_at);
+
+        if (createdAt < sixMonthsAgo && updatedAt < sixMonthsAgo) {
+
+        // Add a comment before closing
+        await octokit.issues.createComment({
+          owner,
+          repo,
+          issue_number: issue.number,
+          body: body,
+        });
+
+        await octokit.issues.update({
+          owner,
+          repo,
+          issue_number: issue.number,
+          state: "closed",
+          state_reason: "not_planned",
+        });
+        closedCount++;
+        console.log(`Closed issue #${issue.number}`);
+
+        // Break out if we have closed 100 issues
+        if (closedCount >= 100) {
+          console.log("Closed 100 issues, stopping.");
+          return;
+        }
+      }
+    }
+    page++;
+  }
+  console.log(`Total closed: ${closedCount}`);
+}
+
+closeOldIssues().catch(console.error);
diff --git a/.github/workflows/close_old_issues.yaml b/.github/workflows/close_old_issues.yaml
@@ -0,0 +1,31 @@
+name: Close Old Issues
+
+on:
+  schedule:
+    - cron: '30 2 * * *'  # Runs daily at 02:30 UTC
+  workflow_dispatch:
+
+permissions:
+  issues: write
+
+jobs:
+  close-old-issues:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Install dependencies
+        run: npm install @octokit/[email protected]
+
+      - name: Run close-old-issues script
+        run: node .github/workflows/close-old-issues.js
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_OWNER: ${{ github.repository_owner }}
+          REPO_NAME: ${{ github.event.repository.name || github.repository }}
diff --git a/ext-net/example/Makefile b/ext-net/example/Makefile
@@ -3,15 +3,20 @@
 #
 # See LICENSE.txt for license information
 #
-NCCL_HOME:=../../build/
-CUDA_HOME:=/usr/local/cuda
-INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
-PLUGIN_SO:=libnccl-net.so
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
 
-default: $(PLUGIN_SO)
+SRC_FILES := $(wildcard *.c)
 
-$(PLUGIN_SO): plugin.c
-	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+build: ${BUILDDIR}/libnccl-net-example.so
+
+${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
 
 clean:
-	rm -f $(PLUGIN_SO)
+	rm -f ${BUILDDIR}/libnccl-net-example.so
diff --git a/ext-net/example/nccl/common.h b/ext-net/example/nccl/common.h
@@ -7,9 +7,15 @@
 #ifndef COMMON_H_
 #define COMMON_H_
 
+#include <stdint.h>
+
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
 typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
+enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop };
+
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
 #endif
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
@@ -8,9 +8,9 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-#include "common.h"
 #include "err.h"
 #include "net_device.h"
+#include "common.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
@@ -23,8 +23,6 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
-typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
-
 #include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"