NVIDIA
diff --git a/‎.github/workflows/install-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/install-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docker/Dockerfile.ci.dev‎
Lines changed: 2 additions & 2 deletions b/‎docker/Dockerfile.ci.dev‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 128 additions & 64 deletions b/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 128 additions & 64 deletions
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_12b.sh‎
Lines changed: 4 additions & 2 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_12b.sh‎
Lines changed: 4 additions & 2 deletions
@@ -149,7 +149,7 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
           RUN_ID: ${{ github.run_id }}
-          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
+          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
         run: |
           FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
 
 
@@ -99,7 +99,7 @@ pip install --no-build-isolation .[mlm,dev]
 
 ```
 Megatron-LM/
-├── megatron/                    
+├── megatron/
 │   ├── core/                    # Megatron Core (kernels, parallelism, building blocks)
 │   │   ├── models/              # Transformer models
 │   │   ├── transformer/         # Transformer building blocks
@@ -128,7 +128,7 @@ Megatron-LM/
 
 - **Training state-of-the-art foundation models** at scale with cutting-edge performance on latest NVIDIA hardware
 - **Research teams** exploring new architectures and training techniques
-- **Learning distributed training** concepts and best practices  
+- **Learning distributed training** concepts and best practices
 - **Quick experimentation** with proven model configurations
 
 **What you get:**
@@ -137,7 +137,7 @@ Megatron-LM/
 - End-to-end examples from data prep to evaluation
 - Research-focused tools and utilities
 
-### Megatron Core: Composable Library  
+### Megatron Core: Composable Library
 
 **Composable library** with GPU-optimized building blocks for custom training frameworks.
 
@@ -170,7 +170,7 @@ Megatron-LM/
 - **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
 - **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
 - **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
-- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
+- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Checkout end-to-end examples in [examples/post_training/modelopt](./examples/post_training/modelopt/).
 
 **Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
 
@@ -257,7 +257,7 @@ Our codebase efficiently trains models from 2B to 462B parameters across thousan
 **Benchmark Configuration:**
 
 - **Vocabulary size**: 131,072 tokens
-- **Sequence length**: 4096 tokens  
+- **Sequence length**: 4096 tokens
 - **Model scaling**: Varied hidden size, attention heads, and layers to achieve target parameter counts
 - **Communication optimizations**: Fine-grained overlapping with DP (`--overlap-grad-reduce`, `--overlap-param-gather`), TP (`--tp-comm-overlap`), and PP (enabled by default)
 
 
@@ -59,9 +59,9 @@ RUN bash -ex <<"EOF"
         ln -s libnvshmem_host.so.3 libnvshmem_host.so
     popd
 
-    git clone https://github.com/deepseek-ai/DeepEP.git
+    git clone --branch tongliu/inter_node https://github.com/Autumn1998/DeepEP.git
     pushd DeepEP
-        git checkout ef73fd9a4c098e09929e48dfd696505ddc8e2043
+        git checkout 0fa8568c5923fcfc87f49ef0c3761dc013375a67
         patch -p1 < /workspace/deepep.patch
     popd
     TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/.
 
@@ -1,6 +1,7 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import hashlib
+import io
 import json
 import math
 import os
@@ -13,14 +14,23 @@
 from tqdm import tqdm
 from typing import Dict, List, Tuple, Optional
 
-import torch
-from tqdm import tqdm
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
 
+import megatron
+from examples.inference.gpt.utils import (
+    Request,
+    add_common_inference_args,
+    build_dynamic_engine_setup_prefix,
+    build_requests,
+    get_curr_time,
+)
 from megatron.core.inference.contexts.dynamic_context import (
     ContextOverflowError,
     DynamicInferenceContext,
 )
-from megatron.core.inference.engines import DynamicInferenceEngine
+from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError
 from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
     GPTInferenceWrapper,
 )
@@ -53,14 +63,14 @@
     build_requests,
     get_curr_time,
 )
-from megatron.training import get_args
-from megatron.training import get_model as _get_model
-from megatron.training import get_tokenizer, initialize_megatron
 from megatron.training.checkpointing import load_checkpoint
 
-import torch
-import io
-import megatron
+from model_provider import model_provider
+from gpt_builders import gpt_builder
+
+torch.serialization.add_safe_globals([io.BytesIO])
+torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunState])
+torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunDiagnostic])
 
 
 def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser:
@@ -76,7 +86,13 @@ def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser:
     )
     group.add_argument(
         "--termination-id", type=int, default=None,
-        help="Termination ID that overrides `tokenizer.eod`."
+        help="Termination ID that overrides `tokenizer.eod`.",
+    )
+    group.add_argument(
+        "--suspend-resume-interval", type=int, default=None,
+        help="Suspend and resume the dynamic engine every "
+        "`suspend_resume_interval` steps. This is used to tet the suspend/resume "
+        "system.",
     )
     group.add_argument('--inference-repeat-n', type=int, default=1, help="Repeat inference iterations N times for benchmarking.")
 
@@ -248,12 +264,12 @@ def run_inference(
     num_requests_total = len(requests)
     num_requests_added = 0
     num_requests_finished = 0
-    step_id = 0
     step_times = {"prefill": [], "decode": []}
     add_times = []
     output_times = []
     tbar = tqdm(total=num_requests_total)
     total_output_tokens = 0
+    attempted_step_count = 0
     if args.cuda_graph_impl == "local":
         cuda_graph_request_count_map = {r:0 for r in engine.context.cuda_graph_request_counts}
     else:
@@ -296,36 +312,75 @@ def _add_request():
 
         # Step inference engine (i.e., generate a token for each active request).
         # Before step, we haven't done the scheduling, so we cannot know the is_decode_only
-        result = engine.step_modern(verbose=True)
+        try:
+            result = engine.step_modern(verbose=True)
+        except EngineSuspendedError as e:
+            result = e
+            pass # ignore error in order to call 'engine.resume()' below.
+        attempted_step_count += 1
+
         # After step, we lost track of last iteration's is_decode_only, so we need to get it from the engine
         is_decode_only = engine.is_decode_only 
-        step_id += 1
+
+        # Test suspending and resuming engine.
+        if args.suspend_resume_interval is not None:
+
+            # Suspend.
+            if attempted_step_count % args.suspend_resume_interval == 0:
+                print("**** step %d/%d ... suspend." % (engine.step_count, attempted_step_count))
+                engine.suspend()
+
+            # Resume, 0+ attempted steps later.
+            if (
+                attempted_step_count > 0
+                and
+                (attempted_step_count - args.suspend_resume_interval // 2)
+                    % args.suspend_resume_interval == 0
+            ):
+                print("**** step %d/%d ... resume." % (engine.step_count, attempted_step_count))
+                engine.resume()
+
+        # If engine suspended, continue to next iter.
+        if isinstance(result, EngineSuspendedError):
+            continue
 
         # Record cuda_graph_request_count.
         cuda_graph_request_count = result["cuda_graph_request_count"]
         if args.cuda_graph_impl == "local" and cuda_graph_request_count is not None:
             cuda_graph_request_count_map[cuda_graph_request_count] += 1
 
         # Update requests.
-        active_requests = result["active_requests"]
-        finished_requests = result["finished_requests"]
+        active_request_ids = result["active_request_ids"]
+        finished_request_records = result["finished_request_records"]
         step_time = result["step_time"]
-        if len(active_requests) > 0 or len(finished_requests) > 0:
+        if len(active_request_ids) > 0 or len(finished_request_records) > 0:
             if is_decode_only:
                 step_times["decode"].append(step_time)
             else:
                 step_times["prefill"].append(step_time)
 
             # Append output tokens.
             output_start = get_curr_time()
-            for finished_request in finished_requests:
+            for finished_request_record in finished_request_records:
+
+                finished_request = finished_request_record.merge(engine.controller.tokenizer)
+
+                # Update local request object.
                 request = requests[finished_request.request_id]
-                request.output_tokens = finished_request.generated_tokens
-                total_output_tokens += len(request.output_tokens)
                 request.time_end = get_curr_time()
-                request.output_text = finished_request.generated_text
                 request.state = "finished"
                 request.request_id = finished_request.request_id
+
+                # Update prompt, in case engine has been suspended and resumed.
+                request.prompt_tokens = finished_request.prompt_tokens
+                request.prompt_text = finished_request.prompt
+
+                # Get output tokens and text.
+                request.output_tokens = finished_request.generated_tokens
+                request.output_text = finished_request.generated_text
+                total_output_tokens += len(request.output_tokens)
+
+                # Log probs.
                 if finished_request.sampling_params.return_log_probs:
                     request.log_probs = (
                         finished_request.prompt_log_probs + finished_request.generated_log_probs
@@ -461,7 +516,9 @@ def escape_str(s):
             unique_prompt_map[request.prompt_text].append(request_idx)
 
         # Print unique prompts + outputs.
+        text_hashes = []
         for unique_idx, (prompt_text, request_idxs) in enumerate(unique_prompt_map.items()):
+
             # ---- Prompt summary line ----
             prompt_len = len(requests[request_idxs[0]].prompt_tokens)
             escaped_prompt_text = escape_str(prompt_text)
@@ -476,15 +533,20 @@ def escape_str(s):
             # ---- Print each unique output ----
             for output_text, output_request_idxs in output_map.items():
                 if output_text is not None:
-                    o_hash = hashlib.sha256(output_text.encode()).hexdigest()[:6]
+                    # Use hash of prompt + generated text in case engine was
+                    # suspended and resumed, which misaligns boundary between
+                    # prompt and generated tokens.
+                    o_hash = hashlib.sha256(
+                        (prompt_text + output_text).encode()
+                    ).hexdigest()[:6]
                     o_len = len(requests[output_request_idxs[0]].output_tokens)
                     escaped_output_text = escape_str(output_text)
-                    print(f"  >>>> [n {len(output_request_idxs)}, l {o_len}, hash {o_hash}] {escaped_output_text}")
                 else:
                     o_hash = "--"
                     o_len = 0
                     escaped_output_text = "--"
-                    print(f"  >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}] {escaped_output_text}")
+                print(f"  >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}] {escaped_output_text}")
+                text_hashes.append(o_hash)
 
         # Write results to JSON. Primarily used for functional testing.
         if args.output_path:
@@ -512,47 +574,49 @@ def escape_str(s):
             with open(args.output_path, "w") as fp:
                 json.dump(json_results, fp, indent=1)
 
-    # Timing results.
-    print("~~~")
-    peak_alloc_gb = stats["allocated_bytes.all.peak"] / 1024**3
-    peak_resvd_gb = stats["reserved_bytes.all.peak"] / 1024**3
-
-    p_times = step_times["prefill"]
-    d_times = step_times["decode"]
-
-    p_total = sum(p_times)
-    d_total = sum(d_times)
-
-    p_count = len(p_times)
-    d_count = len(d_times)
-
-    p_mean = p_total / p_count
-    d_mean = d_total / d_count
-
-    # Commented out for now as the step/add/output times are not calculated correctly.
-    # print(
-    #     f"{setup_prefix} … "
-    #     f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … "
-    #     f"total time: {step_total:.3f}s … "
-    #     f"step time: total {step_total:.3f}s "
-    #     f"[ p {p_total:.3f}s, d {d_total:.3f}s ], "
-    #     f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], "
-    #     f"count [ p {p_count}, d {d_count} ]."
-    # )
-    capture_str = (
-        f"{engine.capture_stats['time']:.2f} sec"
-        if engine.capture_stats else
-        "--"
-    )
-    print(" … ".join((
-        f"{setup_prefix}",
-        f"throughput: {throughput:.3f} tok/s",
-        f"total time: {total_time:.3f}s",
-        f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB",
-        f"steps: {engine.step_count:d}",
-        f"capture {capture_str}",
-    )))
-    print("~~~")
+        # Timing results.
+        stats = torch.cuda.memory_stats()
+        throughput = total_output_tokens / total_time
+        print("~~~")
+        peak_alloc_gb = stats["allocated_bytes.all.peak"] / 1024**3
+        peak_resvd_gb = stats["reserved_bytes.all.peak"] / 1024**3
+
+        p_times = step_times["prefill"]
+        d_times = step_times["decode"]
+
+        p_total = sum(p_times)
+        d_total = sum(d_times)
+
+        p_count = len(p_times)
+        d_count = len(d_times)
+
+        p_mean = p_total / p_count
+        d_mean = d_total / d_count if d_count != 0 else 0.
+
+        # Commented out for now as the step/add/output times are not calculated correctly.
+        # print(
+        #     f"{setup_prefix} … "
+        #     f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … "
+        #     f"total time: {step_total:.3f}s … "
+        #     f"step time: total {step_total:.3f}s "
+        #     f"[ p {p_total:.3f}s, d {d_total:.3f}s ], "
+        #     f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], "
+        #     f"count [ p {p_count}, d {d_count} ]."
+        # )
+        capture_str = (
+            f"{engine.capture_stats['time']:.2f} sec"
+            if engine.capture_stats else
+            "--"
+        )
+        print(
+            f"{setup_prefix} … "
+            f"throughput: {throughput:.3f} tok/s",
+            f"total time: {total_time:.3f}s … "
+            f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … "
+            f"steps: {engine.step_count:d} … "
+            f"capture {capture_str} … "
+        )
+        print("~~~")
 
     # Stop Nsight profiler.
     if os.environ.get("NSIGHT_PREFIX"):
 
@@ -26,9 +26,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 : ${ACTIVE_BUFFER_SIZE_GB=50.}
 
 # Cuda graphs.
-: ${CUDA_GRAPH_IMPL=local}
 : ${NUM_CUDA_GRAPHS=16}
-: ${CUDA_GRAPH_SHARE_IO_BUFFERS=1}
 
 # Miscellaneous.
 : ${USE_COORDINATOR=0}
@@ -87,6 +85,10 @@ if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then
         --cuda-graph-impl local \
         --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \
     "
+else
+    ARGS+=" \
+        --cuda-graph-impl none \
+    "
 fi
 
 # Prompts.