NVIDIA
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 64 additions & 128 deletions b/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 64 additions & 128 deletions
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_12b.sh‎
Lines changed: 2 additions & 4 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_12b.sh‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_357m.sh‎
Lines changed: 2 additions & 4 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_357m.sh‎
Lines changed: 2 additions & 4 deletions
@@ -1,7 +1,6 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import hashlib
-import io
 import json
 import math
 import os
@@ -14,23 +13,14 @@
 from tqdm import tqdm
 from typing import Dict, List, Tuple, Optional
 
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
-)
+import torch
+from tqdm import tqdm
 
-import megatron
-from examples.inference.gpt.utils import (
-    Request,
-    add_common_inference_args,
-    build_dynamic_engine_setup_prefix,
-    build_requests,
-    get_curr_time,
-)
 from megatron.core.inference.contexts.dynamic_context import (
     ContextOverflowError,
     DynamicInferenceContext,
 )
-from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError
+from megatron.core.inference.engines import DynamicInferenceEngine
 from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
     GPTInferenceWrapper,
 )
@@ -63,14 +53,14 @@
     build_requests,
     get_curr_time,
 )
+from megatron.training import get_args
+from megatron.training import get_model as _get_model
+from megatron.training import get_tokenizer, initialize_megatron
 from megatron.training.checkpointing import load_checkpoint
 
-from model_provider import model_provider
-from gpt_builders import gpt_builder
-
-torch.serialization.add_safe_globals([io.BytesIO])
-torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunState])
-torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunDiagnostic])
+import torch
+import io
+import megatron
 
 
 def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser:
@@ -86,13 +76,7 @@ def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser:
     )
     group.add_argument(
         "--termination-id", type=int, default=None,
-        help="Termination ID that overrides `tokenizer.eod`.",
-    )
-    group.add_argument(
-        "--suspend-resume-interval", type=int, default=None,
-        help="Suspend and resume the dynamic engine every "
-        "`suspend_resume_interval` steps. This is used to tet the suspend/resume "
-        "system.",
+        help="Termination ID that overrides `tokenizer.eod`."
     )
     group.add_argument('--inference-repeat-n', type=int, default=1, help="Repeat inference iterations N times for benchmarking.")
 
@@ -264,12 +248,12 @@ def run_inference(
     num_requests_total = len(requests)
     num_requests_added = 0
     num_requests_finished = 0
+    step_id = 0
     step_times = {"prefill": [], "decode": []}
     add_times = []
     output_times = []
     tbar = tqdm(total=num_requests_total)
     total_output_tokens = 0
-    attempted_step_count = 0
     if args.cuda_graph_impl == "local":
         cuda_graph_request_count_map = {r:0 for r in engine.context.cuda_graph_request_counts}
     else:
@@ -312,75 +296,36 @@ def _add_request():
 
         # Step inference engine (i.e., generate a token for each active request).
         # Before step, we haven't done the scheduling, so we cannot know the is_decode_only
-        try:
-            result = engine.step_modern(verbose=True)
-        except EngineSuspendedError as e:
-            result = e
-            pass # ignore error in order to call 'engine.resume()' below.
-        attempted_step_count += 1
-
+        result = engine.step_modern(verbose=True)
         # After step, we lost track of last iteration's is_decode_only, so we need to get it from the engine
         is_decode_only = engine.is_decode_only 
-
-        # Test suspending and resuming engine.
-        if args.suspend_resume_interval is not None:
-
-            # Suspend.
-            if attempted_step_count % args.suspend_resume_interval == 0:
-                print("**** step %d/%d ... suspend." % (engine.step_count, attempted_step_count))
-                engine.suspend()
-
-            # Resume, 0+ attempted steps later.
-            if (
-                attempted_step_count > 0
-                and
-                (attempted_step_count - args.suspend_resume_interval // 2)
-                    % args.suspend_resume_interval == 0
-            ):
-                print("**** step %d/%d ... resume." % (engine.step_count, attempted_step_count))
-                engine.resume()
-
-        # If engine suspended, continue to next iter.
-        if isinstance(result, EngineSuspendedError):
-            continue
+        step_id += 1
 
         # Record cuda_graph_request_count.
         cuda_graph_request_count = result["cuda_graph_request_count"]
         if args.cuda_graph_impl == "local" and cuda_graph_request_count is not None:
             cuda_graph_request_count_map[cuda_graph_request_count] += 1
 
         # Update requests.
-        active_request_ids = result["active_request_ids"]
-        finished_request_records = result["finished_request_records"]
+        active_requests = result["active_requests"]
+        finished_requests = result["finished_requests"]
         step_time = result["step_time"]
-        if len(active_request_ids) > 0 or len(finished_request_records) > 0:
+        if len(active_requests) > 0 or len(finished_requests) > 0:
             if is_decode_only:
                 step_times["decode"].append(step_time)
             else:
                 step_times["prefill"].append(step_time)
 
             # Append output tokens.
             output_start = get_curr_time()
-            for finished_request_record in finished_request_records:
-
-                finished_request = finished_request_record.merge(engine.controller.tokenizer)
-
-                # Update local request object.
+            for finished_request in finished_requests:
                 request = requests[finished_request.request_id]
+                request.output_tokens = finished_request.generated_tokens
+                total_output_tokens += len(request.output_tokens)
                 request.time_end = get_curr_time()
+                request.output_text = finished_request.generated_text
                 request.state = "finished"
                 request.request_id = finished_request.request_id
-
-                # Update prompt, in case engine has been suspended and resumed.
-                request.prompt_tokens = finished_request.prompt_tokens
-                request.prompt_text = finished_request.prompt
-
-                # Get output tokens and text.
-                request.output_tokens = finished_request.generated_tokens
-                request.output_text = finished_request.generated_text
-                total_output_tokens += len(request.output_tokens)
-
-                # Log probs.
                 if finished_request.sampling_params.return_log_probs:
                     request.log_probs = (
                         finished_request.prompt_log_probs + finished_request.generated_log_probs
@@ -516,9 +461,7 @@ def escape_str(s):
             unique_prompt_map[request.prompt_text].append(request_idx)
 
         # Print unique prompts + outputs.
-        text_hashes = []
         for unique_idx, (prompt_text, request_idxs) in enumerate(unique_prompt_map.items()):
-
             # ---- Prompt summary line ----
             prompt_len = len(requests[request_idxs[0]].prompt_tokens)
             escaped_prompt_text = escape_str(prompt_text)
@@ -533,20 +476,15 @@ def escape_str(s):
             # ---- Print each unique output ----
             for output_text, output_request_idxs in output_map.items():
                 if output_text is not None:
-                    # Use hash of prompt + generated text in case engine was
-                    # suspended and resumed, which misaligns boundary between
-                    # prompt and generated tokens.
-                    o_hash = hashlib.sha256(
-                        (prompt_text + output_text).encode()
-                    ).hexdigest()[:6]
+                    o_hash = hashlib.sha256(output_text.encode()).hexdigest()[:6]
                     o_len = len(requests[output_request_idxs[0]].output_tokens)
                     escaped_output_text = escape_str(output_text)
+                    print(f"  >>>> [n {len(output_request_idxs)}, l {o_len}, hash {o_hash}] {escaped_output_text}")
                 else:
                     o_hash = "--"
                     o_len = 0
                     escaped_output_text = "--"
-                print(f"  >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}] {escaped_output_text}")
-                text_hashes.append(o_hash)
+                    print(f"  >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}] {escaped_output_text}")
 
         # Write results to JSON. Primarily used for functional testing.
         if args.output_path:
@@ -574,49 +512,47 @@ def escape_str(s):
             with open(args.output_path, "w") as fp:
                 json.dump(json_results, fp, indent=1)
 
-        # Timing results.
-        stats = torch.cuda.memory_stats()
-        throughput = total_output_tokens / total_time
-        print("~~~")
-        peak_alloc_gb = stats["allocated_bytes.all.peak"] / 1024**3
-        peak_resvd_gb = stats["reserved_bytes.all.peak"] / 1024**3
-
-        p_times = step_times["prefill"]
-        d_times = step_times["decode"]
-
-        p_total = sum(p_times)
-        d_total = sum(d_times)
-
-        p_count = len(p_times)
-        d_count = len(d_times)
-
-        p_mean = p_total / p_count
-        d_mean = d_total / d_count if d_count != 0 else 0.
-
-        # Commented out for now as the step/add/output times are not calculated correctly.
-        # print(
-        #     f"{setup_prefix} … "
-        #     f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … "
-        #     f"total time: {step_total:.3f}s … "
-        #     f"step time: total {step_total:.3f}s "
-        #     f"[ p {p_total:.3f}s, d {d_total:.3f}s ], "
-        #     f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], "
-        #     f"count [ p {p_count}, d {d_count} ]."
-        # )
-        capture_str = (
-            f"{engine.capture_stats['time']:.2f} sec"
-            if engine.capture_stats else
-            "--"
-        )
-        print(
-            f"{setup_prefix} … "
-            f"throughput: {throughput:.3f} tok/s",
-            f"total time: {total_time:.3f}s … "
-            f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … "
-            f"steps: {engine.step_count:d} … "
-            f"capture {capture_str} … "
-        )
-        print("~~~")
+    # Timing results.
+    print("~~~")
+    peak_alloc_gb = stats["allocated_bytes.all.peak"] / 1024**3
+    peak_resvd_gb = stats["reserved_bytes.all.peak"] / 1024**3
+
+    p_times = step_times["prefill"]
+    d_times = step_times["decode"]
+
+    p_total = sum(p_times)
+    d_total = sum(d_times)
+
+    p_count = len(p_times)
+    d_count = len(d_times)
+
+    p_mean = p_total / p_count
+    d_mean = d_total / d_count
+
+    # Commented out for now as the step/add/output times are not calculated correctly.
+    # print(
+    #     f"{setup_prefix} … "
+    #     f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … "
+    #     f"total time: {step_total:.3f}s … "
+    #     f"step time: total {step_total:.3f}s "
+    #     f"[ p {p_total:.3f}s, d {d_total:.3f}s ], "
+    #     f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], "
+    #     f"count [ p {p_count}, d {d_count} ]."
+    # )
+    capture_str = (
+        f"{engine.capture_stats['time']:.2f} sec"
+        if engine.capture_stats else
+        "--"
+    )
+    print(" … ".join((
+        f"{setup_prefix}",
+        f"throughput: {throughput:.3f} tok/s",
+        f"total time: {total_time:.3f}s",
+        f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB",
+        f"steps: {engine.step_count:d}",
+        f"capture {capture_str}",
+    )))
+    print("~~~")
 
     # Stop Nsight profiler.
     if os.environ.get("NSIGHT_PREFIX"):
 
@@ -26,7 +26,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 : ${ACTIVE_BUFFER_SIZE_GB=50.}
 
 # Cuda graphs.
+: ${CUDA_GRAPH_IMPL=local}
 : ${NUM_CUDA_GRAPHS=16}
+: ${CUDA_GRAPH_SHARE_IO_BUFFERS=1}
 
 # Miscellaneous.
 : ${USE_COORDINATOR=0}
@@ -85,10 +87,6 @@ if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then
         --cuda-graph-impl local \
         --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \
     "
-else
-    ARGS+=" \
-        --cuda-graph-impl none \
-    "
 fi
 
 # Prompts.
 
@@ -27,7 +27,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 : ${ACTIVE_BUFFER_SIZE_GB=50.}
 
 # Cuda graphs.
+: ${CUDA_GRAPH_IMPL=local}
 : ${NUM_CUDA_GRAPHS=16}
+: ${CUDA_GRAPH_SHARE_IO_BUFFERS=1}
 
 # Miscellaneous.
 : ${USE_COORDINATOR=0}
@@ -71,10 +73,6 @@ if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then
         --cuda-graph-impl local \
         --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \
     "
-else
-    ARGS+=" \
-        --cuda-graph-impl none \
-    "
 fi
 
 # Prompts.