NVIDIA
diff --git a/‎examples/speculative_decoding/README.md‎
Lines changed: 14 additions & 4 deletions b/‎examples/speculative_decoding/README.md‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/speculative_decoding/ar_validate.py‎ renamed to ‎examples/speculative_decoding/scripts/ar_validate.py‎ b/‎examples/speculative_decoding/ar_validate.py‎ renamed to ‎examples/speculative_decoding/scripts/ar_validate.py‎
diff --git a/‎examples/speculative_decoding/calibrate_draft_vocab.py‎ renamed to ‎examples/speculative_decoding/scripts/calibrate_draft_vocab.py‎ b/‎examples/speculative_decoding/calibrate_draft_vocab.py‎ renamed to ‎examples/speculative_decoding/scripts/calibrate_draft_vocab.py‎
diff --git a/‎examples/speculative_decoding/scripts/convert_to_vllm_ckpt.py‎
Lines changed: 148 additions & 0 deletions b/‎examples/speculative_decoding/scripts/convert_to_vllm_ckpt.py‎
Lines changed: 148 additions & 0 deletions
diff --git a/‎examples/speculative_decoding/export_hf_checkpoint.py‎ renamed to ‎examples/speculative_decoding/scripts/export_hf_checkpoint.py‎ b/‎examples/speculative_decoding/export_hf_checkpoint.py‎ renamed to ‎examples/speculative_decoding/scripts/export_hf_checkpoint.py‎
diff --git a/‎examples/speculative_decoding/server_generate.py‎ renamed to ‎examples/speculative_decoding/scripts/server_generate.py‎
Lines changed: 15 additions & 0 deletions b/‎examples/speculative_decoding/server_generate.py‎ renamed to ‎examples/speculative_decoding/scripts/server_generate.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tests/examples/speculative_decoding/test_eagle.py‎
Lines changed: 70 additions & 2 deletions b/‎tests/examples/speculative_decoding/test_eagle.py‎
Lines changed: 70 additions & 2 deletions
@@ -129,15 +129,15 @@ Once we finish dumping hidden states, launch offline training with an extra `--o
 For online training checkpoints, we can run in-framework evaluation on MT-bench:
 
 ```bash
-python ar_validate.py --model_path $ONLINE_CKPT
+python scripts/ar_validate.py --model_path $ONLINE_CKPT
 ```
 
 **Note**: In-framework evaluation is supported only for online training. For offline training checkpoints, please export the model and evaluate it using serving frameworks.
 
 ## Export
 
 ```bash
-python export_hf_checkpoint.py --model_path $OUTPUT_DIR --export_path $EXPORT_PATH
+python scripts/export_hf_checkpoint.py --model_path $OUTPUT_DIR --export_path $EXPORT_PATH
 ```
 
 This exports the model from a ModelOpt checkpoint to a deployment-compatible format.
@@ -175,6 +175,16 @@ kv_cache_config:
 
 Please refer to [TRT-LLM Doc: Speculative Decoding](https://nvidia.github.io/TensorRT-LLM/examples/llm_speculative_decoding.html) for detailed usage.
 
+### vLLM
+
+Please refer to [VLLM Doc: Speculative Decoding](https://docs.vllm.ai/en/latest/features/spec_decode/) for detailed usage.
+
+Optionally, you can convert the exported checkpoint to contain target model information, which is accepted by vLLM to simplify depployment:
+
+```bash
+python scripts/convert_to_vllm_ckpt.py --input <exported_ckpt> --verifier <target_model> --output <output_dir>
+```
+
 ### SGLang
 
 Please refer to [SGLang Doc: Speculative Decoding](https://docs.sglang.ai/advanced_features/speculative_decoding.html#EAGLE-3-Decoding) for detailed usage.
@@ -227,7 +237,7 @@ Note: Add `--quantization=modelopt` flag for quantized models.
 Then, we generate conversations with the base model using prompts from Daring-Anteater:
 
 ```bash
-python server_generate.py --data_path input_conversations/daring-anteater.jsonl --output_path synthetic/train.jsonl
+python scripts/server_generate.py --data_path input_conversations/daring-anteater.jsonl --output_path synthetic/train.jsonl
 ```
 
 To add a system prompt, use the `--system_prompt <system_prompt_text>` argument.
@@ -239,7 +249,7 @@ For large scale data generation, please see [SLURM prepare data](SLURM_prepare_d
 We can optionally use smaller vocab size for the draft model for faster training and inference. E.g. Llama3.2-1B has a vocab size of 128256. In this example, we construct a draft vocab mapping of size 32k by finding the most commonly appeared vocabs in our training set:
 
 ```bash
-python calibrate_draft_vocab.py --model meta-llama/Llama-3.2-1B-Instruct --data input_conversations/daring-anteater.jsonl --draft_vocab_size 32000 --save_dir draft_vocab_cache
+python scripts/calibrate_draft_vocab.py --model meta-llama/Llama-3.2-1B-Instruct --data input_conversations/daring-anteater.jsonl --draft_vocab_size 32000 --save_dir draft_vocab_cache
 ```
 
 This will produce a `d2t.pt` file in `save_dir`, which is the mapping from draft token to target token. During inference, draft tokens can be mapped back to target tokens by `target_token = draft_token + d2t[draft_token]`.
 
@@ -21,9 +21,9 @@
 import numpy as np
 import torch
 import transformers
-from ar_validate import validate_ar
 from datasets import load_dataset
 from PIL import Image
+from scripts.ar_validate import validate_ar
 from torch.utils.data import Dataset
 from transformers import AutoProcessor, Trainer, TrainerCallback
 from transformers.trainer_pt_utils import LabelSmoother
 
@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Convert a TRTLLM eagle checkpoint to an VLLM compatible one-model checkpoint.
+"""
+
+import argparse
+import json
+import os
+import shutil
+from copy import deepcopy
+
+VLLM_EAGLE3_ONE_CKPT_CFG_TEMPLATE = {
+    "architectures": ["Eagle3Speculator"],
+    "auto_map": {"": "eagle3.Eagle3SpeculatorConfig"},
+    "draft_vocab_size": None,
+    "has_no_defaults_at_init": False,
+    "norm_before_residual": True,
+    "speculators_config": {
+        "algorithm": "eagle3",
+        "default_proposal_method": "greedy",
+        "proposal_methods": [
+            {
+                "accept_tolerance": 0.0,
+                "proposal_type": "greedy",
+                "speculative_tokens": 3,
+                "verifier_accept_k": 1,
+            }
+        ],
+        "verifier": {"architectures": [""], "name_or_path": ""},
+    },
+    "speculators_model_type": "eagle3",
+    "speculators_version": "0.1.0.dev14",
+    "target_hidden_size": None,
+    "torch_dtype": None,
+    "transformer_layer_config": {
+        "attention_bias": None,
+        "attention_dropout": None,
+        "head_dim": None,
+        "hidden_act": None,
+        "hidden_size": None,
+        "initializer_range": None,
+        "intermediate_size": None,
+        "max_position_embeddings": None,
+        "mlp_bias": None,
+        "model_type": None,
+        "num_attention_heads": None,
+        "num_hidden_layers": None,
+        "num_key_value_heads": None,
+        "pretraining_tp": None,
+        "rms_norm_eps": None,
+        "rope_scaling": None,
+        "rope_theta": None,
+        "use_cache": True,
+        "vocab_size": None,
+    },
+    "transformers_version": None,
+}
+
+
+def convert_to_eagle3_speculator_config(
+    draft_cfg,
+    verifier_name_or_path,
+    template_cfg=VLLM_EAGLE3_ONE_CKPT_CFG_TEMPLATE,
+):
+    """
+    Convert a draft model config and a verifier model config to an Eagle3Speculator config.
+    """
+
+    verifier_config_path = os.path.join(verifier_name_or_path, "config.json")
+    with open(verifier_config_path, encoding="utf-8") as verifier_cfg_file:
+        verifier_cfg = json.load(verifier_cfg_file)
+
+    speculator_config = deepcopy(template_cfg)
+
+    try:
+        # Update speculators_config separately to avoid type conflicts
+        speculator_config["speculators_config"].update(
+            {
+                "verifier": {
+                    "architectures": verifier_cfg["architectures"],
+                    "name_or_path": verifier_name_or_path,
+                },
+            }
+        )
+
+        # Update other fields
+        speculator_config.update(
+            {
+                "draft_vocab_size": draft_cfg["draft_vocab_size"],
+                "target_hidden_size": verifier_cfg["hidden_size"],
+                "torch_dtype": draft_cfg["torch_dtype"],
+                "transformer_layer_config": {
+                    k: draft_cfg[k] for k in template_cfg["transformer_layer_config"]
+                },
+                "transformers_version": draft_cfg["transformers_version"],
+            }
+        )
+    except Exception as e:
+        raise Exception(f"Error converting draft config: {e}")
+
+    return speculator_config
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert TRTLLM eagle checkpoint to VLLM compatible one-model checkpoint."
+    )
+    parser.add_argument("--input", help="Path to TRTLLM eagle checkpoint.")
+    parser.add_argument("--verifier", help="Name or path to the verifier model.")
+    parser.add_argument("--output", help="Save path for converted vllm one-model checkpoint.")
+
+    args = parser.parse_args()
+
+    with open(os.path.join(args.input, "config.json"), encoding="utf-8") as f:
+        original_draft_cfg = json.load(f)
+
+    converted_cfg = convert_to_eagle3_speculator_config(
+        original_draft_cfg,
+        args.verifier,
+    )
+
+    # Write the converted config to the output directory
+    os.makedirs(args.output, exist_ok=True)
+    with open(os.path.join(args.output, "config.json"), "w", encoding="utf-8") as f:
+        json.dump(converted_cfg, f, indent=2, ensure_ascii=False)
+
+    # Copy the model.safetensor file from input dir to output dir
+    input_model_path = os.path.join(args.input, "model.safetensors")
+    output_model_path = os.path.join(args.output, "model.safetensors")
+    shutil.copyfile(input_model_path, output_model_path)
+
+
+if __name__ == "__main__":
+    main()
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Adapted from: https://github.com/FasterDecoding/Medusa/blob/e2a5d20/data_generation/generate.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 
@@ -15,11 +15,22 @@
 
 import json
 
+import pytest
+import safetensors.torch
 from _test_utils.examples.run_command import run_example_command
 
+from modelopt.torch.export.plugins.hf_spec_export import EAGLE_MODELOPT_TO_OFFICIAL
+
+
+@pytest.fixture(scope="module")
+def eagle_output_dir(tmp_path_factory):
+    """Eagle output directory shared in this module."""
+    return tmp_path_factory.mktemp("eagle_output_dir")
+
 
 # fmt: off
-def test_llama_eagle3(tiny_llama_path, num_gpus, tiny_daring_anteater_path, tmp_path):
+def test_llama_eagle3(tiny_llama_path, num_gpus, tiny_daring_anteater_path, tmp_path, eagle_output_dir):
+    """Test Eagle3 training with a tiny llama model."""
     # Create an ultra-tiny EAGLE config for testing to reduce memory usage
     tiny_eagle_config = {
         "max_position_embeddings": 128,
@@ -45,8 +56,65 @@ def test_llama_eagle3(tiny_llama_path, num_gpus, tiny_daring_anteater_path, tmp_
             "--num_gpu", str(num_gpus),
             "--mode", "eagle3",
             "--eagle_config", str(config_file),
-            "--output_dir", tmp_path / "eagle-tinyllama",
+            "--output_dir", eagle_output_dir / "eagle-tinyllama",
             "--training_seq_len", "128", # Match max_position_embeddings
         ],
         "speculative_decoding",
     )
+
+
+def test_ar_validate(eagle_output_dir):
+    """Test in-framework AR evaluation."""
+    run_example_command(
+        [
+            "python", "./scripts/ar_validate.py",
+            "--model_path", eagle_output_dir / "eagle-tinyllama",
+            "--osl", "20",
+            "--num_samples", "10",
+            "--steps", "3"
+        ],
+        "speculative_decoding",
+    )
+
+
+def test_export_hf_checkpoint(eagle_output_dir):
+    """Test export of Eagle3 checkpoint."""
+    run_example_command(
+        [
+            "python", "./scripts/export_hf_checkpoint.py",
+            "--model_path", eagle_output_dir / "eagle-tinyllama",
+            "--export_path", eagle_output_dir / "eagle-tinyllama-export",
+        ],
+        "speculative_decoding",
+    )
+    # Check the exported checkpoints have required keys
+    state_dict = safetensors.torch.load_file(eagle_output_dir / "eagle-tinyllama-export" / "model.safetensors")
+    for required_key in EAGLE_MODELOPT_TO_OFFICIAL["required"].values():
+        assert required_key in state_dict, f"Missing key '{required_key}' in state_dict"
+
+
+def test_convert_to_vllm_ckpt(tiny_llama_path, eagle_output_dir):
+    """Test conversion of Eagle3 checkpoint to VLLM one-model checkpoint."""
+    run_example_command(
+        [
+            "python", "./scripts/convert_to_vllm_ckpt.py",
+            "--input", eagle_output_dir / "eagle-tinyllama-export",
+            "--verifier", tiny_llama_path,
+            "--output", eagle_output_dir / "eagle-tinyllama-export-vllm-one-ckpt",
+        ],
+        "speculative_decoding",
+    )
+
+@pytest.mark.skip(reason="Needs dataset conversion to role-content format; consolidate data loading first.")
+def test_calibrate_draft_vocab(tiny_llama_path, tiny_daring_anteater_path,tmp_path):
+    """Test calibration of draft vocabulary."""
+    run_example_command(
+        [
+            "python", "./scripts/calibrate_draft_vocab.py",
+            "--model", tiny_llama_path,
+            "--data", tiny_daring_anteater_path,
+            "--draft_vocab_size", "100",
+            "--save_dir", tmp_path / "draft_vocab_cache",
+        ],
+        "speculative_decoding",
+    )