NVIDIA
diff --git a/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 5 additions & 0 deletions b/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/models/core/mistral_large_3/README.md‎
Lines changed: 165 additions & 0 deletions b/‎examples/models/core/mistral_large_3/README.md‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/_torch/model_config.py‎
Lines changed: 61 additions & 36 deletions b/‎tensorrt_llm/_torch/model_config.py‎
Lines changed: 61 additions & 36 deletions
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎tensorrt_llm/_torch/models/checkpoints/__init__.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/mistral/__init__.py‎ b/‎tensorrt_llm/_torch/models/checkpoints/mistral/__init__.py‎
@@ -23,6 +23,10 @@ def add_llm_args(parser):
                         type=str,
                         nargs="+",
                         help="A single or a list of text prompts.")
+    parser.add_argument('--checkpoint_format',
+                        type=str,
+                        default=None,
+                        help="Model checkpoint format.")
     # Build config
     parser.add_argument("--max_seq_len",
                         type=int,
@@ -237,6 +241,7 @@ def setup_llm(args, **kwargs):
     llm = LLM(
         model=args.model_dir,
         backend='pytorch',
+        checkpoint_format=args.checkpoint_format,
         disable_overlap_scheduler=args.disable_overlap_scheduler,
         kv_cache_config=kv_cache_config,
         attn_backend=args.attention_backend,
 
@@ -0,0 +1,165 @@
+# Mistral Large V3
+
+* Setup the model path
+
+```bash
+export mistral_large_3_model_path=<mistral_large_3_model_path>
+export mistral_large_3_eagle_model_path=<mistral_large_3_eagle_model_path>
+```
+
+## LLM-only run
+
+* Run the Mistral Large V3 by `quickstart_advanced.py`
+
+```bash
+mpirun -n 1 --allow-run-as-root --oversubscribe python3 examples/llm-api/quickstart_advanced.py \
+    --model_dir ${mistral_large_3_model_path} \
+    --tp_size 4 \
+    --moe_ep_size 4 \
+    --max_tokens 100 \
+    --checkpoint_format mistral_large_3 \
+    --kv_cache_fraction 0.25 \
+    --moe_backend TRTLLM # optional
+```
+
+* Run the Mistral Large V3 by `quickstart_advanced.py` with Eagle3. 
+
+```bash
+mpirun -n 1 --allow-run-as-root --oversubscribe python3 examples/llm-api/quickstart_advanced.py \
+    --model_dir ${mistral_large_3_model_path} \
+    --tp_size 4 \
+    --moe_ep_size 4 \
+    --max_tokens 10 \
+    --checkpoint_format mistral_large_3 \
+    --kv_cache_fraction 0.25 \
+    --disable_kv_cache_reuse \
+    --spec_decode_algo EAGLE3 \
+    --spec_decode_max_draft_len 1 \
+    --use_one_model \
+    --draft_model_dir ${mistral_large_3_eagle_model_path} \
+    --moe_backend TRTLLM \
+    --print_iter_log \
+    2>&1 | tee debug.log
+```
+
+* Launch the trtllm-serve and send a request
+
+```bash
+echo "
+backend: pytorch
+tensor_parallel_size: 4
+moe_expert_parallel_size: 4
+enable_attention_dp: false
+kv_cache_config:
+  free_gpu_memory_fraction: 0.25
+  enable_block_reuse: true
+checkpoint_format: mistral_large_3
+" > serve.yml
+mpirun -n 1 --allow-run-as-root --oversubscribe python3 -m tensorrt_llm.commands.serve serve \
+    ${mistral_large_3_model_path} \
+    --host localhost --port 8001 --backend pytorch \
+    --extra_llm_api_options serve.yml \
+    --tokenizer ${mistral_large_3_model_path} \
+    2>&1 | tee serve_debug.log &
+
+curl http://localhost:8001/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "${mistral_large_3_model_path}",
+      "prompt": "The capital of France is",
+      "max_tokens": 16,
+      "top_k": 16
+  }'
+
+# The result would be like
+{"id":"cmpl-7e342c1d722d4226a1bf3ed35d762c35","object":"text_completion","created":1764061351,"model":"${mistral_large_3_model_path}","choices":[{"index":0,"text":"The capital of France is **Paris**.\n\nParis is the largest city in France and","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null,"avg_decoded_tokens_per_iter":1.0}],"usage":{"prompt_tokens":7,"total_tokens":23,"completion_tokens":16,"prompt_tokens_details":{"cached_tokens":1}},"prompt_token_ids":null}
+```
+
+* Launch the trtllm-serve with eagle3 and send a request
+
+```bash
+echo "
+backend: pytorch
+tensor_parallel_size: 4
+moe_expert_parallel_size: 4
+enable_attention_dp: false
+kv_cache_config:
+  free_gpu_memory_fraction: 0.25
+  enable_block_reuse: true
+checkpoint_format: mistral_large_3
+speculative_config:
+    decoding_type: Eagle
+    max_draft_len: 1
+    speculative_model_dir: ${mistral_large_3_eagle_model_path}
+    eagle3_one_model: true
+" > serve.yml
+mpirun -n 1 --allow-run-as-root --oversubscribe python3 -m tensorrt_llm.commands.serve serve \
+    ${mistral_large_3_model_path} \
+    --host localhost --port 8001 --backend pytorch \
+    --extra_llm_api_options serve.yml \
+    --tokenizer ${mistral_large_3_model_path} \
+    2>&1 | tee serve_debug.log &
+
+curl http://localhost:8001/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "${mistral_large_3_model_path}",
+      "prompt": "The capital of France is",
+      "max_tokens": 16,
+      "top_k": 16
+  }'
+```
+
+## How to use the modules
+
+The following explains how to use the different modules of Mistral Large V3.
+
+```python
+from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3ForCausalLM
+from tensorrt_llm._torch.models.modeling_mistral import Mistral3VLM
+from tensorrt_llm.llmapi.tokenizer import MistralTokenizer
+from tensorrt_llm._torch.models.checkpoints.mistral.checkpoint_loader import MistralCheckpointLoader
+from tensorrt_llm._torch.models.checkpoints.mistral.weight_mapper import MistralLarge3WeightMapper
+from tensorrt_llm._torch.models.checkpoints.mistral.config_loader import MistralConfigLoader
+```
+
+### Tokenizer
+```python
+mtok = MistralTokenizer.from_pretrained(TOKENIZER_DIR)
+```
+
+### Config and model instance
+```python
+config_loader = MistralConfigLoader()
+config = config_loader.load(MODEL_DIR)
+
+model = Mistral3VLM(model_config=config)
+assert isinstance(model.llm, DeepseekV3ForCausalLM)
+```
+
+### Checkpoint loading
+```python
+weight_mapper=MistralLarge3WeightMapper()
+loader = MistralCheckpointLoader(weight_mapper=weight_mapper)
+
+weights_dict = loader.load_weights(MODEL_DIR)
+```
+
+### Weight loading
+#### E2E
+```python
+model.load_weights(weights_dict, weight_mapper=weight_mapper) # target usage
+```
+#### By module
+```python
+def _filter_weights(weights, prefix):
+    return {
+        name[len(prefix):]: weight
+        for name, weight in weights.items() if name.startswith(prefix)
+    }
+
+llm_weights = weight_mapper.rename_by_params_map(
+    params_map=weight_mapper.mistral_llm_mapping,
+    weights=_filter_weights(weights_dict, "language_model."))
+model.llm.load_weights(llm_weights, weight_mapper=weight_mapper)
+```
@@ -73,3 +73,4 @@ nvidia-cutlass-dsl==4.3.1; python_version >= "3.10"
 plotly
 numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
 partial_json_parser
+mistral-common
@@ -312,21 +312,39 @@ def load_hf_quant_config(hf_quant_config, moe_backend):
         layer_quant_config = None
 
         # DeepSeek V3 FP8 ckpt
-        if hf_quant_config.get("quant_method") == "fp8" and hf_quant_config.get(
-                "weight_block_size", []):
-            quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
-            if moe_backend == 'TRTLLM':
-                # TODO: This is a hack. Remove after fp8 bmm is integrated.
-                quant_config.exclude_modules = [
-                    "*kv_b_proj*", "*k_b_proj*", "*eh_proj"
-                ]
-            else:
-                quant_config.exclude_modules = ["*eh_proj"]
+        if hf_quant_config.get("quant_method") == "fp8":
+            if hf_quant_config.get("weight_block_size", []):
+                quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+                if moe_backend == 'TRTLLM':
+                    # TODO: This is a hack. Remove after fp8 bmm is integrated.
+                    quant_config.exclude_modules = [
+                        "*kv_b_proj*", "*k_b_proj*", "*eh_proj"
+                    ]
+                else:
+                    quant_config.exclude_modules = ["*eh_proj"]
+
+                block_size = hf_quant_config.get("weight_block_size", [])
+                assert tuple(block_size) == (
+                    128,
+                    128), "FP8_BLOCK_SCALES only supports block_size=(128,128)"
+                quant_config.group_size = block_size[0]
+
+            # DeepSeek V3 FP8 per tensor hack
+            elif hf_quant_config.get("activation_scheme", None) == "static":
+                logger.debug(f"Expanding weight scale to mimic DS FP8 recipe")
+                quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+                if moe_backend == 'TRTLLM':
+                    # TODO: This is a hack. Remove after fp8 bmm is integrated.
+                    quant_config.exclude_modules = [
+                        "*kv_b_proj*", "*k_b_proj*", "*eh_proj"
+                    ]
+                else:
+                    quant_config.exclude_modules = ["*eh_proj"]
+
+                block_size = (128, 128)
+                quant_config.group_size = block_size[0]
+                logger.info(f"quant_config: {quant_config}")
 
-            block_size = hf_quant_config.get("weight_block_size", [])
-            assert tuple(block_size) == (
-                128, 128), "FP8_BLOCK_SCALES only supports block_size=(128,128)"
-            quant_config.group_size = block_size[0]
         # MXFP4 checkpoints.
         elif hf_quant_config.get("quant_method") == "mxfp4":
             quant_config.quant_algo = ModelConfig.get_mxfp4_quant_algo(
@@ -394,44 +412,51 @@ def override_quant_algo():
     @classmethod
     def from_pretrained(cls,
                         checkpoint_dir: str,
+                        pretrained_hf_config=None,
                         trust_remote_code=False,
                         **kwargs):
         # Use file lock to prevent race conditions when multiple processes
         # try to import/cache the same remote model config file
         with config_file_lock():
             # When handling the case where model_format is TLLM_ENGINE
             # send cyclic requests to the NONE URL.
-            if checkpoint_dir is not None:
+            if checkpoint_dir is not None and pretrained_hf_config is not None:
+                logger.warning(
+                    f"Both checkpoint_dir and pretrained config specified. Using pretrained_config."
+                )
+
+            if pretrained_hf_config is not None:
+                pretrained_config = pretrained_hf_config
+            elif checkpoint_dir is not None:
                 pretrained_config = load_pretrained_config(
                     checkpoint_dir,
                     trust_remote_code=trust_remote_code,
                     **kwargs,
                 )
-                if pretrained_config.architectures[
-                        0] == "DeepseekV32ForCausalLM":
-                    sparse_attention_config = kwargs.get(
-                        'sparse_attention_config')
-                    if sparse_attention_config:
-                        index_n_heads = sparse_attention_config.index_n_heads or pretrained_config.index_n_heads
-                        index_head_dim = sparse_attention_config.index_head_dim or pretrained_config.index_head_dim
-                        index_topk = sparse_attention_config.index_topk or pretrained_config.index_topk
-                        indexer_max_chunk_size = sparse_attention_config.indexer_max_chunk_size
-                    else:
-                        index_n_heads = pretrained_config.index_n_heads
-                        index_head_dim = pretrained_config.index_head_dim
-                        index_topk = pretrained_config.index_topk
-                        indexer_max_chunk_size = None
-                    kwargs[
-                        'sparse_attention_config'] = DeepSeekSparseAttentionConfig(
-                            index_n_heads=index_n_heads,
-                            index_head_dim=index_head_dim,
-                            index_topk=index_topk,
-                            indexer_max_chunk_size=indexer_max_chunk_size)
             else:
                 raise ValueError(
-                    "checkpoint_dir is None. Cannot load model config without a valid checkpoint directory."
+                    "checkpoint_dir is None and pretrained config is not specified. Cannot load model config without a valid checkpoint directory or a pretrained config."
                 )
 
+            if pretrained_config.architectures[0] == "DeepseekV32ForCausalLM":
+                sparse_attention_config = kwargs.get('sparse_attention_config')
+                if sparse_attention_config:
+                    index_n_heads = sparse_attention_config.index_n_heads or pretrained_config.index_n_heads
+                    index_head_dim = sparse_attention_config.index_head_dim or pretrained_config.index_head_dim
+                    index_topk = sparse_attention_config.index_topk or pretrained_config.index_topk
+                    indexer_max_chunk_size = sparse_attention_config.indexer_max_chunk_size
+                else:
+                    index_n_heads = pretrained_config.index_n_heads
+                    index_head_dim = pretrained_config.index_head_dim
+                    index_topk = pretrained_config.index_topk
+                    indexer_max_chunk_size = None
+                kwargs[
+                    'sparse_attention_config'] = DeepSeekSparseAttentionConfig(
+                        index_n_heads=index_n_heads,
+                        index_head_dim=index_head_dim,
+                        index_topk=index_topk,
+                        indexer_max_chunk_size=indexer_max_chunk_size)
+
         # Get cached file from path or repo id, return None if not exists.
         def cached_file(path_or_repo_id, file_name):
             try:
 
@@ -12,9 +12,16 @@
 from .hf.qwen3_next_weight_mapper import Qwen3NextHfWeightMapper
 from .hf.weight_loader import HfWeightLoader
 from .hf.weight_mapper import HfWeightMapper
+from .mistral.checkpoint_loader import (MistralCheckpointLoader,
+                                        MistralLarge3CheckpointLoader)
+from .mistral.config_loader import MistralConfigLoader
+from .mistral.weight_mapper import (MistralLarge3WeightMapper,
+                                    MistralWeightMapper)
 
 __all__ = [
     "HfConfigLoader", "HfWeightLoader", "HfWeightMapper",
+    "MistralLarge3CheckpointLoader", "MistralCheckpointLoader",
+    "MistralConfigLoader", "MistralWeightMapper", "MistralLarge3WeightMapper",
     "BaseCheckpointLoader", "HfCheckpointLoader", "NemotronHHfWeightMapper",
     "Gemma3HfWeightMapper", "MixtralHfWeightMapper", "Llama4HfWeightMapper",
     "Qwen2MoeHfWeightMapper", "Qwen3MoeHfWeightMapper", "Qwen2VLHfWeightMapper",
 
@@ -19,6 +19,8 @@
 
 
 @register_checkpoint_weight_loader("HF")
+@register_checkpoint_weight_loader("mistral")
+@register_checkpoint_weight_loader("mistral_large_3")
 class HfWeightLoader(BaseWeightLoader):
     """
     Loads weights from SafeTensors/bin/pth files.