Add Qwen2-VL model implementation (#384)

vincentamato · awni · web-flow · commit cd9884dab3a6 · 2025-08-25T08:54:04.000-07:00
* Add Qwen2-VL + Qwen2.5-VL

* Fixed model sanitize method to handle both HF and MLX parameter formats

* Cleaned up MRoPE implemenation

* Formatted code

* Added type casting in MRoPE

* Removed unused instance variables

* Removed unnecessary MRoPE implemenation

* bump version

---------

Co-authored-by: Awni Hannun &lt;awni@apple.com&gt;
diff --git a/mlx_lm/_version.py b/mlx_lm/_version.py
@@ -1,3 +1,3 @@
 # Copyright © 2023-2025 Apple Inc.
 
-__version__ = "0.26.3"
+__version__ = "0.26.4"
diff --git a/mlx_lm/models/qwen2_vl.py b/mlx_lm/models/qwen2_vl.py
@@ -0,0 +1,59 @@
+# Copyright © 2025 Apple Inc.
+
+from dataclasses import dataclass
+from typing import Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_flatten, tree_unflatten
+
+from . import qwen2
+from .base import BaseModelArgs
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    text_config: dict
+
+    @classmethod
+    def from_dict(cls, params):
+        if "text_config" not in params:
+            return cls(model_type=params["model_type"], text_config=params)
+        return cls(**params)
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = args.model_type
+        self.language_model = qwen2.Model(qwen2.ModelArgs.from_dict(args.text_config))
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+        mask: Optional[mx.array] = None,
+        input_embeddings: Optional[mx.array] = None,
+    ):
+        return self.language_model(
+            inputs, cache=cache, mask=mask, input_embeddings=input_embeddings
+        )
+
+    def sanitize(self, weights):
+        weights = tree_unflatten(list(weights.items()))
+        weights.pop("visual", None)
+        weights.pop("vision_tower", None)
+        weights = dict(tree_flatten(weights))
+
+        sanitized = {}
+        for key, value in weights.items():
+            if not key.startswith("language_model."):
+                key = "language_model." + key
+            sanitized[key] = value
+        return sanitized
+
+    @property
+    def layers(self):
+        return self.language_model.model.layers
diff --git a/mlx_lm/models/rope_utils.py b/mlx_lm/models/rope_utils.py
@@ -251,6 +251,11 @@ def initialize_rope(
             short_factor=scaling_config["short_factor"],
             long_factor=scaling_config["long_factor"],
         )
-
+    elif rope_type == "mrope":
+        mrope_section = scaling_config.get("mrope_section", [])
+        assert (
+            len(mrope_section) == 3
+        ), f"MRoPE currently only supports 3 sections, got {len(mrope_section)}."
+        return nn.RoPE(dims, traditional=traditional, base=base)
     else:
         raise ValueError(f"Unsupported RoPE type {rope_type}")
diff --git a/mlx_lm/utils.py b/mlx_lm/utils.py
@@ -45,6 +45,7 @@
     "phi-msft": "phixtral",
     "falcon_mamba": "mamba",
     "kimi_k2": "deepseek_v3",
+    "qwen2_5_vl": "qwen2_vl",
 }
 
 MAX_FILE_SIZE_GB = 5

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# Copyright © 2023-2025 Apple Inc.`
`2`	`2`
`3`		`-__version__ = "0.26.3"`
	`3`	`+__version__ = "0.26.4"`
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@`
`45`	`45`	`"phi-msft": "phixtral",`
`46`	`46`	`"falcon_mamba": "mamba",`
`47`	`47`	`"kimi_k2": "deepseek_v3",`
	`48`	`+ "qwen2_5_vl": "qwen2_vl",`
`48`	`49`	`}`
`49`	`50`
`50`	`51`	`MAX_FILE_SIZE_GB = 5`