Tencent
diff --git a/‎angelslim/compressor/speculative/train/data/data_utils.py‎
Lines changed: 37 additions & 5 deletions b/‎angelslim/compressor/speculative/train/data/data_utils.py‎
Lines changed: 37 additions & 5 deletions
diff --git a/‎angelslim/compressor/speculative/train/data/dataset_builder/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎angelslim/compressor/speculative/train/data/dataset_builder/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎angelslim/compressor/speculative/train/data/dataset_builder/offline_vlm_dataset_builder.py‎
Lines changed: 224 additions & 0 deletions b/‎angelslim/compressor/speculative/train/data/dataset_builder/offline_vlm_dataset_builder.py‎
Lines changed: 224 additions & 0 deletions
diff --git a/‎angelslim/compressor/speculative/train/models/target/target_head.py‎
Lines changed: 10 additions & 1 deletion b/‎angelslim/compressor/speculative/train/models/target/target_head.py‎
Lines changed: 10 additions & 1 deletion
@@ -116,7 +116,18 @@ def paddingtensor2D(intensors, N):
     return outtensors
 
 
-def paddingtensor3D(tensor_list):
+def paddingtensor3D_CBN(tensor_list):
+    N = max(tensor.shape[-1] for tensor in tensor_list)
+    out_tensor_list = []
+    for tensor in tensor_list:
+        c, b, n = tensor.shape
+        outtensor = torch.zeros(c, b, N, dtype=tensor_list[0].dtype)
+        outtensor[:, :, :n] = tensor
+        out_tensor_list.append(outtensor)
+    return torch.cat(out_tensor_list, dim=1)
+
+
+def paddingtensor3D_BHW(tensor_list):
     max_h = max(tensor.shape[-2] for tensor in tensor_list)
     max_w = max(tensor.shape[-1] for tensor in tensor_list)
     out_tensor_list = []
@@ -183,23 +194,44 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
             "input_ids": batch_input_ids,
             "attention_mask": batch_attention_mask,
             "loss_mask": batch_loss_mask,
+            "hidden_states": None,
+            "target_hiddens": None,
+            "inputs_embeds": None,
+            "position_ids": None,
         }
 
         if "pixel_values" in features[0]:
-            batch["pixel_values"] = paddingtensor3D(
+            batch["pixel_values"] = paddingtensor3D_BHW(
                 [item["pixel_values"] for item in features]
             )
         if "video_pixel_values" in features[0]:
-            batch["video_pixel_values"] = paddingtensor3D(
+            batch["video_pixel_values"] = paddingtensor3D_BHW(
                 [item["video_pixel_values"] for item in features]
             )
         if "image_grid_thw" in features[0]:
-            batch["image_grid_thw"] = paddingtensor3D(
+            batch["image_grid_thw"] = paddingtensor3D_BHW(
                 [item["image_grid_thw"] for item in features]
             )
         if "video_grid_thw" in features[0]:
-            batch["video_grid_thw"] = paddingtensor3D(
+            batch["video_grid_thw"] = paddingtensor3D_BHW(
                 [item["video_grid_thw"] for item in features]
             )
 
+        # Check if both hidden_states and target_hiddens exist in all features
+        if all(
+            "hidden_states" in item and "target_hiddens" in item for item in features
+        ):
+            batch["hidden_states"] = torch.cat(
+                [paddingtensor(item["hidden_states"], max_length) for item in features]
+            )
+            batch["target_hiddens"] = torch.cat(
+                [paddingtensor(item["target_hiddens"], max_length) for item in features]
+            )
+            batch["inputs_embeds"] = torch.cat(
+                [paddingtensor(item["inputs_embeds"], max_length) for item in features]
+            )
+            batch["position_ids"] = paddingtensor3D_CBN(
+                [item["position_ids"] for item in features]
+            )
+
         return batch
@@ -14,12 +14,14 @@
 
 from .dataset_builder_factory import DatasetBuilderFactory
 from .offline_llm_dataset_builder import OfflineLLMDatasetBuilder
+from .offline_vlm_dataset_builder import OfflineVLMDatasetBuilder
 from .online_llm_dataset_builder import OnlineLLMDatasetBuilder
 from .online_vlm_dataset_builder import OnlineVLMDatasetBuilder
 
 __all__ = [
     "OnlineLLMDatasetBuilder",
     "OnlineVLMDatasetBuilder",
     "OfflineLLMDatasetBuilder",
+    "OfflineVLMDatasetBuilder",
     "DatasetBuilderFactory",
 ]
@@ -0,0 +1,224 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.utils.data import Dataset
+
+from angelslim.utils import rank0_print
+
+from ..data_utils import VLMDataCollatorWithPadding
+from .base_dataset_builder import DatasetBuilder
+from .dataset_builder_factory import DatasetBuilderFactory
+
+
+class OfflineVLMEagle3Dataset(Dataset):
+    """
+    Offline Dataset for EAGLE3 training.
+
+    Loads pre-computed hidden states, logits, and other data from .ckpt files.
+    Each .ckpt file contains a dictionary with keys: input_ids, target_logits,
+    hidden_states, and loss_mask.
+    """
+
+    def __init__(
+        self, data_dir: str, file_pattern: str = "*.ckpt", cache_in_memory: bool = False
+    ):
+        """
+        Initialize the OfflineVLMEagle3Dataset.
+
+        Args:
+            data_dir: Directory containing .ckpt files
+                (will search recursively in subdirectories)
+            file_pattern: Pattern to match checkpoint files (default: "*.ckpt")
+            cache_in_memory: Whether to cache all data in memory (default: False)
+        """
+        self.data_dir = Path(data_dir)
+        self.cache_in_memory = cache_in_memory
+
+        if not self.data_dir.exists():
+            raise ValueError(f"Data directory does not exist: {data_dir}")
+
+        # Recursively find all checkpoint files in subdirectories
+        self.ckpt_files = sorted(list(self.data_dir.rglob(file_pattern)))
+
+        if len(self.ckpt_files) == 0:
+            raise ValueError(
+                f"No checkpoint files found in {data_dir} "
+                f"(including subdirectories) with pattern {file_pattern}"
+            )
+
+        rank0_print(
+            f"Found {len(self.ckpt_files)} checkpoint files "
+            f"in {data_dir} (including subdirectories)"
+        )
+
+        # Track valid indices (files that can be loaded successfully)
+        self.valid_indices = list(range(len(self.ckpt_files)))
+
+        # Cache data in memory if requested
+        self.cached_data: Optional[List[Dict[str, torch.Tensor]]] = None
+        if self.cache_in_memory:
+            rank0_print("Caching all data in memory...")
+            self.cached_data = []
+            failed_count = 0
+            for i in range(len(self.ckpt_files)):
+                data = self._load_ckpt(i)
+                if data is not None:
+                    self.cached_data.append(data)
+                else:
+                    failed_count += 1
+
+            # Update valid indices based on successful loads
+            self.valid_indices = list(range(len(self.cached_data)))
+
+            if failed_count > 0:
+                rank0_print(
+                    f"Data caching completed. "
+                    f"Successfully loaded {len(self.cached_data)} files, "
+                    f"failed to load {failed_count} files"
+                )
+            else:
+                rank0_print("Data caching completed")
+
+    def _load_ckpt(self, idx: int) -> Optional[Dict[str, torch.Tensor]]:
+        """
+        Load a checkpoint file.
+
+        Args:
+            idx: Index of the checkpoint file
+
+        Returns:
+            Dictionary containing input_ids, target_hiddens,
+                hidden_states, and loss_mask, or None if loading fails
+        """
+        ckpt_path = self.ckpt_files[idx]
+
+        try:
+            data = torch.load(ckpt_path, map_location="cpu")
+        except Exception as e:
+            warnings.warn(
+                f"Failed to load checkpoint {ckpt_path}: {e}. Skipping this file.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            return None
+
+        # Validate required keys
+        required_keys = [
+            "input_ids",  # B, N
+            "target_hiddens",  # B, N, D
+            "hidden_states",  # B, N, 3*D
+            "loss_mask",  # B, N
+            "inputs_embeds",  # B, N, D
+            "position_ids",  # B, N
+        ]
+        missing_keys = [key for key in required_keys if key not in data]
+
+        if missing_keys:
+            warnings.warn(
+                f"Checkpoint {ckpt_path} is missing required keys: {missing_keys}. "
+                f"Skipping this file.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            return None
+
+        # Validate tensor types
+        for key in required_keys:
+            if not isinstance(data[key], torch.Tensor):
+                warnings.warn(
+                    f"Value for key '{key}' in {ckpt_path} is not a torch.Tensor. "
+                    f"Skipping this file.",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+                return None
+
+        attention_mask = torch.ones_like(data["input_ids"])
+        data["attention_mask"] = attention_mask  # B, N
+        return data
+
+    def __len__(self) -> int:
+        """Return the number of valid samples in the dataset."""
+        if self.cached_data is not None:
+            return len(self.cached_data)
+        return len(self.valid_indices)
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """
+        Get a sample from the dataset.
+
+        Args:
+            idx: Index of the sample
+
+        Returns:
+            Dictionary containing:
+                - input_ids: Token IDs (torch.Tensor)
+                - target_logits: Pre-computed logits from target
+                    model (torch.Tensor)
+                - hidden_states: Pre-computed hidden states from
+                    target model (torch.Tensor)
+                - loss_mask: Mask for loss computation (torch.Tensor)
+        """
+        if self.cached_data is not None:
+            return self.cached_data[idx]
+        else:
+            # Try to load the checkpoint, retry with next valid index if fails
+            max_retries = len(self.valid_indices)
+            for _attempt in range(max_retries):
+                actual_idx = self.valid_indices[idx % len(self.valid_indices)]
+                data = self._load_ckpt(actual_idx)
+                if data is not None:
+                    return data
+                else:
+                    # Remove failed index from valid_indices
+                    self.valid_indices.remove(actual_idx)
+                    if len(self.valid_indices) == 0:
+                        raise RuntimeError(
+                            "All checkpoint files failed to load. "
+                            "Cannot continue training."
+                        )
+                    # Try next index
+                    idx += 1
+
+            # If all retries failed, raise error
+            raise RuntimeError(
+                f"Failed to load any valid checkpoint after {max_retries} attempts"
+            )
+
+
+@DatasetBuilderFactory.register("offline", "VLM")
+class OfflineVLMDatasetBuilder(DatasetBuilder):
+    def __init__(
+        self, file_pattern: str = "*.ckpt", cache_in_memory: bool = False, **kwargs: Any
+    ):
+        self.file_pattern = file_pattern
+        self.cache_in_memory = cache_in_memory
+
+    def build_dataset(self, datapath: str, **kwargs: Any) -> Dataset:
+        """
+        Create offline datasets from pre-computed .ckpt files.
+        """
+        return OfflineVLMEagle3Dataset(
+            data_dir=datapath,
+            file_pattern=self.file_pattern,
+            cache_in_memory=self.cache_in_memory,
+        )
+
+    def get_data_collator(self) -> Any:
+        return VLMDataCollatorWithPadding()
@@ -64,7 +64,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     @classmethod
     def from_pretrained(
-        cls, model_name_or_path: str, lm_head_key: str = "lm_head.weight"
+        cls,
+        model_name_or_path: str,
+        lm_head_key: str = "lm_head.weight",
+        sub_config_name=None,
     ):
         """
         Load TargetHead from a pretrained model efficiently.
@@ -82,6 +85,12 @@ def from_pretrained(
         """
         # Load model config to get architecture info
         config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
+        if hasattr(config, sub_config_name):
+            config = getattr(config, sub_config_name)
+        else:
+            raise ValueError(
+                f"Config {config} has no sub-config named {sub_config_name}"
+            )
 
         # Get model dimensions
         hidden_size = config.hidden_size