vllm-project
diff --git a/‎src/compressed_tensors/compressors/base.py‎
Lines changed: 6 additions & 1 deletion b/‎src/compressed_tensors/compressors/base.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 89 additions & 7 deletions b/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 89 additions & 7 deletions
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/base.py‎
Lines changed: 5 additions & 3 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/base.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/compressed_tensors/compressors/sparse_compressors/base.py‎
Lines changed: 21 additions & 4 deletions b/‎src/compressed_tensors/compressors/sparse_compressors/base.py‎
Lines changed: 21 additions & 4 deletions
@@ -19,6 +19,7 @@
 from compressed_tensors.config import SparsityCompressionConfig
 from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
 from compressed_tensors.registry import RegistryMixin
+from compressed_tensors.utils import has_offloaded_params
 from torch import Tensor
 from torch.nn import Module
 
@@ -169,6 +170,10 @@ def decompress_module(self, module: Module):
         :param module: PyTorch module to decompress
         :return: tensor of the decompressed weight, or None if module is not quantized
         """
+
+        params_device = next(module.parameters()).device
+        device = "cpu" if has_offloaded_params(module) else params_device
+
         if not hasattr(module, "quantization_scheme"):
             return None  # module is not quantized
         quantization_scheme = module.quantization_scheme
@@ -182,7 +187,7 @@ def decompress_module(self, module: Module):
 
         return self.decompress_weight(
             compressed_data=compressed_data, quantization_args=quantization_args
-        )
+        ).to(device)
 
     def decompress_weight(
         self, compressed_data: Dict[str, Tensor], **kwargs
 
@@ -31,13 +31,14 @@
     SPARSITY_CONFIG_NAME,
 )
 from compressed_tensors.compressors.base import BaseCompressor
+from compressed_tensors.compressors.sparse_compressors import DenseCompressor
 from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 from compressed_tensors.quantization import (
     DEFAULT_QUANTIZATION_METHOD,
     QuantizationConfig,
     QuantizationStatus,
     apply_quantization_config,
-    load_pretrained_quantization,
+    load_pretrained_quantization_parameters,
 )
 from compressed_tensors.quantization.lifecycle import expand_target_names
 from compressed_tensors.quantization.quant_args import QuantizationArgs
@@ -47,7 +48,9 @@
 )
 from compressed_tensors.utils import (
     get_safetensors_folder,
+    has_offloaded_params,
     merge_names,
+    register_offload_parameter,
     update_parameter_data,
 )
 from compressed_tensors.utils.helpers import (
@@ -412,6 +415,13 @@ def decompress(self, model_path: str, model: Module):
 
         :param model_path: path to compressed weights
         :param model: pytorch model to load decompressed weights into
+
+        Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
+        The variations in these methods are a result of the subtle variations between the sparsity
+        and quantization compressors. Specifically, quantization compressors return not just the
+        decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
+        compressors only return the decompressed weight.
+
         """
         model_path = get_safetensors_folder(model_path)
         sparse_decompressed = False
@@ -420,9 +430,16 @@ def decompress(self, model_path: str, model: Module):
             self.sparsity_compressor is not None
             and self.sparsity_config.format != CompressionFormat.dense.value
         ):
+            params_to_ignore = None
+            if self.quantization_compressor is not None:
+                params_to_ignore = self.quantization_compressor.compression_param_names
             # Sparse decompression is applied on the model_path
-            dense_gen = self.sparsity_compressor.decompress(model_path)
-            self._replace_weights(dense_gen, model)
+            # The compressor will try and load any quantization parameters as well
+            # params_to_skip_load will skip over quantization params from being loaded
+            dense_gen = self.sparsity_compressor.decompress(
+                model_path, params_to_skip_load=params_to_ignore
+            )
+            self._replace_sparsity_weights(dense_gen, model)
             setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config)
             sparse_decompressed = True
 
@@ -431,13 +448,27 @@ def decompress(self, model_path: str, model: Module):
             # quantization during apply_quantization_config. This ensures
             # that the dtypes of the weights are not unintentionally updated.
             # The status is restored after quantization params are loaded.
+
             with override_quantization_status(
                 self.quantization_config, QuantizationStatus.FROZEN
             ):
+
                 names_to_scheme = apply_quantization_config(
                     model, self.quantization_config
                 )
-                load_pretrained_quantization(model, model_path)
+                # Load activation scales/zp or any other quantization parameters
+                # Conditionally load the weight quantization parameters if we have a dense compressor
+                # Or if a sparsity compressor has already been applied
+                load_pretrained_quantization_parameters(
+                    model,
+                    model_path,
+                    # TODO: all weight quantization params will be moved to the compressor in a follow-up
+                    # including initialization
+                    load_weight_quantization=(
+                        sparse_decompressed
+                        or isinstance(self.quantization_compressor, DenseCompressor)
+                    ),
+                )
 
             model_path_or_state_dict = (
                 model.state_dict() if sparse_decompressed else model_path
@@ -446,6 +477,8 @@ def decompress(self, model_path: str, model: Module):
             dense_gen = self.quantization_compressor.decompress(
                 model_path_or_state_dict, names_to_scheme=names_to_scheme
             )
+            # TODO: all weight quantization params will be moved to the compressor
+            # to prevent duplicate parameter updates in update_parameter_data
             self._replace_weights(dense_gen, model)
 
             def freeze_quantization_status(module):
@@ -501,7 +534,7 @@ def update_config(self, save_directory: str):
         with open(config_file_path, "w") as config_file:
             json.dump(config_data, config_file, indent=2, sort_keys=True)
 
-    def _replace_weights(self, dense_weight_generator, model: Module):
+    def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
         """
         Replace the weights of the model with the
         provided dense weights.
@@ -516,11 +549,60 @@ def _replace_weights(self, dense_weight_generator, model: Module):
         :param model: The model whose weights are to be updated.
         """
         for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
+
             split_name = name.split(".")
             prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
             module = operator.attrgetter(prefix)(model)
-            if hasattr(module, param_name):
-                update_parameter_data(module, data, param_name)
+
+            params_device = next(module.parameters()).device
+            device = "cpu" if has_offloaded_params(module) else params_device
+            delattr(module, param_name)
+            requires_grad = data.dtype in (torch.float16, torch.float32, torch.bfloat16)
+            param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
+            register_offload_parameter(module, param_name, param)
+
+    def _replace_weights(self, dense_weight_generator, model: Module):
+        """
+        Replace the weights of the model with the
+        provided dense weights.
+
+        This method iterates over the dense_weight_generator and
+        updates the corresponding weights in the model. If a parameter
+        name does not exist in the model, it will be skipped.
+
+        :param dense_weight_generator (generator): A generator that yields
+            tuples of (name, data), where 'name' is the parameter name and
+            'data' is the updated param data
+        :param model: The model whose weights are to be updated.
+        """
+
+        for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
+            module = operator.attrgetter(name)(model)
+
+            params_device = next(module.parameters()).device
+            device = "cpu" if has_offloaded_params(module) else params_device
+
+            for param_name, param_data in data.items():
+                if hasattr(module, param_name):
+                    # If compressed, will have an incorrect dtype for transformers >4.49
+                    # TODO: we can also just skip initialization of scales/zp if in decompression in init
+                    # to be consistent with loading which happens later as well
+                    # however, update_data does a good shape check - should be moved to the compressor
+                    if param_name == "weight":
+                        delattr(module, param_name)
+                        requires_grad = param_data.dtype in (
+                            torch.float16,
+                            torch.float32,
+                            torch.bfloat16,
+                        )
+                        param = torch.nn.Parameter(
+                            param_data.to(device), requires_grad=requires_grad
+                        )
+                        register_offload_parameter(module, param_name, param)
+                    else:
+                        # Should already be registered to the correct device for
+                        # for scales/zero-points
+                        update_parameter_data(module, param_data, param_name)
 
 
 def map_modules_to_quant_args(
 
@@ -14,7 +14,7 @@
 
 import logging
 from pathlib import Path
-from typing import Any, Dict, Generator, Tuple, Union
+from typing import Any, Dict, Generator, Optional, Tuple, Union
 
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
@@ -199,7 +199,8 @@ def _decompress_from_path(self, path_to_model, names_to_scheme, device):
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
-                yield merge_names(weight_name, "weight"), decompressed
+                weight_data["weight"] = decompressed
+                yield weight_name, weight_data
 
     def _decompress_from_state_dict(self, state_dict, names_to_scheme):
         weight_mappings = get_nested_mappings_from_state_dict(
@@ -215,4 +216,5 @@ def _decompress_from_state_dict(self, state_dict, names_to_scheme):
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
-                yield merge_names(weight_name, "weight"), decompressed
+                weight_data["weight"] = decompressed
+                yield weight_name, weight_data
@@ -98,7 +98,11 @@ def compress(
         return compressed_dict
 
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
+        self,
+        path_to_model_or_tensors: str,
+        device: str = "cpu",
+        params_to_skip_load: Optional[Tuple] = None,
+        **kwargs,
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a bitmask compressed state dict located
@@ -108,6 +112,11 @@ def decompress(
         :param model_path: path to compressed safetensors model (directory with
             one or more safetensors files) or compressed tensors file
         :param device: device to load decompressed weights onto
+        :param params_to_skip_load: a list of non-sparsity parameters (e.g quantization
+            parameters) that we want to skip loading. As the sparsity compresssor does
+            not handle quantized decompression, this should contain any quantization
+            parameters when decompressing stacked compressors. We want these parameters
+            to be handled by the quantization decompressor
         :return: iterator for generating decompressed weights
         """
         weight_mappings, ignored_params = get_nested_weight_mappings(
@@ -121,13 +130,21 @@ def decompress(
                 full_name = merge_names(weight_name, param_name)
                 with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
+
             decompressed = self.decompress_weight(weight_data)
             yield merge_names(weight_name, "weight"), decompressed
 
         for ignored_param_name, safe_path in ignored_params.items():
-            with safe_open(safe_path, framework="pt", device=device) as f:
-                value = f.get_tensor(ignored_param_name)
-            yield ignored_param_name, value
+            should_skip = False
+            if params_to_skip_load is not None:
+                for param_to_skip in params_to_skip_load:
+                    if param_to_skip in ignored_param_name:
+                        should_skip = True
+
+            if not should_skip:
+                with safe_open(safe_path, framework="pt", device=device) as f:
+                    value = f.get_tensor(ignored_param_name)
+                yield ignored_param_name, value
 
     @staticmethod
     def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool: