ModelTC
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py‎
Lines changed: 4 additions & 30 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py‎
Lines changed: 4 additions & 30 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py‎
Lines changed: 3 additions & 8 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py‎
Lines changed: 33 additions & 99 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py‎
Lines changed: 33 additions & 99 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_factory.py‎
Lines changed: 83 additions & 0 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_factory.py‎
Lines changed: 83 additions & 0 deletions
@@ -1,3 +1,4 @@
+from multiprocessing import parent_process
 import torch
 from abc import ABC, abstractmethod
 from typing import Dict
@@ -14,7 +15,7 @@ def load_hf_weights(self, weights):
 
     @abstractmethod
     def verify_load(self):
-        pass
+        parent_process
 
 
 class BaseWeightTpl(BaseWeight):
@@ -24,35 +25,8 @@ def __init__(self, tp_rank: int = None, tp_world_size: int = None, data_type: to
         self.device_id_ = get_current_device_id()
         self.data_type_ = data_type
 
-    def _slice_weight(self, weight: torch.Tensor):
-        # slice weight
-        return weight.to(self.data_type_)
-
-    def _slice_bias(self, bias: torch.Tensor):
-        # slice bias
-        return bias.to(self.data_type_)
-
-    def _slice_weight_scale(self, weight_scale: torch.Tensor):
-        # slice weight scale and zero point
-        return weight_scale
-
-    def _load_weights(self, weights: Dict[str, torch.Tensor]) -> None:
-        # load weight
-        pass
-
-    def _load_scales(self, weights: Dict[str, torch.Tensor]) -> None:
-        # load quantization scale
-        pass
-
-    def _load_zero_points(self, weights: Dict[str, torch.Tensor]) -> None:
-        # load quantization zero points
-        pass
-
     def load_hf_weights(self, weights):
-        self._load_weights(weights)
-        self._load_scales(weights)
-        self._load_zero_points(weights)
-        return
+        raise NotImplementedError("load_hf_weights must implement this method")
 
     def verify_load(self):
-        pass
+        raise NotImplementedError("verify_load must implement this method")
@@ -3,15 +3,10 @@
     MultiMMWeightTpl,
     AWQMultiMMWeightTpl,
 )
-from .rowmm_weight import (
+from .mm_factory import (
+    MMWeight,
     ROWMMWeight,
-    ROWBMMWeight,
     MultiROWMMWeight,
-    W8A8B128ROWMMWeight,
-    W8A8B128ROWBMMWeight,
-    W8A8B128MultiROWMMWeight,
-)
-from .colmm_weight import (
+    ROWBMMWeight,
     COLMMWeight,
-    W8A8B128COLMMWeight,
 )
@@ -1,25 +1,17 @@
 import torch
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import (
-    MMWeight,
-    MMWeightTpl,
-    generate_scale_name,
+    SingleMMWeightTpl,
+    DeepGemmFP8W8A8B128MMWeight,
     AWQMMWeightTpl,
 )
 from lightllm.common.quantization import Quantcfg
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from typing import Dict, List, Optional
+from .mm_slicer import ColSliceMixin, QuantizedRowSliceMixin, QuantizedColSliceMixin
 
 
-class COLMMWeight(MMWeight):
-    @classmethod
-    def _get_mmcls(cls, quant_method: QuantizationMethod, quantized_weight: bool):
-        if quant_method is None or not quantized_weight:
-            return UnquantizedCOLMMWeight
-        return COLBMM_WEIGHT_CLS_MAP[quant_method.method_name]
-
-
-class UnquantizedCOLMMWeight(MMWeightTpl):
+class UnquantizedCOLMMWeight(SingleMMWeightTpl):
     def __init__(
         self,
         weight_name: str,
@@ -29,24 +21,18 @@ def __init__(
         tp_rank: int = None,
         tp_world_size: int = None,
     ) -> None:
-        super().__init__(data_type, quant_method, tp_rank, tp_world_size)
-        self.weight_name = weight_name
-        self.bias_name = bias_name
-        self.has_bias = bias_name is not None
-
-    def _slice_weight(self, tensor):
-        assert tensor.shape[1] % self.tp_world_size_ == 0, f"tp slice error {tensor.shape[1]} % {self.tp_world_size_}"
-        tp_size = tensor.shape[1] // self.tp_world_size_
-        return tensor[:, tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)].to(self.data_type_)
-
-    def _slice_bias(self, bias):
-        """
-        因为 Colmm 列 tp 切分的计算，最后会有一个 reduce 操作，直接将 bias / tp_world_size 可以节省一步计算。
-        """
-        return (bias / self.tp_world_size_).to(self.data_type_)
+        super().__init__(
+            weight_name=weight_name,
+            data_type=data_type,
+            bias_name=bias_name,
+            quant_method=quant_method,
+            tp_rank=tp_rank,
+            tp_world_size=tp_world_size,
+        )
+        self.param_slicer = ColSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
 
 
-class W8A8B128COLMMWeight(MMWeightTpl):
+class DeepGemmFP8W8A8B128COLMMWeight(DeepGemmFP8W8A8B128MMWeight):
     def __init__(
         self,
         weight_name: str,
@@ -56,47 +42,15 @@ def __init__(
         tp_rank: int = None,
         tp_world_size: int = None,
     ) -> None:
-        super().__init__(data_type, quant_method, tp_rank, tp_world_size)
-        self.weight_name = weight_name
-        self.bias_name = bias_name
-        self.has_bias = bias_name is not None
-
-        self.weight_scale_name, self.act_scale_name = generate_scale_name(
-            weight_name, quant_method.weight_scale_suffix, quant_method.act_scale_suffix
+        super().__init__(
+            weight_name=weight_name,
+            data_type=data_type,
+            bias_name=bias_name,
+            quant_method=quant_method,
+            tp_rank=tp_rank,
+            tp_world_size=tp_world_size,
         )
-        self.weight_scale: Optional[torch.Tensor] = None
-        self.block_size = self.quant_method.block_size
-        self.quantized_weight = True
-
-    def _slice_weight(self, tensor):
-        assert tensor.shape[1] % self.tp_world_size_ == 0, f"tp slice error {tensor.shape[1]} % {self.tp_world_size_}"
-        tp_size = tensor.shape[1] // self.tp_world_size_
-        return tensor[:, tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)]
-
-    def _slice_weight_scale(self, weight_scale: torch.Tensor):
-        assert (
-            weight_scale.shape[1] % self.tp_world_size_ == 0
-        ), f"tp slice error {weight_scale.shape[1]} % {self.tp_world_size_}"
-        tp_size = weight_scale.shape[1] // self.tp_world_size_
-        scale_start = tp_size * self.tp_rank_
-        scale_end = tp_size * (self.tp_rank_ + 1)
-        return weight_scale[:, scale_start:scale_end].to(torch.float)
-
-    def _process_weight_scale(self, weight_scale) -> None:
-        self.weight_scale = weight_scale.cuda(get_current_device_id()).transpose(0, 1)
-
-    def _load_scales(self, weights: Dict[str, torch.Tensor]) -> None:
-        if self.weight_scale_name in weights:
-            weight_scale = self._slice_weight_scale(weights[self.weight_scale_name])
-            self._process_weight_scale(weight_scale)
-        if self.weight_scale is not None and isinstance(self.weight, torch.Tensor):
-            # weight 中保存的 None 是为 激活静态量化 scale 预留的扩展位置。
-            self.weight = [
-                self.weight,
-                self.weight_scale,
-                None,
-            ]
-        return
+        self.param_slicer = QuantizedColSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
 
 
 class AWQCOLMMWeight(AWQMMWeightTpl):
@@ -110,35 +64,8 @@ def __init__(
         tp_world_size: int = None,
     ) -> None:
         super().__init__(data_type, quant_method, tp_rank, tp_world_size)
-        self.weight_name = weight_name.replace("weight", quant_method.weight_suffix)
-        self.weight_scale_name = weight_name.replace("weight", quant_method.weight_scale_suffix)
-        self.weight_zero_point_name = weight_name.replace("weight", quant_method.weight_zero_point_suffix)
-        self.bias_name = bias_name
-        self.weight_scale: Optional[torch.Tensor] = None
-        self.quantized_weight = True
-        self.weight = [None, None, None]
-
-    def _slice_weight(self, weight: torch.Tensor):
-        assert weight.shape[0] % self.tp_world_size_ == 0, f"tp slice error {weight.shape[0]} % {self.tp_world_size_}"
-        tp_size = weight.shape[0] // self.tp_world_size_
-        return weight[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1), :]
-
-    def _slice_bias(self, bias):
-        assert bias.shape[0] % self.tp_world_size_ == 0, f"tp slice error {bias.shape[0]} % {self.tp_world_size_}"
-        tp_size = bias.shape[0] // self.tp_world_size_
-        return bias[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1), :]
-
-    def _slice_weight_scale(self, weight_scale: torch.Tensor):
-        tp_size = weight_scale.shape[0] // self.tp_world_size_
-        scale_start = tp_size * self.tp_rank_
-        scale_end = tp_size * (self.tp_rank_ + 1)
-        return weight_scale[scale_start:scale_end, :].to(torch.half)
-
-    def _slice_weight_zero_point(self, weight_zero_point: torch.Tensor):
-        tp_size = weight_zero_point.shape[0] // self.tp_world_size_
-        zero_point_start = tp_size * self.tp_rank_
-        zero_point_end = tp_size * (self.tp_rank_ + 1)
-        return weight_zero_point[zero_point_start:zero_point_end, :]
+        # 注意这里不是错误，因为awq的weight是按inxout存的
+        self.param_slicer = QuantizedRowSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
 
 
 class AWQMARLINCOLMMWeight(AWQCOLMMWeight):
@@ -151,7 +78,14 @@ def __init__(
         tp_rank: int = None,
         tp_world_size: int = None,
     ) -> None:
-        super().__init__(weight_name, data_type, bias_name, quant_method, tp_rank, tp_world_size)
+        super().__init__(
+            weight_name=weight_name,
+            data_type=data_type,
+            bias_name=bias_name,
+            quant_method=quant_method,
+            tp_rank=tp_rank,
+            tp_world_size=tp_world_size,
+        )
 
     def _process_weight(self, weight: torch.Tensor) -> torch.Tensor:
         return self.quant_method._process_weight_after_loading(weight.cuda(get_current_device_id()))
@@ -168,7 +102,7 @@ def _process_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.T
 
 
 COLBMM_WEIGHT_CLS_MAP = {
-    "deepgemm-fp8w8a8-b128": W8A8B128COLMMWeight,
+    "deepgemm-fp8w8a8-b128": DeepGemmFP8W8A8B128COLMMWeight,
     "awq": AWQCOLMMWeight,
     "awq_marlin": AWQMARLINCOLMMWeight,
 }
@@ -0,0 +1,83 @@
+from lightllm.common.quantization import Quantcfg
+from lightllm.common.quantization.quantize_method import QuantizationMethod
+from typing import Type, Union, Dict
+from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import (
+    MMWeightTpl,
+    MultiMMWeightTpl,
+    BMMWeightTpl,
+)
+from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.rowmm_weight import (
+    UnquantizedROWMMWeight,
+    UnquantizedROWBMMWeight,
+    UnquantizedMultiROWMMWeight,
+    ROWMM_WEIGHT_CLS_MAP,
+    MULTI_ROWMM_WEIGHT_CLS_MAP,
+)
+from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.colmm_weight import (
+    UnquantizedCOLMMWeight,
+    COLBMM_WEIGHT_CLS_MAP,
+)
+
+
+class MMWeight:
+    def __new__(cls, **kwargs):
+        quant_cfg = kwargs.pop("quant_cfg", None)
+        layer_num_ = kwargs.pop("layer_num", None)
+        name = kwargs.pop("name", None)
+        quant_method, quantized_weight = cls._get_quant_method(quant_cfg, layer_num_, name)
+        kwargs["quant_method"] = quant_method
+        mmcls = cls._get_mmcls(quant_method, quantized_weight)
+        return mmcls(**kwargs)
+
+    @classmethod
+    def _get_quant_method(cls, quant_cfg: Quantcfg, layer_num_: int, name: str) -> QuantizationMethod:
+        if quant_cfg is None:
+            return None, False
+        quant_method = quant_cfg.get_quant_method(layer_num_, name)
+        if quant_method is None:
+            return None, False
+        quant_method.hf_quantization_config = quant_cfg.hf_quantization_config
+        quantized_weight = quant_cfg.quantized_weight
+        return quant_method, quantized_weight
+
+    @classmethod
+    def _get_mmcls(
+        cls, quant_method: QuantizationMethod, quantized_weight: bool
+    ) -> Type[Union[MMWeightTpl, MultiMMWeightTpl, BMMWeightTpl]]:
+        raise NotImplementedError("Subclasses must implement _get_mmcls method")
+
+
+class ROWMMWeight(MMWeight):
+    @classmethod
+    def _get_mmcls(cls, quant_method: QuantizationMethod, quantized_weight: bool):
+        if quant_method is None or not quantized_weight:
+            return UnquantizedROWMMWeight
+
+        return ROWMM_WEIGHT_CLS_MAP[quant_method.method_name]
+
+
+class MultiROWMMWeight(MMWeight):
+    @classmethod
+    def _get_mmcls(cls, quant_method: QuantizationMethod, quantized_weight: bool):
+        if quant_method is None or not quantized_weight:
+            return UnquantizedMultiROWMMWeight
+
+        return MULTI_ROWMM_WEIGHT_CLS_MAP[quant_method.method_name]
+
+
+class ROWBMMWeight(MMWeight):
+    @classmethod
+    def _get_mmcls(cls, quant_method: QuantizationMethod, quantized_weight: bool):
+        if quant_method is None or not quantized_weight:
+            return UnquantizedROWBMMWeight
+        else:
+            # TODO: Implement more quantization weight
+            raise NotImplementedError("ROWBMMWeight is not implemented")
+
+
+class COLMMWeight(MMWeight):
+    @classmethod
+    def _get_mmcls(cls, quant_method: QuantizationMethod, quantized_weight: bool):
+        if quant_method is None or not quantized_weight:
+            return UnquantizedCOLMMWeight
+        return COLBMM_WEIGHT_CLS_MAP[quant_method.method_name]