[Dlinfer][Ascend] Optimize performance of 310P device (#3486)

JackWeiw · yao-fengchen · JackWeiw · web-flow · commit 6bddb1c342d7 · 2025-04-30T12:40:44.000+08:00
* support 310P

* format code

* fix accuracy of eager mode

* update code

* [dlinfer]fix tp for Ascend310P device

* [dlinfer][ascend]lazy import torch_npu

* [ascend]use safe device check

* lint

* lint

* [dlinfer][ascend]convert linear weight to NZ at inital time

* [ascend]fix tp2 lm compile transdata

* [ascend]set transdata linear weight by default

* [dlinfer][ascend]fix Transdata linear weight device check

---------

Co-authored-by: yaofengchen &lt;fengchenyao@foxmail.com&gt;
Co-authored-by: JackWeiw &lt;taowei@stu.ecnu.edu.cn&gt;
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
@@ -49,6 +49,12 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf
                                                                backend='atbgraph')
             else:
                 self.model = torch.compile(self.model, fullgraph=True, dynamic=True, backend='atbgraph')
+            if SocVersion.is_Ascend310P() and hasattr(self.model, 'get_logits'):
+                # Compile get_logits for Ascend310P to use ATB linear since we would convert weight to nz format
+                self.model.get_logits = torch.compile(self.model.get_logits,
+                                                      fullgraph=True,
+                                                      dynamic=True,
+                                                      backend='atbgraph')
 
     def check_enable_graph(self):
         """check enable graph."""
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -91,6 +91,8 @@ class AscendOpsBackend(DlinferOpsBackend):
     enable_graph = False
     half_negative_inf = torch.finfo(torch.float16).min
     total_slots = None
+    # compiled atb Transdataoperation to convert tensor from ACL_FORMAT_ND to ACL_FORMAT_FRACTAL_NZ format.
+    transdata_func = None
 
     @staticmethod
     def get_name() -> str:
@@ -216,11 +218,13 @@ def get_total_slots():
                         single_attention_mask = torch.triu(single_attention_mask, diagonal=1)
                         attention_mask.append(single_attention_mask)
                 else:
+                    # Transdata needs dtype to be float16 or int8
                     single_attention_mask = torch.triu(
-                        torch.ones(max_q_seq_len, max_kv_seq_len).fill_(-float('inf')).cuda(),
+                        torch.ones(max_q_seq_len, max_kv_seq_len, dtype=torch.float16).fill_(-float('inf')).cuda(),
                         diagonal=max_kv_seq_len - max_q_seq_len + 1,
                     )
-                    attention_mask.append(single_attention_mask)
+                    # Convert to NZ format
+                    attention_mask.append(cls.get_transdata_func()(single_attention_mask, 2))
             else:
                 raise ValueError(f"dlinfer doesn't support {SocVersion.device_name()} device currently.")
         else:
@@ -240,13 +244,21 @@ def get_total_slots():
             kv_seqlens = step_context.kv_seqlens.to(torch.int32)
             if not step_context.is_decoding:
                 if is_unpaged_prefill:
-                    attention_mask = [mask.half() for mask in attention_mask]
-                    if SocVersion.is_Ascend310P():
-                        attention_mask = [torch.cat([mask.unsqueeze(0) for mask in attention_mask])]
+                    if SocVersion.is_Ascend910B():
+                        attention_mask = [mask.half() for mask in attention_mask]
                 else:
-                    attention_mask = [
-                        torch.cat([mask.half() * cls.half_negative_inf for mask in attention_mask]).unsqueeze(1)
-                    ]
+                    if SocVersion.is_Ascend910B():
+                        attention_mask = [
+                            torch.cat([mask.half() * cls.half_negative_inf for mask in attention_mask]).unsqueeze(1)
+                        ]
+                    elif SocVersion.is_Ascend310P():
+                        # Convert mask to NZ format.
+                        attention_mask = [
+                            cls.get_transdata_func()(torch.cat(
+                                [mask.half() * cls.half_negative_inf for mask in attention_mask]).unsqueeze(1), 2)
+                        ]
+                    else:
+                        raise ValueError(f"dlinfer doesn't support {SocVersion.device_name()} device currently.")
                     kv_seqlens = kv_seqlens.repeat_interleave(step_context.q_seqlens, 0)
         else:
             if step_context.is_decoding:
@@ -302,6 +314,21 @@ def build_graph_runner(model: torch.nn.Module, model_config: ModelConfig, cache_
         AscendOpsBackend.enable_graph = ascend_graph_runner.enable_graph
         return ascend_graph_runner
 
+    @staticmethod
+    def get_transdata_func():
+        """get transdata function."""
+        if AscendOpsBackend.transdata_func is None:
+            import dlinfer
+            from dlinfer.ops import transdata
+            dlinfer.graph.config.enable_graph_mode = True
+            if torch.distributed.is_initialized():
+                torch._inductor.config.compile_threads = 1
+            AscendOpsBackend.transdata_func = torch.compile(transdata,
+                                                            fullgraph=True,
+                                                            dynamic=False,
+                                                            backend='atbgraph')
+        return AscendOpsBackend.transdata_func
+
     @staticmethod
     def init():
         """Initialize Ascend backend."""
diff --git a/lmdeploy/pytorch/backends/dlinfer/linear.py b/lmdeploy/pytorch/backends/dlinfer/linear.py
@@ -15,8 +15,16 @@ class DlinferLinearImpl(LinearImpl):
 
     def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
         """update weights."""
-        if os.getenv('DLINER_LINEAR_USE_NN_LAYOUT', '0') == '1':
+        if os.getenv('DLINFER_LINEAR_USE_NN_LAYOUT', '0') == '1':
             weight = weight.data.t().contiguous()
+        if weight.device.type == 'npu':
+            from .ascend import SocVersion
+            if SocVersion.is_Ascend310P() and not os.getenv('DLINFER_DISABLE_LINEAR_NZ_FORMAT', '0') == '1':
+                # Ascend 310P device need weight to be NZ format, so Transdata it initially.
+                # Transdata Linear weight by default, if Error occurs, please set
+                # DLINFER_DISABLE_LINEAR_NZ_FORMAT=1 to disable transdata.
+                from .ascend import AscendOpsBackend
+                weight = AscendOpsBackend.get_transdata_func()(weight, 2)
         return weight, bias
 
     def forward(self,