fix qwen3-30-a3b lcb-code score (#4142)

yao-fengchen · web-flow · commit 21c22f072f78 · 2025-11-26T11:11:45.000+08:00
* calculate inv_freq on device

* adapt for dlinfer attn

* update code

* fix dlinfer moe para err

* update cann version

* update code
diff --git a/docker/Dockerfile_ascend_a2_300i b/docker/Dockerfile_ascend_a2_300i
@@ -4,8 +4,8 @@
 ARG ASCEND_DEVICE_TYPE=ascend_a2
 ARG ASCEND_HUB=swr.cn-south-1.myhuaweicloud.com/ascendhub
 
-FROM ${ASCEND_HUB}/cann:8.3.rc1.alpha002-910b-ubuntu22.04-py3.11 AS ascend_a2_base
-FROM ${ASCEND_HUB}/cann:8.3.rc1.alpha002-310p-ubuntu22.04-py3.11 AS ascend_300i_base
+FROM ${ASCEND_HUB}/cann:8.3.rc1-910b-ubuntu22.04-py3.11 AS ascend_a2_base
+FROM ${ASCEND_HUB}/cann:8.3.rc1-310p-ubuntu22.04-py3.11 AS ascend_300i_base
 
 FROM ${ASCEND_DEVICE_TYPE}_base AS builder
 ENV DEBIAN_FRONTEND=noninteractive
@@ -23,6 +23,6 @@ ARG LMDEPLOY_TAG=main
 RUN --mount=type=cache,target=/root/.cache \
     pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
     pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \
-    pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0rc1 torchvision==0.23.0 && \
+    pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0 torchvision==0.23.0 && \
     TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \
     LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG}
diff --git a/docker/Dockerfile_ascend_a3 b/docker/Dockerfile_ascend_a3
@@ -4,7 +4,7 @@
 ARG ASCEND_DEVICE_TYPE=ascend_a3
 ARG ASCEND_HUB=swr.cn-south-1.myhuaweicloud.com/ascendhub
 
-FROM ${ASCEND_HUB}/cann:8.3.rc1.alpha002-a3-openeuler24.03-py3.11 AS ascend_a3_base
+FROM ${ASCEND_HUB}/cann:8.3.rc1-a3-openeuler24.03-py3.11 AS ascend_a3_base
 
 FROM ${ASCEND_DEVICE_TYPE}_base AS builder
 ENV DEBIAN_FRONTEND=noninteractive
@@ -22,6 +22,6 @@ ARG LMDEPLOY_TAG=main
 RUN --mount=type=cache,target=/root/.cache \
     pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
     pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \
-    pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0rc1 torchvision==0.23.0 && \
+    pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0 torchvision==0.23.0 && \
     TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \
     LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG}
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -212,7 +212,7 @@ def get_total_slots():
         elif is_unpaged_prefill:
             # prepare some params of unpaged_prefill attention stage.
             q_start_loc_cpu, kv_seqlens_cpu = None, None
-            q_seqlens_cpu = step_context.q_seqlens.cpu()
+            q_seqlens_cpu = step_context.q_seqlens.cpu().to(torch.int32)
             if SocVersion.is_Ascend910():
                 single_attention_mask = torch.logical_not(
                     torch.tril(
@@ -251,7 +251,7 @@ def get_total_slots():
                     step_context.block_offsets = step_context.block_offsets\
                         .repeat_interleave(step_context.q_seqlens, 0)
             dynamo.mark_dynamic(step_context.block_offsets, [0, 1])
-            kv_seqlens = step_context.kv_seqlens.to(torch.int32)
+            kv_seqlens = step_context.kv_seqlens.cpu().to(torch.int32)
             if not step_context.is_decoding:
                 if is_unpaged_prefill:
                     if SocVersion.is_Ascend910():
@@ -269,11 +269,9 @@ def get_total_slots():
                     else:
                         raise ValueError(f"dlinfer doesn't support {SocVersion.device_name()} device currently.")
                     kv_seqlens = kv_seqlens.repeat_interleave(step_context.q_seqlens, 0)
-            if not is_unpaged_prefill and AscendOpsBackend.enable_aclgraph():
-                kv_seqlens = kv_seqlens.cpu().tolist()
         else:
             if step_context.is_decoding:
-                kv_seqlens_cpu = step_context.kv_seqlens.cpu()
+                kv_seqlens_cpu = step_context.kv_seqlens.cpu().to(torch.int32)
             elif is_unpaged_prefill:
                 pass
             else:
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -27,9 +27,9 @@ class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder):
     """Dlinfer softmax topk implementation builder."""
 
     @staticmethod
-    def build(top_k: int, dim: int = -1):
+    def build(top_k: int, dim: int = -1, n_groups: int = -1):
         """build."""
-        return DlinferSoftmaxTopKImpl(top_k, dim)
+        return DlinferSoftmaxTopKImpl(top_k, dim, n_groups)
 
 
 class DlinferFusedMoEImpl(FusedMoEImpl):
diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
@@ -46,8 +46,7 @@ def __init__(self, dim: int, base: int = 10000, scaling_factor: float = 1.0):
         self.dim = dim
         self.base = base
         # yapf: disable
-        inv_freq = 1.0 / (self.base
-                          ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)).float().cuda()
+        inv_freq = 1.0 / (self.base**(torch.arange(0, self.dim, 2, dtype=torch.float, device='cuda') / self.dim))
         # yapf: enable
         self.register_buffer('inv_freq', inv_freq, persistent=False)