Fix dpsk-r1-fp4 tp8 by reverting two commits (#13162 and #13341) (#13348)

Qiaolin-Yu · Kangyan-Zhou · web-flow · commit 78a4b446c6c6 · 2025-11-15T21:31:36.000-08:00
Co-authored-by: Kangyan-Zhou &lt;zky314343421@gmail.com&gt;
diff --git a/python/sglang/srt/eplb/expert_location.py b/python/sglang/srt/eplb/expert_location.py
@@ -284,17 +284,9 @@ def update(
     # -------------------------------- usage ------------------------------------
 
     def logical_to_all_physical(
-        self,
-        layer_id: int,
-        logical_expert_id: int,
-        require_global_experts: bool = False,
+        self, layer_id: int, logical_expert_id: int
     ) -> List[int]:
         # Use CPU copy to avoid GPU→CPU sync on every call, which is expensive in update weights scenario
-        if require_global_experts:
-            num_physical_experts = self.logical_to_all_physical_map_cpu[layer_id].shape[
-                -1
-            ]
-            return list(torch.arange(0, num_physical_experts))
         return [
             physical_expert_id
             for physical_expert_id in self.logical_to_all_physical_map_cpu[
@@ -363,10 +355,14 @@ def _compute_logical_to_all_physical_map(
                 )
 
                 # Replace by the nearest physical expert
-                if nearest_expert != -1:
-                    logical_to_all_physical_map[layer_id][logical_expert_id] = [
-                        nearest_expert
-                    ]
+                mapped_physical_experts = logical_to_all_physical_map[layer_id][
+                    logical_expert_id
+                ]
+                if (
+                    nearest_expert != -1
+                    and nearest_expert not in mapped_physical_experts
+                ):
+                    mapped_physical_experts[0] = nearest_expert
 
     logical_to_all_physical_map = _pad_nested_array(
         logical_to_all_physical_map, pad_value=-1
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -539,12 +539,9 @@ def weight_loader(
             # This is a shared expert.
             physical_expert_ids = [expert_id]
         else:
-            require_global_experts = getattr(
-                param, "_sglang_require_global_experts", False
-            )
             physical_expert_ids = (
                 global_expert_location_metadata.logical_to_all_physical(
-                    self.layer_id, expert_id, require_global_experts
+                    self.layer_id, expert_id
                 )
             )
 
@@ -1129,6 +1126,7 @@ def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
             local_expert_offset=self.moe_ep_rank * self.num_local_experts,
             local_num_experts=self.num_local_experts,
             routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
+            tile_tokens_dim=None,
             routing_method_type=RoutingMethodType.DeepSeekV3,
             do_finalize=True,
             output=symm_output,
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -1245,6 +1245,7 @@ def apply_with_router_logits(
                 routed_scaling_factor=(
                     routed_scaling_factor if routed_scaling_factor is not None else 1.0
                 ),
+                tile_tokens_dim=None,
                 routing_method_type=routing_method_type,
                 use_shuffled_weight=False,
             )
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -739,6 +739,7 @@ def apply(
                         else 1.0
                     ),
                     use_routing_scales_on_input=use_routing_scales_on_input,
+                    tile_tokens_dim=None,
                     routing_method_type=routing_method_type,
                 )
 
diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py
@@ -681,6 +681,7 @@ def apply(
                 layer.moe_ep_rank * layer.num_local_experts,  # local_expert_offset
                 layer.num_local_experts,  # local num experts
                 None,
+                None,  # tile_tokens_dim
                 1,  # routing_method_type, renormalize
                 True,  # do finalize
                 output=symm_output,

Original file line number	Diff line number	Diff line change
`@@ -1245,6 +1245,7 @@ def apply_with_router_logits(`
`1245`	`1245`	`routed_scaling_factor=(`
`1246`	`1246`	`routed_scaling_factor if routed_scaling_factor is not None else 1.0`
`1247`	`1247`	`),`
	`1248`	`+ tile_tokens_dim=None,`
`1248`	`1249`	`routing_method_type=routing_method_type,`
`1249`	`1250`	`use_shuffled_weight=False,`
`1250`	`1251`	`)`
Original file line number	Diff line number	Diff line change
`@@ -739,6 +739,7 @@ def apply(`
`739`	`739`	`else 1.0`
`740`	`740`	`),`
`741`	`741`	`use_routing_scales_on_input=use_routing_scales_on_input,`
	`742`	`+ tile_tokens_dim=None,`
`742`	`743`	`routing_method_type=routing_method_type,`
`743`	`744`	`)`
`744`	`745`