Support weight update for blackwell DeepGEMM (#13324)

fzyzcjy · web-flow · commit f3e9336dcbd6 · 2025-11-17T15:55:26.000+08:00
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -105,6 +105,7 @@
     block_quant_dequant,
     block_quant_to_tensor_quant,
     channel_quant_to_tensor_quant,
+    inverse_transform_scale_ue8m0,
     normalize_e4m3fn_to_e4m3fnuz,
     quant_weight_ue8m0,
     requant_weight_ue8m0_inplace,
@@ -3270,6 +3271,8 @@ def __init__(
             }
         )
         self.capture_aux_hidden_states = False
+        self._executed_weight_requant_ue8m0 = False
+
         self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
         if self.nsa_enable_prefill_cp:
             self.cp_rank = get_attention_tp_rank()
@@ -3443,6 +3446,18 @@ def post_load_weights(self, is_nextn=False, weight_names=None):
                         weight = w
                         weight_scale = self_attn.kv_b_proj.weight_scale_inv
 
+                    if (
+                        should_deepgemm_weight_requant_ue8m0(
+                            weight_block_size=getattr(
+                                self.quant_config, "weight_block_size", None
+                            )
+                        )
+                        and self._executed_weight_requant_ue8m0
+                    ):
+                        weight_scale = inverse_transform_scale_ue8m0(
+                            weight_scale, mn=weight.shape[-2]
+                        )
+
                     if (
                         _is_cuda
                         and weight_block_size[0] == 128
@@ -3553,9 +3568,14 @@ def post_load_weights(self, is_nextn=False, weight_names=None):
                 self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
                 self_attn.use_deep_gemm_bmm = True
 
-        if not ENABLE_FLASHINFER_FP8_GEMM and should_deepgemm_weight_requant_ue8m0(
-            weight_block_size=getattr(self.quant_config, "weight_block_size", None)
+        if (
+            not ENABLE_FLASHINFER_FP8_GEMM
+            and should_deepgemm_weight_requant_ue8m0(
+                weight_block_size=getattr(self.quant_config, "weight_block_size", None)
+            )
+            and not self._executed_weight_requant_ue8m0
         ):
+            self._executed_weight_requant_ue8m0 = True
             self._weight_requant_ue8m0(is_nextn)
 
         # TODO can move weight_requant_ue8m0 and transform_scale_ue8m0 into Fp8LinearMethod.process_weights_after_loading