sgl-project
diff --git a/‎docs/advanced_features/server_arguments.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/advanced_features/server_arguments.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/basic_usage/deepseek_v32.md‎
Lines changed: 20 additions & 0 deletions b/‎docs/basic_usage/deepseek_v32.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎python/sglang/srt/distributed/device_communicators/pynccl.py‎
Lines changed: 28 additions & 0 deletions b/‎python/sglang/srt/distributed/device_communicators/pynccl.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎python/sglang/srt/distributed/parallel_state.py‎
Lines changed: 21 additions & 0 deletions b/‎python/sglang/srt/distributed/parallel_state.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎python/sglang/srt/layers/attention/nsa/nsa_indexer.py‎
Lines changed: 221 additions & 8 deletions b/‎python/sglang/srt/layers/attention/nsa/nsa_indexer.py‎
Lines changed: 221 additions & 8 deletions
@@ -396,6 +396,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | `--numa-node` | Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess. | `None` | List[int] |
 | `--enable-layerwise-nvtx-marker` | Enable layerwise NVTX profiling annotations for the model. This adds NVTX markers to every layer for detailed per-layer performance analysis with Nsight Systems. | `False` | bool flag (set to enable) |
 | `--enable-attn-tp-input-scattered` | Allow input of attention to be scattered when only using tensor parallelism, to reduce the computational load of operations such as qkv latent.                                                                                                      | `False`  | bool flag (set to enable) |
+| `--enable-nsa-prefill-context-parallel` | Context parallelism used in the long sequence prefill phase of DeepSeek v3.2 | `False` | bool flag (set to enable) |
 
 ## Debug tensor dumps
 | Argument | Description | Defaults | Options |
 
@@ -142,3 +142,23 @@ The mean accuracy over 8 runs shows 0.797, which matches the number 79.9 in offi
 Repeat: 8, mean: 0.797
 Scores: ['0.808', '0.798', '0.808', '0.798', '0.783', '0.788', '0.803', '0.793']
 ```
+
+
+## DSA long sequence context parallel optimization(experimental)
+
+Accuracy benchmark on long context can be tested on GPQA-diamond dataset with long output tokens and thinking enabled:
+
+Example usage:
+```bash
+# Launch with EP + DP
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp  --tp 8 --ep 8 --dp 2 --enable-dp-attention --enable-nsa-prefill-context-parallel --max-running-requests 32
+```
+### Context-parallel Tips
+`CP_size` reuses `atten_tp_size`, which is equal to `TP_size` / `DP_size`.
+Some features are still not supported at present.
+- **Multi-batch prefill**: Currently, only single-request processing is supported during the prefill process.
+- **disaggregation**: P/D disaggregation.
+- **Cross-machine support**: - Currently only tested on a single machine (TP=8,EP=8).
+- **Other Args**: Currently only supports moe_dense_tp_size=1, kv_cache_dtype = "bf16", moe_a2a_backend = "deepep",
+- **DP_size**: `CP_size` reuses `atten_tp_size`, which is equal to `TP_size` / `DP_size`. For the cp function to work correctly, `TP_size` must be divisible by `DP_size`, and TP_size / DP_size > 1 (to ensure CP_size > 1).
+- **Detailed design reference**: https://github.com/sgl-project/sglang/pull/12065
@@ -209,6 +209,34 @@ def all_gather(
                 cudaStream_t(stream.cuda_stream),
             )
 
+    def cp_all_gather_into_tensor(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        stream=None,
+        sizes: Optional[list[int]] = None,
+    ):
+        """
+        Currently, it is mainly used in context parallelism,
+        primarily leveraging pynccl to implement non-blocking allgather communication.
+        """
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        stream = self._resolve_stream(stream)
+        self.nccl.ncclAllGather(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()),
+            input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
     def reduce_scatter(
         self,
         output_tensor: torch.Tensor,
 
@@ -748,6 +748,27 @@ def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
                 output, input, group_name=self.unique_name
             )
 
+    def cp_all_gather_into_tensor_async(
+        self, output: torch.Tensor, input: torch.Tensor, stream=None
+    ):
+        """
+        Implement an asynchronous `allgather` operation on a specified stream.
+        (the default `torch.distributed.all_gather_into_tensor` will trigger event synchronization),
+        eliminating the CPU-side launch-kernel blocking issue caused by synchronization problems.
+        The specific implementation uses the interface provided by pynccl to remove the synchronization logic of events.
+        """
+        assert (
+            stream is not None
+        ), f"Invalid params stream ({stream}, Please specify the stream to use when calling cp_all_gather_into_tensor_async.)"
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None:
+            pynccl_comm.cp_all_gather_into_tensor(output, input, stream=stream)
+        else:
+            logger.warning("not all_gather_into_tensor_async")
+            torch.ops.sglang.reg_all_gather_into_tensor(
+                output, input, group_name=self.unique_name
+            )
+
     def all_gather(
         self,
         input_: torch.Tensor,
 
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import torch
 from einops import rearrange
@@ -16,9 +16,18 @@
     except ImportError as e:
         deep_gemm = e
 
+
 from sglang.srt.layers import deep_gemm_wrapper
-from sglang.srt.layers.attention.nsa.utils import NSA_DUAL_STREAM
-from sglang.srt.layers.dp_attention import get_attention_tp_group
+from sglang.srt.layers.attention.nsa.utils import (
+    NSA_DUAL_STREAM,
+    cp_all_gather_rerange_output,
+    is_nsa_enable_prefill_cp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_group,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+)
 from sglang.srt.layers.linear import ReplicatedLinear
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.rotary_embedding import get_rope_wrapper
@@ -112,6 +121,13 @@ def __init__(
         self.layer_id = layer_id
         self.alt_stream = alt_stream
         self.fuse_wk_and_weights_proj = fuse_wk_and_weights_proj
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        if self.nsa_enable_prefill_cp:
+            self.cp_size = get_attention_tp_size()
+            self.cp_rank = get_attention_tp_rank()
+        else:
+            self.cp_size = None
+            self.cp_rank = None
         if is_cuda():
             self.sm_count = deep_gemm.get_num_sms()
             self.half_device_sm_count = ceil_align(self.sm_count // 2, 8)
@@ -171,6 +187,7 @@ def _get_q_k_bf16(
         x: torch.Tensor,
         positions: torch.Tensor,
         enable_dual_stream: bool,
+        forward_batch: ForwardBatch,
     ):
         weights = None
         if enable_dual_stream:
@@ -228,6 +245,15 @@ def _get_q_k_bf16(
         query[..., : self.rope_head_dim] = q_rope
         key[..., : self.rope_head_dim] = k_rope
 
+        # allgather+rerrange
+        if forward_batch.nsa_cp_metadata is not None and self.nsa_enable_prefill_cp:
+            key = cp_all_gather_rerange_output(
+                key.contiguous(),
+                self.cp_size,
+                forward_batch,
+                torch.cuda.current_stream(),
+            )
+
         if enable_dual_stream:
             current_stream = torch.cuda.current_stream()
             self.alt_stream.wait_stream(current_stream)
@@ -469,6 +495,153 @@ def _forward_cuda_k_only(
         )
         return metadata.topk_transform(dummy_logits, self.index_topk)
 
+    def _get_topk_ragged_with_cp(
+        self,
+        forward_batch: ForwardBatch,
+        layer_id: int,
+        q_fp8: torch.Tensor,
+        weights: torch.Tensor,
+        metadata: BaseIndexerMetadata,
+        kv_len: int,
+        actual_seq_q: int,
+        cp_index: List[Tuple[int, int, int]] = None,
+    ) -> torch.Tensor:
+        if TYPE_CHECKING:
+            assert isinstance(forward_batch.token_to_kv_pool, NSATokenToKVPool)
+
+        page_size = forward_batch.token_to_kv_pool.page_size
+        assert page_size == 64, "only support page size 64"
+        assert len(weights.shape) == 3
+        weights = weights.squeeze(-1)
+        k_fp8_list = []
+        k_scale_list = []
+        ks_list = []
+        ke_offset_list = []
+        offset = 0
+        actual_seq_q_list = []
+        batch_idx_list = []
+
+        block_tables = metadata.get_page_table_64()
+
+        assert (
+            forward_batch.seq_lens_cpu is not None
+            and forward_batch.extend_seq_lens_cpu is not None
+        )
+        if cp_index is not None:
+            # TODO Multi-batch support has accuracy issues
+            for batch_idx, start_seq_position, end_seq_position in cp_index:
+                pre_chunk_offset = (
+                    forward_batch.seq_lens_cpu[batch_idx].item()
+                    - forward_batch.extend_seq_lens_cpu[batch_idx]
+                )
+                start_seq_position += pre_chunk_offset
+                end_seq_position += pre_chunk_offset
+                if offset == 0 and batch_idx != 0:
+                    offset += forward_batch.extend_seq_lens_cpu[batch_idx - 1]
+                k_fp8 = forward_batch.token_to_kv_pool.get_index_k_continuous(
+                    layer_id,
+                    end_seq_position,
+                    block_tables[batch_idx],
+                )
+                k_scale = forward_batch.token_to_kv_pool.get_index_k_scale_continuous(
+                    layer_id,
+                    end_seq_position,
+                    block_tables[batch_idx],
+                )
+
+                extend_seq_len = end_seq_position - start_seq_position
+                ks = torch.full(
+                    (extend_seq_len,), offset, dtype=torch.int32, device="cuda"
+                )
+                k_fp8_list.append(k_fp8)
+                k_scale_list.append(k_scale)
+                ks_list.append(ks)
+                ke_offset = torch.arange(
+                    start_seq_position + 1,
+                    end_seq_position + 1,
+                    dtype=torch.int32,
+                    device="cuda",
+                )
+                ke_offset_list.append(ke_offset)
+                actual_seq_q = torch.tensor(
+                    [extend_seq_len], dtype=torch.int32, device="cuda"
+                )
+                actual_seq_q_list.append(actual_seq_q)
+                batch_idx_list.append(batch_idx)
+
+            k_fp8 = torch.cat(k_fp8_list, dim=0).view(torch.float8_e4m3fn)
+            k_scale = torch.cat(k_scale_list, dim=0).view(torch.float32).squeeze(-1)
+            kv_fp8 = (k_fp8, k_scale)
+            ks = torch.cat(ks_list, dim=0)
+            ke_offset = torch.cat(ke_offset_list, dim=0)
+            ke = ks + ke_offset
+            actual_seq_q = torch.cat(actual_seq_q_list, dim=0)
+            logits = deep_gemm.fp8_mqa_logits(
+                q_fp8,
+                kv_fp8,
+                weights,
+                ks,
+                ke,
+                clean_logits=False,
+            )
+            topk_result = metadata.topk_transform(
+                logits,
+                self.index_topk,
+                ks=ks,
+                cu_seqlens_q=actual_seq_q,
+                ke_offset=ke_offset,
+                batch_idx_list=batch_idx_list,
+            )
+        else:
+            kv_len = (
+                forward_batch.seq_lens_cpu[0].item()
+                - forward_batch.extend_seq_lens_cpu[0]
+                + kv_len
+            )
+            k_fp8 = forward_batch.token_to_kv_pool.get_index_k_continuous(
+                layer_id,
+                kv_len,
+                block_tables[0],
+            )
+            k_scale = forward_batch.token_to_kv_pool.get_index_k_scale_continuous(
+                layer_id,
+                kv_len,
+                block_tables[0],
+            )
+
+            k_fp8 = k_fp8.view(torch.float8_e4m3fn)
+            k_scale = k_scale.view(torch.float32).squeeze(-1)
+            kv_fp8 = (k_fp8, k_scale)
+            ks = torch.full((actual_seq_q,), offset, dtype=torch.int32, device="cuda")
+            ke_offset = torch.arange(
+                (kv_len - actual_seq_q) + 1,
+                kv_len + 1,
+                dtype=torch.int32,
+                device="cuda",
+            )
+            ke = ks + ke_offset
+
+            logits = deep_gemm.fp8_mqa_logits(
+                q_fp8,
+                kv_fp8,
+                weights,
+                ks,
+                ke,
+                clean_logits=False,
+            )
+            actual_seq_q = torch.tensor([actual_seq_q], dtype=torch.int32).to(
+                device="cuda", non_blocking=True
+            )
+            topk_result = metadata.topk_transform(
+                logits,
+                self.index_topk,
+                ks=ks,
+                cu_seqlens_q=actual_seq_q,
+                ke_offset=ke_offset,
+            )
+
+        return topk_result
+
     def forward_indexer(
         self,
         q_fp8: torch.Tensor,
@@ -594,7 +767,7 @@ def forward_cuda(
                 skip_logits_computation = max_kv_len <= self.index_topk
 
         # Optimization: fast path when skipping topk computation
-        if skip_logits_computation:
+        if skip_logits_computation and (not self.nsa_enable_prefill_cp):
             return self._forward_cuda_k_only(
                 x,
                 positions,
@@ -607,7 +780,7 @@ def forward_cuda(
             )
 
         query, key, weights = self._get_q_k_bf16(
-            q_lora, x, positions, enable_dual_stream
+            q_lora, x, positions, enable_dual_stream, forward_batch=forward_batch
         )
 
         if enable_dual_stream:
@@ -660,9 +833,49 @@ def forward_cuda(
                     forward_batch, layer_id, q_fp8, weights, metadata
                 )
             else:
-                topk_result = self._get_topk_ragged(
-                    forward_batch, layer_id, q_fp8, weights, metadata
-                )
+                if (
+                    forward_batch.nsa_cp_metadata is not None
+                    and self.nsa_enable_prefill_cp
+                ):
+                    kv_len_prev = forward_batch.nsa_cp_metadata.kv_len_prev
+                    kv_len_next = forward_batch.nsa_cp_metadata.kv_len_next
+                    actual_seq_q_prev = forward_batch.nsa_cp_metadata.actual_seq_q_prev
+                    actual_seq_q_next = forward_batch.nsa_cp_metadata.actual_seq_q_next
+
+                    # TODO support mutil-batch
+                    # cp_batch_seq_index_prev = forward_batch.nsa_cp_metadata["cp_batch_seq_index_prev"]
+                    # cp_batch_seq_index_next = forward_batch.nsa_cp_metadata["cp_batch_seq_index_next"]
+                    # TODO prev, next, combined into a single call
+                    q_fp8_prev, q_fp8_next = torch.split(
+                        q_fp8, (q_fp8.shape[0] + 1) // 2, dim=0
+                    )
+                    weights_prev, weights_next = torch.split(
+                        weights, (weights.shape[0] + 1) // 2, dim=0
+                    )
+                    topk_result_prev = self._get_topk_ragged_with_cp(
+                        forward_batch,
+                        layer_id,
+                        q_fp8_prev,
+                        weights_prev,
+                        metadata,
+                        kv_len_prev,
+                        actual_seq_q_prev,
+                    )
+
+                    topk_result_next = self._get_topk_ragged_with_cp(
+                        forward_batch,
+                        layer_id,
+                        q_fp8_next,
+                        weights_next,
+                        metadata,
+                        kv_len_next,
+                        actual_seq_q_next,
+                    )
+                    return torch.cat([topk_result_prev, topk_result_next], dim=0)
+                else:
+                    topk_result = self._get_topk_ragged(
+                        forward_batch, layer_id, q_fp8, weights, metadata
+                    )
         else:
             topk_result = self.forward_indexer(
                 q_fp8.contiguous(),