ModelTC · kingder · Apr 11, 2025 · Apr 21, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/docker/Dockerfile.nixl b/docker/Dockerfile.nixl
@@ -0,0 +1,83 @@
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
+ARG PYTHON_VERSION=3.10
+ARG MAMBA_VERSION=24.7.1-0
+ARG TARGETPLATFORM
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    libssl-dev \
+    curl \
+    g++ \
+    make \
+    git && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+    *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  exit 1 ;; \
+    *)              /opt/conda/bin/conda update -y conda &&  \
+    /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+
+WORKDIR /root
+
+COPY ./requirements.txt /lightllm/requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel &&  pip install --no-deps -v .
+
+RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel
+
+RUN apt-get update && apt-get install -y cmake automake autotools-dev  libtool libz-dev && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
+    rm -rf /usr/lib/ucx && \
+    rm -rf /opt/hpcx/ucx && \
+    cd /usr/local/src && \
+    git clone https://github.com/openucx/ucx.git && \
+    cd ucx && 			     \
+    git checkout v1.19.x &&	     \
+    ./autogen.sh && ./configure     \
+    --enable-shared             \
+    --disable-static            \
+    --disable-doxygen-doc       \
+    --enable-optimizations      \
+    --enable-cma                \
+    --enable-devel-headers      \
+    --with-cuda=/usr/local/cuda \
+    --with-verbs=yes                \
+    --with-dm                   \
+    --with-gdrcopy=/usr/local   \
+    --with-efa                  \
+    --enable-mt &&              \
+    make -j &&                      \
+    make -j install-strip &&        \
+    ldconfig;
+
+RUN apt-get update && apt-get install -y  pkg-config tmux net-tools;  \
+    cd /usr/local/src; \
+    pip install --upgrade meson pybind11 patchelf; \
+    git clone https://github.com/ai-dynamo/nixl.git -b main && \
+    cd nixl && \
+    rm -rf build && \
+    mkdir build && \
+    meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
+    cd build && \
+    ninja && \
+    ninja install && \
+    cd .. && pip install . --no-deps;
+
+COPY . /lightllm
+RUN pip install -e /lightllm --no-cache-dir
diff --git a/docker/Dockerfile.nixl.deepep b/docker/Dockerfile.nixl.deepep
@@ -0,0 +1,121 @@
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
+
+ARG PYTHON_VERSION=3.10
+ARG MAMBA_VERSION=24.7.1-0
+ARG TARGETPLATFORM
+
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    libssl-dev \
+    curl \
+    g++ \
+    make \
+    git && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+    *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  exit 1 ;; \
+    *)              /opt/conda/bin/conda update -y conda &&  \
+    /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+
+WORKDIR /root
+
+COPY ./requirements.txt /lightllm/requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel &&  pip install --no-deps -v .
+
+RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms
+RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev
+
+ENV CUDA_HOME=/usr/local/cuda \
+    GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
+
+RUN mkdir -p /tmp/gdrcopy && cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
+ && cd gdrcopy/packages \
+ && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+ && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+ && cd / && rm -rf /tmp/gdrcopy
+
+ # Fix DeepEP IBGDA symlink
+RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
+ && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
+ && cd nvshmem \
+ && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
+ && NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \
+ && cmake --build build --target install -j64
+
+ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58
+RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..
+
+WORKDIR /root/DeepEP
+ENV NVSHMEM_DIR=/root/nvshmem/install
+RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install
+
+RUN apt-get update && apt-get install -y cmake automake autotools-dev  libtool libz-dev && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
+    rm -rf /usr/lib/ucx && \
+    rm -rf /opt/hpcx/ucx && \
+    cd /usr/local/src && \
+    git clone https://github.com/openucx/ucx.git && \
+    cd ucx && 			     \
+    git checkout v1.19.x &&	     \
+    ./autogen.sh && ./configure     \
+    --enable-shared             \
+    --disable-static            \
+    --disable-doxygen-doc       \
+    --enable-optimizations      \
+    --enable-cma                \
+    --enable-devel-headers      \
+    --with-cuda=/usr/local/cuda \
+    --with-verbs=yes                \
+    --with-dm                   \
+    --with-gdrcopy=/usr/local   \
+    --with-efa                  \
+    --enable-mt &&              \
+    make -j &&                      \
+    make -j install-strip &&        \
+    ldconfig;
+
+RUN apt-get update && apt-get install -y  pkg-config tmux net-tools ;  \
+    cd /usr/local/src; \
+    pip install --upgrade meson pybind11 patchelf; \
+    git clone https://github.com/ai-dynamo/nixl.git -b main && \
+    cd nixl && \
+    rm -rf build && \
+    mkdir build && \
+    meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
+    cd build && \
+    ninja && \
+    ninja install && \
+    cd .. && pip install . --no-deps;
+
+COPY . /lightllm
+RUN pip install -e /lightllm --no-cache-dir
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -177,6 +177,10 @@ def _init_kv_move_buffer(self):
         # p d 分离的推理模式下才需要做这一步初始化
         if self.run_mode in ["prefill", "decode"]:
             self.mem_manager.alloc_kv_move_buffer(self.mem_manager.size)
+        elif self.run_mode in ["nixl_prefill", "nixl_decode"]:
+            page_num = int(os.getenv("PD_NIXL_MOVE_PAGE_NUM", 32))
+            page_size = int(os.getenv("PD_NIXL_MOVE_PAGE_SIZE", 1024))
+            self.mem_manager.alloc_paged_kv_move_buffer(page_num, page_size)
 
     def _check_mem_size(self):
         self.max_total_token_num = self.mem_manager.size

diff --git a/lightllm/common/deepseek2_mem_manager.py b/lightllm/common/deepseek2_mem_manager.py
@@ -36,6 +36,12 @@ def alloc_kv_move_buffer(self, max_req_total_len):
         self.token_dim_size = self.kv_move_buffer.shape[-1] * self.kv_move_buffer.shape[-2]
         return
 
+    def alloc_paged_kv_move_buffer(self, page_num, page_size):
+        self.kv_move_buffer = torch.empty(
+            (page_num, page_size, self.layer_num, self.head_num, self.head_dim), dtype=self.dtype, device="cuda"
+        )
+        return
+
     def send_to_decode_node(
         self,
         move_tasks: List[KVMoveTask],

diff --git a/lightllm/common/mem_manager.py b/lightllm/common/mem_manager.py
@@ -96,6 +96,14 @@ def alloc_kv_move_buffer(self, max_req_total_len):
         self.token_dim_size = self.kv_move_buffer.shape[-2] * self.kv_move_buffer.shape[-1]
         return
 
+    def alloc_paged_kv_move_buffer(self, page_num, page_size):
+        if isinstance(self, MemoryManager) and type(self) != MemoryManager:
+            raise NotImplementedError("subclass need reimpl this method")
-        if isinstance(self, MemoryManager) and type(self) != MemoryManager:
-            raise NotImplementedError("subclass need reimpl this method")
+        if type(self) is not MemoryManager:
+            raise NotImplementedError("subclass need to reimplement this method")
-        if isinstance(self, MemoryManager) and type(self) != MemoryManager:
-            raise NotImplementedError("subclass need reimpl this method")
+        if type(self) is not MemoryManager:
+            raise NotImplementedError("subclass need to reimplement this method")
+        self.kv_move_buffer = torch.empty(
+            (page_num, page_size, self.layer_num, 2 * self.head_num, self.head_dim), dtype=self.dtype, device="cuda"
+        )
+        return
+
     def send_to_decode_node(
         self,
         move_tasks: List[KVMoveTask],

diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -7,7 +7,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--run_mode",
         type=str,
-        choices=["normal", "prefill", "decode", "pd_master", "config_server"],
+        choices=["normal", "prefill", "decode", "nixl_prefill", "nixl_decode", "pd_master", "config_server"],
         default="normal",
         help="""set run mode, normal is started for a single server, prefill decode pd_master is for pd split run mode,
                 config_server is for pd split mode used to register pd_master node, and get pd_master node list,
@@ -54,6 +54,20 @@ def make_argument_parser() -> argparse.ArgumentParser:
         default=None,
         help="The port number for the config server in config_server mode.",
     )
+    parser.add_argument(
+        "--pd_nixl_remote_prefill_http_port",
+        type=int,
+        default=42001,
+        help="nixl pd mode, prefill node used for triggering prefill http port.",
+    )
+
+    parser.add_argument(
+        "--pd_nixl_remote_prefill_port",
+        type=int,
+        default=42002,
+        help="nixl pd mode, prefill and decode used for meta exchange.",
+    )
+
     parser.add_argument(
         "--model_name",
         type=str,

diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -67,7 +67,7 @@ def normal_or_p_d_start(args):
 
         enable_mps()
 
-    if args.run_mode not in ["normal", "prefill", "decode"]:
+    if args.run_mode not in ["normal", "prefill", "decode", "nixl_prefill", "nixl_decode"]:
         return
 
     assert args.zmq_mode in ["tcp://", "ipc:///tmp/"]

diff --git a/lightllm/server/core/objs/__init__.py b/lightllm/server/core/objs/__init__.py
@@ -1,5 +1,5 @@
 from .sampling_params import SamplingParams
-from .req import Req, FinishStatus
+from .req import Req, FinishStatus, PDNIXLChunkedPrefillReq
 from .shm_req_manager import ShmReqManager
 from .rpc_shm import RpcShmParams, RpcShmResults, ShmSyncStatusArray
 from .start_args_type import StartArgs
diff --git a/lightllm/server/core/objs/io_objs/__init__.py b/lightllm/server/core/objs/io_objs/__init__.py
@@ -1 +1 @@
-from .group_req import GroupReqIndexes, GroupReqObjs, AbortedReqCmd
+from .group_req import GroupReqIndexes, GroupReqObjs, AbortedReqCmd, NIXLRemotePrefillDoneCmd, ReqCmd
diff --git a/lightllm/server/core/objs/io_objs/group_req.py b/lightllm/server/core/objs/io_objs/group_req.py
@@ -29,5 +29,13 @@ def to_group_req_index(self):
 
 
 @dataclass
-class AbortedReqCmd:
+class ReqCmd:
     req_id: int
+
+
+class AbortedReqCmd(ReqCmd):
+    pass
+
+
+class NIXLRemotePrefillDoneCmd(ReqCmd):
+    pass
diff --git a/lightllm/server/core/objs/req.py b/lightllm/server/core/objs/req.py
@@ -105,6 +105,7 @@ def get_str(self):
             f"shm_cur_kv_len:{self.shm_cur_kv_len},"
             f"shm_cur_output_len:{self.shm_cur_output_len},"
             f"finish_status:{self.finish_status.is_finished()}"
+            f"group_id: {self.group_req_id}"
         )
 
     def init(
@@ -326,3 +327,63 @@ def post_init(
         # 错误问题。
         self.sample_params.max_new_tokens = self.sample_params.max_new_tokens + self.prefix_token_ids.size + 6
         return
+
+
+class PdNixlReqState(ctypes.Structure):
+    _pack_ = 4
+    _MAX_TP_SIZE = 32
+    _fields_ = [("dp_world_size", ctypes.c_int), ("state", ctypes.c_int * _MAX_TP_SIZE)]
+
+    def __init__(self):
+        self.dp_world_size = 0
+        self.state = (ctypes.c_int * self._MAX_TP_SIZE)(*([0] * self._MAX_TP_SIZE))
+
+    def set_dp_world_size(self, size: int):
+        assert size < self._MAX_TP_SIZE, f"size {size} > max size {self._MAX_TP_SIZE}"
+        self.dp_world_size = size
+        ctypes.memset(ctypes.addressof(self.state), 0, (self.dp_world_size + 1) * ctypes.sizeof(ctypes.c_int))
+
+    def set_tp_state(self, tp_id: int, state: int):
+        assert (
+            self.dp_world_size > 0 and tp_id >= 0 and tp_id < self.dp_world_size
+        ), f"tp_id {tp_id} out of range [0, {self.dp_world_size})"
+        self.state[tp_id] = state
+
+    def set_state(self):
+        assert self.dp_world_size > 0, "dp_world_size should be set before calling this"
+        unique_state = np.unique(self.state[: self.dp_world_size])
+        self.state[self.dp_world_size] = unique_state[0]
+        return unique_state[0]
+
+    def get_state(self):
+        assert self.dp_world_size > 0, "dp_world_size should be set before calling this"
+        return self.state[self.dp_world_size]
+
+
+class PDNIXLChunkedPrefillReq(ChunkedPrefillReq):
+    _pack_ = 4
+    _fields_ = ChunkedPrefillReq._fields_ + [
+        # 用于pd nixl状态同步
+        ("pd_nixl_req_state", PdNixlReqState),
+        ("router_nixl_rpd", ctypes.c_bool),
+    ]
+
+    def post_init(self):
+        self.router_nixl_rpd = False
+
+    def set_dp_world_size(self, dp_world_size):
+        self.pd_nixl_req_state.set_dp_world_size(dp_world_size)
+        self.router_nixl_rpd = False
+
+    # called by each tp rank, no contention
+    def set_pd_req_rank_state(self, tp_id: int, state: int):
+        self.pd_nixl_req_state.set_tp_state(tp_id, state)
+
+    # state: -1 for failed, 0 for in progress, 1 for success
+    # set by router
+    def set_pd_req_state(self):
+        return self.pd_nixl_req_state.set_state()
+
+    # read by all rank
+    def get_pd_req_state(self):
+        return self.pd_nixl_req_state.get_state()
diff --git a/lightllm/server/core/objs/shm_req_manager.py b/lightllm/server/core/objs/shm_req_manager.py
@@ -3,7 +3,7 @@
 from lightllm.utils.envs_utils import get_unique_server_name
 from multiprocessing import shared_memory
 from lightllm.utils.log_utils import init_logger
-from .req import Req, ChunkedPrefillReq, TokenHealingReq
+from .req import Req, ChunkedPrefillReq, TokenHealingReq, PDNIXLChunkedPrefillReq
 from .shm_array import ShmArray
 from .atomic_array_lock import AtomicShmArrayLock, AtomicLockItem
 from .atomic_lock import AtomicShmLock
@@ -33,6 +33,9 @@ def get_req_class_type(self):
         if args.token_healing_mode:
             return TokenHealingReq
 
+        if args.run_mode in ["nixl_prefill", "nixl_decode"]:
+            return PDNIXLChunkedPrefillReq
+
         return ChunkedPrefillReq
 
     def get_max_req_num(self):
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .group_req import GroupReqIndexes, GroupReqObjs, AbortedReqCmd
		from .group_req import GroupReqIndexes, GroupReqObjs, AbortedReqCmd, NIXLRemotePrefillDoneCmd, ReqCmd