ModelTC · hiworldwzj · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 30, 2025
diff --git a/lightllm/common/basemodel/triton_kernel/kv_cache_offload.py b/lightllm/common/basemodel/triton_kernel/kv_cache_offload.py
@@ -0,0 +1,228 @@
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _offload_gpu_kv_to_cpu(
+    token_indexes_ptr,
+    gpu_kv_cache_ptr,
+    gpu_stride0,
+    gpu_stride1,
+    gpu_stride2,
+    gpu_stride3,
+    cpu_kv_cache_ptr,
+    cpu_stride0,
+    cpu_stride1,
+    cpu_stride2,
+    cpu_stride3,
+    cpu_stride4,
+    page_indexes_ptr,
+    page_readies_ptr,
+    layer_num,
+    head_all_dim,
+    BLOCK_HEAD_ALL_DIM: tl.constexpr,
+    TOKEN_BLOCK: tl.constexpr,
+):
+    block_index = tl.program_id(0)
+    cpu_page_index = tl.load(page_indexes_ptr + block_index).to(tl.int64)
+    if cpu_page_index == -1:
+        return
+
+    ready_state = tl.load(page_readies_ptr + block_index)
+    if ready_state:
+        return
+
+    token_range = block_index * TOKEN_BLOCK + tl.arange(0, TOKEN_BLOCK)
+    token_indexes = tl.load(token_indexes_ptr + token_range).to(tl.int64)
+    head_all_dim_range = tl.arange(0, BLOCK_HEAD_ALL_DIM)
+
+    gpu_stride0 = tl.cast(gpu_stride0, dtype=tl.int64)
+
+    for layer_index in range(layer_num):
+        gpu_ptr = (
+            gpu_kv_cache_ptr
+            + layer_index * gpu_stride0
+            + token_indexes[:, None] * gpu_stride1
+            + head_all_dim_range[None, :]
+        )
+        gpu_data = tl.load(gpu_ptr, mask=(head_all_dim_range[None, :] < head_all_dim), other=0.0)
+        cpu_ptr = (
+            cpu_kv_cache_ptr
+            + cpu_page_index * cpu_stride0
+            + layer_index * cpu_stride1
+            + tl.arange(0, TOKEN_BLOCK)[:, None] * cpu_stride2
+            + head_all_dim_range[None, :]
+        )
+        tl.store(
+            cpu_ptr,
+            gpu_data,
+            mask=(head_all_dim_range[None, :] < head_all_dim),
+        )
+    return
+
+
+
+@torch.no_grad()
+def offload_gpu_kv_to_cpu(
+    token_indexes: torch.Tensor,
+    gpu_kv_cache: torch.Tensor,
+    cpu_kv_cache: torch.Tensor,
+    page_indexes: torch.Tensor,
+    page_readies: torch.Tensor,
+):
+    """
+    this function is used to offload GPU KV cache to CPU KV cache.
+    Args:
+        token_indexes: (token_num,)
+        gpu_kv_cache: (layer_num, token_num, head_num, head_dim)
+        cpu_kv_cache: (all_page_num, layer_num, token_block_size, head_num, head_dim)
+        page_indexes: (page_num,)
+        page_readies: (page_num,)
+    """
+    token_block_size = cpu_kv_cache.shape[2]
+    token_num = page_indexes.shape[0] * token_block_size
+    assert token_indexes.shape[0] >= token_num
+    assert page_indexes.shape == page_readies.shape
+    page_num = page_indexes.shape[0]
+    head_all_dim = gpu_kv_cache.shape[-1] * gpu_kv_cache.shape[-2]
+    BLOCK_HEAD_ALL_DIM = triton.next_power_of_2(gpu_kv_cache.shape[-1] * gpu_kv_cache.shape[-2])
+
+    grid = (page_num,)
+    num_warps = 4
+
+    _offload_gpu_kv_to_cpu[grid](
+        token_indexes_ptr=token_indexes,
+        gpu_kv_cache_ptr=gpu_kv_cache,
+        gpu_stride0=gpu_kv_cache.stride(0),
+        gpu_stride1=gpu_kv_cache.stride(1),
+        gpu_stride2=gpu_kv_cache.stride(2),
+        gpu_stride3=gpu_kv_cache.stride(3),
+        cpu_kv_cache_ptr=cpu_kv_cache,
+        cpu_stride0=cpu_kv_cache.stride(0),
+        cpu_stride1=cpu_kv_cache.stride(1),
+        cpu_stride2=cpu_kv_cache.stride(2),
+        cpu_stride3=cpu_kv_cache.stride(3),
+        cpu_stride4=cpu_kv_cache.stride(4),
+        page_indexes_ptr=page_indexes,
+        page_readies_ptr=page_readies,
+        layer_num=gpu_kv_cache.shape[0],
+        head_all_dim=head_all_dim,
+        BLOCK_HEAD_ALL_DIM=BLOCK_HEAD_ALL_DIM,
+        TOKEN_BLOCK=token_block_size,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return
+
+
+@triton.jit
+def _load_cpu_cache_to_gpu(
+    token_indexes_ptr,
+    gpu_kv_cache_ptr,
+    gpu_stride0,
+    gpu_stride1,
+    gpu_stride2,
+    gpu_stride3,
+    cpu_kv_cache_ptr,
+    cpu_stride0,
+    cpu_stride1,
+    cpu_stride2,
+    cpu_stride3,
+    cpu_stride4,
+    page_indexes_ptr,
+    layer_num,
+    head_all_dim,
+    all_move_token_num,
+    BLOCK_HEAD_ALL_DIM: tl.constexpr,
+    TOKEN_BLOCK: tl.constexpr,
+):
+    block_index = tl.program_id(0)
+    cpu_page_index = tl.load(page_indexes_ptr + block_index).to(tl.int64)
+    if cpu_page_index == -1:
+        return
+
+    gpu_stride0 = tl.cast(gpu_stride0, dtype=tl.int64)
+    padded_size = TOKEN_BLOCK * tl.num_programs(0) - all_move_token_num
+    head_all_dim_range = tl.arange(0, BLOCK_HEAD_ALL_DIM)
+    token_range = block_index * TOKEN_BLOCK + tl.arange(0, TOKEN_BLOCK)
+    token_range = token_range - padded_size
+
+    token_mask = token_range >= 0
+    head_dim_mask = head_all_dim_range < head_all_dim
+
+    token_indexes = tl.load(token_indexes_ptr + token_range, mask=token_mask, other=0).to(tl.int64)
+
+    cpu_page_index = tl.load(page_indexes_ptr + block_index)
+    for layer_index in range(layer_num):
+        cpu_ptr = (
+            cpu_kv_cache_ptr
+            + cpu_page_index * cpu_stride0
+            + layer_index * cpu_stride1
+            + tl.arange(0, TOKEN_BLOCK)[:, None] * cpu_stride2
+            + head_all_dim_range[None, :]
+        )
+        cpu_data = tl.load(cpu_ptr, mask=head_dim_mask[None, :], other=0.0)
+
+        gpu_ptr = (
+            gpu_kv_cache_ptr
+            + layer_index * gpu_stride0
+            + token_indexes[:, None] * gpu_stride1
+            + head_all_dim_range[None, :]
+        )
+        tl.store(
+            gpu_ptr,
+            cpu_data,
+            mask=token_mask[:, None] & head_dim_mask[None, :],
+        )
+    return
+
+
+@torch.no_grad()
+def load_cpu_kv_to_gpu(
+    mem_indexes: torch.Tensor,
+    gpu_kv_cache: torch.Tensor,
+    cpu_kv_cache: torch.Tensor,
+    page_indexes: torch.Tensor,
+):
+    """
+    this function is used to offload GPU KV cache to CPU KV cache.
-    this function is used to offload GPU KV cache to CPU KV cache.
+    this function is used to load CPU KV cache to GPU KV cache.
-    this function is used to offload GPU KV cache to CPU KV cache.
+    this function is used to load CPU KV cache to GPU KV cache.
+    Args:
+        mem_indexes: (token_num,)
+        gpu_kv_cache: (layer_num, token_num, head_num, head_dim)
+        cpu_kv_cache: (page_num, layer_num, token_block_size, head_num, head_dim)
+        page_indexes: (page_num,)
+    """
+    token_block_size = cpu_kv_cache.shape[2]
+    token_num = page_indexes.shape[0] * token_block_size
+    assert mem_indexes.shape[0] >= token_num
+    page_num = page_indexes.shape[0]
+    BLOCK_HEAD_ALL_DIM = triton.next_power_of_2(gpu_kv_cache.shape[-1] * gpu_kv_cache.shape[-2])
+
+    grid = (page_num,)
+    num_warps = 1
+
+    _offload_gpu_kv_to_cpu[grid](
+        token_indexes_ptr=mem_indexes,
+        gpu_kv_cache_ptr=gpu_kv_cache,
+        gpu_stride0=gpu_kv_cache.stride(0),
+        gpu_stride1=gpu_kv_cache.stride(1),
+        gpu_stride2=gpu_kv_cache.stride(2),
+        gpu_stride3=gpu_kv_cache.stride(3),
+        cpu_kv_cache_ptr=cpu_kv_cache,
+        cpu_stride0=cpu_kv_cache.stride(0),
+        cpu_stride1=cpu_kv_cache.stride(1),
+        cpu_stride2=cpu_kv_cache.stride(2),
+        cpu_stride3=cpu_kv_cache.stride(3),
+        cpu_stride4=cpu_kv_cache.stride(4),
+        page_indexes_ptr=page_indexes,
+        layer_num=gpu_kv_cache.shape[0],
+        head_all_dim=gpu_kv_cache.shape[-1] * gpu_kv_cache.shape[-2],
+        all_move_token_num=len(mem_indexes),
+        BLOCK_HEAD_ALL_DIM=BLOCK_HEAD_ALL_DIM,
+        TOKEN_BLOCK=token_block_size,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -477,4 +477,25 @@ def make_argument_parser() -> argparse.ArgumentParser:
         default=0.03,
         help="""The interval of the schedule time, default is 30ms.""",
     )
+    parser.add_argument(
+        "--enable_cpu_cache",
+        action="store_true",
+        help="""enable cpu cache to store kv cache.""",
+    )
+    parser.add_argument(
+        "--cpu_cache_storage_size",
+        type=float,
+        default=2,
+        help="""The capacity of cpu cache. GB used.""",
+    )
+    parser.add_argument(
+        "--cpu_cache_token_page_size",
+        type=int,
+        default=256,
+        help="""The token page size of cpu cache""",
+    )
+    parser.add_argument("--enable_disk_cache", action="store_true", help="""enable disk cache to store kv cache.""")
+    parser.add_argument(
+        "--disk_cache_storage_size", type=float, default=10, help="""The capacity of disk cache. GB used."""
+    )
     return parser
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
@@ -38,6 +38,7 @@
 from fastapi import BackgroundTasks, FastAPI, Request, WebSocket, WebSocketDisconnect
 from fastapi.responses import Response, StreamingResponse, JSONResponse
 from lightllm.server.core.objs.sampling_params import SamplingParams
+from lightllm.server.core.objs import StartArgs
 from .multimodal_params import MultimodalParams
 from .httpserver.manager import HttpServerManager
 from .httpserver_for_pd_master.manager import HttpServerManagerForPDMaster
@@ -71,7 +72,7 @@ class G_Objs:
     httpserver_manager: Union[HttpServerManager, HttpServerManagerForPDMaster] = None
     shared_token_load: TokenLoad = None
 
-    def set_args(self, args):
+    def set_args(self, args: StartArgs):
         self.args = args
         from .api_lightllm import lightllm_generate, lightllm_generate_stream
         from .api_tgi import tgi_generate_impl, tgi_generate_stream_impl
@@ -86,22 +87,13 @@ def set_args(self, args):
         if args.run_mode == "pd_master":
             self.metric_client = MetricClient(args.metric_port)
             self.httpserver_manager = HttpServerManagerForPDMaster(
-                args,
-                metric_port=args.metric_port,
+                args=args,
             )
         else:
             init_tokenizer(args)  # for openai api
             SamplingParams.load_generation_cfg(args.model_dir)
             self.metric_client = MetricClient(args.metric_port)
-            self.httpserver_manager = HttpServerManager(
-                args,
-                router_port=args.router_port,
-                cache_port=args.cache_port,
-                detokenization_pub_port=args.detokenization_pub_port,
-                visual_port=args.visual_port,
-                enable_multimodal=args.enable_multimodal,
-                metric_port=args.metric_port,
-            )
+            self.httpserver_manager = HttpServerManager(args=args)
             dp_size_in_node = max(1, args.dp // args.nnodes)  # 兼容多机纯tp的运行模式，这时候 1 // 2 == 0, 需要兼容
             self.shared_token_load = TokenLoad(f"{get_unique_server_name()}_shared_token_load", dp_size_in_node)