ModelTC
diff --git a/‎lightllm/server/core/objs/req.py‎
Lines changed: 2 additions & 0 deletions b/‎lightllm/server/core/objs/req.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lightllm/server/httpserver/manager.py‎
Lines changed: 21 additions & 3 deletions b/‎lightllm/server/httpserver/manager.py‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎lightllm/server/multi_level_kv_cache/cpu_cache_client.py‎
Lines changed: 83 additions & 26 deletions b/‎lightllm/server/multi_level_kv_cache/cpu_cache_client.py‎
Lines changed: 83 additions & 26 deletions
@@ -81,6 +81,7 @@ class Req(ctypes.Structure):
         ("candetoken_out_len", ctypes.c_int),
         ("prompt_cache_len", ctypes.c_int),  # 用于记录prompt cache 的命中长度，用于统计,这里指gpu kv cache命中长度
         ("cpu_prompt_cache_len", ctypes.c_int),  # 用于记录在 enable_cpu_cache 的场景下,命中的 cpu kv cache 的长度
+        ("disk_prompt_cache_len", ctypes.c_int),  # 用于记录从磁盘命中的长度
         ("is_paused", ctypes.c_bool),  # 标记一个Req因为显存资源管理的原因被临时暂停了。
         ("finish_status", FinishStatus),
         # 这个标记变量是http_server 写入，其他进程读取，用于标记该请求是否因为断网被aborted。
@@ -149,6 +150,7 @@ def init(
         self.candetoken_out_len = 0
         self.prompt_cache_len = 0
         self.cpu_prompt_cache_len = 0
+        self.disk_prompt_cache_len = 0
         self.finish_token_index = -1
         self.can_released_mark = False
         self.reward_score = math.nan
 
@@ -12,7 +12,7 @@
 from frozendict import frozendict
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-from typing import Union, List, Tuple, Dict, Optional
+from typing import Union, List, Tuple, Dict, Optional, AsyncGenerator
 from websockets import ClientConnection
 from fastapi import Request
 from ..tokenizer import get_tokenizer
@@ -264,7 +264,7 @@ async def generate(
         nixl_pd_upload_websocket: ClientConnection = None,
         # 用于等待 pd_master 下发的交换信息
         nixl_pd_event: asyncio.Event = None,
-    ) -> Tuple[int, str, dict, FinishStatus]:
+    ) -> AsyncGenerator[Tuple[int, str, dict, FinishStatus], None]:
         start_time = time.time()
         request_headers = request.headers if request is not None else {}
         group_request_id = self.alloc_req_id(sampling_params, is_health_req)
@@ -567,6 +567,7 @@ async def _wait_to_token_package(
 
                     prompt_cache_len = metadata.pop("prompt_cache_len", 0)
                     cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0)
+                    disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0)
                     if is_first_token:
                         first_token_cost_ms = (time.time() - start_time) * 1000
                         is_first_token = False
@@ -589,6 +590,8 @@ async def _wait_to_token_package(
                         x_request_id = request.headers.get("X-Request-Id", "") if request is not None else ""
                         x_session_id = request.headers.get("X-Session-Id", "") if request is not None else ""
                         prompt_cache_ratio = prompt_cache_len / prompt_tokens
+                        cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens
+                        disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens
 
                         mtp_avg_token_per_step = out_token_counter / max(
                             (out_token_counter - metadata["mtp_accepted_token_num"]), 1
@@ -604,9 +607,23 @@ async def _wait_to_token_package(
                             f"prompt_cache_len:{prompt_cache_len} "
                             f"prompt_cache_ratio:{prompt_cache_ratio} "
                             f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
-                            f"used_cpu_prompt_cache_len:{max(0, cpu_prompt_cache_len - prompt_cache_len)} "
+                            f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
+                            f"disk_prompt_cache_len:{disk_prompt_cache_len} "
+                            f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
                             f"mtp_avg_token_per_step:{mtp_avg_token_per_step} "
                         )
+                        if cpu_prompt_cache_len > 0:
+                            logger.info(
+                                f"blueswhen "
+                                f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
+                                f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
+                            )
+                        if disk_prompt_cache_len > 0:
+                            logger.info(
+                                f"blueswhen "
+                                f"disk_prompt_cache_len:{disk_prompt_cache_len} "
+                                f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
+                            )
                         if group_request_id < 0:
                             # health 探测请求，不记录日志和监控
                             return
@@ -726,6 +743,7 @@ async def handle_loop(self):
                                     "count_output_tokens": count_output_tokens,
                                     "prompt_cache_len": req.prompt_cache_len,
                                     "cpu_prompt_cache_len": req.cpu_prompt_cache_len,
+                                    "disk_prompt_cache_len": req.disk_prompt_cache_len,
                                     "mtp_accepted_token_num": req.mtp_accepted_token_num,
                                 }
                                 if self.args.return_all_prompt_logprobs:
 
@@ -37,6 +37,16 @@ def __init__(self, only_create_meta_data: bool, init_shm_data: bool):
                 self.attach_shm_handle = self._attach_shm_cpu_kv_cache()
         return
 
+    @staticmethod
+    def _encode_offload_head(page_index: int) -> int:
+        return -(page_index + 1)
+
+    @staticmethod
+    def _decode_offload_value(value: int) -> Tuple[int, bool]:
+        if value < 0:
+            return -(value + 1), True
+        return value, False
+
     def get_one_empty_page(self, hash_key: int, disk_offload_enable: bool) -> Optional[int]:
         assert self.page_hash_dict.get(hash_key) is None
         head = self.page_items.head
@@ -63,15 +73,12 @@ def allocate_one_page(self, hash_key: int, disk_offload_enable: bool) -> Tuple[O
         page_index = self.page_hash_dict.get(hash_key)
         if page_index is not None:
             page_item: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
+            page_item.ref_count += 1
+            page_item.del_self_from_list()
+            self.page_items.add_item_to_tail(index=page_index)
             if page_item.is_data_ready():
-                page_item.ref_count += 1
-                page_item.del_self_from_list()
-                self.page_items.add_item_to_tail(index=page_index)
                 return page_index, True
             else:
-                page_item.ref_count += 1
-                page_item.del_self_from_list()
-                self.page_items.add_item_to_tail(index=page_index)
                 return page_index, False
         else:
             page_index = self.get_one_empty_page(hash_key=hash_key, disk_offload_enable=disk_offload_enable)
@@ -101,34 +108,54 @@ def allocate_pages(self, hash_keys: List[int], disk_offload_enable: bool) -> Tup
         ready_list.extend([False for _ in range(left_num)])
         return page_list, ready_list
 
-    def update_pages_status_to_ready(self, page_list: List[int], deref: bool = True, disk_offload_enable: bool = False):
+    def update_pages_status_to_ready(
+        self,
+        page_list: List[int],
+        deref: bool = True,
+        disk_offload_enable: bool = False,
+    ):
+        offload_candidates: List[int] = []
         for page_index in page_list:
             if page_index != -1:
                 cur_page: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
                 if cur_page.status < cur_page.READY:
                     cur_page.status = cur_page.READY
                     if disk_offload_enable:
-                        self.offload_page_indexes.add_item(value=cur_page.self_index)
+                        offload_candidates.append(cur_page.self_index)
                 if deref:
                     assert cur_page.ref_count > 0
                     cur_page.ref_count -= 1
+
+        if disk_offload_enable and offload_candidates:
+            for idx, page_index in enumerate(offload_candidates):
+                if idx == 0:
+                    encoded = self._encode_offload_head(page_index)
+                else:
+                    encoded = page_index
+                self.offload_page_indexes.add_item(value=encoded)
+        return
+
+    def mark_pages_recyclable(self, page_list: List[int]):
+        for page_index in page_list:
+            if page_index == -1:
+                continue
+            cur_page: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
+            if cur_page.status >= cur_page.READY:
+                cur_page.status = cur_page.READY_RECYCLE
         return
 
     def query_one_page(self, hash_key: int) -> Tuple[Optional[int], bool]:
         page_index = self.page_hash_dict.get(hash_key)
         if page_index is not None:
             page_item: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
+            page_item.ref_count += 1
+            # lru 更新
+            page_item.del_self_from_list()
+            self.page_items.add_item_to_tail(index=page_index)
             if page_item.is_data_ready():
-                page_item.ref_count += 1
-                # lru 更新
-                page_item.del_self_from_list()
-                self.page_items.add_item_to_tail(index=page_index)
                 return page_index, True
             else:
-                # lru 更新
-                page_item.del_self_from_list()
-                self.page_items.add_item_to_tail(index=page_index)
-                return None, False
+                return page_index, False
         else:
             return None, False
 
@@ -138,6 +165,7 @@ def check_allpages_ready(self, page_list: List[int]) -> bool:
                 continue
             page_item: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
             if not page_item.is_data_ready():
+                logger.info("cpu cache page %d not ready, status %d", page_index, page_item.status)
                 return False
         return True
 
@@ -156,17 +184,30 @@ def deref_one_page(self, page_index: int):
         page_item.ref_count -= 1
         return
 
-    def get_pages_to_offloading(self) -> List[int]:
+    def get_pages_to_offloading(self) -> List[List[int]]:
         page_list = self.offload_page_indexes.pop_all_item()
-        ans_list = []
-        if page_list is not None:
-            for page_index in page_list:
-                page_item: _CpuPageStatus = self.page_items.get_item_by_index(index=page_index)
-                if page_item.is_ready():
-                    page_item.ref_count += 1
-                    page_item.status = page_item.OFFLOADING
-                    ans_list.append(page_index)
-        return ans_list
+        groups: List[List[int]] = []
+        current_group: List[int] = []
+
+        if page_list is None:
+            return groups
+
+        for value in page_list:
+            page_index, is_group_head = self._decode_offload_value(value)
+            if is_group_head:
+                if current_group:
+                    groups.append(current_group)
+                    current_group = []
+            page_item: _CpuPageStatus = self.page_items.get_item_by_index(index=page_index)
+            if page_item.is_ready():
+                page_item.ref_count += 1
+                page_item.status = page_item.OFFLOADING
+                current_group.append(page_index)
+
+        if current_group:
+            groups.append(current_group)
+
+        return groups
 
     def update_pages_status_to_ready_recycle(self, page_list: List[int], deref: bool = True):
         for page_index in page_list:
@@ -179,6 +220,22 @@ def update_pages_status_to_ready_recycle(self, page_list: List[int], deref: bool
                     cur_page.ref_count -= 1
         return
 
+    def recycle_pages(self, page_list: List[int]):
+        for page_index in page_list:
+            if page_index == -1:
+                continue
+            cur_page: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
+            cur_page.del_self_from_list()
+            if not cur_page.is_empty() and cur_page.hash_key != 0:
+                existing_index = self.page_hash_dict.get(cur_page.hash_key)
+                if existing_index is not None:
+                    self.page_hash_dict.remove(cur_page.hash_key)
+            cur_page.hash_key = 0
+            cur_page.status = cur_page.EMPTY
+            cur_page.ref_count = 0
+            self.page_items.add_item_to_tail(cur_page.self_index)
+        return
+
     def _create_cpu_status_list(self, init_shm_data: bool):
         self.page_items = ShmLinkedList(
             name=f"{get_unique_server_name()}_cpu_kv_cache_page_items",