feat: refine

niushengxiao · niushengxiao · commit 51c6b59a585a · 2025-11-13T10:50:08.000+08:00
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -540,4 +540,10 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--disk_cache_storage_size", type=float, default=10, help="""The capacity of disk cache. GB used."""
     )
+    parser.add_argument(
+        "--disk_cache_dir",
+        type=str,
+        default=None,
+        help="""Directory used to persist disk cache data. Defaults to a temp directory when not set.""",
+    )
     return parser
diff --git a/lightllm/server/core/objs/atomic_lock.py b/lightllm/server/core/objs/atomic_lock.py
@@ -29,9 +29,13 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
     # acquire_sleep1ms 和 release 是某些特定场景下主动使用进行锁获取的操作函数
     def acquire_sleep1ms(self):
+        last_log_time = time.monotonic()
         with atomics.atomicview(buffer=self.shm.buf, atype=atomics.INT) as a:
             while not a.cmpxchg_weak(0, 1):
-                logger.warning("acquire_sleep1ms wait for 1ms")
+                now = time.monotonic()
+                if now - last_log_time >= 0.1:
+                    logger.warning("acquire_sleep1ms wait for 100ms")
+                    last_log_time = now
                 time.sleep(0.001)
                 pass
 
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
@@ -109,6 +109,7 @@ class StartArgs:
     cpu_cache_token_page_size: int = field(default=64)
     enable_disk_cache: bool = field(default=False)
     disk_cache_storage_size: float = field(default=10)
+    disk_cache_dir: Optional[str] = field(default=None)
     # zmp ports
     router_port: int = field(default=None)
     detokenization_port: int = field(default=None)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -612,15 +612,21 @@ async def _wait_to_token_package(
                             f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
                             f"mtp_avg_token_per_step:{mtp_avg_token_per_step} "
                         )
+                        if prompt_cache_len > 0:
+                            logger.info(
+                                f"[gpu cache hit] "
+                                f"prompt_cache_len:{prompt_cache_len} "
+                                f"prompt_cache_ratio:{prompt_cache_ratio} "
+                            )
                         if cpu_prompt_cache_len > 0:
                             logger.info(
-                                f"blueswhen "
+                                f"[cpu cache hit] "
                                 f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
                                 f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
                             )
                         if disk_prompt_cache_len > 0:
                             logger.info(
-                                f"blueswhen "
+                                f"[disk cache hit] "
                                 f"disk_prompt_cache_len:{disk_prompt_cache_len} "
                                 f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
                             )
diff --git a/lightllm/server/multi_level_kv_cache/cpu_cache_client.py b/lightllm/server/multi_level_kv_cache/cpu_cache_client.py
@@ -126,6 +126,12 @@ def update_pages_status_to_ready(
                     assert cur_page.ref_count > 0
                     cur_page.ref_count -= 1
 
+        # 控制prompt长度，较短的prompt不进行disk offload
+        if disk_offload_enable and offload_candidates and len(page_list) * self.args.cpu_cache_token_page_size < 10000:
+            print(f"skip disk offload for small page, length = {len(page_list) * self.args.cpu_cache_token_page_size}")
+            self.mark_pages_recyclable(page_list=offload_candidates)
+            return
+
         if disk_offload_enable and offload_candidates:
             for idx, page_index in enumerate(offload_candidates):
                 if idx == 0:
@@ -225,11 +231,19 @@ def recycle_pages(self, page_list: List[int]):
             if page_index == -1:
                 continue
             cur_page: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
-            cur_page.del_self_from_list()
-            if not cur_page.is_empty() and cur_page.hash_key != 0:
+
+            if cur_page.ref_count != 0:
+                if cur_page.status == cur_page.LOADING and cur_page.ref_count == 1:
+                    cur_page.ref_count = 0
+                else:
+                    continue
+
+            if cur_page.hash_key != 0:
                 existing_index = self.page_hash_dict.get(cur_page.hash_key)
-                if existing_index is not None:
+                if existing_index is not None and existing_index == cur_page.self_index:
                     self.page_hash_dict.remove(cur_page.hash_key)
+
+            cur_page.del_self_from_list()
             cur_page.hash_key = 0
             cur_page.status = cur_page.EMPTY
             cur_page.ref_count = 0
diff --git a/lightllm/server/multi_level_kv_cache/disk_cache_worker.py b/lightllm/server/multi_level_kv_cache/disk_cache_worker.py
@@ -23,21 +23,28 @@ class _PagePayload:
 class DiskCacheWorker:
     """Background worker that offloads CPU KV pages to disk using kvcache."""
 
-    def __init__(self, disk_cache_storage_size: float, cpu_cache_client):
+    def __init__(
+        self,
+        disk_cache_storage_size: float,
+        cpu_cache_client,
+        disk_cache_dir: Optional[str] = None,
+    ):
         self.cpu_cache_client = cpu_cache_client
         self._pages_all_idle = False
 
         assert disk_cache_storage_size > 0
         storage_size = int(disk_cache_storage_size * (1024 ** 3))
-        num_shard = 32
-        num_worker = 32
+        num_shard = 64
+        num_worker = 48
+        max_concurrent_write_tasks = 16
 
-        cache_dir = os.getenv("LIGHTLLM_DISK_CACHE_DIR")
+        cache_dir = disk_cache_dir
         if not cache_dir:
             cache_dir = os.path.join(tempfile.gettempdir(), f"lightllm_disk_cache_{get_unique_server_name()}")
         os.makedirs(cache_dir, exist_ok=True)
         cache_file = os.path.join(cache_dir, "cache_file")
 
+        self.max_concurrent_write_tasks = max_concurrent_write_tasks
         self._page_major_tensor = self._prepare_tensor(cpu_cache_client.cpu_kv_cache_tensor)
 
         self.service = PyLocalCacheService(
@@ -49,7 +56,7 @@ def __init__(self, disk_cache_storage_size: float, cpu_cache_client):
         )
 
         logger.info(
-            "blueswhen disk cache worker initialized: dir=%s size_bytes=%d shards=%d workers=%d pages_per_block=%d",
+            "disk cache worker initialized: dir=%s size_bytes=%d shards=%d workers=%d pages_per_block=%d",
             cache_dir,
             storage_size,
             num_shard,
@@ -63,35 +70,15 @@ def _prepare_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
 
     def run(self) -> None:
         while True:
-            time.sleep(0.01)
+            time.sleep(0.1)
             payload_groups = self._gather_offload_payloads()
-            # self._log_idle_once()
             if not payload_groups:
                 continue
             for payloads in payload_groups:
                 if not payloads:
                     continue
                 self._persist_pages_to_disk(payloads)
 
-    def _log_idle_once(self) -> int:
-        locked_pages = 0
-        self.cpu_cache_client.lock.acquire_sleep1ms()
-        try:
-            for page_idx in range(self.cpu_cache_client.page_num):
-                page_item = self.cpu_cache_client.page_items.get_item_by_index(page_idx)
-                if not page_item.is_ready_recycle() or page_item.ref_count != 0:
-                    locked_pages += 1
-        finally:
-            self.cpu_cache_client.lock.release()
-
-        if locked_pages == 0:
-            if not self._pages_all_idle:
-                logger.info("blueswhen all cpu cache pages are idle and ready to reuse")
-            self._pages_all_idle = True
-        else:
-            self._pages_all_idle = False
-        return locked_pages
-
     def _gather_offload_payloads(self) -> List[List[_PagePayload]]:
         self.cpu_cache_client.lock.acquire_sleep1ms()
         try:
@@ -120,16 +107,21 @@ def _persist_pages_to_disk(self, payloads: List[_PagePayload]) -> None:
         kv_indexer = torch.tensor(page_indexes, dtype=torch.int32, device="cpu")
         query_result = self.service.query(tokens)
         if not all(query_result):
+            # 限制写入并发量，给读取操作留资源
+            while (
+                self.service.active_threads("r") and self.service.active_threads("w") >= self.max_concurrent_write_tasks
+            ):
+                time.sleep(0.001)
+
             task = self.service.create(tokens=tokens, kv_page_indexer=kv_indexer, mode="w")
-            while not task.ready():
+            # 数据安全即可结束等待，无需写入完成
+            while not task.data_safe():
                 time.sleep(0.001)
 
         self.cpu_cache_client.lock.acquire_sleep1ms()
         self.cpu_cache_client.update_pages_status_to_ready_recycle(page_list=page_indexes, deref=True)
         self.cpu_cache_client.lock.release()
 
-        # self._log_idle_once()
-
     def blocks_exist(self, tokens: List[int], start_pos: int = 0) -> bool:
         if not tokens or start_pos < 0 or start_pos >= len(tokens):
             return False
@@ -147,6 +139,11 @@ def load_pages(self, tokens: List[int], page_indexes: List[int], start_pos: int
         if start_pos < 0 or start_pos >= len(tokens):
             return False
 
+        # 检测当前是否有写操作在进行，若有则跳过本次load请求，暂时不用
+        # if self.service.active_threads("w") > 0:
+        #     logger.warning("disk cache worker is busy writing, skip load_pages")
+        #     return False
+
         kv_indexer = torch.tensor(page_indexes, dtype=torch.int32, device="cpu")
         task = self.service.create(tokens=tokens, kv_page_indexer=kv_indexer, mode="r", start_pos=start_pos)
         while not task.ready():
diff --git a/lightllm/server/multi_level_kv_cache/manager.py b/lightllm/server/multi_level_kv_cache/manager.py
@@ -48,6 +48,7 @@ def __init__(
             self.disk_cache_worker = DiskCacheWorker(
                 disk_cache_storage_size=self.args.disk_cache_storage_size,
                 cpu_cache_client=self.cpu_cache_client,
+                disk_cache_dir=self.args.disk_cache_dir,
             )
             self.disk_cache_thread = threading.Thread(target=self.disk_cache_worker.run, daemon=True)
             self.disk_cache_thread.start()
@@ -71,8 +72,8 @@ def _handle_group_req_cpu_cache_match(self, group_req_indexes: GroupReqIndexes,
         if current_time - start_time >= self.cpu_cache_time_out:
             self.send_to_router.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
             logger.warning(
-                f"blueswhen cpu cache match time out {current_time - start_time}s, "
-                "group_req_id: {group_req_indexes.group_req_id}"
+                f"cpu cache match time out {current_time - start_time}s, "
+                f"group_req_id: {group_req_indexes.group_req_id}"
             )
             return
 
diff --git a/requirements.txt b/requirements.txt
@@ -87,4 +87,8 @@ librosa==0.11.0
 cuda_bindings==12.9.0
 orjson==3.11.2
 setproctitle==1.3.6
-xxhash==3.6.0
+xxhash==3.6.0
+torchvision==0.23.0
+interegular
+partial_json_parser
+websockets