Skip to content

Commit 80f5832

Browse files
author
niushengxiao
committed
feat: disk cache v1.0
1 parent 82df7a1 commit 80f5832

File tree

6 files changed

+383
-46
lines changed

6 files changed

+383
-46
lines changed

lightllm/server/core/objs/req.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ class Req(ctypes.Structure):
8181
("candetoken_out_len", ctypes.c_int),
8282
("prompt_cache_len", ctypes.c_int), # 用于记录prompt cache 的命中长度,用于统计,这里指gpu kv cache命中长度
8383
("cpu_prompt_cache_len", ctypes.c_int), # 用于记录在 enable_cpu_cache 的场景下,命中的 cpu kv cache 的长度
84+
("disk_prompt_cache_len", ctypes.c_int), # 用于记录从磁盘命中的长度
8485
("is_paused", ctypes.c_bool), # 标记一个Req因为显存资源管理的原因被临时暂停了。
8586
("finish_status", FinishStatus),
8687
# 这个标记变量是http_server 写入,其他进程读取,用于标记该请求是否因为断网被aborted。
@@ -149,6 +150,7 @@ def init(
149150
self.candetoken_out_len = 0
150151
self.prompt_cache_len = 0
151152
self.cpu_prompt_cache_len = 0
153+
self.disk_prompt_cache_len = 0
152154
self.finish_token_index = -1
153155
self.can_released_mark = False
154156
self.reward_score = math.nan

lightllm/server/httpserver/manager.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from frozendict import frozendict
1313

1414
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
15-
from typing import Union, List, Tuple, Dict, Optional
15+
from typing import Union, List, Tuple, Dict, Optional, AsyncGenerator
1616
from websockets import ClientConnection
1717
from fastapi import Request
1818
from ..tokenizer import get_tokenizer
@@ -264,7 +264,7 @@ async def generate(
264264
nixl_pd_upload_websocket: ClientConnection = None,
265265
# 用于等待 pd_master 下发的交换信息
266266
nixl_pd_event: asyncio.Event = None,
267-
) -> Tuple[int, str, dict, FinishStatus]:
267+
) -> AsyncGenerator[Tuple[int, str, dict, FinishStatus], None]:
268268
start_time = time.time()
269269
request_headers = request.headers if request is not None else {}
270270
group_request_id = self.alloc_req_id(sampling_params, is_health_req)
@@ -567,6 +567,7 @@ async def _wait_to_token_package(
567567

568568
prompt_cache_len = metadata.pop("prompt_cache_len", 0)
569569
cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0)
570+
disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0)
570571
if is_first_token:
571572
first_token_cost_ms = (time.time() - start_time) * 1000
572573
is_first_token = False
@@ -589,6 +590,8 @@ async def _wait_to_token_package(
589590
x_request_id = request.headers.get("X-Request-Id", "") if request is not None else ""
590591
x_session_id = request.headers.get("X-Session-Id", "") if request is not None else ""
591592
prompt_cache_ratio = prompt_cache_len / prompt_tokens
593+
cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens
594+
disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens
592595

593596
mtp_avg_token_per_step = out_token_counter / max(
594597
(out_token_counter - metadata["mtp_accepted_token_num"]), 1
@@ -604,9 +607,23 @@ async def _wait_to_token_package(
604607
f"prompt_cache_len:{prompt_cache_len} "
605608
f"prompt_cache_ratio:{prompt_cache_ratio} "
606609
f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
607-
f"used_cpu_prompt_cache_len:{max(0, cpu_prompt_cache_len - prompt_cache_len)} "
610+
f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
611+
f"disk_prompt_cache_len:{disk_prompt_cache_len} "
612+
f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
608613
f"mtp_avg_token_per_step:{mtp_avg_token_per_step} "
609614
)
615+
if cpu_prompt_cache_len > 0:
616+
logger.info(
617+
f"blueswhen "
618+
f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
619+
f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
620+
)
621+
if disk_prompt_cache_len > 0:
622+
logger.info(
623+
f"blueswhen "
624+
f"disk_prompt_cache_len:{disk_prompt_cache_len} "
625+
f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
626+
)
610627
if group_request_id < 0:
611628
# health 探测请求,不记录日志和监控
612629
return
@@ -726,6 +743,7 @@ async def handle_loop(self):
726743
"count_output_tokens": count_output_tokens,
727744
"prompt_cache_len": req.prompt_cache_len,
728745
"cpu_prompt_cache_len": req.cpu_prompt_cache_len,
746+
"disk_prompt_cache_len": req.disk_prompt_cache_len,
729747
"mtp_accepted_token_num": req.mtp_accepted_token_num,
730748
}
731749
if self.args.return_all_prompt_logprobs:

lightllm/server/multi_level_kv_cache/cpu_cache_client.py

Lines changed: 83 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,16 @@ def __init__(self, only_create_meta_data: bool, init_shm_data: bool):
3737
self.attach_shm_handle = self._attach_shm_cpu_kv_cache()
3838
return
3939

40+
@staticmethod
41+
def _encode_offload_head(page_index: int) -> int:
42+
return -(page_index + 1)
43+
44+
@staticmethod
45+
def _decode_offload_value(value: int) -> Tuple[int, bool]:
46+
if value < 0:
47+
return -(value + 1), True
48+
return value, False
49+
4050
def get_one_empty_page(self, hash_key: int, disk_offload_enable: bool) -> Optional[int]:
4151
assert self.page_hash_dict.get(hash_key) is None
4252
head = self.page_items.head
@@ -63,15 +73,12 @@ def allocate_one_page(self, hash_key: int, disk_offload_enable: bool) -> Tuple[O
6373
page_index = self.page_hash_dict.get(hash_key)
6474
if page_index is not None:
6575
page_item: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
76+
page_item.ref_count += 1
77+
page_item.del_self_from_list()
78+
self.page_items.add_item_to_tail(index=page_index)
6679
if page_item.is_data_ready():
67-
page_item.ref_count += 1
68-
page_item.del_self_from_list()
69-
self.page_items.add_item_to_tail(index=page_index)
7080
return page_index, True
7181
else:
72-
page_item.ref_count += 1
73-
page_item.del_self_from_list()
74-
self.page_items.add_item_to_tail(index=page_index)
7582
return page_index, False
7683
else:
7784
page_index = self.get_one_empty_page(hash_key=hash_key, disk_offload_enable=disk_offload_enable)
@@ -101,34 +108,54 @@ def allocate_pages(self, hash_keys: List[int], disk_offload_enable: bool) -> Tup
101108
ready_list.extend([False for _ in range(left_num)])
102109
return page_list, ready_list
103110

104-
def update_pages_status_to_ready(self, page_list: List[int], deref: bool = True, disk_offload_enable: bool = False):
111+
def update_pages_status_to_ready(
112+
self,
113+
page_list: List[int],
114+
deref: bool = True,
115+
disk_offload_enable: bool = False,
116+
):
117+
offload_candidates: List[int] = []
105118
for page_index in page_list:
106119
if page_index != -1:
107120
cur_page: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
108121
if cur_page.status < cur_page.READY:
109122
cur_page.status = cur_page.READY
110123
if disk_offload_enable:
111-
self.offload_page_indexes.add_item(value=cur_page.self_index)
124+
offload_candidates.append(cur_page.self_index)
112125
if deref:
113126
assert cur_page.ref_count > 0
114127
cur_page.ref_count -= 1
128+
129+
if disk_offload_enable and offload_candidates:
130+
for idx, page_index in enumerate(offload_candidates):
131+
if idx == 0:
132+
encoded = self._encode_offload_head(page_index)
133+
else:
134+
encoded = page_index
135+
self.offload_page_indexes.add_item(value=encoded)
136+
return
137+
138+
def mark_pages_recyclable(self, page_list: List[int]):
139+
for page_index in page_list:
140+
if page_index == -1:
141+
continue
142+
cur_page: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
143+
if cur_page.status >= cur_page.READY:
144+
cur_page.status = cur_page.READY_RECYCLE
115145
return
116146

117147
def query_one_page(self, hash_key: int) -> Tuple[Optional[int], bool]:
118148
page_index = self.page_hash_dict.get(hash_key)
119149
if page_index is not None:
120150
page_item: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
151+
page_item.ref_count += 1
152+
# lru 更新
153+
page_item.del_self_from_list()
154+
self.page_items.add_item_to_tail(index=page_index)
121155
if page_item.is_data_ready():
122-
page_item.ref_count += 1
123-
# lru 更新
124-
page_item.del_self_from_list()
125-
self.page_items.add_item_to_tail(index=page_index)
126156
return page_index, True
127157
else:
128-
# lru 更新
129-
page_item.del_self_from_list()
130-
self.page_items.add_item_to_tail(index=page_index)
131-
return None, False
158+
return page_index, False
132159
else:
133160
return None, False
134161

@@ -138,6 +165,7 @@ def check_allpages_ready(self, page_list: List[int]) -> bool:
138165
continue
139166
page_item: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
140167
if not page_item.is_data_ready():
168+
logger.info("cpu cache page %d not ready, status %d", page_index, page_item.status)
141169
return False
142170
return True
143171

@@ -156,17 +184,30 @@ def deref_one_page(self, page_index: int):
156184
page_item.ref_count -= 1
157185
return
158186

159-
def get_pages_to_offloading(self) -> List[int]:
187+
def get_pages_to_offloading(self) -> List[List[int]]:
160188
page_list = self.offload_page_indexes.pop_all_item()
161-
ans_list = []
162-
if page_list is not None:
163-
for page_index in page_list:
164-
page_item: _CpuPageStatus = self.page_items.get_item_by_index(index=page_index)
165-
if page_item.is_ready():
166-
page_item.ref_count += 1
167-
page_item.status = page_item.OFFLOADING
168-
ans_list.append(page_index)
169-
return ans_list
189+
groups: List[List[int]] = []
190+
current_group: List[int] = []
191+
192+
if page_list is None:
193+
return groups
194+
195+
for value in page_list:
196+
page_index, is_group_head = self._decode_offload_value(value)
197+
if is_group_head:
198+
if current_group:
199+
groups.append(current_group)
200+
current_group = []
201+
page_item: _CpuPageStatus = self.page_items.get_item_by_index(index=page_index)
202+
if page_item.is_ready():
203+
page_item.ref_count += 1
204+
page_item.status = page_item.OFFLOADING
205+
current_group.append(page_index)
206+
207+
if current_group:
208+
groups.append(current_group)
209+
210+
return groups
170211

171212
def update_pages_status_to_ready_recycle(self, page_list: List[int], deref: bool = True):
172213
for page_index in page_list:
@@ -179,6 +220,22 @@ def update_pages_status_to_ready_recycle(self, page_list: List[int], deref: bool
179220
cur_page.ref_count -= 1
180221
return
181222

223+
def recycle_pages(self, page_list: List[int]):
224+
for page_index in page_list:
225+
if page_index == -1:
226+
continue
227+
cur_page: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
228+
cur_page.del_self_from_list()
229+
if not cur_page.is_empty() and cur_page.hash_key != 0:
230+
existing_index = self.page_hash_dict.get(cur_page.hash_key)
231+
if existing_index is not None:
232+
self.page_hash_dict.remove(cur_page.hash_key)
233+
cur_page.hash_key = 0
234+
cur_page.status = cur_page.EMPTY
235+
cur_page.ref_count = 0
236+
self.page_items.add_item_to_tail(cur_page.self_index)
237+
return
238+
182239
def _create_cpu_status_list(self, init_shm_data: bool):
183240
self.page_items = ShmLinkedList(
184241
name=f"{get_unique_server_name()}_cpu_kv_cache_page_items",

0 commit comments

Comments
 (0)