Skip to content

Commit 782dfca

Browse files
authored
[TRTLLM-9050][test] add llama4 disagg case to cover kv cache overflow error (#9172)
Signed-off-by: Ivy Zhang <[email protected]>
1 parent 7905d6c commit 782dfca

File tree

3 files changed

+191
-0
lines changed

3 files changed

+191
-0
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
model: llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
2+
hostname: localhost
3+
port: 8000
4+
backend: pytorch
5+
6+
context_servers:
7+
num_instances: 1
8+
tensor_parallel_size: 4
9+
pipeline_parallel_size: 1
10+
moe_expert_parallel_size: 1
11+
enable_attention_dp: false
12+
max_num_tokens: 8192
13+
max_seq_len: 257000
14+
max_input_len: 256000
15+
max_batch_size: 1
16+
trust_remote_code: true
17+
enable_chunked_prefill: true
18+
kv_cache_config:
19+
enable_block_reuse: false
20+
free_gpu_memory_fraction: 0.3
21+
disable_overlap_scheduler: true
22+
cuda_graph_config: null
23+
cache_transceiver_config:
24+
backend: UCX
25+
# Intentionally small to reproduce buffer overflow bug
26+
max_tokens_in_buffer: 2048
27+
urls:
28+
- "localhost:8001"
29+
30+
generation_servers:
31+
num_instances: 1
32+
tensor_parallel_size: 4
33+
pipeline_parallel_size: 1
34+
moe_expert_parallel_size: 1
35+
enable_attention_dp: false
36+
max_num_tokens: 8192
37+
max_seq_len: 257000
38+
max_input_len: 256000
39+
max_batch_size: 1
40+
trust_remote_code: true
41+
enable_chunked_prefill: true
42+
kv_cache_config:
43+
enable_block_reuse: false
44+
free_gpu_memory_fraction: 0.3
45+
disable_overlap_scheduler: true
46+
cuda_graph_config: null
47+
cache_transceiver_config:
48+
backend: UCX
49+
# Intentionally small to reproduce buffer overflow bug
50+
max_tokens_in_buffer: 2048
51+
urls:
52+
- "localhost:8002"

tests/integration/defs/disaggregated/test_disaggregated.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ def get_test_config(test_desc, example_dir, test_root):
265265
(3,
266266
f"{test_configs_root}/disagg_config_deepseek_v3_lite_empty_batch.yaml"
267267
),
268+
"llama4_kv_cache_overflow":
269+
(8, f"{test_configs_root}/disagg_config_llama4_kv_cache_overflow.yaml"),
268270
}
269271

270272
if test_desc not in config_map:
@@ -1685,6 +1687,104 @@ def get_config_for_benchmark(model_root, backend):
16851687
return serve_config
16861688

16871689

1690+
def run_disaggregated_genai_perf(config_file,
1691+
model_path,
1692+
num_ranks,
1693+
server_start_timeout=1200,
1694+
input_tokens=128000,
1695+
output_tokens=100,
1696+
env=None,
1697+
cwd=None):
1698+
"""Run disaggregated test with genai-perf for performance/stress testing."""
1699+
cleanup_output_files()
1700+
run_env = env.copy()
1701+
run_env["UCX_TLS"] = "^ib"
1702+
1703+
workers_cmd = [
1704+
'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
1705+
str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
1706+
config_file
1707+
]
1708+
1709+
server_cmd = [
1710+
'trtllm-serve', 'disaggregated', '--server_start_timeout',
1711+
str(server_start_timeout), '-c', config_file
1712+
]
1713+
1714+
artifact_dir = os.path.join(cwd or ".", "benchmark-results")
1715+
1716+
try:
1717+
with (open('output_workers.log', 'w') as output_workers,
1718+
popen(workers_cmd,
1719+
stdout=output_workers,
1720+
stderr=subprocess.STDOUT,
1721+
env=run_env,
1722+
cwd=cwd) as workers_proc, open('output_disagg.log', 'w') as
1723+
output_disagg,
1724+
popen(server_cmd,
1725+
stdout=output_disagg,
1726+
stderr=subprocess.STDOUT,
1727+
env=run_env,
1728+
cwd=cwd) as server_proc):
1729+
1730+
# Wait for server to be ready
1731+
if not wait_for_server(
1732+
"localhost", 8000, timeout_seconds=server_start_timeout):
1733+
raise RuntimeError(
1734+
f"Disaggregated server did not become ready within {server_start_timeout} seconds"
1735+
)
1736+
1737+
# Run genai-perf
1738+
genai_perf_cmd = [
1739+
'genai-perf', 'profile', '--model', model_path, '--tokenizer',
1740+
model_path, '--endpoint-type', 'chat', '--endpoint',
1741+
'/v1/chat/completions', '--streaming', '--url',
1742+
'localhost:8000', '--synthetic-input-tokens-mean',
1743+
str(input_tokens), '--synthetic-input-tokens-stddev', '0',
1744+
'--output-tokens-mean',
1745+
str(output_tokens), '--output-tokens-stddev', '0',
1746+
'--extra-inputs', f'max_tokens:{output_tokens}',
1747+
'--extra-inputs', f'min_tokens:{output_tokens}',
1748+
'--extra-inputs', 'ignore_eos:true', '--concurrency', '1',
1749+
'--warmup-request-count', '8', '--num-dataset-entries', '64',
1750+
'--random-seed', '100', '--artifact-dir', artifact_dir, '--',
1751+
'-v', '-H', 'Authorization: Bearer NOT USED', '-H',
1752+
'Accept: text/event-stream', '-p', '200000'
1753+
]
1754+
1755+
check_call(genai_perf_cmd,
1756+
env=env,
1757+
poll_procs=[workers_proc, server_proc])
1758+
1759+
except Exception:
1760+
# Print outputs on error
1761+
logger.error("-------- Workers output (last 30 lines) --------")
1762+
try:
1763+
with open('output_workers.log', 'r') as f:
1764+
lines = f.read().split('\n')
1765+
for line in lines[-30:]:
1766+
if line.strip():
1767+
logger.error(line)
1768+
except FileNotFoundError:
1769+
pass
1770+
1771+
logger.error("-------- Disagg server output (last 30 lines) --------")
1772+
try:
1773+
with open('output_disagg.log', 'r') as f:
1774+
lines = f.read().split('\n')
1775+
for line in lines[-30:]:
1776+
if line.strip():
1777+
logger.error(line)
1778+
except FileNotFoundError:
1779+
pass
1780+
raise
1781+
finally:
1782+
server_proc.terminate()
1783+
workers_proc.terminate()
1784+
server_proc.wait()
1785+
workers_proc.wait()
1786+
1787+
16881788
@pytest.mark.parametrize("benchmark_model_root", [
16891789
'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf',
16901790
'llama-3.1-8b-instruct-hf-fp8'
@@ -1768,3 +1868,41 @@ def test_disaggregated_deepseek_v3_lite_bf16_empty_batch(
17681868
print(f"E2EL: {e2el} ms, TTFT: {ttft} ms")
17691869

17701870
assert e2el > 0 and ttft > 0
1871+
1872+
1873+
@pytest.mark.skip_less_device(8)
1874+
@pytest.mark.skip_less_device_memory(140000)
1875+
@pytest.mark.parametrize(
1876+
"model_path",
1877+
['llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8'])
1878+
def test_llama4_long_context_kv_cache_overflow(disaggregated_test_root,
1879+
disaggregated_example_root,
1880+
llm_venv, model_path):
1881+
"""
1882+
RCCA: https://nvbugspro.nvidia.com/bug/5555681
1883+
Test to reproduce KV cache buffer overflow bug with long context.
1884+
"""
1885+
models_root = llm_models_root()
1886+
llama4_model_root = os.path.join(models_root, model_path)
1887+
1888+
# Create symlink to match config file path
1889+
src_dst_dict = {
1890+
llama4_model_root: f"{llm_venv.get_working_directory()}/{model_path}",
1891+
}
1892+
for src, dst in src_dst_dict.items():
1893+
if not os.path.islink(dst):
1894+
os.makedirs(os.path.dirname(dst), exist_ok=True)
1895+
os.symlink(src, dst, target_is_directory=True)
1896+
1897+
num_ranks, config_file = get_test_config("llama4_kv_cache_overflow",
1898+
disaggregated_example_root,
1899+
os.path.dirname(__file__))
1900+
1901+
run_disaggregated_genai_perf(config_file=config_file,
1902+
model_path=llama4_model_root,
1903+
num_ranks=num_ranks,
1904+
server_start_timeout=1200,
1905+
input_tokens=128000,
1906+
output_tokens=100,
1907+
env=llm_venv._new_env,
1908+
cwd=llm_venv.get_working_directory())

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
787787
disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
788788
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
789789
disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
790+
disaggregated/test_disaggregated.py::test_llama4_long_context_kv_cache_overflow[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8]
790791
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
791792
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
792793
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]

0 commit comments

Comments
 (0)