@@ -265,6 +265,8 @@ def get_test_config(test_desc, example_dir, test_root):
265265 (3 ,
266266 f"{ test_configs_root } /disagg_config_deepseek_v3_lite_empty_batch.yaml"
267267 ),
268+ "llama4_kv_cache_overflow" :
269+ (8 , f"{ test_configs_root } /disagg_config_llama4_kv_cache_overflow.yaml" ),
268270 }
269271
270272 if test_desc not in config_map :
@@ -1685,6 +1687,104 @@ def get_config_for_benchmark(model_root, backend):
16851687 return serve_config
16861688
16871689
1690+ def run_disaggregated_genai_perf (config_file ,
1691+ model_path ,
1692+ num_ranks ,
1693+ server_start_timeout = 1200 ,
1694+ input_tokens = 128000 ,
1695+ output_tokens = 100 ,
1696+ env = None ,
1697+ cwd = None ):
1698+ """Run disaggregated test with genai-perf for performance/stress testing."""
1699+ cleanup_output_files ()
1700+ run_env = env .copy ()
1701+ run_env ["UCX_TLS" ] = "^ib"
1702+
1703+ workers_cmd = [
1704+ 'mpirun' , '--allow-run-as-root' , '--oversubscribe' , '-n' ,
1705+ str (num_ranks ), 'trtllm-serve' , 'disaggregated_mpi_worker' , '-c' ,
1706+ config_file
1707+ ]
1708+
1709+ server_cmd = [
1710+ 'trtllm-serve' , 'disaggregated' , '--server_start_timeout' ,
1711+ str (server_start_timeout ), '-c' , config_file
1712+ ]
1713+
1714+ artifact_dir = os .path .join (cwd or "." , "benchmark-results" )
1715+
1716+ try :
1717+ with (open ('output_workers.log' , 'w' ) as output_workers ,
1718+ popen (workers_cmd ,
1719+ stdout = output_workers ,
1720+ stderr = subprocess .STDOUT ,
1721+ env = run_env ,
1722+ cwd = cwd ) as workers_proc , open ('output_disagg.log' , 'w' ) as
1723+ output_disagg ,
1724+ popen (server_cmd ,
1725+ stdout = output_disagg ,
1726+ stderr = subprocess .STDOUT ,
1727+ env = run_env ,
1728+ cwd = cwd ) as server_proc ):
1729+
1730+ # Wait for server to be ready
1731+ if not wait_for_server (
1732+ "localhost" , 8000 , timeout_seconds = server_start_timeout ):
1733+ raise RuntimeError (
1734+ f"Disaggregated server did not become ready within { server_start_timeout } seconds"
1735+ )
1736+
1737+ # Run genai-perf
1738+ genai_perf_cmd = [
1739+ 'genai-perf' , 'profile' , '--model' , model_path , '--tokenizer' ,
1740+ model_path , '--endpoint-type' , 'chat' , '--endpoint' ,
1741+ '/v1/chat/completions' , '--streaming' , '--url' ,
1742+ 'localhost:8000' , '--synthetic-input-tokens-mean' ,
1743+ str (input_tokens ), '--synthetic-input-tokens-stddev' , '0' ,
1744+ '--output-tokens-mean' ,
1745+ str (output_tokens ), '--output-tokens-stddev' , '0' ,
1746+ '--extra-inputs' , f'max_tokens:{ output_tokens } ' ,
1747+ '--extra-inputs' , f'min_tokens:{ output_tokens } ' ,
1748+ '--extra-inputs' , 'ignore_eos:true' , '--concurrency' , '1' ,
1749+ '--warmup-request-count' , '8' , '--num-dataset-entries' , '64' ,
1750+ '--random-seed' , '100' , '--artifact-dir' , artifact_dir , '--' ,
1751+ '-v' , '-H' , 'Authorization: Bearer NOT USED' , '-H' ,
1752+ 'Accept: text/event-stream' , '-p' , '200000'
1753+ ]
1754+
1755+ check_call (genai_perf_cmd ,
1756+ env = env ,
1757+ poll_procs = [workers_proc , server_proc ])
1758+
1759+ except Exception :
1760+ # Print outputs on error
1761+ logger .error ("-------- Workers output (last 30 lines) --------" )
1762+ try :
1763+ with open ('output_workers.log' , 'r' ) as f :
1764+ lines = f .read ().split ('\n ' )
1765+ for line in lines [- 30 :]:
1766+ if line .strip ():
1767+ logger .error (line )
1768+ except FileNotFoundError :
1769+ pass
1770+
1771+ logger .error ("-------- Disagg server output (last 30 lines) --------" )
1772+ try :
1773+ with open ('output_disagg.log' , 'r' ) as f :
1774+ lines = f .read ().split ('\n ' )
1775+ for line in lines [- 30 :]:
1776+ if line .strip ():
1777+ logger .error (line )
1778+ except FileNotFoundError :
1779+ pass
1780+ raise
1781+ finally :
1782+ server_proc .terminate ()
1783+ workers_proc .terminate ()
1784+ server_proc .wait ()
1785+ workers_proc .wait ()
1786+
1787+
16881788@pytest .mark .parametrize ("benchmark_model_root" , [
16891789 'DeepSeek-V3-Lite-fp8' , 'DeepSeek-V3-Lite-bf16' , 'llama-v3-8b-hf' ,
16901790 'llama-3.1-8b-instruct-hf-fp8'
@@ -1768,3 +1868,41 @@ def test_disaggregated_deepseek_v3_lite_bf16_empty_batch(
17681868 print (f"E2EL: { e2el } ms, TTFT: { ttft } ms" )
17691869
17701870 assert e2el > 0 and ttft > 0
1871+
1872+
1873+ @pytest .mark .skip_less_device (8 )
1874+ @pytest .mark .skip_less_device_memory (140000 )
1875+ @pytest .mark .parametrize (
1876+ "model_path" ,
1877+ ['llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8' ])
1878+ def test_llama4_long_context_kv_cache_overflow (disaggregated_test_root ,
1879+ disaggregated_example_root ,
1880+ llm_venv , model_path ):
1881+ """
1882+ RCCA: https://nvbugspro.nvidia.com/bug/5555681
1883+ Test to reproduce KV cache buffer overflow bug with long context.
1884+ """
1885+ models_root = llm_models_root ()
1886+ llama4_model_root = os .path .join (models_root , model_path )
1887+
1888+ # Create symlink to match config file path
1889+ src_dst_dict = {
1890+ llama4_model_root : f"{ llm_venv .get_working_directory ()} /{ model_path } " ,
1891+ }
1892+ for src , dst in src_dst_dict .items ():
1893+ if not os .path .islink (dst ):
1894+ os .makedirs (os .path .dirname (dst ), exist_ok = True )
1895+ os .symlink (src , dst , target_is_directory = True )
1896+
1897+ num_ranks , config_file = get_test_config ("llama4_kv_cache_overflow" ,
1898+ disaggregated_example_root ,
1899+ os .path .dirname (__file__ ))
1900+
1901+ run_disaggregated_genai_perf (config_file = config_file ,
1902+ model_path = llama4_model_root ,
1903+ num_ranks = num_ranks ,
1904+ server_start_timeout = 1200 ,
1905+ input_tokens = 128000 ,
1906+ output_tokens = 100 ,
1907+ env = llm_venv ._new_env ,
1908+ cwd = llm_venv .get_working_directory ())
0 commit comments