[TRTLLM-9050][test] add llama4 disagg case to cover kv cache overflow error (#9172)

crazydemo · web-flow · commit 782dfca7e85c · 2025-11-18T18:26:32.000-08:00
Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_llama4_kv_cache_overflow.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_llama4_kv_cache_overflow.yaml
@@ -0,0 +1,52 @@
+model: llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
+hostname: localhost
+port: 8000
+backend: pytorch
+
+context_servers:
+  num_instances: 1
+  tensor_parallel_size: 4
+  pipeline_parallel_size: 1
+  moe_expert_parallel_size: 1
+  enable_attention_dp: false
+  max_num_tokens: 8192
+  max_seq_len: 257000
+  max_input_len: 256000
+  max_batch_size: 1
+  trust_remote_code: true
+  enable_chunked_prefill: true
+  kv_cache_config:
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.3
+  disable_overlap_scheduler: true
+  cuda_graph_config: null
+  cache_transceiver_config:
+    backend: UCX
+    # Intentionally small to reproduce buffer overflow bug
+    max_tokens_in_buffer: 2048
+  urls:
+    - "localhost:8001"
+
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 4
+  pipeline_parallel_size: 1
+  moe_expert_parallel_size: 1
+  enable_attention_dp: false
+  max_num_tokens: 8192
+  max_seq_len: 257000
+  max_input_len: 256000
+  max_batch_size: 1
+  trust_remote_code: true
+  enable_chunked_prefill: true
+  kv_cache_config:
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.3
+  disable_overlap_scheduler: true
+  cuda_graph_config: null
+  cache_transceiver_config:
+    backend: UCX
+    # Intentionally small to reproduce buffer overflow bug
+    max_tokens_in_buffer: 2048
+  urls:
+    - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -265,6 +265,8 @@ def get_test_config(test_desc, example_dir, test_root):
         (3,
          f"{test_configs_root}/disagg_config_deepseek_v3_lite_empty_batch.yaml"
          ),
+        "llama4_kv_cache_overflow":
+        (8, f"{test_configs_root}/disagg_config_llama4_kv_cache_overflow.yaml"),
     }
 
     if test_desc not in config_map:
@@ -1685,6 +1687,104 @@ def get_config_for_benchmark(model_root, backend):
     return serve_config
 
 
+def run_disaggregated_genai_perf(config_file,
+                                 model_path,
+                                 num_ranks,
+                                 server_start_timeout=1200,
+                                 input_tokens=128000,
+                                 output_tokens=100,
+                                 env=None,
+                                 cwd=None):
+    """Run disaggregated test with genai-perf for performance/stress testing."""
+    cleanup_output_files()
+    run_env = env.copy()
+    run_env["UCX_TLS"] = "^ib"
+
+    workers_cmd = [
+        'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
+        str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
+        config_file
+    ]
+
+    server_cmd = [
+        'trtllm-serve', 'disaggregated', '--server_start_timeout',
+        str(server_start_timeout), '-c', config_file
+    ]
+
+    artifact_dir = os.path.join(cwd or ".", "benchmark-results")
+
+    try:
+        with (open('output_workers.log', 'w') as output_workers,
+              popen(workers_cmd,
+                    stdout=output_workers,
+                    stderr=subprocess.STDOUT,
+                    env=run_env,
+                    cwd=cwd) as workers_proc, open('output_disagg.log', 'w') as
+              output_disagg,
+              popen(server_cmd,
+                    stdout=output_disagg,
+                    stderr=subprocess.STDOUT,
+                    env=run_env,
+                    cwd=cwd) as server_proc):
+
+            # Wait for server to be ready
+            if not wait_for_server(
+                    "localhost", 8000, timeout_seconds=server_start_timeout):
+                raise RuntimeError(
+                    f"Disaggregated server did not become ready within {server_start_timeout} seconds"
+                )
+
+            # Run genai-perf
+            genai_perf_cmd = [
+                'genai-perf', 'profile', '--model', model_path, '--tokenizer',
+                model_path, '--endpoint-type', 'chat', '--endpoint',
+                '/v1/chat/completions', '--streaming', '--url',
+                'localhost:8000', '--synthetic-input-tokens-mean',
+                str(input_tokens), '--synthetic-input-tokens-stddev', '0',
+                '--output-tokens-mean',
+                str(output_tokens), '--output-tokens-stddev', '0',
+                '--extra-inputs', f'max_tokens:{output_tokens}',
+                '--extra-inputs', f'min_tokens:{output_tokens}',
+                '--extra-inputs', 'ignore_eos:true', '--concurrency', '1',
+                '--warmup-request-count', '8', '--num-dataset-entries', '64',
+                '--random-seed', '100', '--artifact-dir', artifact_dir, '--',
+                '-v', '-H', 'Authorization: Bearer NOT USED', '-H',
+                'Accept: text/event-stream', '-p', '200000'
+            ]
+
+            check_call(genai_perf_cmd,
+                       env=env,
+                       poll_procs=[workers_proc, server_proc])
+
+    except Exception:
+        # Print outputs on error
+        logger.error("-------- Workers output (last 30 lines) --------")
+        try:
+            with open('output_workers.log', 'r') as f:
+                lines = f.read().split('\n')
+                for line in lines[-30:]:
+                    if line.strip():
+                        logger.error(line)
+        except FileNotFoundError:
+            pass
+
+        logger.error("-------- Disagg server output (last 30 lines) --------")
+        try:
+            with open('output_disagg.log', 'r') as f:
+                lines = f.read().split('\n')
+                for line in lines[-30:]:
+                    if line.strip():
+                        logger.error(line)
+        except FileNotFoundError:
+            pass
+        raise
+    finally:
+        server_proc.terminate()
+        workers_proc.terminate()
+        server_proc.wait()
+        workers_proc.wait()
+
+
 @pytest.mark.parametrize("benchmark_model_root", [
     'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf',
     'llama-3.1-8b-instruct-hf-fp8'
@@ -1768,3 +1868,41 @@ def test_disaggregated_deepseek_v3_lite_bf16_empty_batch(
     print(f"E2EL: {e2el} ms, TTFT: {ttft} ms")
 
     assert e2el > 0 and ttft > 0
+
+
+@pytest.mark.skip_less_device(8)
+@pytest.mark.skip_less_device_memory(140000)
+@pytest.mark.parametrize(
+    "model_path",
+    ['llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8'])
+def test_llama4_long_context_kv_cache_overflow(disaggregated_test_root,
+                                               disaggregated_example_root,
+                                               llm_venv, model_path):
+    """
+    RCCA: https://nvbugspro.nvidia.com/bug/5555681
+    Test to reproduce KV cache buffer overflow bug with long context.
+    """
+    models_root = llm_models_root()
+    llama4_model_root = os.path.join(models_root, model_path)
+
+    # Create symlink to match config file path
+    src_dst_dict = {
+        llama4_model_root: f"{llm_venv.get_working_directory()}/{model_path}",
+    }
+    for src, dst in src_dst_dict.items():
+        if not os.path.islink(dst):
+            os.makedirs(os.path.dirname(dst), exist_ok=True)
+            os.symlink(src, dst, target_is_directory=True)
+
+    num_ranks, config_file = get_test_config("llama4_kv_cache_overflow",
+                                             disaggregated_example_root,
+                                             os.path.dirname(__file__))
+
+    run_disaggregated_genai_perf(config_file=config_file,
+                                 model_path=llama4_model_root,
+                                 num_ranks=num_ranks,
+                                 server_start_timeout=1200,
+                                 input_tokens=128000,
+                                 output_tokens=100,
+                                 env=llm_venv._new_env,
+                                 cwd=llm_venv.get_working_directory())
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -787,6 +787,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
 disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_llama4_long_context_kv_cache_overflow[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]