[https://nvbugs/5649010][fix] increase status-checking interval to avoid instability (NVIDIA#9203)

reasonsolo · web-flow · commit c789000a62ec · 2025-11-19T08:55:42.000+08:00
Signed-off-by: Lizhi Zhou &lt;1432185+reasonsolo@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/disaggregated/test_auto_scaling.py b/tests/integration/defs/disaggregated/test_auto_scaling.py
@@ -15,6 +15,7 @@
 TEST_PORT = 8000
 HEARTBEAT_INTERVAL = 1
 INACTIVE_TIMEOUT = 2
+CHECK_STATUS_INTERVAL = 3  # check cluster status with a larger interval than inactive timeout to avoid flaky tests
 
 ROUTER_TYPES = ["round_robin", "load_balancing", "kv_cache_aware"]
 
@@ -317,7 +318,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
         print(response)
         # kill gen1, the request should fail
         terminate(gen_worker1)
-        await asyncio.sleep(INACTIVE_TIMEOUT)
+        await asyncio.sleep(CHECK_STATUS_INTERVAL)
         verify_cluster_info(False, 1, 0)
         with pytest.raises(Exception):
             request_completion(model_name, "Hello, my name is", port=TEST_PORT)
@@ -330,7 +331,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
                                      TEST_PORT + 201,
                                      device=2)
         await wait_for_worker_ready(TEST_PORT + 201)
-        await asyncio.sleep(INACTIVE_TIMEOUT)
+        await asyncio.sleep(CHECK_STATUS_INTERVAL)
         verify_cluster_info(True, 1, 1)
 
         response = request_completion(model_name, test_prompt, port=TEST_PORT)
@@ -340,7 +341,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
 
         # kill ctx1, the request should fail
         terminate(ctx_worker1)
-        await asyncio.sleep(INACTIVE_TIMEOUT)
+        await asyncio.sleep(CHECK_STATUS_INTERVAL)
         verify_cluster_info(False, 0, 1)
         with pytest.raises(Exception):
             request_completion(model_name, test_prompt, port=TEST_PORT)
@@ -362,7 +363,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
         gen_worker1 = run_gen_worker(model_name, worker_config, TEST_PORT + 200)
         await wait_for_worker_ready(TEST_PORT + 100)
         await wait_for_worker_ready(TEST_PORT + 200)
-        await asyncio.sleep(INACTIVE_TIMEOUT)
+        await asyncio.sleep(CHECK_STATUS_INTERVAL)
         verify_cluster_info(True, 2, 2)
 
         # send 10 requests, the responses will be generated by the different ctx/gen workers (but we can't verify it now)
@@ -402,7 +403,7 @@ async def test_disagg_server_restart(model_name, disagg_server_config,
 
         # kill disagg server, the request should fail
         terminate(disagg_server)
-        await asyncio.sleep(INACTIVE_TIMEOUT)
+        await asyncio.sleep(CHECK_STATUS_INTERVAL)
         with pytest.raises(Exception):
             verify_cluster_info(False, 1, 1, expected_code=500)
 
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -388,7 +388,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-b
 accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
 accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5651824)
-disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/5649010)
 unittest/_torch/modules SKIP (https://nvbugs/5637037)
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854)
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854)
@@ -405,9 +404,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
 examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5655832)
-disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5633340)
-disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing] SKIP (https://nvbugs/5649010)
-disaggregated/test_auto_scaling.py::test_worker_restart[http-round_robin] SKIP (https://nvbugs/5649010)
 disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5661926)
 unittest/_torch/sampler/test_torch_sampler.py::TestBatchedSampling SKIP (https://nvbugs/5661877)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5568836)