Skip to content

Commit c789000

Browse files
authored
[https://nvbugs/5649010][fix] increase status-checking interval to avoid instability (NVIDIA#9203)
Signed-off-by: Lizhi Zhou <[email protected]>
1 parent 34f845b commit c789000

File tree

2 files changed

+6
-9
lines changed

2 files changed

+6
-9
lines changed

tests/integration/defs/disaggregated/test_auto_scaling.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
TEST_PORT = 8000
1616
HEARTBEAT_INTERVAL = 1
1717
INACTIVE_TIMEOUT = 2
18+
CHECK_STATUS_INTERVAL = 3 # check cluster status with a larger interval than inactive timeout to avoid flaky tests
1819

1920
ROUTER_TYPES = ["round_robin", "load_balancing", "kv_cache_aware"]
2021

@@ -317,7 +318,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
317318
print(response)
318319
# kill gen1, the request should fail
319320
terminate(gen_worker1)
320-
await asyncio.sleep(INACTIVE_TIMEOUT)
321+
await asyncio.sleep(CHECK_STATUS_INTERVAL)
321322
verify_cluster_info(False, 1, 0)
322323
with pytest.raises(Exception):
323324
request_completion(model_name, "Hello, my name is", port=TEST_PORT)
@@ -330,7 +331,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
330331
TEST_PORT + 201,
331332
device=2)
332333
await wait_for_worker_ready(TEST_PORT + 201)
333-
await asyncio.sleep(INACTIVE_TIMEOUT)
334+
await asyncio.sleep(CHECK_STATUS_INTERVAL)
334335
verify_cluster_info(True, 1, 1)
335336

336337
response = request_completion(model_name, test_prompt, port=TEST_PORT)
@@ -340,7 +341,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
340341

341342
# kill ctx1, the request should fail
342343
terminate(ctx_worker1)
343-
await asyncio.sleep(INACTIVE_TIMEOUT)
344+
await asyncio.sleep(CHECK_STATUS_INTERVAL)
344345
verify_cluster_info(False, 0, 1)
345346
with pytest.raises(Exception):
346347
request_completion(model_name, test_prompt, port=TEST_PORT)
@@ -362,7 +363,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
362363
gen_worker1 = run_gen_worker(model_name, worker_config, TEST_PORT + 200)
363364
await wait_for_worker_ready(TEST_PORT + 100)
364365
await wait_for_worker_ready(TEST_PORT + 200)
365-
await asyncio.sleep(INACTIVE_TIMEOUT)
366+
await asyncio.sleep(CHECK_STATUS_INTERVAL)
366367
verify_cluster_info(True, 2, 2)
367368

368369
# send 10 requests, the responses will be generated by the different ctx/gen workers (but we can't verify it now)
@@ -402,7 +403,7 @@ async def test_disagg_server_restart(model_name, disagg_server_config,
402403

403404
# kill disagg server, the request should fail
404405
terminate(disagg_server)
405-
await asyncio.sleep(INACTIVE_TIMEOUT)
406+
await asyncio.sleep(CHECK_STATUS_INTERVAL)
406407
with pytest.raises(Exception):
407408
verify_cluster_info(False, 1, 1, expected_code=500)
408409

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-b
388388
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
389389
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
390390
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5651824)
391-
disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/5649010)
392391
unittest/_torch/modules SKIP (https://nvbugs/5637037)
393392
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854)
394393
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854)
@@ -405,9 +404,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp
405404
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
406405
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
407406
examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5655832)
408-
disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5633340)
409-
disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing] SKIP (https://nvbugs/5649010)
410-
disaggregated/test_auto_scaling.py::test_worker_restart[http-round_robin] SKIP (https://nvbugs/5649010)
411407
disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5661926)
412408
unittest/_torch/sampler/test_torch_sampler.py::TestBatchedSampling SKIP (https://nvbugs/5661877)
413409
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5568836)

0 commit comments

Comments
 (0)