1515TEST_PORT = 8000
1616HEARTBEAT_INTERVAL = 1
1717INACTIVE_TIMEOUT = 2
18+ CHECK_STATUS_INTERVAL = 3 # check cluster status with a larger interval than inactive timeout to avoid flaky tests
1819
1920ROUTER_TYPES = ["round_robin" , "load_balancing" , "kv_cache_aware" ]
2021
@@ -317,7 +318,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
317318 print (response )
318319 # kill gen1, the request should fail
319320 terminate (gen_worker1 )
320- await asyncio .sleep (INACTIVE_TIMEOUT )
321+ await asyncio .sleep (CHECK_STATUS_INTERVAL )
321322 verify_cluster_info (False , 1 , 0 )
322323 with pytest .raises (Exception ):
323324 request_completion (model_name , "Hello, my name is" , port = TEST_PORT )
@@ -330,7 +331,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
330331 TEST_PORT + 201 ,
331332 device = 2 )
332333 await wait_for_worker_ready (TEST_PORT + 201 )
333- await asyncio .sleep (INACTIVE_TIMEOUT )
334+ await asyncio .sleep (CHECK_STATUS_INTERVAL )
334335 verify_cluster_info (True , 1 , 1 )
335336
336337 response = request_completion (model_name , test_prompt , port = TEST_PORT )
@@ -340,7 +341,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
340341
341342 # kill ctx1, the request should fail
342343 terminate (ctx_worker1 )
343- await asyncio .sleep (INACTIVE_TIMEOUT )
344+ await asyncio .sleep (CHECK_STATUS_INTERVAL )
344345 verify_cluster_info (False , 0 , 1 )
345346 with pytest .raises (Exception ):
346347 request_completion (model_name , test_prompt , port = TEST_PORT )
@@ -362,7 +363,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config,
362363 gen_worker1 = run_gen_worker (model_name , worker_config , TEST_PORT + 200 )
363364 await wait_for_worker_ready (TEST_PORT + 100 )
364365 await wait_for_worker_ready (TEST_PORT + 200 )
365- await asyncio .sleep (INACTIVE_TIMEOUT )
366+ await asyncio .sleep (CHECK_STATUS_INTERVAL )
366367 verify_cluster_info (True , 2 , 2 )
367368
368369 # send 10 requests, the responses will be generated by the different ctx/gen workers (but we can't verify it now)
@@ -402,7 +403,7 @@ async def test_disagg_server_restart(model_name, disagg_server_config,
402403
403404 # kill disagg server, the request should fail
404405 terminate (disagg_server )
405- await asyncio .sleep (INACTIVE_TIMEOUT )
406+ await asyncio .sleep (CHECK_STATUS_INTERVAL )
406407 with pytest .raises (Exception ):
407408 verify_cluster_info (False , 1 , 1 , expected_code = 500 )
408409
0 commit comments