Skip to content

Commit 227d42e

Browse files
authored
[https://nvbugs/5651854][fix] Fix dist-serving perf by clearing CPU affinity (#9549)
Signed-off-by: Shixiaowei02 <[email protected]>
1 parent e72ce98 commit 227d42e

File tree

3 files changed

+24
-23
lines changed

3 files changed

+24
-23
lines changed

tensorrt_llm/executor/base_worker.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,13 @@ def _configure_affinity(self, device_id):
125125
Note:
126126
If the process already has constrained affinity, a warning is logged.
127127
Configuration is handled as follows:
128-
TLLM_NUMA_WORKER_AFFINITY = <unset>
129-
-> affinity is auto-configured only if it is unconstrained
130-
TLLM_NUMA_WORKER_AFFINITY = 1
131-
-> affinity is unconditionally auto-configured
132-
TLLM_NUMA_WORKER_AFFINITY = 0 or any other value
133-
-> affinity is unconditionally _not_ auto-configured
128+
TLLM_NUMA_AWARE_WORKER_AFFINITY = <unset>
129+
-> Affinity is automatically configured if it is unconstrained,
130+
and deleted if it is constrained externally by the user.
131+
TLLM_NUMA_AWARE_WORKER_AFFINITY = 1
132+
-> Affinity is unconditionally auto-configured.
133+
TLLM_NUMA_AWARE_WORKER_AFFINITY = 0 or any other value
134+
-> Affinity is unconditionally _not_ auto-configured.
134135
'''
135136

136137
# Get the current affinity setting
@@ -141,22 +142,31 @@ def _configure_affinity(self, device_id):
141142
all_cpus = list(range(psutil.cpu_count()))
142143

143144
constrained_affinity = (cpu_affinity != all_cpus)
145+
numa_aware_affinity = os.environ.get("TLLM_NUMA_AWARE_WORKER_AFFINITY")
144146

145-
# If the process is affined to a constrained set of CPUs, warn the user
146-
# so as to ensure that this is what is intended
147+
# If affinity is constrained but the user hasn't explicitly
148+
# requested NUMA-aware affinity, remove the constraints.
147149
if constrained_affinity:
148150
logger.warning(
149151
f"Worker process {pid} is affined to run on the following CPUs: "
150152
f"{cpu_affinity} (subset of all logical CPUs). This may harm "
151153
f"performance if set incorrectly.")
154+
if numa_aware_affinity is None:
155+
logger.warning(
156+
f"Worker process {pid} has constrained CPU affinity "
157+
f"but `TLLM_NUMA_AWARE_WORKER_AFFINITY` is not set. "
158+
f"Removing CPU affinity constraints.")
159+
process.cpu_affinity(all_cpus)
152160

153161
# If affinity is unconstrained and the user hasn't explicitly
154162
# prohibited it or the user has explicitly requested it, choose the
155163
# optimal affinity based upon the NUMA topology
156-
numa_aware_affinity = os.environ.get("TLLM_NUMA_AWARE_WORKER_AFFINITY")
157164
if ((numa_aware_affinity is None and not constrained_affinity)
158165
or (numa_aware_affinity == "1")):
159166
process.cpu_affinity(get_numa_aware_cpu_affinity(device_id))
167+
logger.info(
168+
f"Worker process {pid} CPU affinity set to "
169+
f"{process.cpu_affinity()} for optimal NUMA-aware scheduling.")
160170

161171
def _get_comm_ranks_device_id(self):
162172
device_id = self.global_rank % torch.cuda.device_count()

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ def launch_disaggregated_llm(
9595
ctx_model: str = None,
9696
gen_model: str = None,
9797
server_waiting_timeout: int = DEFAULT_SERVER_WAITING_TIMEOUT,
98-
max_workers: int = 16):
98+
max_workers: int = 16,
99+
enable_perf=False):
99100
temp_dir = tempfile.TemporaryDirectory()
100101
disaggregated_serving_config_path = os.path.join(
101102
temp_dir.name, "disaggregated_serving_config.yaml")
@@ -104,9 +105,7 @@ def launch_disaggregated_llm(
104105
print(
105106
f"Using unified tp parameter for testing is not recommended. Please use server configs instead."
106107
)
107-
108-
enable_perf = True
109-
perf_max_requests = 10000
108+
perf_max_requests = 50
110109

111110
def _apply_perf_flags(cfg: Optional[Dict[str, Any]]):
112111
if not isinstance(cfg, dict):
@@ -120,6 +119,7 @@ def _apply_perf_flags(cfg: Optional[Dict[str, Any]]):
120119
_apply_perf_flags(disaggregated_server_config)
121120
_apply_perf_flags(ctx_server_config)
122121
_apply_perf_flags(gen_server_config)
122+
123123
disaggregated_server_config = revise_disaggregated_server_config_urls_with_free_ports(
124124
disaggregated_server_config)
125125

@@ -366,7 +366,7 @@ def _get_perf_metrics():
366366
except requests.exceptions.RequestException as e:
367367
print(f"Error fetching {perf_url}: {e}")
368368

369-
def _show_kvcache_time(kv_cache_perf_dir, max_lines=1000):
369+
def _show_kvcache_time(kv_cache_perf_dir, max_lines=100):
370370
print(f"kv_cache_perf_dir: {kv_cache_perf_dir}")
371371
for file in os.listdir(kv_cache_perf_dir):
372372
print(f"file: {file}")
@@ -475,9 +475,6 @@ def test_auto_dtype(self, disable_overlap_scheduler, ctx_enable_block_reuse,
475475
"disable_overlap_scheduler": disable_overlap_scheduler,
476476
"kv_cache_config": {
477477
"enable_block_reuse": gen_enable_block_reuse
478-
},
479-
"cache_transceiver_config": {
480-
"backend": "DEFAULT"
481478
}
482479
}
483480
gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"}

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -346,8 +346,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-b
346346
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
347347
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
348348
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5651824)
349-
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854)
350-
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854)
351349
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682)
352350
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5655584)
353351
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
@@ -367,10 +365,7 @@ unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP] SKIP
367365
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL] SKIP (https://nvbugs/5664904)
368366
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8] SKIP (https://nvbugs/5670469)
369367
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469)
370-
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-True] SKIP (https://nvbugs/5670480)
371368
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5673610)
372-
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] SKIP (https://nvbugs/5670480)
373-
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-True-False] SKIP (https://nvbugs/5670480)
374369
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559)
375370
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5673578)
376371
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] SKIP (https://nvbugs/5666826)
@@ -390,7 +385,6 @@ unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_
390385
test_e2e.py::test_openai_responses SKIP (https://nvbugs/5635153)
391386
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438)
392387
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5688388)
393-
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-True-True] SKIP (https://nvbugs/5670480)
394388
accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721)
395389
unittest/_torch/speculative/test_eagle3.py::test_llama_eagle3[True-FLASHINFER-False-False-False-False-True-False-False] SKIP (https://nvbugs/5691246)
396390
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5698897)

0 commit comments

Comments
 (0)