@@ -577,7 +577,7 @@ def local_mpi_barrier():
577577
578578
579579def mpi_broadcast (obj , root = 0 ):
580- return mpi_comm ().bcast (obj , root ) if is_multi_device_enable () else obj
580+ return mpi_comm ().bcast (obj , root ) if global_mpi_size () > 1 else obj
581581
582582
583583def mpi_allgather (obj ):
@@ -1141,17 +1141,6 @@ def _unique_tokens_to_json(data):
11411141 }
11421142
11431143
1144- def is_multi_device_enable ():
1145- """
1146- This method evaluates if we are running on multiple GPUs and the flag ENABLE_MULTI_DEVICE is set.
1147- So we can avoid broadcast calls on single GPU.
1148- Issue: https://github.com/NVIDIA/TensorRT-LLM/issues/5927
1149- ENABLE_MULTI_DEVICE is true by default when building TensorRT LLM so we need to also check
1150- the number of devices
1151- """
1152- return local_mpi_size () > 1
1153-
1154-
11551144def set_prometheus_multiproc_dir () -> object :
11561145 # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.10/python/sglang/srt/utils.py#L1266
11571146 global prometheus_multiproc_dir
@@ -1174,3 +1163,19 @@ def torch_pybind11_abi() -> str:
11741163 if TORCH_PYBIND11_ABI is None :
11751164 TORCH_PYBIND11_ABI = f"{ torch ._C ._PYBIND11_COMPILER_TYPE } { torch ._C ._PYBIND11_STDLIB } { torch ._C ._PYBIND11_BUILD_ABI } "
11761165 return TORCH_PYBIND11_ABI
1166+
1167+
1168+ @lru_cache (maxsize = 1 )
1169+ def is_device_integrated () -> bool :
1170+ """Check if the current GPU device is integrated (shares physical memory with CPU).
1171+
1172+ Integrated GPU systems include DGX Spark and other unified memory architectures.
1173+ This function caches the result to avoid repeated CUDA device property queries.
1174+
1175+ Returns:
1176+ bool: True if the GPU is integrated, False otherwise. Returns False if CUDA
1177+ is not available.
1178+ """
1179+ if not torch .cuda .is_available ():
1180+ return False
1181+ return torch .cuda .get_device_properties ().is_integrated
0 commit comments