Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 21 additions & 12 deletions src/libvgpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -872,21 +872,30 @@ void preInit(){
void postInit(){
allocator_init();
map_cuda_visible_devices();
int lock_ret = try_lock_unified_lock();
if (lock_ret != 0) {
LOG_WARN("try_lock_unified_lock failed, skipping set_task_pid");
pidfound=0;

// Use shared memory semaphore to serialize host PID detection
// Returns 1 if lock acquired, 0 if timeout (skip detection)
int lock_acquired = lock_postinit();
nvmlReturn_t res = NVML_SUCCESS;

if (lock_acquired) {
// Lock acquired - safe to call set_task_pid()
res = set_task_pid();
unlock_postinit();
} else {
nvmlReturn_t res = set_task_pid();
try_unlock_unified_lock();
if (res != NVML_SUCCESS) {
LOG_WARN("SET_TASK_PID FAILED.");
pidfound = 0;
} else {
pidfound = 1;
}
// Timeout - another process likely crashed holding the lock
// Skip host PID detection for this process
LOG_WARN("Skipped host PID detection due to lock timeout");
res = NVML_ERROR_TIMEOUT;
}

LOG_MSG("Initialized");
if (res != NVML_SUCCESS) {
LOG_WARN("SET_TASK_PID FAILED - using container PID for accounting");
pidfound = 0;
} else {
pidfound = 1;
}

//add_gpu_device_memory_usage(getpid(),0,context_size,0);
env_utilization_switch = set_env_utilization_switch();
Expand Down
Loading
Loading