Skip to content

Commit bfdb006

Browse files
committed
Avoid deadlock issues caused by logging in a loop.
Signed-off-by: wangfakang <[email protected]>
1 parent ab2b89c commit bfdb006

File tree

1 file changed

+19
-11
lines changed

1 file changed

+19
-11
lines changed

src/misc/param.cc

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,21 +61,29 @@ void initEnv() {
6161

6262
void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
6363
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
64+
if (__builtin_expect(__atomic_load_n(cache, __ATOMIC_RELAXED) != uninitialized, true)) {
65+
return;
66+
}
67+
6468
pthread_mutex_lock(&mutex);
6569
if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
6670
const char* str = ncclGetEnv(env);
6771
int64_t value = deftVal;
68-
if (str && strlen(str) > 0) {
69-
errno = 0;
70-
value = strtoll(str, nullptr, 0);
71-
if (errno) {
72-
value = deftVal;
73-
INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
74-
} else {
75-
INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value);
76-
}
72+
if (!str || strlen(str) <= 0) {
73+
__atomic_store_n(cache, value, __ATOMIC_RELAXED);
74+
pthread_mutex_unlock(&mutex);
75+
return;
76+
}
77+
errno = 0;
78+
value = strtoll(str, nullptr, 0);
79+
// To prevent deadlock issues caused by logging in a loop,
80+
// so cache the value before the log operation.
81+
__atomic_store_n(cache, errno ? deftVal : value, __ATOMIC_RELAXED);
82+
if (errno) {
83+
INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
84+
} else {
85+
INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value);
7786
}
78-
__atomic_store_n(cache, value, __ATOMIC_RELAXED);
7987
}
8088
pthread_mutex_unlock(&mutex);
8189
}
@@ -84,4 +92,4 @@ const char *ncclGetEnv(const char *name) {
8492
static pthread_once_t once = PTHREAD_ONCE_INIT;
8593
pthread_once(&once, initEnv);
8694
return getenv(name);
87-
}
95+
}

0 commit comments

Comments
 (0)