Skip to content

Commit 3f5073b

Browse files
committed
Use poll to avoid busy loops for bootstrap socket-io
Build and run nccl_allreduce_perf ensuring Initialization path is exercised for testing.
1 parent 3ea7eed commit 3f5073b

File tree

1 file changed

+15
-1
lines changed

1 file changed

+15
-1
lines changed

src/misc/socket.cc

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34);
1818
NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100);
19+
NCCL_PARAM(PollTimeOut, "SOCKET_POLL_TIMEOUT_MSEC", 0);
20+
1921
static void msleep(unsigned int time_msec) {
2022
const long c_1e6 = 1e6;
2123
struct timespec tv = (struct timespec){
@@ -25,6 +27,14 @@ static void msleep(unsigned int time_msec) {
2527
nanosleep(&tv, NULL);
2628
}
2729

30+
static void pollSocket(int fd, int op) {
31+
struct pollfd pfd;
32+
pfd.fd = fd;
33+
pfd.events = (op == NCCL_SOCKET_RECV) ? POLLIN : POLLOUT;
34+
pfd.revents = 0;
35+
poll(&pfd, 1, ncclParamPollTimeOut());
36+
}
37+
2838
static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
2939
int bytes = 0;
3040
*closed = 0;
@@ -77,8 +87,12 @@ static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, i
7787
}
7888

7989
static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
80-
while (*offset < size)
90+
while (*offset < size) {
8191
NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
92+
// If we have more data to read or write, use the poll system call to wait
93+
// until the socket becomes readable or writable again.
94+
if ((*offset < size) && ncclParamPollTimeOut()) pollSocket(sock->fd, op);
95+
}
8296
return ncclSuccess;
8397
}
8498

0 commit comments

Comments
 (0)