Skip to content

Commit fa72b3c

Browse files
iftahlpasis
authored andcommitted
issue: 3697720 Option to postpone socket close
Add VMA option VMA_DEFERRED_CLOSE to postpone shadow socket close. Disabled by default (no changes vs previous code). Signed-off-by: Iftah Levi <[email protected]>
1 parent a9b7fce commit fa72b3c

File tree

6 files changed

+24
-5
lines changed

6 files changed

+24
-5
lines changed

src/vma/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,7 @@ void print_vma_global_settings()
573573
VLOG_PARAM_NUMBER("MSS", safe_mce_sys().lwip_mss, MCE_DEFAULT_MSS, SYS_VAR_MSS); break;
574574
}
575575
VLOG_PARAM_NUMSTR("TCP CC Algorithm", safe_mce_sys().lwip_cc_algo_mod, MCE_DEFAULT_LWIP_CC_ALGO_MOD, SYS_VAR_TCP_CC_ALGO, lwip_cc_algo_str(safe_mce_sys().lwip_cc_algo_mod));
576+
VLOG_PARAM_STRING("Deferred close", safe_mce_sys().deferred_close, MCE_DEFAULT_DEFERRED_CLOSE, SYS_VAR_DEFERRED_CLOSE, safe_mce_sys().deferred_close ? "Enabled " : "Disabled");
576577
VLOG_PARAM_STRING("Polling Rx on Tx TCP", safe_mce_sys().rx_poll_on_tx_tcp, MCE_DEFAULT_RX_POLL_ON_TX_TCP, SYS_VAR_VMA_RX_POLL_ON_TX_TCP, safe_mce_sys().rx_poll_on_tx_tcp ? "Enabled " : "Disabled");
577578
VLOG_PARAM_STRING("Trig dummy send getsockname()", safe_mce_sys().trigger_dummy_send_getsockname, MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME, SYS_VAR_VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME, safe_mce_sys().trigger_dummy_send_getsockname ? "Enabled " : "Disabled");
578579

src/vma/sock/sock-redirect.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -310,9 +310,9 @@ void dbg_check_if_need_to_send_mcpkt()
310310
dbg_check_if_need_to_send_mcpkt_prevent_nested_calls--;
311311
}
312312

313-
void handle_close(int fd, bool cleanup, bool passthrough)
313+
bool handle_close(int fd, bool cleanup, bool passthrough)
314314
{
315-
315+
bool to_close_now = true;
316316
srdr_logfunc("Cleanup fd=%d", fd);
317317

318318
if (g_p_fd_collection) {
@@ -321,12 +321,17 @@ void handle_close(int fd, bool cleanup, bool passthrough)
321321

322322
if (fd_collection_get_sockfd(fd)) {
323323
g_p_fd_collection->del_sockfd(fd, cleanup);
324+
if (safe_mce_sys().deferred_close) {
325+
to_close_now = false;
326+
}
324327
}
325328
if (fd_collection_get_epfd(fd)) {
326329
g_p_fd_collection->del_epfd(fd, cleanup);
327330
}
328331

329332
}
333+
334+
return to_close_now;
330335
}
331336

332337

@@ -912,9 +917,9 @@ int close(int __fd)
912917

913918
srdr_logdbg_entry("fd=%d", __fd);
914919

915-
handle_close(__fd);
920+
bool close_now = handle_close(__fd);
916921

917-
return orig_os_api.close(__fd);
922+
return close_now ? orig_os_api.close(__fd) : 0;
918923
}
919924

920925
extern "C"

src/vma/sock/sock-redirect.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ extern iomux_stats_t* g_p_epoll_stats;
177177

178178
int do_global_ctors();
179179
void reset_globals();
180-
void handle_close(int fd, bool cleanup = false, bool passthrough = false);
180+
bool handle_close(int fd, bool cleanup = false, bool passthrough = false);
181181

182182
// allow calling our socket(...) implementation safely from within libvma.so
183183
// this is critical in case VMA was loaded using dlopen and not using LD_PRELOAD

src/vma/sock/socket_fd_api.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ socket_fd_api::socket_fd_api(int fd) : m_epoll_event_flags(0), m_fd(fd), m_n_sys
5656

5757
socket_fd_api::~socket_fd_api()
5858
{
59+
if (safe_mce_sys().deferred_close && (m_fd >= 0)) {
60+
orig_os_api.close(m_fd);
61+
}
5962
}
6063

6164

src/vma/util/sys_vars.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,7 @@ void mce_sys_var::get_env_params()
619619
neigh_wait_till_send_arp_msec = MCE_DEFAULT_NEIGH_UC_ARP_DELAY_MSEC;
620620
timer_netlink_update_msec = MCE_DEFAULT_NETLINK_TIMER_MSEC;
621621

622+
deferred_close = MCE_DEFAULT_DEFERRED_CLOSE;
622623
rx_poll_on_tx_tcp = MCE_DEFAULT_RX_POLL_ON_TX_TCP;
623624
trigger_dummy_send_getsockname = MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME;
624625

@@ -1267,6 +1268,10 @@ void mce_sys_var::get_env_params()
12671268
if ((env_ptr = getenv(SYS_VAR_TCP_CC_ALGO)) != NULL)
12681269
lwip_cc_algo_mod = (uint32_t)atoi(env_ptr);
12691270

1271+
if ((env_ptr = getenv(SYS_VAR_DEFERRED_CLOSE)) != NULL) {
1272+
deferred_close = atoi(env_ptr) ? true : false;
1273+
}
1274+
12701275
if ((env_ptr = getenv(SYS_VAR_VMA_RX_POLL_ON_TX_TCP)) != NULL)
12711276
rx_poll_on_tx_tcp = atoi(env_ptr) ? true : false;
12721277

src/vma/util/sys_vars.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,9 @@ struct mce_sys_var {
424424
uint32_t vma_time_measure_num_samples;
425425
char vma_time_measure_filename[PATH_MAX];
426426
sysctl_reader_t & sysctl_reader;
427+
// Workaround for #3440429: postpone close(2) to the socket destructor, so the sockfd is closed
428+
// after the rfs rule is destroyed. Otherwise, flow_tag or TCP port can be reused too early.
429+
bool deferred_close;
427430
bool rx_poll_on_tx_tcp;
428431
hyper_t hypervisor;
429432
bool trigger_dummy_send_getsockname;
@@ -567,6 +570,7 @@ extern mce_sys_var & safe_mce_sys();
567570

568571
#define SYS_VAR_VMA_TIME_MEASURE_NUM_SAMPLES "VMA_TIME_MEASURE_NUM_SAMPLES"
569572
#define SYS_VAR_VMA_TIME_MEASURE_DUMP_FILE "VMA_TIME_MEASURE_DUMP_FILE"
573+
#define SYS_VAR_DEFERRED_CLOSE "VMA_DEFERRED_CLOSE"
570574
#define SYS_VAR_VMA_RX_POLL_ON_TX_TCP "VMA_RX_POLL_ON_TX_TCP"
571575
#define SYS_VAR_VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME "VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME"
572576

@@ -701,6 +705,7 @@ extern mce_sys_var & safe_mce_sys();
701705
#endif /* DEFINED_TSO */
702706
#define MCE_DEFAULT_RX_POLL_ON_TX_TCP (false)
703707
#define MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME (false)
708+
#define MCE_DEFAULT_DEFERRED_CLOSE (false)
704709

705710
#define MCE_ALIGNMENT ((unsigned long)63)
706711
#define RX_BUF_SIZE(mtu) ((mtu) + IPOIB_HDR_LEN + GRH_HDR_LEN) // RX buffers are larger in IB

0 commit comments

Comments
 (0)