Skip to content

Commit 413a35d

Browse files
committed
Add RCCL_P2P_BATCH_ENABLE parameter to multinode
rccl tests. - Added nccl_p2p_batch_enable test parameter with values ["0", "1"] to rccl_multinode_cvs.py - Updated rccl_lib.py to include the parameter in MPI command execution - Added parameter to rccl_config.json configuration file - Doubles test combinations to cover P2P batching enable/disable cases. Signed-off-by: Ignatious Johnson <[email protected]>
1 parent 20f2be7 commit 413a35d

File tree

3 files changed

+11
-5
lines changed

3 files changed

+11
-5
lines changed

input/config_file/rccl/rccl_config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"nccl_ib_tc": "0",
4242
"nccl_ib_split_data_on_qps": "0",
4343
"nccl_pxn_disable": [ "0", "1" ],
44+
"nccl_p2p_batch_enable": [ "0", "1" ],
4445
"nccl_net_plugin": "none",
4546
"verify_bus_bw": "False",
4647
"verify_bw_dip": "True",

lib/rccl_lib.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ def rccl_cluster_test( phdl, shdl, test_name, cluster_node_list, vpc_node_list,
312312
ib_rx_queue_len=8192, ucx_tls='tcp', hcoll_enable_mcast_all=0, \
313313
nccl_cumem_enable=0, nccl_ib_timeout=30, nccl_ib_sl=0, \
314314
nccl_ib_tc=41, nccl_ib_split_data_on_qps=0, nccl_pxn_disable=1, \
315-
nccl_net_plugin=None, user_password=None, \
315+
nccl_p2p_batch_enable=1, nccl_net_plugin=None, user_password=None, \
316316
min_channels=64, max_channels=64, \
317317
data_type="float", \
318318
user_key_file=None, verify_bus_bw=False, \
@@ -418,6 +418,7 @@ def rccl_cluster_test( phdl, shdl, test_name, cluster_node_list, vpc_node_list,
418418
-x NCCL_IB_TC={nccl_ib_tc} \
419419
-x NCCL_IB_SPLIT_DATA_ON_QPS={nccl_ib_split_data_on_qps} \
420420
-x NCCL_PXN_DISABLE={nccl_pxn_disable} \
421+
-x RCCL_P2P_BATCH_ENABLE={nccl_p2p_batch_enable} \
421422
-x NCCL_NET_PLUGIN={nccl_net_plugin} \
422423
{RCCL_TESTS_INSTALL_DIR}/{test_name} -b {start_msg_size} -e {end_msg_size} -f {step_function} \
423424
-g {threads_per_gpu} -c {check_iteration_count} -w {warmup_iterations} \
@@ -482,7 +483,7 @@ def rccl_cluster_test_default( phdl, shdl, test_name, cluster_node_list, vpc_nod
482483
ib_rx_queue_len=8192, ucx_tls='tcp', hcoll_enable_mcast_all=0, \
483484
nccl_cumem_enable=0, nccl_ib_timeout=30, nccl_ib_sl=0, \
484485
nccl_ib_tc=41, nccl_ib_split_data_on_qps=0, nccl_pxn_disable=1, \
485-
nccl_net_plugin=None, user_password=None, \
486+
nccl_p2p_batch_enable=1, nccl_net_plugin=None, user_password=None, \
486487
min_channels=64, max_channels=64, \
487488
user_key_file=None, verify_bus_bw=False, \
488489
verify_bw_dip=True, verify_lat_dip=True, exp_results_dict=None ):
@@ -584,6 +585,7 @@ def rccl_cluster_test_default( phdl, shdl, test_name, cluster_node_list, vpc_nod
584585
--mca btl_tcp_if_exclude lo,docker0,usb0 \
585586
-x UCX_NET_DEVICES={net_dev_list} \
586587
-x UCX_TLS={ucx_tls} \
588+
-x RCCL_P2P_BATCH_ENABLE={nccl_p2p_batch_enable} \
587589
-x NCCL_NET_PLUGIN={nccl_net_plugin} \
588590
{RCCL_TESTS_INSTALL_DIR}/{test_name} -b {start_msg_size} -e {end_msg_size} -f {step_function} \
589591
-g {threads_per_gpu} -c {check_iteration_count} -w {warmup_iterations} \

tests/rccl/rccl_multinode_cvs.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,10 @@ def pytest_generate_tests(metafunc):
242242
rccl_protocol_list = rccl.get("rccl_protocol", ["simple", "LL128", "LL"])
243243
qp_scale_list = rccl.get("qp_scale", ["1", "2"])
244244
nccl_pxn_disable_list = rccl.get("nccl_pxn_disable", [ "1", "0" ])
245+
nccl_p2p_batch_enable_list = rccl.get("nccl_p2p_batch_enable", [ "1", "0" ])
245246

246247
# Only parametrize fixtures used by this test
247-
all_keys = ("rccl_collective", "rccl_algo", "rccl_protocol", "qp_scale", "nccl_pxn_disable")
248+
all_keys = ("rccl_collective", "rccl_algo", "rccl_protocol", "qp_scale", "nccl_pxn_disable", "nccl_p2p_batch_enable")
248249

249250
active = [k for k in all_keys if k in metafunc.fixturenames]
250251
if not active:
@@ -256,6 +257,7 @@ def pytest_generate_tests(metafunc):
256257
"rccl_protocol": rccl_protocol_list,
257258
"qp_scale": qp_scale_list,
258259
"nccl_pxn_disable": nccl_pxn_disable_list,
260+
"nccl_p2p_batch_enable": nccl_p2p_batch_enable_list,
259261
}
260262
domains = [domain_by_key[k] for k in active]
261263

@@ -329,7 +331,7 @@ def test_disable_firewall( phdl ):
329331

330332

331333
def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective, rccl_algo, \
332-
rccl_protocol, qp_scale, nccl_pxn_disable ):
334+
rccl_protocol, qp_scale, nccl_pxn_disable, nccl_p2p_batch_enable ):
333335

334336
"""
335337
Execute RCCL performance test across the cluster with given parameters.
@@ -422,6 +424,7 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective, rccl_
422424
nccl_ib_tc = config_dict['nccl_ib_tc'], \
423425
nccl_ib_split_data_on_qps = config_dict['nccl_ib_split_data_on_qps'], \
424426
nccl_pxn_disable = nccl_pxn_disable, \
427+
nccl_p2p_batch_enable = nccl_p2p_batch_enable, \
425428
nccl_net_plugin = config_dict['nccl_net_plugin'], \
426429
user_key_file = cluster_dict['priv_key_file'], \
427430
verify_bus_bw = config_dict['verify_bus_bw'], \
@@ -432,7 +435,7 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective, rccl_
432435

433436

434437
print(result_dict)
435-
key_name = f'{rccl_collective}-{rccl_algo}-{rccl_protocol}-{qp_scale}-{nccl_pxn_disable}'
438+
key_name = f'{rccl_collective}-{rccl_algo}-{rccl_protocol}-{qp_scale}-{nccl_pxn_disable}-{nccl_p2p_batch_enable}'
436439
rccl_res_dict[key_name] = result_dict
437440

438441
# Scan dmesg between start and end times cluster wide ..

0 commit comments

Comments
 (0)