Skip to content

Commit dfad51e

Browse files
mustafabarisaki001
andauthored
Support gfx950 in topo_expl and resolve dependency on FMT (#1829)
* Support gfx950 in topo_expl * Fix dependencies and fetch fmt from sources * Remove third_party folder in make clean * Add empty target when fmt is found * Add MI350 example * Update README.md --------- Co-authored-by: isaki001 <[email protected]>
1 parent 5e7937e commit dfad51e

File tree

5 files changed

+210
-5
lines changed

5 files changed

+210
-5
lines changed

tools/topo_expl/Makefile

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,43 @@ HIP_PATH = ../../..
55
endif
66
HIPCC = $(HIP_PATH)/bin/hipcc
77

8+
.DEFAULT_GOAL := all
9+
810
EXE = topo_expl
911
CXXFLAGS = -g -ffunction-sections -fdata-sections -Wl,--gc-sections -fgpu-rdc -Iinclude -Ihipify_rccl/include -Ihipify_rccl/include/plugin -Ihipify_rccl/device/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DENABLE_LL128 -DNVTX_NO_IMPL -DRCCL_EXPOSE_STATIC -lpthread
1012

1113
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
1214
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc \
1315
hipify_rccl/collectives.cc hipify_rccl/register.cc hipify_rccl/enqueue.cc ../../src/rccl_wrap.cc
1416

15-
all: hipify $(EXE)
17+
FMT_DIR := third_party/fmt
18+
FMT_INCLUDE := $(FMT_DIR)/include
19+
FMT_HEADER := $(FMT_INCLUDE)/fmt/format.h
20+
FMT_GIT := https://github.com/fmtlib/fmt.git
21+
22+
# Probe for <fmt/format.h> using a portable pipe (no <<< here-string)
23+
HAVE_FMT := $(shell echo '#include <fmt/format.h>' | \
24+
$(HIPCC) -xc++ -std=c++17 -E - >/dev/null 2>&1 && echo yes || echo no)
25+
26+
ifeq ($(HAVE_FMT),no)
27+
CXXFLAGS += -I$(FMT_INCLUDE)
28+
NEED_FMT := 1
29+
endif
30+
31+
ifeq ($(NEED_FMT),1)
32+
$(FMT_HEADER):
33+
rm -rf third_party/fmt
34+
@echo ">>> fmt not found; cloning $(FMT_GIT) ..."
35+
@mkdir -p $(dir $(FMT_DIR))
36+
@git clone --depth=1 $(FMT_GIT) $(FMT_DIR)
37+
else
38+
$(FMT_HEADER):
39+
endif
40+
# ---------------------------------------------------------------------------
41+
42+
43+
44+
all: $(FMT_HEADER) hipify $(EXE)
1645

1746
$(EXE): $(files)
1847
$(HIPCC) $(CXXFLAGS) $^ -o $@
@@ -30,13 +59,14 @@ hipify:
3059
cp -a ../../src/misc/archinfo.cc hipify_rccl/graph/
3160
hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
3261
hipify-perl -inplace -quiet-warnings hipify_rccl/include/plugin/*.h
62+
hipify-perl -inplace -quiet-warnings hipify_rccl/include/latency_profiler/*.h
3363
hipify-perl -inplace -quiet-warnings hipify_rccl/device/include/*.h
34-
sed -i "s/template<typename T, typename RedOp>/template<typename T, typename RedOp, int COLL_UNROLL>/g" "hipify_rccl/device/include/common.h"
35-
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "hipify_rccl/device/include/common.h"
64+
sed -i "s/template<typename T, typename RedOp>/template<typename T, typename RedOp, int USE_ACC, int COLL_UNROLL>/g" "hipify_rccl/device/include/common.h"
65+
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, USE_ACC, COLL_UNROLL>/" "hipify_rccl/device/include/common.h"
3666
hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
3767
hipify-perl -inplace -quiet-warnings hipify_rccl/include/network/unpack/*
3868
hipify-perl -inplace -quiet-warnings hipify_rccl/*.cc
3969

4070
clean:
41-
rm -rf hipify_rccl
71+
rm -rf hipify_rccl third_party
4272
rm -f *.o $(EXE)

tools/topo_expl/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Run `./topo_expl` without arguments to see the list of available models. Each mo
3333

3434
## Example Usage: Print RCCL's algorithm/protocol selections
3535

36-
The tool is typically run with the `NCCL_DEBUG=INFO` environment variable, but for the convenience of just printing the algo/proto table, we use version `NCCL_DEBUG=version` in this example to avoid printing topo details.
36+
The tool is typically run with the `NCCL_DEBUG=INFO` environment variable to show the topology information and print out the constructed rings/trees. However, for the convenience of just printing the algo/proto table, we use version `NCCL_DEBUG=version` in this example to avoid printing topo details.
3737

3838
```bash
3939
# List available models
@@ -45,6 +45,9 @@ NCCL_DEBUG=version ./topo_expl -m 55
4545
# Test a multi-node MI300 configuration with 8 nodes
4646
NCCL_DEBUG=version ./topo_expl -m 55 -n 8
4747

48+
# Test a multi-node MI350 configuration with 2 nodes
49+
NCCL_DEBUG=version ./topo_expl -m 59 -n 2
50+
4851
# Test MI250 configuration (model 42)
4952
NCCL_DEBUG=version ./topo_expl -m 42
5053

tools/topo_expl/include/device_table.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,8 @@ static struct rcclKernelItem rcclKernelTable[] = { };
1111

1212
template <int unroll>
1313
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept { }
14+
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept { }
15+
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept { }
16+
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_4(unsigned short funcIndex) noexcept { }
1417

1518
#endif
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
<system version="2">
2+
<cpu host_hash="0x1b2f53a636e182bb" numaid="0" affinity="00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="191" modelid="2">
3+
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc03" link_speed="32.0 GT/s PCIe" link_width="16">
4+
<pci busid="0000:03:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
5+
<nic>
6+
<net name="bnxt_re0" dev="0" latency="0" speed="400000" port="1" guid="0x28303efeffe604d6" maxconn="131073" gdr="1"/>
7+
</nic>
8+
</pci>
9+
<pci busid="0000:06:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
10+
<pci busid="0000:08:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
11+
<gpu dev="0" sm="256" gcn="gfx950" arch="38911" rank="0" gdr="1">
12+
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
13+
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
14+
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
15+
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
16+
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
17+
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
18+
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
19+
</gpu>
20+
</pci>
21+
</pci>
22+
</pci>
23+
<pci busid="0000:11:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc03" link_speed="32.0 GT/s PCIe" link_width="16">
24+
<pci busid="0000:13:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
25+
<nic>
26+
<net name="bnxt_re1" dev="1" latency="0" speed="400000" port="1" guid="0x18483efeffe604d6" maxconn="131073" gdr="1"/>
27+
</nic>
28+
</pci>
29+
<pci busid="0000:16:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
30+
<pci busid="0000:18:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
31+
<gpu dev="1" sm="256" gcn="gfx950" arch="38911" rank="1" gdr="1">
32+
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
33+
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
34+
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
35+
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
36+
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
37+
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
38+
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
39+
</gpu>
40+
</pci>
41+
</pci>
42+
</pci>
43+
<pci busid="0000:61:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc02" link_speed="32.0 GT/s PCIe" link_width="16">
44+
<pci busid="0000:63:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
45+
<nic>
46+
<net name="bnxt_re2" dev="2" latency="0" speed="400000" port="1" guid="0x78333efeffe604d6" maxconn="131073" gdr="1"/>
47+
</nic>
48+
</pci>
49+
<pci busid="0000:66:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
50+
<pci busid="0000:68:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
51+
<gpu dev="2" sm="256" gcn="gfx950" arch="38911" rank="2" gdr="1">
52+
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
53+
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
54+
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
55+
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
56+
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
57+
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
58+
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
59+
</gpu>
60+
</pci>
61+
</pci>
62+
</pci>
63+
<pci busid="0000:71:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc02" link_speed="32.0 GT/s PCIe" link_width="16">
64+
<pci busid="0000:73:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
65+
<nic>
66+
<net name="bnxt_re3" dev="3" latency="0" speed="400000" port="1" guid="0x981e3efeffe604d6" maxconn="131073" gdr="1"/>
67+
</nic>
68+
</pci>
69+
<pci busid="0000:76:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
70+
<pci busid="0000:78:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
71+
<gpu dev="3" sm="256" gcn="gfx950" arch="38911" rank="3" gdr="1">
72+
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
73+
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
74+
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
75+
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
76+
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
77+
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
78+
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
79+
</gpu>
80+
</pci>
81+
</pci>
82+
</pci>
83+
</cpu>
84+
<cpu host_hash="0x1b2f53a636e182bb" numaid="1" affinity="ffffffff,ffffffff,ffffffff,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="191" modelid="2">
85+
<pci busid="0000:81:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc01" link_speed="32.0 GT/s PCIe" link_width="16">
86+
<pci busid="0000:83:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
87+
<nic>
88+
<net name="bnxt_re4" dev="4" latency="0" speed="400000" port="1" guid="0x80963ffeffe604d6" maxconn="131073" gdr="1"/>
89+
</nic>
90+
</pci>
91+
<pci busid="0000:86:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
92+
<pci busid="0000:88:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
93+
<gpu dev="4" sm="256" gcn="gfx950" arch="38911" rank="4" gdr="1">
94+
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
95+
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
96+
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
97+
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
98+
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
99+
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
100+
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
101+
</gpu>
102+
</pci>
103+
</pci>
104+
</pci>
105+
<pci busid="0000:91:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc01" link_speed="32.0 GT/s PCIe" link_width="16">
106+
<pci busid="0000:93:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
107+
<nic>
108+
<net name="bnxt_re5" dev="5" latency="0" speed="400000" port="1" guid="0x681e3efeffe604d6" maxconn="131073" gdr="1"/>
109+
</nic>
110+
</pci>
111+
<pci busid="0000:96:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
112+
<pci busid="0000:98:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
113+
<gpu dev="5" sm="256" gcn="gfx950" arch="38911" rank="5" gdr="1">
114+
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
115+
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
116+
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
117+
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
118+
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
119+
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
120+
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
121+
</gpu>
122+
</pci>
123+
</pci>
124+
</pci>
125+
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc00" link_speed="32.0 GT/s PCIe" link_width="16">
126+
<pci busid="0000:e3:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
127+
<nic>
128+
<net name="bnxt_re6" dev="6" latency="0" speed="400000" port="1" guid="0xd0373efeffe604d6" maxconn="131073" gdr="1"/>
129+
</nic>
130+
</pci>
131+
<pci busid="0000:e6:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
132+
<pci busid="0000:e8:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
133+
<gpu dev="6" sm="256" gcn="gfx950" arch="38911" rank="6" gdr="1">
134+
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
135+
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
136+
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
137+
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
138+
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
139+
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
140+
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
141+
</gpu>
142+
</pci>
143+
</pci>
144+
</pci>
145+
<pci busid="0000:f1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc00" link_speed="32.0 GT/s PCIe" link_width="16">
146+
<pci busid="0000:f3:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
147+
<nic>
148+
<net name="bnxt_re7" dev="7" latency="0" speed="400000" port="1" guid="0xe84a3efeffe604d6" maxconn="131073" gdr="1"/>
149+
</nic>
150+
</pci>
151+
<pci busid="0000:f7:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
152+
<pci busid="0000:f9:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
153+
<gpu dev="7" sm="256" gcn="gfx950" arch="38911" rank="7" gdr="1">
154+
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
155+
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
156+
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
157+
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
158+
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
159+
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
160+
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
161+
</gpu>
162+
</pci>
163+
</pci>
164+
</pci>
165+
</cpu>
166+
</system>
167+

tools/topo_expl/topo_expl.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ NodeModelDesc model_descs[] = {
136136
{"topo_8p_942vm.xml", " 8gfx942 1H7XGMI 8NIC 2Intel B"},
137137
{"topo_16p_gio-1s-1rp-cascade.xml", "16gfx942 2H7XGMI 1NIC 2AMD A"},
138138
{"topo_16p_gio-3s-1rp-split-flat.xml", "16gfx942 2H7XGMI 1NIC 2AMD B"},
139+
// GFX 950
140+
{"topo_8p_950.xml", " 8gfx950 1H7XGMI 8NIC 2AMD A"},
139141
};
140142

141143
NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);

0 commit comments

Comments
 (0)