Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/scripts/ut_result_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
fi
fi
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log
grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
grep "PASSED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
echo -e "========================================================================="
echo -e "Show Failed cases in ${ut_suite} xpu distributed"
echo -e "========================================================================="
Expand Down
44 changes: 43 additions & 1 deletion test/xpu/run_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,41 @@

# libfabric WA to avoid hang issue
os.environ["FI_PROVIDER"] = "tcp"
# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
# Get the xelink group card affinity
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
if ret == 0:
gpu_dict = {}
with open("topology.log") as file:
lines = file.readlines()
for line in lines:
if "CPU Affinity" in line:
continue
line = line.strip()
if line.startswith("GPU "):
items = line.split(" ")
items = [x for x in items if x]
gpu_id = items[1]
i = gpu_id.split("/")[0]
affinity = ""
for j, item in enumerate(items):
if "SYS" not in item and ("XL" in item or "S" in item):
if len(affinity) == 0:
affinity = str(j - 2)
else:
affinity = affinity + "," + str(j - 2)
gpu_dict[i] = affinity

max_affinity = ""
for key, value in gpu_dict.items():
if len(value) > len(max_affinity):
max_affinity = value

os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))

else:
print("xpu-smi topology failed")
sys.exit(255)


# run python test
Expand All @@ -24,8 +58,16 @@ def run(test_command):
return result.returncode


os.environ["CCL_SEND"] = "direct"
os.environ["CCL_RECV"] = "direct"
test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
res += run(test_command)
del os.environ["CCL_SEND"]
del os.environ["CCL_RECV"]
test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
res += run(test_command)
test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
res += run(test_command)

# run pytest with skiplist
for key in skip_dict:
Expand Down
9 changes: 9 additions & 0 deletions test/xpu/skip_list_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"test_ddp_parity_xpu",
),
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
"test_delayed_optim_step_offload_false_no_shard_xpu",
"test_delayed_optim_step_offload_false_none_xpu",
Expand Down Expand Up @@ -102,4 +103,12 @@
# will bring back after oneccl upgrade to 2021.16.1
"test_xccl_barrier",
),
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
"../../../../test/distributed/test_functional_api.py": (
# depends on https://github.com/pytorch/pytorch/pull/159473
"test_tracing_with_fakepg_xpu",
),
"../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
"../../../../test/distributed/_tools/test_mem_tracker.py": None,
"../../../../test/distributed/_tools/test_memory_tracker.py": None,
}
Loading