diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index c6ac2eb2b..f1074aae7 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -161,3 +161,4 @@ runs: python run_distributed.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + cp *.xml ${{ github.workspace }}/ut_log diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index cc3131cce..fd17147b7 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -225,7 +225,9 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then fi fi if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log + sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index ddde5f8c8..724715ce8 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -11,7 +11,41 @@ # libfabric WA to avoid hang issue os.environ["FI_PROVIDER"] = "tcp" -# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3" +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) # run python test @@ -24,8 +58,16 @@ def run(test_command): return result.returncode +os.environ["CCL_SEND"] = "direct" +os.environ["CCL_RECV"] = "direct" test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) +del os.environ["CCL_SEND"] +del os.environ["CCL_RECV"] +test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"] +res += run(test_command) +test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"] +res += run(test_command) # run pytest with skiplist for key in skip_dict: diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py index 1210896ec..b380a3aaa 100644 --- a/test/xpu/skip_list_dist.py +++ b/test/xpu/skip_list_dist.py @@ -10,6 +10,7 @@ "test_ddp_parity_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, "../../../../test/distributed/fsdp/test_fsdp_core.py": ( "test_delayed_optim_step_offload_false_no_shard_xpu", "test_delayed_optim_step_offload_false_none_xpu", @@ -102,4 +103,12 @@ # will bring back after oneccl upgrade to 2021.16.1 "test_xccl_barrier", ), + "../../../../test/distributed/fsdp/test_fsdp_misc.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # depends on https://github.com/pytorch/pytorch/pull/159473 + "test_tracing_with_fakepg_xpu", + ), + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_memory_tracker.py": None, }