diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2ebdc63a2..8b8ec6b11 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -37,9 +37,8 @@ jobs: source /opt/spack/share/spack/setup-env.sh spack load python~debug@3.9.2%gcc@10.2.1 cuda@11.8.0 cudnn@8.7.0.84-11.8 intel-mkl@2020.4.304 java@11 gcc@11.3.0 source ci-script/prepare-python-environment.sh - # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission - OMP_PROC_BIND=true LD_LIBRARY_PATH=./install/lib:$LD_LIBRARY_PATH PYTHONPATH=install/lib:python:$PYTHONPATH srun --exclusive -N 1 -p gpu pytest --color=yes test - build-and-test-gcc-minimal-run_in_tree: + LD_LIBRARY_PATH=./install/lib:$LD_LIBRARY_PATH PYTHONPATH=install/lib:python:$PYTHONPATH srun --exclusive -N 1 -p gpu pytest --color=yes test + build-and-test-minimal-run_in_tree: runs-on: self-hosted if: github.event.pull_request.draft == false steps: @@ -64,8 +63,7 @@ jobs: source /opt/spack/share/spack/setup-env.sh spack load python~debug@3.9.2%gcc@10.2.1 java@11 gcc@12.1.0 source ci-script/prepare-python-environment.sh - # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission - OMP_PROC_BIND=true PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p cpu pytest --color=yes test + PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p cpu pytest --color=yes test build-and-test-clang-run-in-tree: runs-on: self-hosted if: github.event.pull_request.draft == false @@ -91,5 +89,4 @@ jobs: source /opt/spack/share/spack/setup-env.sh spack load python~debug@3.9.2%gcc@10.2.1 java@11 gcc@11.3.0 llvm@16.0.0 source ci-script/prepare-python-environment.sh - # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission - OMP_PROC_BIND=true PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p cpu pytest --color=yes test + PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p cpu pytest --color=yes test diff --git a/ffi/driver.cc b/ffi/driver.cc index 4b1305b0c..01130cd14 100644 --- a/ffi/driver.cc +++ b/ffi/driver.cc @@ -30,7 +30,7 @@ void init_ffi_driver(py::module_ &m) { .def("run", &Driver::run) .def("sync", &Driver::sync) .def("collect_returns", &Driver::collectReturns) - .def("time", &Driver::time, "rounds"_a = 10, "warmpups"_a = 3) + .def("time", &Driver::time, "rounds"_a = 10, "warmups"_a = 3) .def_property_readonly("device", &Driver::device); // Serialization diff --git a/python/freetensor/core/driver.py b/python/freetensor/core/driver.py index babb5e8ff..fe1db6c13 100644 --- a/python/freetensor/core/driver.py +++ b/python/freetensor/core/driver.py @@ -300,6 +300,22 @@ def __call__(self, *args, **kws): self.run() return self.collect_returns() + def time(self, *args, kws={}, rounds=10, warmups=3): + ''' + Measure running time. The return is dropped. + + Returns + ------- + Tuple[float, float] + - [0] = average time, in ms + - [1] = estimated standard deviation of the average time = sqrt(Var(X1 + + X2 + ... + Xn))), in ms + ''' + self.set_args(*args, **kws) + t = super().time(rounds=rounds, warmups=warmups) + self.collect_returns() # Must collect. Then we drop the result + return t + @as_decorator def build_binary(code: Optional[NativeCode] = None, diff --git a/src/driver.cc b/src/driver.cc index f36c85b64..a5c4eea7f 100644 --- a/src/driver.cc +++ b/src/driver.cc @@ -5,6 +5,7 @@ #include // memset #include // dlopen #include +#include #include // mkdir #include // SYS_fork #include // waitpid @@ -497,6 +498,9 @@ std::vector> Driver::collectReturns() { } std::pair Driver::time(int rounds, int warmups) { + // Restart OpenMP for a more reproducible result + omp_pause_resource_all(omp_pause_hard); + namespace ch = std::chrono; std::vector times(rounds); diff --git a/src/schedule.cc b/src/schedule.cc index bb335f036..98294d510 100644 --- a/src/schedule.cc +++ b/src/schedule.cc @@ -133,7 +133,18 @@ std::vector Schedule::tuneAutoSchedule( auto &[trace, _1, _2, t, stddev] = trials[i * batchSize + j]; d.setArgs(args, kws); // TODO: Allow setting measuring repeats - std::tie(t, stddev) = d.time(); + for (int k = 0; k < 3; k++) { + std::tie(t, stddev) = d.time(); + if (stddev <= 0.1 * t) { + break; + } + if (k < 2) { + WARNING( + "Rerunning measurement because stddev is too high"); + } else { + WARNING("Cannot get a low enough stddev"); + } + } d.collectReturns(); randCtx_->observeTrace(trace, t, stddev); } diff --git a/test/31.auto_schedule/test_auto_fission_fuse.py b/test/31.auto_schedule/test_auto_fission_fuse.py index c4709b552..96c2ea810 100644 --- a/test/31.auto_schedule/test_auto_fission_fuse.py +++ b/test/31.auto_schedule/test_auto_fission_fuse.py @@ -91,8 +91,10 @@ def test_stmt_in_between_2(): def test_tune_fuse(): - # We may fuse these loops. But fusing them will make it impossible to parallelize. - # After tuning, we will end up in not fusing them + # Plan 1: Fuse these loops, which makes it impossible to parallelize + # Plan 2: Not fusing these loops, then we can parallelize them + # We should decide on real measurement. (Parallelization does not always bring speedup, + # especially when there are too many cores). with ft.VarDef([("a", (100, 100, 100), "int32", "input", "cpu"), ("b", (100, 100, 100), "int32", "inout", "cpu"), ("c", (100, 100, 100), "int32", "inout", "cpu")]) as (a, b, @@ -129,18 +131,40 @@ def test_tune_fuse(): logs = list(map(str, s.logs())) print(logs) - for log in logs: - assert "fuse" not in log + s_plan1 = ft.Schedule(func) + s_plan1.fuse("Li1", "Li2") + s_plan1.fuse("Lj1", "Lj2") + s_plan1.fuse("Lk1", "Lk2") + + s_plan2 = ft.Schedule(func) + s_plan2.parallelize("Li1", "openmp") + s_plan2.reorder(["Lj2", "Li2"]) + s_plan2.parallelize("Lj2", "openmp") + + exe1 = ft.optimize(s_plan1.func()) + exe2 = ft.optimize(s_plan2.func()) + for i in range(3): + t1, stddev1 = exe1.time(a, b, c) + if stddev1 <= 0.1 * t1: + break + print("Rerunning because stddev is too high") + for i in range(3): + t2, stddev2 = exe2.time(a, b, c) + if stddev2 <= 0.1 * t2: + break + print("Rerunning because stddev is too high") + print(f"t1 = {t1}ms, stddev1 = {stddev1}ms") + print(f"t2 = {t2}ms, stddev2 = {stddev2}ms") + if (t1 < t2): + assert "fuse" in ", ".join(logs) + else: + assert "fuse" not in ", ".join(logs) def test_tune_fission(): # The reverse schedule of `test_tune_fuse` - # NOTE 1: To pass this test, the OpenMP parallel version must run faster than - # the serial version. However, this is not always true for unknown reasons. - # Set OMP_PROC_BIND=true can mitigate the problem. - - # NOTE 2: Library conflict with PyTorch (#421) may break this test. (FIXME) + # NOTE: Library conflict with PyTorch (#421) may break this test. (FIXME) with ft.VarDef([("a", (100, 100, 100), "int32", "input", "cpu"), ("b", (100, 100, 100), "int32", "inout", "cpu"), @@ -149,6 +173,7 @@ def test_tune_fission(): with ft.For("i", 0, 100, label="Li") as i: with ft.For("j", 0, 100, label="Lj") as j: with ft.For("k", 0, 100, label="Lk") as k: + ft.MarkLabel("S0") b[i, j, k] = b[i, (j + 1) % 100, k] + b[i, j, @@ -175,11 +200,40 @@ def test_tune_fission(): logs = list(map(str, s.logs())) print(logs) - assert "fission" in ", ".join(logs) + s_plan1 = ft.Schedule(func) + s_plan1.fission("Lk", ft.FissionSide.After, "S0") + s_plan1.fission("Lj", ft.FissionSide.After, "$fission.0{S0}") + s_plan1.fission("Li", ft.FissionSide.After, "$fission.0{$fission.0{S0}}") + s_plan1.parallelize("$fission.0{Li}", "openmp") + s_plan1.reorder(["$fission.1{$fission.1{Lj}}", "$fission.1{Li}"]) + s_plan1.parallelize("$fission.1{$fission.1{Lj}}", "openmp") + + s_plan2 = ft.Schedule(func) + # Do nothing + + exe1 = ft.optimize(s_plan1.func()) + exe2 = ft.optimize(s_plan2.func()) + for i in range(3): + t1, stddev1 = exe1.time(a, b, c) + if stddev1 <= 0.1 * t1: + break + print("Rerunning because stddev is too high") + for i in range(3): + t2, stddev2 = exe2.time(a, b, c) + if stddev2 <= 0.1 * t2: + break + print("Rerunning because stddev is too high") + print(f"t1 = {t1}ms, stddev1 = {stddev1}ms") + print(f"t2 = {t2}ms, stddev2 = {stddev2}ms") + if (t1 < t2): + assert "fission" in ", ".join(logs) + else: + assert "fission" not in ", ".join(logs) @pytest.mark.skipif(not ft.with_cuda(), reason="requires CUDA") def test_tune_with_cond(): + # Test different dicisions in a single program # Fuse loops that can parallelize. Don't fuse loops that can't with ft.VarDef([("a", (100, 100, 10), "int32", "input", "gpu/global"), ("b", (100, 100, 10), "int32", "inout", "gpu/global"),