roastduck · roastduck · May 5, 2023 · May 5, 2023 · May 5, 2023 · May 18, 2023
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -37,9 +37,8 @@ jobs:
           source /opt/spack/share/spack/setup-env.sh
           spack load python~debug@3.9.2%gcc@10.2.1 cuda@11.8.0 cudnn@8.7.0.84-11.8 intel-mkl@2020.4.304 java@11 gcc@11.3.0
           source ci-script/prepare-python-environment.sh
-          # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission
-          OMP_PROC_BIND=true LD_LIBRARY_PATH=./install/lib:$LD_LIBRARY_PATH PYTHONPATH=install/lib:python:$PYTHONPATH srun --exclusive -N 1 -p gpu pytest --color=yes test
-  build-and-test-gcc-minimal-run_in_tree:
+          LD_LIBRARY_PATH=./install/lib:$LD_LIBRARY_PATH PYTHONPATH=install/lib:python:$PYTHONPATH srun --exclusive -N 1 -p gpu pytest --color=yes test
+  build-and-test-minimal-run_in_tree:
     runs-on: self-hosted
     if: github.event.pull_request.draft == false
     steps:
@@ -64,8 +63,7 @@ jobs:
           source /opt/spack/share/spack/setup-env.sh
           spack load python~debug@3.9.2%gcc@10.2.1 java@11 gcc@12.1.0
           source ci-script/prepare-python-environment.sh
-          # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission
-          OMP_PROC_BIND=true PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p cpu pytest --color=yes test
+          PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p cpu pytest --color=yes test
   build-and-test-clang-run-in-tree:
     runs-on: self-hosted
     if: github.event.pull_request.draft == false
@@ -91,5 +89,4 @@ jobs:
           source /opt/spack/share/spack/setup-env.sh
           spack load python~debug@3.9.2%gcc@10.2.1 java@11 gcc@11.3.0 llvm@16.0.0
           source ci-script/prepare-python-environment.sh
-          # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission
-          OMP_PROC_BIND=true PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p cpu pytest --color=yes test
+          PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p cpu pytest --color=yes test
diff --git a/ffi/driver.cc b/ffi/driver.cc
@@ -30,7 +30,7 @@ void init_ffi_driver(py::module_ &m) {
         .def("run", &Driver::run)
         .def("sync", &Driver::sync)
         .def("collect_returns", &Driver::collectReturns)
-        .def("time", &Driver::time, "rounds"_a = 10, "warmpups"_a = 3)
+        .def("time", &Driver::time, "rounds"_a = 10, "warmups"_a = 3)
         .def_property_readonly("device", &Driver::device);
 
     // Serialization

diff --git a/python/freetensor/core/driver.py b/python/freetensor/core/driver.py
@@ -300,6 +300,22 @@ def __call__(self, *args, **kws):
         self.run()
         return self.collect_returns()
 
+    def time(self, *args, kws={}, rounds=10, warmups=3):
+        '''
+        Measure running time. The return is dropped.
+
+        Returns
+        -------
+        Tuple[float, float]
+            - [0] = average time, in ms
+            - [1] = estimated standard deviation of the average time = sqrt(Var(X1 +
+            X2 + ... + Xn))), in ms
+        '''
+        self.set_args(*args, **kws)
+        t = super().time(rounds=rounds, warmups=warmups)
+        self.collect_returns()  # Must collect. Then we drop the result
+        return t
+
 
 @as_decorator
 def build_binary(code: Optional[NativeCode] = None,

diff --git a/src/driver.cc b/src/driver.cc
@@ -5,6 +5,7 @@
 #include <cstring> // memset
 #include <dlfcn.h> // dlopen
 #include <fstream>
+#include <omp.h>
 #include <sys/stat.h>    // mkdir
 #include <sys/syscall.h> // SYS_fork
 #include <sys/wait.h>    // waitpid
@@ -497,6 +498,9 @@ std::vector<Ref<Array>> Driver::collectReturns() {
 }
 
 std::pair<double, double> Driver::time(int rounds, int warmups) {
+    // Restart OpenMP for a more reproducible result
+    omp_pause_resource_all(omp_pause_hard);
+
     namespace ch = std::chrono;
 
     std::vector<double> times(rounds);

diff --git a/src/schedule.cc b/src/schedule.cc
@@ -133,7 +133,18 @@ std::vector<AutoScheduleTuneTrial> Schedule::tuneAutoSchedule(
                 auto &[trace, _1, _2, t, stddev] = trials[i * batchSize + j];
                 d.setArgs(args, kws);
                 // TODO: Allow setting measuring repeats
-                std::tie(t, stddev) = d.time();
+                for (int k = 0; k < 3; k++) {
+                    std::tie(t, stddev) = d.time();
+                    if (stddev <= 0.1 * t) {
+                        break;
+                    }
+                    if (k < 2) {
+                        WARNING(
+                            "Rerunning measurement because stddev is too high");
+                    } else {
+                        WARNING("Cannot get a low enough stddev");
+                    }
+                }
                 d.collectReturns();
                 randCtx_->observeTrace(trace, t, stddev);
             }

diff --git a/test/31.auto_schedule/test_auto_fission_fuse.py b/test/31.auto_schedule/test_auto_fission_fuse.py
@@ -91,8 +91,10 @@ def test_stmt_in_between_2():
 
 
 def test_tune_fuse():
-    # We may fuse these loops. But fusing them will make it impossible to parallelize.
-    # After tuning, we will end up in not fusing them
+    # Plan 1: Fuse these loops, which makes it impossible to parallelize
+    # Plan 2: Not fusing these loops, then we can parallelize them
+    # We should decide on real measurement. (Parallelization does not always bring speedup,
+    # especially when there are too many cores).
     with ft.VarDef([("a", (100, 100, 100), "int32", "input", "cpu"),
                     ("b", (100, 100, 100), "int32", "inout", "cpu"),
                     ("c", (100, 100, 100), "int32", "inout", "cpu")]) as (a, b,
@@ -129,18 +131,40 @@ def test_tune_fuse():
     logs = list(map(str, s.logs()))
     print(logs)
 
-    for log in logs:
-        assert "fuse" not in log
+    s_plan1 = ft.Schedule(func)
+    s_plan1.fuse("Li1", "Li2")
+    s_plan1.fuse("Lj1", "Lj2")
+    s_plan1.fuse("Lk1", "Lk2")
+
+    s_plan2 = ft.Schedule(func)
+    s_plan2.parallelize("Li1", "openmp")
+    s_plan2.reorder(["Lj2", "Li2"])
+    s_plan2.parallelize("Lj2", "openmp")
+
+    exe1 = ft.optimize(s_plan1.func())
+    exe2 = ft.optimize(s_plan2.func())
+    for i in range(3):
+        t1, stddev1 = exe1.time(a, b, c)
+        if stddev1 <= 0.1 * t1:
+            break
+        print("Rerunning because stddev is too high")
+    for i in range(3):
+        t2, stddev2 = exe2.time(a, b, c)
+        if stddev2 <= 0.1 * t2:
+            break
+        print("Rerunning because stddev is too high")
+    print(f"t1 = {t1}ms, stddev1 = {stddev1}ms")
+    print(f"t2 = {t2}ms, stddev2 = {stddev2}ms")
+    if (t1 < t2):
+        assert "fuse" in ", ".join(logs)
+    else:
+        assert "fuse" not in ", ".join(logs)
 
 
 def test_tune_fission():
     # The reverse schedule of `test_tune_fuse`
 
-    # NOTE 1: To pass this test, the OpenMP parallel version must run faster than
-    # the serial version. However, this is not always true for unknown reasons.
-    # Set OMP_PROC_BIND=true can mitigate the problem.
-
-    # NOTE 2: Library conflict with PyTorch (#421) may break this test. (FIXME)
+    # NOTE: Library conflict with PyTorch (#421) may break this test. (FIXME)
 
     with ft.VarDef([("a", (100, 100, 100), "int32", "input", "cpu"),
                     ("b", (100, 100, 100), "int32", "inout", "cpu"),
@@ -149,6 +173,7 @@ def test_tune_fission():
         with ft.For("i", 0, 100, label="Li") as i:
             with ft.For("j", 0, 100, label="Lj") as j:
                 with ft.For("k", 0, 100, label="Lk") as k:
+                    ft.MarkLabel("S0")
                     b[i, j,
                       k] = b[i,
                              (j + 1) % 100, k] + b[i, j,
@@ -175,11 +200,40 @@ def test_tune_fission():
     logs = list(map(str, s.logs()))
     print(logs)
 
-    assert "fission" in ", ".join(logs)
+    s_plan1 = ft.Schedule(func)
+    s_plan1.fission("Lk", ft.FissionSide.After, "S0")
+    s_plan1.fission("Lj", ft.FissionSide.After, "$fission.0{S0}")
+    s_plan1.fission("Li", ft.FissionSide.After, "$fission.0{$fission.0{S0}}")
+    s_plan1.parallelize("$fission.0{Li}", "openmp")
+    s_plan1.reorder(["$fission.1{$fission.1{Lj}}", "$fission.1{Li}"])
+    s_plan1.parallelize("$fission.1{$fission.1{Lj}}", "openmp")
+
+    s_plan2 = ft.Schedule(func)
+    # Do nothing
+
+    exe1 = ft.optimize(s_plan1.func())
+    exe2 = ft.optimize(s_plan2.func())
+    for i in range(3):
+        t1, stddev1 = exe1.time(a, b, c)
+        if stddev1 <= 0.1 * t1:
+            break
+        print("Rerunning because stddev is too high")
+    for i in range(3):
+        t2, stddev2 = exe2.time(a, b, c)
+        if stddev2 <= 0.1 * t2:
+            break
+        print("Rerunning because stddev is too high")
+    print(f"t1 = {t1}ms, stddev1 = {stddev1}ms")
+    print(f"t2 = {t2}ms, stddev2 = {stddev2}ms")
+    if (t1 < t2):
+        assert "fission" in ", ".join(logs)
+    else:
+        assert "fission" not in ", ".join(logs)
 
 
 @pytest.mark.skipif(not ft.with_cuda(), reason="requires CUDA")
 def test_tune_with_cond():
+    # Test different dicisions in a single program
     # Fuse loops that can parallelize. Don't fuse loops that can't
     with ft.VarDef([("a", (100, 100, 10), "int32", "input", "gpu/global"),
                     ("b", (100, 100, 10), "int32", "inout", "gpu/global"),