fixes torch benchmarking, refactors infra

pranavm-nvidia · pranavm-nvidia · commit f2db53811e1b · 2024-10-02T12:52:53.000-07:00
diff --git a/tripy/pyproject.toml b/tripy/pyproject.toml
@@ -56,6 +56,7 @@ test = [
   "pytest-cov==4.1.0",
   "pytest-xdist==3.6.1",
   "pytest-benchmark==4.0.0",
+  "pytest-lazy-fixture==0.6.3",
   # Triton is required for torch.compile
   "triton==3.0.0",
   "snakeviz==2.2.0",
diff --git a/tripy/tests/performance/cases/__init__.py b/tripy/tests/performance/cases/__init__.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__all__ = []
+
+
+# In order to make the pytest fixtures defined in this submodule visible, we
+# need to import them in the test using their function names. To do so, we can
+# export them via this file by making them local variables and adding them to `__all__`.
+#
+# Note that just importing the module is sufficient to update PERF_CASES, but does
+# not make the actual fixture function visible to pytest.
+def __discover_modules():
+    import importlib
+    import pkgutil
+
+    mods = [importlib.import_module("tests.performance.cases")]
+    while mods:
+        mod = mods.pop(0)
+
+        yield mod
+
+        if hasattr(mod, "__path__"):
+            mods.extend(
+                [
+                    importlib.import_module(f"{mod.__name__}.{submod.name}")
+                    for submod in pkgutil.iter_modules(mod.__path__)
+                ]
+            )
+
+
+modules = list(__discover_modules())[1:]
+
+# Discover and import all perf fixtures.
+from tests.performance.conftest import PERF_CASES
+
+__perf_case_names = {case.name for case in PERF_CASES}
+
+for mod in modules:
+    for name, obj in mod.__dict__.items():
+        if name in __perf_case_names:
+            locals()[name] = obj
+            __all__.append(name)
diff --git a/tripy/tests/performance/cases/linear_block.py b/tripy/tests/performance/cases/linear_block.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import torch
+from tests.performance.conftest import perf_fixture
+
+import tripy as tp
+
+
+# TODO: File issue for FP32:
+@perf_fixture(dtypes=[pytest.param(tp.float32, marks=pytest.mark.skip("Bug in MLIR-TRT")), tp.float16])
+def linear_block(tripy_dtype, torch_dtype):
+
+    class LinearBlock(tp.Module):
+        def __init__(self):
+            self.layers = [tp.Linear(256, 256, bias=False, dtype=tripy_dtype) for _ in range(10)]
+            for layer in self.layers:
+                # Adjust the weights to prevent FP16 overflows.
+                weight = torch.tile(
+                    torch.tensor([[-1, 1], [1, -1]], dtype=torch_dtype, device=torch.device("cuda")), (128, 128)
+                )
+                layer.weight = tp.Parameter(weight)
+
+        def __call__(self, input):
+            for layer in self.layers:
+                input = layer(input)
+            return input
+
+    class TorchLinearBlock(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layers = torch.nn.ModuleList(
+                [
+                    torch.nn.Linear(256, 256, bias=False, dtype=torch_dtype, device=torch.device("cuda"))
+                    for _ in range(10)
+                ]
+            )
+
+        def forward(self, input):
+            for layer in self.layers:
+                input = layer(input)
+            return input
+
+    tripy_block = LinearBlock()
+    torch_block = TorchLinearBlock()
+    input_infos = {"input": tp.InputInfo(shape=(1024, 256), dtype=tripy_dtype)}
+    return tripy_block, torch_block, input_infos
diff --git a/tripy/tests/performance/conftest.py b/tripy/tests/performance/conftest.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import torch
+from tests import helper
+
+import tripy as tp
+
+PERF_CASES = []
+
+
+def perf_fixture(dtypes):
+    def perf_fixture_impl(func):
+        PERF_CASES.append(pytest.lazy_fixture(func.__qualname__))
+
+        @pytest.fixture(params=dtypes, scope="session")
+        def wrapped(request):
+            tripy_module, torch_module, input_infos = func(request.param, helper.TORCH_DTYPES[request.param])
+
+            torch_state_dict = {key: torch.from_dlpack(value) for key, value in tripy_module.state_dict().items()}
+            torch_module.load_state_dict(torch_state_dict)
+
+            compiler = tp.Compiler(tripy_module)
+            tripy_compiled = compiler.compile(**input_infos)
+
+            inputs = {
+                key: tp.iota(input_info.shape_bounds.opt, dtype=request.param)
+                for key, input_info in input_infos.items()
+            }
+            for tensor in inputs.values():
+                tensor.eval()
+
+            torch_compiled = torch.compile(torch_module)
+
+            return tripy_compiled, torch_compiled, inputs
+
+        return wrapped
+
+    return perf_fixture_impl
diff --git a/tripy/tests/performance/test_perf.py b/tripy/tests/performance/test_perf.py
@@ -16,105 +16,62 @@
 
 import pytest
 import torch
-from tests import helper
 
-import tripy as tp
-
-
-def perf_fixture(dtypes):
-    def perf_fixture_impl(func):
-        @pytest.fixture(params=dtypes, scope="session")
-        def wrapped(request):
-            tripy_module, torch_module, input_infos = func(request.param, helper.TORCH_DTYPES[request.param])
-
-            torch_state_dict = {key: torch.from_dlpack(value) for key, value in tripy_module.state_dict().items()}
-            torch_module.load_state_dict(torch_state_dict)
-
-            compiler = tp.Compiler(tripy_module)
-            tripy_compiled = compiler.compile(**input_infos)
-
-            inputs = {
-                key: tp.iota(input_info.shape_bounds.opt, dtype=request.param)
-                for key, input_info in input_infos.items()
-            }
-            for tensor in inputs.values():
-                tensor.eval()
-
-            torch_compiled = torch.compile(torch_module)
+# Need to import cases in order to populate PERF_CASES and load pytest fixtures
+from tests.performance.cases import *
+from tests.performance.conftest import PERF_CASES
 
-            return tripy_compiled, torch_compiled, inputs
-
-        return wrapped
-
-    return perf_fixture_impl
-
-
-# TODO: File issue for FP32:
-@perf_fixture(dtypes=[pytest.param(tp.float32, marks=pytest.mark.skip("Bug in MLIR-TRT")), tp.float16])
-def linear_block(tripy_dtype, torch_dtype):
+import tripy as tp
 
-    class LinearBlock(tp.Module):
-        def __init__(self):
-            self.layers = [tp.Linear(256, 256, bias=False, dtype=tripy_dtype) for _ in range(10)]
-            for layer in self.layers:
-                # Adjust the weights to prevent FP16 overflows.
-                weight = torch.tile(
-                    torch.tensor([[-1, 1], [1, -1]], dtype=torch_dtype, device=torch.device("cuda")), (128, 128)
-                )
-                layer.weight = tp.Parameter(weight)
 
-        def __call__(self, input):
-            for layer in self.layers:
-                input = layer(input)
-            return input
+@pytest.mark.parametrize("perf_case", PERF_CASES)
+def test_perf_regression(perf_case, benchmark):
+    compiled_tripy_module, _, inputs = perf_case
 
-    class TorchLinearBlock(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.layers = torch.nn.ModuleList(
-                [
-                    torch.nn.Linear(256, 256, bias=False, dtype=torch_dtype, device=torch.device("cuda"))
-                    for _ in range(10)
-                ]
-            )
+    benchmark(compiled_tripy_module, **inputs)
 
-        def forward(self, input):
-            for layer in self.layers:
-                input = layer(input)
-            return input
 
-    tripy_block = LinearBlock()
-    torch_block = TorchLinearBlock()
-    input_infos = {"input": tp.InputInfo(shape=(1024, 256), dtype=tripy_dtype)}
-    return tripy_block, torch_block, input_infos
+@pytest.mark.parametrize("perf_case", PERF_CASES)
+def test_perf_comparative(perf_case):
+    compiled_tripy_module, compiled_torch_module, inputs = perf_case
 
+    WARM_UP_RUNS = 2
+    ITERATIONS = 100
 
-def test_perf_regression(linear_block, benchmark):
-    compiled_tripy_module, _, inputs = linear_block
+    # Time Tripy
+    stream = tp.default_stream()
 
-    benchmark(compiled_tripy_module, **inputs)
+    for _ in range(WARM_UP_RUNS):
+        compiled_tripy_module(**inputs)
+    stream.synchronize()
 
+    start = time.perf_counter()
+    for _ in range(ITERATIONS):
+        tripy_out = compiled_tripy_module(**inputs)
+    stream.synchronize()
+    end = time.perf_counter()
 
-def test_perf_comparative(linear_block):
-    compiled_tripy_module, compiled_torch_module, inputs = linear_block
+    # Torch will report time in ms:
+    tripy_time = (end - start) * 1000
 
-    def time_func(func, kwargs, warm_up_runs=2, iterations=100):
-        for _ in range(warm_up_runs):
-            func(**kwargs)
+    # Time Torch
+    torch_inputs = {key: torch.from_dlpack(value).to(device="cuda") for key, value in inputs.items()}
 
-        start = time.perf_counter()
-        for _ in range(iterations):
-            out = func(**kwargs)
-        end = time.perf_counter()
+    with torch.no_grad():
+        for _ in range(WARM_UP_RUNS):
+            compiled_torch_module(**torch_inputs)
+        torch.cuda.synchronize()
 
-        return out, end - start
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
 
-    tripy_out, tripy_time = time_func(compiled_tripy_module, inputs)
+        start.record()
+        for _ in range(ITERATIONS):
+            torch_out = compiled_torch_module(**torch_inputs)
+        end.record()
+        torch.cuda.synchronize()
 
-    # TODO: Figure out how to time torch more accurately:
-    torch_out, torch_time = time_func(
-        compiled_torch_module, {key: torch.from_dlpack(value) for key, value in inputs.items()}
-    )
+        torch_time = start.elapsed_time(end)
 
     # If the outputs don't match, then we're either not comparing apples-to-apples
     # or there is an accuracy bug somewhere - either way we want to catch it here.
diff --git a/tripy/tripy/backend/mlir/executor.py b/tripy/tripy/backend/mlir/executor.py
@@ -24,7 +24,7 @@
 from tripy.backend.utils import TensorInfo
 from tripy.common import datatype, device
 from tripy.common.exception import raise_error
-from tripy.utils import log_time, make_tuple
+from tripy.utils import make_tuple
 
 
 class Executor:
@@ -146,7 +146,6 @@ def stream(self):
     def stream(self, stream):
         self._stream = stream
 
-    @log_time
     def execute(self, output_devices=List[device], inputs: List["Tensor"] = []) -> List[runtime.MemRefValue]:
         from tripy.frontend.trace.ops import Storage