adds perf fixture

pranavm-nvidia · pranavm-nvidia · commit 046cbec9d570 · 2024-10-01T16:01:11.000-07:00
diff --git a/tripy/tests/performance/test_perf.py b/tripy/tests/performance/test_perf.py
@@ -21,58 +21,72 @@
 import tripy as tp
 
 
+def perf_fixture(dtypes):
+    def perf_fixture_impl(func):
+        @pytest.fixture(params=dtypes, scope="session")
+        def wrapped(request):
+            tripy_module, torch_module, input_infos = func(request.param, helper.TORCH_DTYPES[request.param])
+
+            torch_state_dict = {key: torch.from_dlpack(value) for key, value in tripy_module.state_dict().items()}
+            torch_module.load_state_dict(torch_state_dict)
+
+            compiler = tp.Compiler(tripy_module)
+            tripy_compiled = compiler.compile(**input_infos)
+
+            inputs = {
+                key: tp.iota(input_info.shape_bounds.opt, dtype=request.param)
+                for key, input_info in input_infos.items()
+            }
+            for tensor in inputs.values():
+                tensor.eval()
+
+            torch_compiled = torch.compile(torch_module)
+
+            return tripy_compiled, torch_compiled, inputs
+
+        return wrapped
+
+    return perf_fixture_impl
+
+
 # TODO: File issue for FP32:
-@pytest.fixture(params=[pytest.param(tp.float32, marks=pytest.mark.skip("Bug in MLIR-TRT")), tp.float16])
-def linear_block(request):
+@perf_fixture(dtypes=[pytest.param(tp.float32, marks=pytest.mark.skip("Bug in MLIR-TRT")), tp.float16])
+def linear_block(tripy_dtype, torch_dtype):
+
     class LinearBlock(tp.Module):
         def __init__(self):
-            self.layers = [tp.Linear(256, 256, bias=False, dtype=request.param) for _ in range(10)]
+            self.layers = [tp.Linear(256, 256, bias=False, dtype=tripy_dtype) for _ in range(10)]
             for layer in self.layers:
                 # Adjust the weights to prevent FP16 overflows.
-                layer.weight = tp.Parameter((tp.iota((256, 256), dim=1, dtype=request.param) / 256.0) - 0.5)
+                weight = torch.tile(
+                    torch.tensor([[-1, 1], [1, -1]], dtype=torch_dtype, device=torch.device("cuda")), (128, 128)
+                )
+                layer.weight = tp.Parameter(weight)
 
         def __call__(self, input):
             for layer in self.layers:
                 input = layer(input)
-                print(torch.from_dlpack(input))
             return input
 
     class TorchLinearBlock(torch.nn.Module):
         def __init__(self):
             super().__init__()
-            dtype = helper.TORCH_DTYPES[request.param]
             self.layers = torch.nn.ModuleList(
-                [torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=torch.device("cuda")) for _ in range(10)]
+                [
+                    torch.nn.Linear(256, 256, bias=False, dtype=torch_dtype, device=torch.device("cuda"))
+                    for _ in range(10)
+                ]
             )
 
         def forward(self, input):
             for layer in self.layers:
                 input = layer(input)
-                print(input)
             return input
 
     tripy_block = LinearBlock()
     torch_block = TorchLinearBlock()
-
-    torch_state_dict = {key: torch.from_dlpack(value) for key, value in tripy_block.state_dict().items()}
-    torch_block.load_state_dict(torch_state_dict)
-
-    input_infos = {"input": tp.InputInfo(shape=(1024, 256), dtype=request.param)}
-
-    # compiler = tp.Compiler(tripy_block)
-    # tripy_compiled = compiler.compile(**input_infos)
-    tripy_compiled = tripy_block
-
-    inputs = {
-        key: tp.iota(input_info.shape_bounds.opt, dtype=request.param) / 100.0
-        for key, input_info in input_infos.items()
-    }
-    for tensor in inputs.values():
-        tensor.eval()
-
-    torch_compiled = torch.compile(torch_block)
-
-    return tripy_compiled, torch_compiled, inputs
+    input_infos = {"input": tp.InputInfo(shape=(1024, 256), dtype=tripy_dtype)}
+    return tripy_block, torch_block, input_infos
 
 
 def test_perf_regression(linear_block, benchmark):
@@ -84,30 +98,29 @@ def test_perf_regression(linear_block, benchmark):
 def test_perf_comparative(linear_block):
     compiled_tripy_module, compiled_torch_module, inputs = linear_block
 
-    # TODO: Change to 100:
-    NUM_ITERS = 1
+    def time_func(func, kwargs, warm_up_runs=2, iterations=100):
+        for _ in range(warm_up_runs):
+            func(**kwargs)
 
-    # TODO: Add warm-up runs, factor out into function.
-    start = time.perf_counter()
-    for _ in range(NUM_ITERS):
-        tripy_out = compiled_tripy_module(**inputs)
-    end = time.perf_counter()
+        start = time.perf_counter()
+        for _ in range(iterations):
+            out = func(**kwargs)
+        end = time.perf_counter()
 
-    tripy_time = end - start
+        return out, end - start
 
-    start = time.perf_counter()
-    for _ in range(NUM_ITERS):
-        torch_out = compiled_torch_module(**{key: torch.from_dlpack(value) for key, value in inputs.items()})
-    end = time.perf_counter()
+    tripy_out, tripy_time = time_func(compiled_tripy_module, inputs)
 
-    torch_time = end - start
+    # TODO: Figure out how to time torch more accurately:
+    torch_out, torch_time = time_func(
+        compiled_torch_module, {key: torch.from_dlpack(value) for key, value in inputs.items()}
+    )
 
     # If the outputs don't match, then we're either not comparing apples-to-apples
     # or there is an accuracy bug somewhere - either way we want to catch it here.
-    # TODO: Adjust tolerance per test?
-    # TODO: File accuracy bug? Check if delta is within expected FP16 error - maybe check CUDA vs. torch CPU.
-    assert torch.allclose(torch_out, torch.from_dlpack(tripy_out), atol=0.01)
+    assert torch.allclose(torch_out, torch.from_dlpack(tripy_out))
 
+    # TODO: Make this threshold adjustable
     # Check that Tripy inference is at least 5% faster
     print(f"Tripy was {torch_time / float(tripy_time)}x faster than Torch")
     assert (tripy_time * 1.05) < torch_time