adds initial comparative benchmark test

pranavm-nvidia · pranavm-nvidia · commit e24ad4000125 · 2024-09-30T16:59:11.000-07:00
diff --git a/tripy/tests/performance/test_perf.py b/tripy/tests/performance/test_perf.py
@@ -12,11 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import time
+
 import pytest
 import torch
+from tests import helper
 
 import tripy as tp
-from tests import helper
 
 
 # TODO: File issue for FP32:
@@ -25,39 +27,52 @@ def linear_block(request):
     class LinearBlock(tp.Module):
         def __init__(self):
             self.layers = [tp.Linear(256, 256, bias=False, dtype=request.param) for _ in range(10)]
+            for layer in self.layers:
+                # Adjust the weights to prevent FP16 overflows.
+                layer.weight = tp.Parameter((tp.iota((256, 256), dim=1, dtype=request.param) / 256.0) - 0.5)
 
         def __call__(self, input):
             for layer in self.layers:
                 input = layer(input)
+                print(torch.from_dlpack(input))
             return input
 
     class TorchLinearBlock(torch.nn.Module):
         def __init__(self):
             super().__init__()
-            self.layers = [
-                torch.nn.Linear(256, 256, bias=False, dtype=helper.TORCH_DTYPES[request.param]) for _ in range(10)
-            ]
+            dtype = helper.TORCH_DTYPES[request.param]
+            self.layers = torch.nn.ModuleList(
+                [torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=torch.device("cuda")) for _ in range(10)]
+            )
 
         def forward(self, input):
             for layer in self.layers:
                 input = layer(input)
+                print(input)
             return input
 
     tripy_block = LinearBlock()
     torch_block = TorchLinearBlock()
 
-    tripy_block.load_from_state_dict(state_dict={key: tp.Parameter(value) for key, value in torch_block.state_dict()})
+    torch_state_dict = {key: torch.from_dlpack(value) for key, value in tripy_block.state_dict().items()}
+    torch_block.load_state_dict(torch_state_dict)
 
     input_infos = {"input": tp.InputInfo(shape=(1024, 256), dtype=request.param)}
 
-    compiler = tp.Compiler(tripy_block)
-    tripy_compiled = compiler.compile(**input_infos)
+    # compiler = tp.Compiler(tripy_block)
+    # tripy_compiled = compiler.compile(**input_infos)
+    tripy_compiled = tripy_block
 
-    inputs = {key: tp.iota(input_info.shape_bounds.opt, dtype=request.param) for key, input_info in input_infos.items()}
+    inputs = {
+        key: tp.iota(input_info.shape_bounds.opt, dtype=request.param) / 100.0
+        for key, input_info in input_infos.items()
+    }
     for tensor in inputs.values():
         tensor.eval()
 
-    return tripy_compiled, torch_block, inputs
+    torch_compiled = torch.compile(torch_block)
+
+    return tripy_compiled, torch_compiled, inputs
 
 
 def test_perf_regression(linear_block, benchmark):
@@ -67,8 +82,32 @@ def test_perf_regression(linear_block, benchmark):
 
 
 def test_perf_comparative(linear_block):
-    compiled_tripy_module, torch_module, inputs = linear_block
+    compiled_tripy_module, compiled_torch_module, inputs = linear_block
+
+    # TODO: Change to 100:
+    NUM_ITERS = 1
+
+    # TODO: Add warm-up runs, factor out into function.
+    start = time.perf_counter()
+    for _ in range(NUM_ITERS):
+        tripy_out = compiled_tripy_module(**inputs)
+    end = time.perf_counter()
+
+    tripy_time = end - start
+
+    start = time.perf_counter()
+    for _ in range(NUM_ITERS):
+        torch_out = compiled_torch_module(**{key: torch.from_dlpack(value) for key, value in inputs.items()})
+    end = time.perf_counter()
+
+    torch_time = end - start
 
-    # TODO: Check accuracy - update fixture to make weights same
+    # If the outputs don't match, then we're either not comparing apples-to-apples
+    # or there is an accuracy bug somewhere - either way we want to catch it here.
+    # TODO: Adjust tolerance per test?
+    # TODO: File accuracy bug? Check if delta is within expected FP16 error - maybe check CUDA vs. torch CPU.
+    assert torch.allclose(torch_out, torch.from_dlpack(tripy_out), atol=0.01)
 
-    # TODO: Compare perf after compiling? Maybe compile in fixture
+    # Check that Tripy inference is at least 5% faster
+    print(f"Tripy was {torch_time / float(tripy_time)}x faster than Torch")
+    assert (tripy_time * 1.05) < torch_time
diff --git a/tripy/tripy/frontend/module/linear.py b/tripy/tripy/frontend/module/linear.py
@@ -98,9 +98,8 @@ def __init__(
 
         self.quant_dtype = quant_dtype
         self.weight_quant_dim = weight_quant_dim
-        if quant_dtype is not None:
-            self.weight_scale = None
-            self.input_scale = None
+        self.weight_scale = None
+        self.input_scale = None
 
     def __call__(self, x: "tripy.Tensor") -> "tripy.Tensor":
         r"""