NVIDIA
diff --git a/‎tripy/docs/packages.html‎
Lines changed: 27 additions & 1 deletion b/‎tripy/docs/packages.html‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎tripy/pyproject.toml‎
Lines changed: 3 additions & 3 deletions b/‎tripy/pyproject.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tripy/tests/integration/conftest.py‎
Lines changed: 61 additions & 0 deletions b/‎tripy/tests/integration/conftest.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎tripy/tests/integration/test_batchnorm.py‎
Lines changed: 2 additions & 2 deletions b/‎tripy/tests/integration/test_batchnorm.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tripy/tests/integration/test_cast.py‎
Lines changed: 13 additions & 14 deletions b/‎tripy/tests/integration/test_cast.py‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎tripy/tests/integration/test_concatenate.py‎
Lines changed: 4 additions & 4 deletions b/‎tripy/tests/integration/test_concatenate.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tripy/tests/integration/test_conv.py‎
Lines changed: 8 additions & 8 deletions b/‎tripy/tests/integration/test_conv.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎tripy/tests/integration/test_conv_transpose.py‎
Lines changed: 12 additions & 12 deletions b/‎tripy/tests/integration/test_conv_transpose.py‎
Lines changed: 12 additions & 12 deletions
@@ -9,6 +9,12 @@
 
 <body>
     <h1>Package Index</h1>
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/tripy-v0.0.5/tripy-0.0.5-py3-none-any.whl">tripy-0.0.5-py3-none-any.whl</a><br>
+
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/tripy-v0.0.4/tripy-0.0.4-py3-none-any.whl">tripy-0.0.4-py3-none-any.whl</a><br>
+
     <a
         href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/tripy-v0.0.3/tripy-0.0.3-py3-none-any.whl">tripy-0.0.3-py3-none-any.whl</a><br>
 
@@ -102,6 +108,26 @@ <h1>Package Index</h1>
         href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.36/mlir_tensorrt_runtime-0.1.36+cuda12.trt102-cp312-cp312-linux_x86_64.whl">mlir_tensorrt_runtime-0.1.36+cuda12.trt102-cp312-cp312-linux_x86_64.whl</a><br>
     <a
         href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.36/mlir_tensorrt_runtime-0.1.36+cuda12.trt102-cp39-cp39-linux_x86_64.whl">mlir_tensorrt_runtime-0.1.36+cuda12.trt102-cp39-cp39-linux_x86_64.whl</a><br>
-</body>
+
+
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.37/mlir_tensorrt_compiler-0.1.37+cuda12.trt102-cp310-cp310-linux_x86_64.whl">mlir_tensorrt_compiler-0.1.37+cuda12.trt102-cp310-cp310-linux_x86_64.whl</a><br>
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.37/mlir_tensorrt_compiler-0.1.37+cuda12.trt102-cp311-cp311-linux_x86_64.whl">mlir_tensorrt_compiler-0.1.37+cuda12.trt102-cp311-cp311-linux_x86_64.whl</a><br>
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.37/mlir_tensorrt_compiler-0.1.37+cuda12.trt102-cp312-cp312-linux_x86_64.whl">mlir_tensorrt_compiler-0.1.37+cuda12.trt102-cp312-cp312-linux_x86_64.whl</a><br>
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.37/mlir_tensorrt_compiler-0.1.37+cuda12.trt102-cp39-cp39-linux_x86_64.whl">mlir_tensorrt_compiler-0.1.37+cuda12.trt102-cp39-cp39-linux_x86_64.whl</a><br>
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.37/mlir_tensorrt_runtime-0.1.37+cuda12.trt102-cp310-cp310-linux_x86_64.whl">mlir_tensorrt_runtime-0.1.37+cuda12.trt102-cp310-cp310-linux_x86_64.whl</a><br>
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.37/mlir_tensorrt_runtime-0.1.37+cuda12.trt102-cp311-cp311-linux_x86_64.whl">mlir_tensorrt_runtime-0.1.37+cuda12.trt102-cp311-cp311-linux_x86_64.whl</a><br>
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.37/mlir_tensorrt_runtime-0.1.37+cuda12.trt102-cp312-cp312-linux_x86_64.whl">mlir_tensorrt_runtime-0.1.37+cuda12.trt102-cp312-cp312-linux_x86_64.whl</a><br>
+    <a
+        href="https://github.com/NVIDIA/TensorRT-Incubator/releases/download/mlir-tensorrt-v0.1.37/mlir_tensorrt_runtime-0.1.37+cuda12.trt102-cp39-cp39-linux_x86_64.whl">mlir_tensorrt_runtime-0.1.37+cuda12.trt102-cp39-cp39-linux_x86_64.whl</a><br>
+
+
+    </body>
 
 </html>
@@ -1,15 +1,15 @@
 [project]
 name = "tripy"
-version = "0.0.3"
+version = "0.0.5"
 authors = [{name = "NVIDIA", email="[email protected]"}]
 description = "Tripy: A Python Programming Model For TensorRT"
 readme = "README.md"
 requires-python = ">= 3.9"
 license = {text = "Apache 2.0"}
 dependencies = [
   "tensorrt~=10.0",
-  "mlir-tensorrt-compiler==0.1.36+cuda12.trt102",
-  "mlir-tensorrt-runtime==0.1.36+cuda12.trt102",
+  "mlir-tensorrt-compiler==0.1.37+cuda12.trt102",
+  "mlir-tensorrt-runtime==0.1.37+cuda12.trt102",
   "colored==2.2.3",
 ]
 
 
@@ -0,0 +1,61 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+
+import tripy as tp
+
+
+@pytest.fixture(params=["compile", "eager"])
+def eager_or_compiled(request):
+    def wrapper(func, *args, **kwargs):
+        def get_input_info(x: tp.Tensor):
+            return tp.InputInfo(list(map(int, x.shape)), dtype=x.dtype)
+
+        if request.param == "eager":
+            return func(*args, **kwargs)
+
+        assert request.param == "compile"
+
+        compile_args = []
+        for arg in args:
+            # We don't want to feed DimensionSize as a dynamic input to the compiler (https://github.com/NVIDIA/TensorRT-Incubator/issues/65).
+            if isinstance(arg, tp.Tensor) and not isinstance(arg, tp.DimensionSize):
+                compile_args.append(get_input_info(arg))
+            else:
+                compile_args.append(arg)
+        compile_args = tuple(compile_args)
+
+        compile_kwargs = dict(
+            (
+                k,
+                ((get_input_info(v) if isinstance(v, tp.Tensor) and not isinstance(v, tp.DimensionSize) else v)),
+            )
+            for k, v in kwargs.items()
+        )
+
+        compiled_func = tp.compile(func, args=compile_args, kwargs=compile_kwargs)
+
+        tensor_args = tuple(x for x in args if isinstance(x, tp.Tensor) and not isinstance(x, tp.DimensionSize))
+
+        tensor_kwargs = {
+            k: v for k, v in kwargs.items() if isinstance(v, tp.Tensor) and not isinstance(v, tp.DimensionSize)
+        }
+
+        return compiled_func(*tensor_args, **tensor_kwargs)
+
+    return wrapper
@@ -26,7 +26,7 @@ class TestBatchNorm:
 
     @pytest.mark.parametrize("torch_dtype, tp_dtype", DTYPES)
     @pytest.mark.parametrize("input_shape", [(2, 2, 2, 2)])
-    def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape):
+    def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape, eager_or_compiled):
         eps = 1e-5
         num_features = input_shape[1]  # Number of channels in the input tensor
         batchnorm = torch.nn.BatchNorm2d(num_features=num_features, eps=eps, dtype=torch_dtype)
@@ -45,7 +45,7 @@ def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape):
         input = torch.randn(input_shape, dtype=torch_dtype).to("cuda")
         tp_input = tp.Tensor(input, dtype=tp_dtype)
 
-        output = tp_batchnorm(tp_input)
+        output = eager_or_compiled(tp_batchnorm, tp_input)
 
         batchnorm.to("cuda").eval()
         with torch.no_grad():
 
@@ -30,54 +30,53 @@ class TestCast:
         [
             (np.int32, np.float32),
             (np.float32, np.int32),
-            (np.int64, np.float32),
-            (np.float32, np.int64),
-            (np.int64, np.int32),
-            (np.int64, np.int8),
             (np.int32, np.int8),
             (np.float32, np.int8),
-            (np.int8, np.int64),
             (np.int8, np.int32),
             (np.int8, np.float32),
             # important to test conversion into bool because default StableHLO semantics
             # are simply to truncate to i1, which is not desirable
             (np.float32, bool),
             (np.int32, bool),
-            (np.int64, bool),
             # requires a dequantization first
             # TODO(#219): Dequantize fails with dynamic shapes
             # (np.int8, bool),
         ],
     )
-    def test_cast(self, input_dtype, target_dtype):
+    def test_cast(self, input_dtype, target_dtype, eager_or_compiled):
         tp_input_dtype = NUMPY_TO_TRIPY[input_dtype]
         tp_target_dtype = NUMPY_TO_TRIPY[target_dtype]
 
         # TODO(#222): Integer casts with negative numbers fail in many cases
         input_tensor = tp.Tensor([0, 1, 2], dtype=tp_input_dtype)
         np_input = cp.from_dlpack(input_tensor).get()
-        output = tp.cast(input_tensor, tp_target_dtype)
+        output = eager_or_compiled(tp.cast, input_tensor, tp_target_dtype)
 
         assert np.array_equal(cp.from_dlpack(output).get(), np_input.astype(target_dtype))
 
     # these dtypes don't have analogues in numpy
     @pytest.mark.parametrize("source_dtype", [pytest.param(tp.float8, marks=skip_if_older_than_sm89), tp.int4])
-    def test_cast_quantized_dtypes_into_bool(self, source_dtype):
+    def test_cast_quantized_dtypes_into_bool(self, source_dtype, eager_or_compiled):
         # TODO(#223): Using an odd size leads to a strange crash, so can't just use [-1.0, 0.0, 1.0]
         input_tensor = tp.Tensor([-1.0, 0.0, 0.0, 1.0], dtype=tp.float32)
-        q = tp.quantize(input_tensor, scale=1.0, dtype=source_dtype)
-        output = tp.cast(q, tp.bool)
+
+        def func(input):
+            q = tp.quantize(input, scale=1.0, dtype=source_dtype)
+            output = tp.cast(q, tp.bool)
+            return output
+
+        output = eager_or_compiled(func, input_tensor)
         assert cp.from_dlpack(output).get().tolist() == [True, False, False, True]
 
-    @pytest.mark.parametrize("target_dtype", [np.float32, np.int32, np.int64, np.int8])
-    def test_cast_from_bool(self, target_dtype):
+    @pytest.mark.parametrize("target_dtype", [np.float32, np.int32, np.int8])
+    def test_cast_from_bool(self, target_dtype, eager_or_compiled):
         tp_target_dtype = NUMPY_TO_TRIPY[target_dtype]
 
         # in principle, it is not important what *specific* values we convert to,
         # so long as false is mapped to 0 and true to nonzero
         input_tensor = tp.Tensor([False, True], dtype=tp.bool)
         np_input = cp.from_dlpack(input_tensor).get()
-        output = tp.cast(input_tensor, tp_target_dtype)
+        output = eager_or_compiled(tp.cast, input_tensor, tp_target_dtype)
 
         tp_compare_to_zero = cp.from_dlpack(output).get() == 0
         np_compare_to_zero = np_input.astype(target_dtype) == 0
 
@@ -33,9 +33,9 @@ class TestConcatenate:
             ([(2, 3, 4)], 0),
         ],
     )
-    def test_concat(self, tensor_shapes, dim):
+    def test_concat(self, tensor_shapes, dim, eager_or_compiled):
         tensors = [tp.ones(shape) for shape in tensor_shapes]
-        out = tp.concatenate(tensors, dim=dim)
+        out = eager_or_compiled(tp.concatenate, tensors, dim=dim)
         assert np.array_equal(
             cp.from_dlpack(out).get(), np.concatenate([np.ones(shape) for shape in tensor_shapes], axis=dim)
         )
@@ -44,8 +44,8 @@ def test_concat(self, tensor_shapes, dim):
         "tensor_shapes, dim",
         [([(2, 3, 4), (2, 4, 4)], 0), ([(4, 5, 6), (4, 1, 6)], -1)],
     )
-    def test_negative_concat(self, tensor_shapes, dim):
+    def test_negative_concat(self, tensor_shapes, dim, eager_or_compiled):
         tensors = [tp.ones(shape) for shape in tensor_shapes]
         with helper.raises(tp.TripyException, match=f"not compatible at non-concat index"):
-            out = tp.concatenate(tensors, dim=dim)
+            out = eager_or_compiled(tp.concatenate, tensors, dim=dim)
             print(out)
@@ -75,7 +75,7 @@ class ConvTestCase:
 @pytest.mark.parametrize("torch_dtype,tp_dtype", DTYPES)
 class TestConvolution:
     @pytest.mark.parametrize("test_case", test_cases_1d)
-    def test_convolution_1d(self, torch_dtype, tp_dtype, test_case):
+    def test_convolution_1d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -122,7 +122,7 @@ def test_convolution_1d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         # FP32 kernel seems to lose some precision, and FP16 needs to be run in FP32 on torch
         rtol_ = 4e-5 if tp_dtype == tp.float32 else 1e-3
@@ -131,7 +131,7 @@ def test_convolution_1d(self, torch_dtype, tp_dtype, test_case):
         assert list(output_torch.shape) == list(expected.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_2d)
-    def test_convolution_2d(self, torch_dtype, tp_dtype, test_case):
+    def test_convolution_2d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -178,15 +178,15 @@ def test_convolution_2d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 2e-7 if tp_dtype == tp.float32 else 1.5e-3
         output_torch = torch.from_dlpack(output)
         assert torch.allclose(output_torch, expected, rtol=rtol_)
         assert list(output_torch.shape) == list(expected.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_3d)
-    def test_convolution_3d(self, torch_dtype, tp_dtype, test_case):
+    def test_convolution_3d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         pytest.skip("TODO (#260): Fix accuracy bugs in 3D conv")
         if not test_case.torch_pad:
             test_case.torch_pad = 0
@@ -245,14 +245,14 @@ def test_convolution_3d(self, torch_dtype, tp_dtype, test_case):
             return
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 2e-4 if tp_dtype == tp.float32 else 1.4e-3  # 3d conv has greater accumulation error
         output_torch = torch.from_dlpack(output)
         assert torch.allclose(output_torch, expected, rtol=rtol_)
         assert list(output_torch.shape) == list(expected.shape)
 
-    def test_uneven_padding(self, torch_dtype, tp_dtype):
+    def test_uneven_padding(self, torch_dtype, tp_dtype, eager_or_compiled):
         input_torch = torch.arange(200, dtype=torch.float32, device=torch.device("cuda")).reshape(*(2, 4, 5, 5))
         input = tp.cast(tp.Tensor(input_torch), tp_dtype)
 
@@ -282,7 +282,7 @@ def test_uneven_padding(self, torch_dtype, tp_dtype):
 
         input_torch = torch_pad(input_torch)
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 2e-7 if tp_dtype == tp.float32 else 2e-3
         output_torch = torch.from_dlpack(output)
 
@@ -81,7 +81,7 @@ class ConvTestCase:
 @pytest.mark.parametrize("torch_dtype,tp_dtype", DTYPES)
 class TestConvolution:
     @pytest.mark.parametrize("test_case", test_cases_transpose_1d)
-    def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case):
+    def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -129,14 +129,14 @@ def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
-        rtol_ = 1e-3
+        rtol_ = 3e-3
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)
         assert output.shape == list(expected.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_transpose_2d)
-    def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case):
+    def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -184,14 +184,14 @@ def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 1e-2
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)
         assert output.shape == list(expected.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_transpose_3d)
-    def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case):
+    def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         if not test_case.torch_pad:
             test_case.torch_pad = 0
         if not test_case.stride:
@@ -239,12 +239,12 @@ def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case):
             conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
         rtol_ = 1.3e-6 if tp_dtype == tp.float32 else 1.6e-3
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)
         assert output.shape == list(expected.shape)
 
-    def test_transposed_equivalency(self, torch_dtype, tp_dtype):
+    def test_transposed_equivalency(self, torch_dtype, tp_dtype, eager_or_compiled):
         input_torch = torch.arange(9, dtype=torch.float32, device=torch.device("cuda")).reshape(*(1, 1, 3, 3))
         input = tp.cast(tp.Tensor(input_torch), tp_dtype)
 
@@ -277,8 +277,8 @@ def test_transposed_equivalency(self, torch_dtype, tp_dtype):
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
         expected_transpose = conv_transpose_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
-        output_transpose = conv_transpose_layer(input)
+        output = eager_or_compiled(conv_layer, input)
+        output_transpose = eager_or_compiled(conv_transpose_layer, input)
 
         rtol_ = 2e-7 if tp_dtype == tp.float32 else 9e-4
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)
@@ -291,7 +291,7 @@ def test_transposed_equivalency(self, torch_dtype, tp_dtype):
         assert list(expected.shape) == list(expected_transpose.shape)
 
     @pytest.mark.parametrize("test_case", test_cases_transpose_downscale)
-    def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case):
+    def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case, eager_or_compiled):
         input_torch = torch.arange(9, dtype=torch.float32, device=torch.device("cuda")).reshape(*(1, 1, 3, 3))
         input = tp.cast(tp.Tensor(input_torch), tp_dtype)
 
@@ -320,7 +320,7 @@ def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case):
         conv_layer.weight = tp.cast(tp.Tensor(conv_layer_torch.weight.data), tp_dtype)
 
         expected = conv_layer_torch(input_torch).to(torch_dtype)
-        output = conv_layer(input)
+        output = eager_or_compiled(conv_layer, input)
 
         rtol_ = 1e-15 if tp_dtype == tp.float32 else 1e-10
         assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_)