Add stride suppport

jhalakpatel · jhalakpatel · commit 40d6c05a0ba9 · 2024-10-01T14:28:16.000-07:00
diff --git a/tripy/tests/backend/test_compiler_api.py b/tripy/tests/backend/test_compiler_api.py
@@ -185,8 +185,7 @@ def test_function(self):
         inp = tp.ones((2, 2), dtype=tp.float32)
         out = compiled_gelu(inp)
 
-        # TODO (#225): Replace with tp.all
-        assert cp.array_equal(cp.from_dlpack(out), cp.from_dlpack(tp.relu(inp)))
+        assert tp.allclose(out, tp.relu(inp), rtol=0.0, atol=0.0)
 
     def test_module(self):
         layernorm = tp.LayerNorm(2)
diff --git a/tripy/tests/flat_ir/ops/test_constant.py b/tripy/tests/flat_ir/ops/test_constant.py
@@ -30,7 +30,10 @@ def test_str(self):
 
         const = flat_ir.ops[-1]
         assert isinstance(const, ConstantOp)
-        assert str(const) == "out: [rank=(1), shape=((2,)), dtype=(float32), loc=(gpu:0)] = ConstantOp(data=[2.0, 3.0])"
+        assert (
+            str(const)
+            == "out: [rank=(1), shape=((2,)), stride=((1,)), dtype=(float32), loc=(gpu:0)] = ConstantOp(data=[2.0, 3.0])"
+        )
 
     def test_mlir(self):
         out = tp.Tensor([2, 3], dtype=tp.int32, name="out")
diff --git a/tripy/tests/frontend/test_stride.py b/tripy/tests/frontend/test_stride.py
@@ -0,0 +1,62 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+import re
+import torch
+
+import tripy as tp
+from tests.helper import raises
+
+
+class TestStride:
+
+    def test_non_canonical_stride(self):
+        t = torch.arange(12, dtype=torch.float32).reshape(3, 4)
+        a = tp.Tensor(t)
+        assert a.stride == t.stride()
+
+        t = t.transpose(0, 1)
+        a = tp.Tensor(t)
+
+        assert a.stride == t.stride()
+
+    def test_lazy_stride(self):
+        a = torch.arange(12, dtype=torch.float32).reshape(4, 3).transpose(0, 1)
+        with raises(
+            tp.TripyException,
+            match=re.escape("Non-canonical strides are not supported for Tripy tensors."),
+        ):
+            print(tp.Tensor(a))
+
+    def test_compile_stride(self):
+        def twice(t):
+            return 2 * t
+
+        compiler = tp.Compiler(twice)
+
+        t = tp.Tensor(torch.arange(12, dtype=torch.float32).reshape(4, 3).transpose(0, 1))
+
+        # Create a tensor info with non-canonical stride.
+        t_info = tp.InputInfo(shape=t.shape.tolist(), dtype=t.dtype, stride=t.stride)
+        compiled_add = compiler.compile(t_info)
+
+        with raises(
+            tp.TripyException,
+            match=re.escape("Reason: InvalidArgument: Runtime stride mismatch. Expected [4, 1] but received [1, 3]"),
+        ):
+            print(compiled_add(t))
diff --git a/tripy/tests/frontend/trace/test_trace.py b/tripy/tests/frontend/trace/test_trace.py
@@ -95,8 +95,8 @@ def test_str(self):
             str(trace)
             == dedent(
                 """
-                a = storage(data=[0], shape=(1,), dtype=int32, device=gpu:0)
-                b = storage(data=[1], shape=(1,), dtype=int32, device=gpu:0)
+                a = storage(data=[0], shape=(1,), stride=(1,), dtype=int32, device=gpu:0)
+                b = storage(data=[1], shape=(1,), stride=(1,), dtype=int32, device=gpu:0)
                 c = a + b
                 outputs:
                     c: [rank=(1), dtype=(int32), loc=(gpu:0)]
@@ -133,8 +133,8 @@ def test_multiple_outputs(self):
             str(trace)
             == dedent(
                 """
-                a = storage(data=[1.0000], shape=(1,), dtype=float32, device=gpu:0)
-                b = storage(data=[1.0000], shape=(1,), dtype=float32, device=gpu:0)
+                a = storage(data=[1.0000], shape=(1,), stride=(1,), dtype=float32, device=gpu:0)
+                b = storage(data=[1.0000], shape=(1,), stride=(1,), dtype=float32, device=gpu:0)
                 c = a + b
                 d = c + c
                 outputs:
@@ -168,8 +168,8 @@ def test_all_inputs(self):
             == dedent(
                 """
                 inputs:
-                    a: [rank=(1), shape=((1,)), dtype=(float32), loc=(gpu:0)]
-                    b: [rank=(1), shape=((1,)), dtype=(float32), loc=(gpu:0)]
+                    a: [rank=(1), shape=((1,)), stride=((1,)), dtype=(float32), loc=(gpu:0)]
+                    b: [rank=(1), shape=((1,)), stride=((1,)), dtype=(float32), loc=(gpu:0)]
                 c = a + b
                 outputs:
                     c: [rank=(1), dtype=(float32), loc=(gpu:0)]
@@ -191,8 +191,8 @@ def test_const_and_input(self):
             == dedent(
                 """
                 inputs:
-                    a: [rank=(1), shape=((1,)), dtype=(float32), loc=(gpu:0)]
-                b = storage(data=[1.0000], shape=(1,), dtype=float32, device=gpu:0)
+                    a: [rank=(1), shape=((1,)), stride=((1,)), dtype=(float32), loc=(gpu:0)]
+                b = storage(data=[1.0000], shape=(1,), stride=(1,), dtype=float32, device=gpu:0)
                 c = a + b
                 outputs:
                     c: [rank=(1), dtype=(float32), loc=(gpu:0)]
diff --git a/tripy/tests/integration/test_allclose.py b/tripy/tests/integration/test_allclose.py
@@ -35,8 +35,8 @@ class TestAllClose:
         ],
     )
     def test_all_close_float32(self, tensor_a, tensor_b, rtol, atol):
-        np_result = torch.allclose(torch.FloatTensor(tensor_a), torch.FloatTensor(tensor_b), rtol=rtol, atol=atol)
+        torch_result = torch.allclose(torch.FloatTensor(tensor_a), torch.FloatTensor(tensor_b), rtol=rtol, atol=atol)
         tp_result = tp.allclose(
             tp.Tensor(tensor_a, dtype=tp.float32), tp.Tensor(tensor_b, dtype=tp.float32), rtol=rtol, atol=atol
         )
-        assert np_result == tp_result
+        assert torch_result == tp_result
diff --git a/tripy/tests/integration/test_quantize.py b/tripy/tests/integration/test_quantize.py
@@ -118,4 +118,4 @@ def test_non_constant_scale(self):
         scale = tp.ones((4,))
         quantized = tp.quantize(input, scale, tp.int8, dim=0)
 
-        assert bool(tp.all(quantized == tp.ones((4, 4), dtype=tp.int8)))
+        assert tp.allclose(quantized, tp.ones((4, 4), dtype=tp.int8), rtol=0.0, atol=0.0)
diff --git a/tripy/tripy/backend/api/compiler.py b/tripy/tripy/backend/api/compiler.py
@@ -147,6 +147,7 @@ def add(a, b):
         """
 
         shapes = []
+        strides = []
         trace_input_map = {}
         input_names = set()
 
@@ -162,6 +163,7 @@ def process_arg(name, arg):
 
                 trace_input_map[name] = tensor
                 shapes.append(arg.shape_bounds)
+                strides.append(arg.stride)
                 input_names.add(name)
 
                 return tensor
@@ -196,7 +198,7 @@ def process_arg(name, arg):
 
         # Order of trace inputs also needs to match that of the compiled_arg_names
         trace_inputs = [trace_input_map[name] for name in compiled_arg_names]
-        trace = Trace(trace_outputs, trace_inputs, shapes=shapes)
+        trace = Trace(trace_outputs, trace_inputs, shapes=shapes, strides=strides)
 
         flat_ir = trace.to_flat_ir()
         mlir = flat_ir.to_mlir()
diff --git a/tripy/tripy/backend/api/executable.py b/tripy/tripy/backend/api/executable.py
@@ -161,6 +161,10 @@ def add(a, b):
                                     tensor,
                                 ],
                             )
+            elif "Runtime stride mismatch" in str(err):
+                # Just raise the error for now.
+                raise raise_error(str(err))
+
             raise
 
         from tripy.utils.stack_info import StackInfo
@@ -175,10 +179,11 @@ def _get_arg_info(self, idx):
         arg = runtime.MemRefType(arg)
         arg_bound = self._executable_signature.get_arg_bound(idx)
         shape_bounds = tuple(zip(arg_bound.min(), arg_bound.max()))
+        stride = arg.strides
         if len(shape_bounds) == 0:
             # For static shape arguments, get_arg_bound returns an empty list and we fallback to arg.shape
             shape_bounds = tuple((x, x) for x in arg.shape)
-        return ArgInfo(shape_bounds, mlir_utils.convert_runtime_dtype_to_tripy_dtype(arg.dtype))
+        return ArgInfo(shape_bounds, stride, mlir_utils.convert_runtime_dtype_to_tripy_dtype(arg.dtype))
 
     def get_input_info(self) -> Sequence[ArgInfo]:
         """
diff --git a/tripy/tripy/backend/api/input_info.py b/tripy/tripy/backend/api/input_info.py
@@ -29,7 +29,10 @@ class InputInfo:
     """
 
     def __init__(
-        self, shape: Sequence[Union[int, Tuple[int], Tuple[int, int], Tuple[int, int, int]]], dtype: "tripy.dtype"
+        self,
+        shape: Sequence[Union[int, Tuple[int], Tuple[int, int], Tuple[int, int, int]]],
+        dtype: "tripy.dtype",
+        stride: Sequence[int] = None,
     ) -> None:
         """
         Args:
@@ -88,10 +91,13 @@ def __init__(
             max_shape.append(elem[2])
 
         self.shape_bounds = ShapeBounds(tuple(min_shape), tuple(opt_shape), tuple(max_shape))
+        self.stride = stride
         self.dtype = dtype
 
     def __str__(self) -> str:
-        return f"InputInfo(min={self.shape_bounds.min}, opt={self.shape_bounds.opt}, max={self.shape_bounds.max}, dtype={self.dtype})"
+        base_info = f"InputInfo(min={self.shape_bounds.min}, opt={self.shape_bounds.opt}, max={self.shape_bounds.max}"
+        stride_info = f", stride={self.stride}" if self.stride is not None else ""
+        return f"{base_info}{stride_info}, dtype={self.dtype})"
 
 
 # TODO(MLIR-TRT #923): Can generalize `InputInfo` and drop this class.
@@ -100,5 +106,7 @@ def __str__(self) -> str:
 class ArgInfo:
     shape_bounds: Sequence[Tuple[int, int]]
     """A sequence of tuple(min, max) indicating the bounds of each dimension"""
+    stride: Sequence[int]
+    """A sequence of integers indicating stride"""
     dtype: "tripy.dtype"
     """The datatype of the argument"""
diff --git a/tripy/tripy/backend/mlir/executor.py b/tripy/tripy/backend/mlir/executor.py
@@ -114,7 +114,9 @@ def _get_output_tensor_info(self, outputs_runtime_shape, output_devices):
             is_static_shape = all(dim >= 0 for dim in memref.shape)
             if is_static_shape:
                 outputs_tensor_info.append(
-                    TensorInfo(len(memref.shape), tuple(memref.shape), dtype, device(device_type))
+                    TensorInfo(
+                        len(memref.shape), tuple(memref.shape), tuple(memref.strides), dtype, device(device_type)
+                    )
                 )
             else:
                 runtime_shape = [
@@ -124,6 +126,7 @@ def _get_output_tensor_info(self, outputs_runtime_shape, output_devices):
                     TensorInfo(
                         len(runtime_shape),
                         tuple(runtime_shape),
+                        tuple(memref.strides),
                         dtype,
                         device(device_type),
                     )
@@ -174,7 +177,10 @@ def execute(self, output_devices=List[device], inputs: List["Tensor"] = []) -> L
         # Allocate output memory and store buffer pointers.
         outputs = [
             create_empty_memref(
-                shape=info.shape, dtype=info.dtype, device=info.device, stream=self.stream._active_cuda_stream
+                shape=info.shape,
+                dtype=info.dtype,
+                device=info.device,
+                stream=self.stream._active_cuda_stream,
             )
             for info in out_tensor_info
         ]
diff --git a/tripy/tripy/backend/utils.py b/tripy/tripy/backend/utils.py
@@ -27,6 +27,7 @@
 class TensorInfo:
     rank: int
     shape: Sequence[int]
+    stride: Sequence[int]
     dtype: "tripy.dtype"
     device: "tripy.device"
 
@@ -36,11 +37,12 @@ def encode(tensor_info: TensorInfo) -> Dict[str, Any]:
     return {
         "rank": tensor_info.rank,
         "shape": tensor_info.shape,
+        "stride": tensor_info.stride,
         "dtype": tensor_info.dtype,
         "device": tensor_info.device,
     }
 
 
 @Decoder.register(TensorInfo)
 def decode(dct: Dict[str, Any]) -> TensorInfo:
-    return TensorInfo(dct["rank"], dct["shape"], dct["dtype"], dct["device"])
+    return TensorInfo(dct["rank"], dct["shape"], dct["stride"], dct["dtype"], dct["device"])
diff --git a/tripy/tripy/flat_ir/ops/constant.py b/tripy/tripy/flat_ir/ops/constant.py
@@ -72,6 +72,7 @@ def to_mlir(self, operands):
                 constant_op = stablehlo.ConstantOp(attr)
                 return [stablehlo.ConvertOp(result=cast_output, operand=constant_op)]
 
+            assert utils.is_canonical_stride(data_memref.shape, utils.make_tuple(data_memref.strides))
             attr = ir.DenseElementsAttr.get(
                 array=data_memref, type=mlir_utils.get_mlir_dtype(self.outputs[0].dtype), shape=data_memref.shape
             )
diff --git a/tripy/tripy/flat_ir/tensor.py b/tripy/tripy/flat_ir/tensor.py
@@ -38,6 +38,7 @@ class FlatIRTensor:
     rank: int
     producer: "BaseFlatIROp" = None
     shape: Optional[List[int]] = None
+    stride: Optional[List[int]] = None
     reason_details: Optional[List[Any]] = None
     """
     Describes why this tensor was created.
@@ -69,6 +70,7 @@ def build(
         rank: int,
         reason_details: List[Any],
         shape: List[int] = None,
+        stride: List[int] = None,
     ) -> "FlatIRTensor":
         return FlatIRTensor(
             name=None,
@@ -80,6 +82,7 @@ def build(
             rank=rank,
             producer=None,
             shape=shape,
+            stride=utils.get_stride(shape, stride),
             reason_details=reason_details,
             reason_context=copy.copy(_BUILD_CONTEXT),
         )
@@ -88,6 +91,7 @@ def __str__(self) -> str:
         return (
             f"{self.name}: [rank=({self.rank}), "
             + (f"shape=({self.shape}), " if self.shape is not None else "")
+            + (f"stride=({self.stride}), " if self.stride is not None else "")
             + (f"dtype=({self.dtype.name}), " if self.dtype is not None else "")
             + f"loc=({self.device})]"
         )
diff --git a/tripy/tripy/frontend/tensor.py b/tripy/tripy/frontend/tensor.py
@@ -125,6 +125,7 @@ def __init__(
             attr is not None
             for attr in [
                 self.trace_tensor.shape,
+                self.trace_tensor.stride,
                 self.trace_tensor.dtype,
                 self.trace_tensor.device,
                 self.trace_tensor.producer,
@@ -171,6 +172,10 @@ def dtype(self):
     def rank(self):
         return self.trace_tensor.rank
 
+    @property
+    def stride(self):
+        return self.trace_tensor.stride
+
     def eval(self) -> runtime.MemRefValue:
         from tripy.backend.mlir.compiler import Compiler
         from tripy.backend.mlir.executor import Executor
@@ -228,7 +233,7 @@ def __repr__(self) -> str:
         return (
             f"tensor({sep}"
             f"{indent(arr_str, prefix=indentation)}, {sep}"
-            f"{indent(f'dtype={self.dtype}, loc={self.device}, shape={arr.shape}', prefix=indentation)}"
+            f"{indent(f'dtype={self.dtype}, loc={self.device}, shape={arr.shape}, stride={arr.strides}', prefix=indentation)}"
             f")"
         )
 
diff --git a/tripy/tripy/frontend/trace/ops/binary_elementwise.py b/tripy/tripy/frontend/trace/ops/binary_elementwise.py
@@ -136,6 +136,7 @@ def to_flat_ir(self, inputs, outputs):
             # First apply DivideOp
             divide_out = FlatIRTensor.build(
                 shape=outputs[0].shape,
+                stride=outputs[0].stride,
                 rank=outputs[0].rank,
                 dtype=outputs[0].dtype,
                 device=outputs[0].device,
@@ -148,6 +149,7 @@ def to_flat_ir(self, inputs, outputs):
             # Step 1: Perform DivideOp
             divide_out = FlatIRTensor.build(
                 shape=outputs[0].shape,
+                stride=outputs[0].stride,
                 rank=outputs[0].rank,
                 dtype=outputs[0].dtype,
                 device=outputs[0].device,
@@ -158,6 +160,7 @@ def to_flat_ir(self, inputs, outputs):
             # Step 2: Apply FloorOp
             floor_out = FlatIRTensor.build(
                 shape=outputs[0].shape,
+                stride=outputs[0].stride,
                 rank=outputs[0].rank,
                 dtype=outputs[0].dtype,
                 device=outputs[0].device,
@@ -168,6 +171,7 @@ def to_flat_ir(self, inputs, outputs):
             # Step 3: Multiply divisor with floored division result (FloorOp output)
             multiply_out = FlatIRTensor.build(
                 shape=outputs[0].shape,
+                stride=outputs[0].stride,
                 rank=outputs[0].rank,
                 dtype=outputs[0].dtype,
                 device=outputs[0].device,
diff --git a/tripy/tripy/frontend/trace/ops/cast.py b/tripy/tripy/frontend/trace/ops/cast.py
@@ -76,6 +76,7 @@ def to_flat_ir(self, inputs, outputs):
             zeros_shape = op_utils.get_shape_of_tensor(convert_input)
             zeros = FlatIRTensor.build(
                 shape=convert_input.shape,
+                stride=convert_input.stride,
                 rank=convert_input.rank,
                 dtype=zero_dtype,
                 device=convert_input.device,
@@ -86,6 +87,7 @@ def to_flat_ir(self, inputs, outputs):
             if zero_dtype != convert_input.dtype:
                 zero_output = FlatIRTensor.build(
                     shape=zeros.shape,
+                    stride=zeros.stride,
                     rank=zeros.rank,
                     dtype=convert_input.dtype,
                     device=zeros.device,
diff --git a/tripy/tripy/frontend/trace/ops/dequantize.py b/tripy/tripy/frontend/trace/ops/dequantize.py
diff --git a/tripy/tripy/frontend/trace/ops/quantize.py b/tripy/tripy/frontend/trace/ops/quantize.py
diff --git a/tripy/tripy/frontend/trace/ops/storage.py b/tripy/tripy/frontend/trace/ops/storage.py
diff --git a/tripy/tripy/frontend/trace/tensor.py b/tripy/tripy/frontend/trace/tensor.py
diff --git a/tripy/tripy/frontend/trace/trace.py b/tripy/tripy/frontend/trace/trace.py
diff --git a/tripy/tripy/utils/utils.py b/tripy/tripy/utils/utils.py

Original file line number	Diff line number	Diff line change
`@@ -35,8 +35,8 @@ class TestAllClose:`
`35`	`35`	`],`
`36`	`36`	`)`
`37`	`37`	`def test_all_close_float32(self, tensor_a, tensor_b, rtol, atol):`
`38`		`- np_result = torch.allclose(torch.FloatTensor(tensor_a), torch.FloatTensor(tensor_b), rtol=rtol, atol=atol)`
	`38`	`+ torch_result = torch.allclose(torch.FloatTensor(tensor_a), torch.FloatTensor(tensor_b), rtol=rtol, atol=atol)`
`39`	`39`	`tp_result = tp.allclose(`
`40`	`40`	`tp.Tensor(tensor_a, dtype=tp.float32), tp.Tensor(tensor_b, dtype=tp.float32), rtol=rtol, atol=atol`
`41`	`41`	`)`
`42`		`- assert np_result == tp_result`
	`42`	`+ assert torch_result == tp_result`