Add stride suppport

jhalakpatel · jhalakpatel · commit 5863d9bd0768 · 2024-09-28T00:46:36.000-07:00
diff --git a/tripy/tests/backend/test_compiler_api.py b/tripy/tests/backend/test_compiler_api.py
@@ -185,8 +185,7 @@ def test_function(self):
         inp = tp.ones((2, 2), dtype=tp.float32)
         out = compiled_gelu(inp)
 
-        # TODO (#225): Replace with tp.all
-        assert cp.array_equal(cp.from_dlpack(out), cp.from_dlpack(tp.relu(inp)))
+        assert tp.equal(out, tp.relu(inp))
 
     def test_module(self):
         layernorm = tp.LayerNorm(2)
diff --git a/tripy/tests/flat_ir/ops/test_constant.py b/tripy/tests/flat_ir/ops/test_constant.py
@@ -30,7 +30,10 @@ def test_str(self):
 
         const = flat_ir.ops[-1]
         assert isinstance(const, ConstantOp)
-        assert str(const) == "out: [rank=(1), shape=((2,)), dtype=(float32), loc=(gpu:0)] = ConstantOp(data=[2.0, 3.0])"
+        assert (
+            str(const)
+            == "out: [rank=(1), shape=((2,)), stride=((1,)), dtype=(float32), loc=(gpu:0)] = ConstantOp(data=[2.0, 3.0])"
+        )
 
     def test_mlir(self):
         out = tp.Tensor([2, 3], dtype=tp.int32, name="out")
diff --git a/tripy/tests/frontend/trace/test_trace.py b/tripy/tests/frontend/trace/test_trace.py
@@ -95,8 +95,8 @@ def test_str(self):
             str(trace)
             == dedent(
                 """
-                a = storage(data=[0], shape=(1,), dtype=int32, device=gpu:0)
-                b = storage(data=[1], shape=(1,), dtype=int32, device=gpu:0)
+                a = storage(data=[0], shape=(1,), stride=(1,), dtype=int32, device=gpu:0)
+                b = storage(data=[1], shape=(1,), stride=(1,), dtype=int32, device=gpu:0)
                 c = a + b
                 outputs:
                     c: [rank=(1), dtype=(int32), loc=(gpu:0)]
@@ -133,8 +133,8 @@ def test_multiple_outputs(self):
             str(trace)
             == dedent(
                 """
-                a = storage(data=[1.0000], shape=(1,), dtype=float32, device=gpu:0)
-                b = storage(data=[1.0000], shape=(1,), dtype=float32, device=gpu:0)
+                a = storage(data=[1.0000], shape=(1,), stride=(1,), dtype=float32, device=gpu:0)
+                b = storage(data=[1.0000], shape=(1,), stride=(1,), dtype=float32, device=gpu:0)
                 c = a + b
                 d = c + c
                 outputs:
@@ -168,8 +168,8 @@ def test_all_inputs(self):
             == dedent(
                 """
                 inputs:
-                    a: [rank=(1), shape=((1,)), dtype=(float32), loc=(gpu:0)]
-                    b: [rank=(1), shape=((1,)), dtype=(float32), loc=(gpu:0)]
+                    a: [rank=(1), shape=((1,)), stride=((1,)), dtype=(float32), loc=(gpu:0)]
+                    b: [rank=(1), shape=((1,)), stride=((1,)), dtype=(float32), loc=(gpu:0)]
                 c = a + b
                 outputs:
                     c: [rank=(1), dtype=(float32), loc=(gpu:0)]
@@ -191,8 +191,8 @@ def test_const_and_input(self):
             == dedent(
                 """
                 inputs:
-                    a: [rank=(1), shape=((1,)), dtype=(float32), loc=(gpu:0)]
-                b = storage(data=[1.0000], shape=(1,), dtype=float32, device=gpu:0)
+                    a: [rank=(1), shape=((1,)), stride=((1,)), dtype=(float32), loc=(gpu:0)]
+                b = storage(data=[1.0000], shape=(1,), stride=(1,), dtype=float32, device=gpu:0)
                 c = a + b
                 outputs:
                     c: [rank=(1), dtype=(float32), loc=(gpu:0)]
diff --git a/tripy/tests/integration/test_quantize.py b/tripy/tests/integration/test_quantize.py
@@ -118,4 +118,4 @@ def test_non_constant_scale(self):
         scale = tp.ones((4,))
         quantized = tp.quantize(input, scale, tp.int8, dim=0)
 
-        assert bool(tp.all(quantized == tp.ones((4, 4), dtype=tp.int8)))
+        assert tp.equal(quantized, tp.ones((4, 4), dtype=tp.int8))
diff --git a/tripy/tripy/backend/mlir/executor.py b/tripy/tripy/backend/mlir/executor.py
@@ -114,7 +114,9 @@ def _get_output_tensor_info(self, outputs_runtime_shape, output_devices):
             is_static_shape = all(dim >= 0 for dim in memref.shape)
             if is_static_shape:
                 outputs_tensor_info.append(
-                    TensorInfo(len(memref.shape), tuple(memref.shape), dtype, device(device_type))
+                    TensorInfo(
+                        len(memref.shape), tuple(memref.shape), tuple(memref.strides), dtype, device(device_type)
+                    )
                 )
             else:
                 runtime_shape = [
@@ -124,6 +126,7 @@ def _get_output_tensor_info(self, outputs_runtime_shape, output_devices):
                     TensorInfo(
                         len(runtime_shape),
                         tuple(runtime_shape),
+                        tuple(memref.strides),
                         dtype,
                         device(device_type),
                     )
@@ -174,7 +177,10 @@ def execute(self, output_devices=List[device], inputs: List["Tensor"] = []) -> L
         # Allocate output memory and store buffer pointers.
         outputs = [
             create_empty_memref(
-                shape=info.shape, dtype=info.dtype, device=info.device, stream=self.stream._active_cuda_stream
+                shape=info.shape,
+                dtype=info.dtype,
+                device=info.device,
+                stream=self.stream._active_cuda_stream,
             )
             for info in out_tensor_info
         ]
diff --git a/tripy/tripy/backend/utils.py b/tripy/tripy/backend/utils.py
@@ -27,6 +27,7 @@
 class TensorInfo:
     rank: int
     shape: Sequence[int]
+    stride: Sequence[int]
     dtype: "tripy.dtype"
     device: "tripy.device"
 
@@ -36,11 +37,12 @@ def encode(tensor_info: TensorInfo) -> Dict[str, Any]:
     return {
         "rank": tensor_info.rank,
         "shape": tensor_info.shape,
+        "stride": tensor_info.stride,
         "dtype": tensor_info.dtype,
         "device": tensor_info.device,
     }
 
 
 @Decoder.register(TensorInfo)
 def decode(dct: Dict[str, Any]) -> TensorInfo:
-    return TensorInfo(dct["rank"], dct["shape"], dct["dtype"], dct["device"])
+    return TensorInfo(dct["rank"], dct["shape"], dct["stride"], dct["dtype"], dct["device"])
diff --git a/tripy/tripy/flat_ir/tensor.py b/tripy/tripy/flat_ir/tensor.py
@@ -38,6 +38,7 @@ class FlatIRTensor:
     rank: int
     producer: "BaseFlatIROp" = None
     shape: Optional[List[int]] = None
+    stride: Optional[List[int]] = None
     reason_details: Optional[List[Any]] = None
     """
     Describes why this tensor was created.
@@ -69,6 +70,7 @@ def build(
         rank: int,
         reason_details: List[Any],
         shape: List[int] = None,
+        stride: List[int] = None,
     ) -> "FlatIRTensor":
         return FlatIRTensor(
             name=None,
@@ -80,6 +82,7 @@ def build(
             rank=rank,
             producer=None,
             shape=shape,
+            stride=utils.get_stride(shape, stride),
             reason_details=reason_details,
             reason_context=copy.copy(_BUILD_CONTEXT),
         )
@@ -88,6 +91,7 @@ def __str__(self) -> str:
         return (
             f"{self.name}: [rank=({self.rank}), "
             + (f"shape=({self.shape}), " if self.shape is not None else "")
+            + (f"stride=({self.stride}), " if self.stride is not None else "")
             + (f"dtype=({self.dtype.name}), " if self.dtype is not None else "")
             + f"loc=({self.device})]"
         )
diff --git a/tripy/tripy/frontend/ops/equal.py b/tripy/tripy/frontend/ops/equal.py
@@ -0,0 +1,59 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from tripy import export, constraints
+from tripy.common.exception import raise_error
+
+
+@export.public_api(document_under="operations/functions")
+@constraints.dtype_info(
+    dtype_variables={"T1": ["float32", "float16", "bfloat16", "float8", "int4", "int8", "int32", "int64", "bool"]},
+    dtype_constraints={"a": "T1", "b": "T1"},
+)
+def equal(a: "tripy.Tensor", b: "tripy.Tensor") -> bool:
+    r"""
+    Returns true if all elements in ``a`` and ``b`` are exactly equal.
+
+    This function performs an element-wise equality comparison between tensors ``a`` and ``b``,
+    and returns True only if all elements are exactly equal.
+
+    Args:
+        a: First tensor to compare.
+        b: Second tensor to compare.
+
+    Returns:
+        ``True`` if all elements in both tensors are exactly equal, ``False`` otherwise.
+
+    .. code-block:: python
+        :linenos:
+        :caption: Equal Tensors
+
+        # doc: print-locals out
+        out = tp.equal(tp.Tensor([1, 2, 3]), tp.Tensor([1, 2, 3]))
+        assert out
+
+    .. code-block:: python
+        :linenos:
+        :caption: Unequal Tensors
+
+        # doc: print-locals out
+        out = tp.equal(tp.Tensor([1, 2, 3]), tp.Tensor([1, 2, 4]))
+        assert not out
+    """
+    from tripy.frontend.trace.ops.reduce import all
+
+    return bool(all(a == b))
diff --git a/tripy/tripy/frontend/tensor.py b/tripy/tripy/frontend/tensor.py
@@ -125,6 +125,7 @@ def __init__(
             attr is not None
             for attr in [
                 self.trace_tensor.shape,
+                self.trace_tensor.stride,
                 self.trace_tensor.dtype,
                 self.trace_tensor.device,
                 self.trace_tensor.producer,
@@ -171,6 +172,10 @@ def dtype(self):
     def rank(self):
         return self.trace_tensor.rank
 
+    @property
+    def stride(self):
+        return self.trace_tensor.stride
+
     def eval(self) -> runtime.MemRefValue:
         from tripy.backend.mlir.compiler import Compiler
         from tripy.backend.mlir.executor import Executor
@@ -228,7 +233,7 @@ def __repr__(self) -> str:
         return (
             f"tensor({sep}"
             f"{indent(arr_str, prefix=indentation)}, {sep}"
-            f"{indent(f'dtype={self.dtype}, loc={self.device}, shape={arr.shape}', prefix=indentation)}"
+            f"{indent(f'dtype={self.dtype}, loc={self.device}, shape={arr.shape}, stride={arr.strides}', prefix=indentation)}"
             f")"
         )
 
diff --git a/tripy/tripy/frontend/trace/ops/binary_elementwise.py b/tripy/tripy/frontend/trace/ops/binary_elementwise.py
@@ -140,6 +140,7 @@ def to_flat_ir(self, inputs, outputs):
             # First apply DivideOp
             divide_out = FlatIRTensor.build(
                 shape=outputs[0].shape,
+                stride=outputs[0].stride,
                 rank=outputs[0].rank,
                 dtype=outputs[0].dtype,
                 device=outputs[0].device,
@@ -152,6 +153,7 @@ def to_flat_ir(self, inputs, outputs):
             # Step 1: Perform DivideOp
             divide_out = FlatIRTensor.build(
                 shape=outputs[0].shape,
+                stride=outputs[0].stride,
                 rank=outputs[0].rank,
                 dtype=outputs[0].dtype,
                 device=outputs[0].device,
@@ -162,6 +164,7 @@ def to_flat_ir(self, inputs, outputs):
             # Step 2: Apply FloorOp
             floor_out = FlatIRTensor.build(
                 shape=outputs[0].shape,
+                stride=outputs[0].stride,
                 rank=outputs[0].rank,
                 dtype=outputs[0].dtype,
                 device=outputs[0].device,
@@ -172,6 +175,7 @@ def to_flat_ir(self, inputs, outputs):
             # Step 3: Multiply divisor with floored division result (FloorOp output)
             multiply_out = FlatIRTensor.build(
                 shape=outputs[0].shape,
+                stride=outputs[0].stride,
                 rank=outputs[0].rank,
                 dtype=outputs[0].dtype,
                 device=outputs[0].device,
diff --git a/tripy/tripy/frontend/trace/ops/cast.py b/tripy/tripy/frontend/trace/ops/cast.py
@@ -73,6 +73,7 @@ def to_flat_ir(self, inputs, outputs):
             zeros_shape = op_utils.get_shape_of_tensor(convert_input)
             zeros = FlatIRTensor.build(
                 shape=convert_input.shape,
+                stride=convert_input.stride,
                 rank=convert_input.rank,
                 dtype=zero_dtype,
                 device=convert_input.device,
@@ -83,6 +84,7 @@ def to_flat_ir(self, inputs, outputs):
             if zero_dtype != convert_input.dtype:
                 zero_output = FlatIRTensor.build(
                     shape=zeros.shape,
+                    stride=zeros.stride,
                     rank=zeros.rank,
                     dtype=convert_input.dtype,
                     device=zeros.device,
diff --git a/tripy/tripy/frontend/trace/ops/dequantize.py b/tripy/tripy/frontend/trace/ops/dequantize.py
@@ -51,6 +51,7 @@ def to_flat_ir(self, inputs, outputs):
         # Represent quantize as convert(input, dtype) * scale
         converted_tensor = FlatIRTensor.build(
             shape=inputs[0].shape,
+            stride=inputs[0].stride,
             rank=inputs[0].rank,
             dtype=self.dtype,
             device=inputs[0].device,
@@ -60,6 +61,7 @@ def to_flat_ir(self, inputs, outputs):
 
         broadcast_scale = FlatIRTensor.build(
             shape=inputs[0].shape,  # broadcast to input's shape
+            stride=inputs[0].stride,
             rank=inputs[0].rank,
             dtype=inputs[1].dtype,  # original scale's dtype
             device=inputs[1].device,
diff --git a/tripy/tripy/frontend/trace/ops/quantize.py b/tripy/tripy/frontend/trace/ops/quantize.py
@@ -51,13 +51,15 @@ def to_flat_ir(self, inputs, outputs):
         # Represent quantize as clamp(round((input / scale))) + convert(dtype)
         scaled_tensor = FlatIRTensor.build(
             shape=inputs[0].shape,
+            stride=inputs[0].stride,
             rank=inputs[0].rank,
             dtype=inputs[0].dtype,
             device=inputs[0].device,
             reason_details=["Compute the scaled tensor by dividing input with scale."],
         )
         broadcast_scale = FlatIRTensor.build(
             shape=inputs[0].shape,  # broadcast to input's shape
+            stride=inputs[0].stride,
             rank=inputs[0].rank,
             dtype=inputs[1].dtype,
             device=inputs[1].device,
@@ -106,6 +108,7 @@ def to_flat_ir(self, inputs, outputs):
 
         rounded_tensor = FlatIRTensor.build(
             shape=inputs[0].shape,
+            stride=inputs[0].stride,
             rank=inputs[0].rank,
             dtype=inputs[0].dtype,
             device=inputs[0].device,
@@ -115,6 +118,7 @@ def to_flat_ir(self, inputs, outputs):
 
         clamped_tensor = FlatIRTensor.build(
             shape=inputs[0].shape,
+            stride=inputs[0].stride,
             rank=inputs[0].rank,
             dtype=inputs[0].dtype,
             device=inputs[0].device,
diff --git a/tripy/tripy/frontend/trace/ops/storage.py b/tripy/tripy/frontend/trace/ops/storage.py
@@ -35,6 +35,7 @@ class Storage(BaseTraceOp):
 
     data: Union["runtime.MemRefValue", Sequence]
     shape: Sequence[int]
+    stride: Sequence[int]
     dtype: type
     device: tp_device
 
@@ -53,18 +54,21 @@ def __init__(
             assert not any([dtype, device]), "Internal usage: dtype/device are inherited from memref."
             self.dtype = mlir_utils.convert_runtime_dtype_to_tripy_dtype(self.data.dtype)
             self.shape = tuple(data.shape)
+            self.stride = utils.get_stride(self.shape, tuple(data.strides))
             self.device = tp_device("gpu") if data.address_space == runtime.PointerType.device else tp_device("cpu")
             self.has_memref = True
         elif common_utils.is_empty(data):
             # special case: empty tensor
             self.dtype = utils.default(dtype, datatype.float32)
             self.shape = tuple(utils.get_shape(data))
+            self.stride = utils.get_stride(self.shape)
             self.data = memref.create_empty_memref(shape=self.shape, dtype=self.dtype)
             self.device = utils.default(device, tp_device("gpu"))
             self.has_memref = True
         else:
             self.dtype = dtype if dtype else common_utils.get_element_type(data)
             self.shape = tuple(utils.get_shape(data))
+            self.stride = utils.get_stride(self.shape)
             self.device = utils.default(device, tp_device("gpu"))
             self.has_memref = False
 
@@ -82,7 +86,7 @@ def __str__(self) -> str:
         if "data" not in skip_fields:
             data_str = memref.pretty_print_memref(self.data) if self.has_memref else str(self.data)
             args.append(f"data={data_str}")
-        args.extend([f"{field}={getattr(self, field)}" for field in ("shape", "dtype", "device")])
+        args.extend([f"{field}={getattr(self, field)}" for field in ("shape", "stride", "dtype", "device")])
         return f"{self.outputs[0].name} = storage({', '.join([inp.name for inp in self.inputs] + args)})"
 
     def __eq__(self, other) -> bool:
@@ -94,6 +98,7 @@ def infer_dtypes(self):
     def infer_rank(self):
         self.outputs[0].rank = len(self.shape)
         self.outputs[0].shape = self.shape
+        self.outputs[0].stride = self.stride
 
     def infer_devices(self):
         # TODO(#155): Fix allocation on host
diff --git a/tripy/tripy/frontend/trace/tensor.py b/tripy/tripy/frontend/trace/tensor.py
diff --git a/tripy/tripy/utils/utils.py b/tripy/tripy/utils/utils.py