NVIDIA
diff --git a/‎tripy/tests/flat_ir/ops/test_gather.py‎
Lines changed: 2 additions & 2 deletions b/‎tripy/tests/flat_ir/ops/test_gather.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tripy/tests/flat_ir/ops/test_plugin.py‎
Lines changed: 1 addition & 1 deletion b/‎tripy/tests/flat_ir/ops/test_plugin.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tripy/tests/flat_ir/test_constant_deduplication.py‎
Lines changed: 7 additions & 0 deletions b/‎tripy/tests/flat_ir/test_constant_deduplication.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎tripy/tests/flat_ir/test_function_deduplication.py‎
Lines changed: 122 additions & 0 deletions b/‎tripy/tests/flat_ir/test_function_deduplication.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎tripy/tests/helper.py‎
Lines changed: 1 addition & 2 deletions b/‎tripy/tests/helper.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tripy/tripy/backend/mlir/executor.py‎
Lines changed: 5 additions & 1 deletion b/‎tripy/tripy/backend/mlir/executor.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎tripy/tripy/backend/mlir/memref.py‎
Lines changed: 28 additions & 6 deletions b/‎tripy/tripy/backend/mlir/memref.py‎
Lines changed: 28 additions & 6 deletions
diff --git a/‎tripy/tripy/backend/mlir/utils.py‎
Lines changed: 30 additions & 0 deletions b/‎tripy/tripy/backend/mlir/utils.py‎
Lines changed: 30 additions & 0 deletions
@@ -55,7 +55,7 @@ def test_gather_mlir(self, axis):
         flat_ir = trace.to_flat_ir()
         mlir_text = str(flat_ir.to_mlir())
         if axis == 0:
-            target = '"stablehlo.dynamic_gather"(%arg0, %arg1, %2) <{dimension_numbers = #stablehlo.gather<offset_dims = [1], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 1>}> : (tensor<2x3xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<1x?xi32>'
+            target = '"stablehlo.dynamic_gather"(%arg0, %arg1, %6) <{dimension_numbers = #stablehlo.gather<offset_dims = [1], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 1>}> : (tensor<?x?xi32>, tensor<?xi32>, tensor<2xi32>) -> tensor<?x?xi32>'
         else:
-            target = '"stablehlo.dynamic_gather"(%arg0, %arg1, %2) <{dimension_numbers = #stablehlo.gather<offset_dims = [0], collapsed_slice_dims = [1], start_index_map = [1], index_vector_dim = 1>}> : (tensor<2x3xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<?x1xi32>'
+            target = '"stablehlo.dynamic_gather"(%arg0, %arg1, %6) <{dimension_numbers = #stablehlo.gather<offset_dims = [0], collapsed_slice_dims = [1], start_index_map = [1], index_vector_dim = 1>}> : (tensor<?x?xi32>, tensor<?xi32>, tensor<2xi32>) -> tensor<?x?xi32>'
         assert target in mlir_text, mlir_text
@@ -59,7 +59,7 @@ def test_str(self, flat_ir):
 
     def test_mlir(self, flat_ir):
         assert """
-            tensorrt.opaque_plugin {creator_params = {output_height = 5 : i32, output_width = 5 : i32}, plugin_name = "ROIAlign_TRT", plugin_namespace = "", plugin_version = "1"}(%0, %cst, %1) : (tensor<?x?x?x?xf32>, tensor<2x4xf32>, tensor<?xi32>) -> tensor<?x?x?x?xf32>
+            tensorrt.opaque_plugin {creator_params = {output_height = 5 : i32, output_width = 5 : i32}, plugin_name = "ROIAlign_TRT", plugin_namespace = "", plugin_version = "1"}(%0, %cst, %2) : (tensor<?x?x?x?xf32>, tensor<2x4xf32>, tensor<?xi32>) -> tensor<?x?x?x?xf32>
             """.strip() in str(
             flat_ir.to_mlir()
         )
 
@@ -92,3 +92,10 @@ def test_integrate_subgraph_constant_deduplication(config):
     mock_op = [op for op in ops if isinstance(op, MockOp)][0]
     assert mock_op.inputs[0] is mock_op.inputs[1], "The mock op should use the same tensor for its first two inputs"
     assert mock_op.inputs[0] is not mock_op.inputs[2], "The mock op should still have a different third input"
+
+    if config == "main":
+        # Verify that tensor replacements were applied
+        assert len(flat_ir.tensor_replacements) > 0, "There should be tensor replacements after integration"
+
+        # Verify that the constant map has the correct number of entries
+        assert len(flat_ir.constant_map) == 2, "Constant map should have 2 entries"
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+
+from tripy.flat_ir.flat_ir import FlatIR
+from tripy.flat_ir.ops.base import FlatIRFunction, BaseFlatIROp
+from tripy.flat_ir.ops import ConstantOp
+from tripy.flat_ir.tensor import FlatIRTensor
+from tripy.common.device import device
+from tripy.common.datatype import float32, int32
+
+
+@dataclass(repr=False, eq=False)
+class MockOp(BaseFlatIROp):
+    def __init__(self, inputs, outputs):
+        self.inputs = inputs
+        self.outputs = outputs
+        self.trace_input_names = []
+        self.trace_output_names = []
+        for output in outputs:
+            output.producer = self
+
+    def __eq__(self, other):
+        return True
+
+    def to_mlir(self, operands):
+        assert "Not implemented"
+
+
+def test_is_structurally_equivalent():
+    """Test the structural equivalence of two FlatIR functions."""
+    flat_ir = FlatIR()
+
+    def create_tensor(reason_details: str, name: Optional[str] = None) -> FlatIRTensor:
+        """Create and register a FlatIRTensor."""
+        t = FlatIRTensor.build(
+            shape=[3],
+            rank=1,
+            dtype=float32,
+            device=device("gpu"),
+            reason_details=reason_details,
+        )
+        if name:
+            t.name = name
+        flat_ir.register_tensor(t)
+        return t
+
+    def create_function(
+        name: str,
+        input_tensor: FlatIRTensor,
+        output_tensors: List[FlatIRTensor],
+    ) -> FlatIRFunction:
+        """Create a FlatIRFunction with associated operations."""
+        callee_input = input_tensor.clone(reason_details=f"{name} input cloned from {input_tensor}")
+        callee_outputs = [out.clone(reason_details=f"{name} output cloned from {out}") for out in output_tensors]
+
+        flat_ir.register_tensor(callee_input)
+        setattr(callee_input, "caller_tensor", input_tensor)
+
+        for callee_out, original_out in zip(callee_outputs, output_tensors):
+            flat_ir.register_tensor(callee_out)
+            setattr(callee_out, "caller_tensor", original_out)
+
+        func = FlatIRFunction(name, [callee_input], callee_outputs)
+        mock_op = MockOp([callee_input], [callee_outputs[0]])
+        const_op = ConstantOp.build([], [callee_outputs[1]], data=[3, 4, 5])
+        callee_outputs[1].producer = const_op
+
+        func.ops.extend([mock_op, const_op])
+        for out in output_tensors:
+            out.producer = func
+
+        return func
+
+    # Create main tensors
+    input_tensor = create_tensor("Function 1 input", "main_input_tensor")
+    intermediates = [create_tensor(f"Function 1 output {i}", f"intermediate_tensor_{i}") for i in range(2)]
+    outputs = [create_tensor(f"Function 2 output {i}", f"main_output_tensor_{i}") for i in range(2)]
+
+    # Create two structurally equivalent functions
+    func_1 = create_function("Func1", input_tensor, intermediates)
+    func_2 = create_function("Func2", intermediates[0], outputs)
+
+    # Assert structural equivalence
+    assert func_1.is_structurally_equivalent(func_2)
+
+    # Set up FlatIR inputs and outputs
+    flat_ir.inputs = [input_tensor]
+    flat_ir.outputs = outputs
+
+    # Integrate subgraphs
+    for in_tensor, out_tensors in [(input_tensor, intermediates), (intermediates[0], outputs)]:
+        flat_ir.integrate_subgraph([in_tensor], out_tensors)
+
+    flat_ir_str = str(flat_ir)
+
+    # Check Func1 structure
+    func_pattern = re.compile(r"function\s+Func1\s*\(\s*\w+:.*?\)\s*->\s*\(.*?\)\s*{.*?return.*?}", re.DOTALL)
+    assert func_pattern.search(flat_ir_str), "Function Func1 structure is incorrect"
+
+    # Check Main Function structure
+    main_pattern = re.compile(
+        r"Main Function:.*?inputs:.*?=\s*function Func1.*?=\s*function Func1.*?outputs:", re.DOTALL
+    )
+    assert main_pattern.search(flat_ir_str), "Main Function structure is incorrect"
+
+    print("All assertions passed. Function structures are correct.")
@@ -76,8 +76,7 @@ def raises(ExcType: type, match: Optional[str] = None, has_stack_info_for: Seque
     for tensor in has_stack_info_for:
         # Stack info is indented since it's part of the `details` block in `raise_error`
         expected_stack_info = indent(_make_stack_info_message(tensor.stack_info).strip(), " " * 4)
-        # TODO: How to add stack information for broadcasted tensors.
-        # assert expected_stack_info in error_msg, f"Missing stack information for tensor:\n{expected_stack_info}"
+        assert expected_stack_info in error_msg, f"Missing stack information for tensor:\n{expected_stack_info}"
 
 
 @contextlib.contextmanager
 
@@ -174,7 +174,11 @@ def execute(self, output_devices=List[device], inputs: List["Tensor"] = []) -> L
         # Allocate output memory and store buffer pointers.
         outputs = [
             create_empty_memref(
-                shape=info.shape, dtype=info.dtype, device=info.device, stream=self.stream._active_cuda_stream
+                shape=info.shape,
+                dtype=info.dtype,
+                device=info.device,
+                stream=self.stream._active_cuda_stream,
+                use_cache=False,
             )
             for info in out_tensor_info
         ]
 
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+from functools import lru_cache
+from typing import Sequence
 
 from tripy.backend.mlir import utils as mlir_utils
 from tripy.common import device as tp_device
@@ -23,11 +25,9 @@
 import mlir_tensorrt.runtime.api as runtime
 
 
-def create_empty_memref(shape, dtype, device=tp_device("gpu"), stream=None):
-    """
-    Creates an empty memref, used for allocating memory.
-    """
-    mlirtrt_device = mlir_utils.MLIRRuntimeClient().get_devices()[0] if device == tp_device("gpu") else None
+@lru_cache(maxsize=None)
+def _cached_create_memref(shape: Sequence[int], dtype: str, device_kind: str, stream):
+    mlirtrt_device = mlir_utils.MLIRRuntimeClient().get_devices()[0] if device_kind == "gpu" else None
     mlir_dtype = mlir_utils.convert_tripy_dtype_to_runtime_dtype(dtype)
     return mlir_utils.MLIRRuntimeClient().create_memref(
         shape=list(shape),
@@ -37,11 +37,33 @@ def create_empty_memref(shape, dtype, device=tp_device("gpu"), stream=None):
     )
 
 
+def create_empty_memref(
+    shape: Sequence[int],
+    dtype: str,
+    device: tp_device = tp_device("gpu"),
+    stream=None,
+    use_cache: bool = True,
+):
+    """
+    Creates an empty memref, used for allocating memory.
+    Caches the result for subsequent calls with the same parameters.
+
+    Args:
+        use_cache (bool, optional): Whether to use cached results for repeated calls with the same parameters.
+                                    If True, returns cached results if available. If False, always creates a new memref.
+                                    Defaults to True. This ensures we reuse empty memref across functions.
+
+    """
+    if use_cache:
+        return _cached_create_memref(tuple(shape), dtype, device.kind, stream)
+    else:
+        return _cached_create_memref.__wrapped__(tuple(shape), dtype, device.kind, stream)
+
+
 def create_memref_view(data):
     """
     Creates a memref view of an array object that implements the dlpack interface.
     """
-
     return mlir_utils.MLIRRuntimeClient().create_memref_view_from_dlpack(data.__dlpack__())
 
 
 
@@ -228,6 +228,9 @@ def parse_tensor_names_from_location(msg: str) -> Tuple[List[str], List[str], Li
     if not loc:
         return [], [], [], [], msg
 
+    # Hack: Extract callsite for function call locations.
+    if "at" in loc:
+        _, _, loc = loc.partition('at "')
     input_names, _, loc = loc.partition(OUTPUT_SEPARATOR)
     output_names, _, loc = loc.partition(TRACE_INPUTS_SEPARATOR)
     trace_inputs, _, trace_outputs = loc.partition(TRACE_OUTPUTS_SEPARATOR)
@@ -306,6 +309,33 @@ def is_any_dim_dynamic(mlir_tensor):
     return any([type.is_dynamic_dim(i) for i in range(type.rank)])
 
 
+def has_all_dynamic_dims(tensor_type: ir.RankedTensorType) -> bool:
+    """Check if all dimensions of a tensor type are dynamic."""
+    if not isinstance(tensor_type, ir.RankedTensorType):
+        raise ValueError("Input must be a RankedTensorType")
+
+    return all(dim == ir.ShapedType.get_dynamic_size() for dim in tensor_type.shape)
+
+
+def cast_to_dynamic_ranked_tensor(input_tensor: ir.Value, always_insert_cast: bool = False) -> ir.Value:
+    """Cast a tensor to a dynamic ranked tensor if necessary."""
+    from mlir_tensorrt.compiler.dialects._ods_common import get_op_result_or_value
+    from mlir_tensorrt.compiler.dialects import stablehlo
+
+    input_type = get_op_result_or_value(input_tensor).type
+
+    if not ir.RankedTensorType.isinstance(input_type):
+        raise ValueError("Input must be a RankedTensorType")
+
+    if not always_insert_cast and has_all_dynamic_dims(input_type):
+        return input_tensor
+
+    dynamic_shape = [ir.ShapedType.get_dynamic_size()] * input_type.rank
+    dynamic_type = ir.RankedTensorType.get(dynamic_shape, input_type.element_type)
+
+    return stablehlo.ConvertOp(result=dynamic_type, operand=input_tensor).result
+
+
 class ShapeContext:
     _instance = None
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ def test_str(self, flat_ir):`
`59`	`59`
`60`	`60`	`def test_mlir(self, flat_ir):`
`61`	`61`	`assert """`
`62`		`- tensorrt.opaque_plugin {creator_params = {output_height = 5 : i32, output_width = 5 : i32}, plugin_name = "ROIAlign_TRT", plugin_namespace = "", plugin_version = "1"}(%0, %cst, %1) : (tensor<?x?x?x?xf32>, tensor<2x4xf32>, tensor<?xi32>) -> tensor<?x?x?x?xf32>`
	`62`	`+ tensorrt.opaque_plugin {creator_params = {output_height = 5 : i32, output_width = 5 : i32}, plugin_name = "ROIAlign_TRT", plugin_namespace = "", plugin_version = "1"}(%0, %cst, %2) : (tensor<?x?x?x?xf32>, tensor<2x4xf32>, tensor<?xi32>) -> tensor<?x?x?x?xf32>`
`63`	`63`	`""".strip() in str(`
`64`	`64`	`flat_ir.to_mlir()`
`65`	`65`	`)`
Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,11 @@ def execute(self, output_devices=List[device], inputs: List["Tensor"] = []) -> L`
`174`	`174`	`# Allocate output memory and store buffer pointers.`
`175`	`175`	`outputs = [`
`176`	`176`	`create_empty_memref(`
`177`		`- shape=info.shape, dtype=info.dtype, device=info.device, stream=self.stream._active_cuda_stream`
	`177`	`+ shape=info.shape,`
	`178`	`+ dtype=info.dtype,`
	`179`	`+ device=info.device,`
	`180`	`+ stream=self.stream._active_cuda_stream,`
	`181`	`+ use_cache=False,`
`178`	`182`	`)`
`179`	`183`	`for info in out_tensor_info`
`180`	`184`	`]`