NVIDIA
diff --git a/‎tripy/README.md‎
Lines changed: 1 addition & 3 deletions b/‎tripy/README.md‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎tripy/docs/pre0_user_guides/00-introduction-to-tripy.md‎
Lines changed: 3 additions & 5 deletions b/‎tripy/docs/pre0_user_guides/00-introduction-to-tripy.md‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎tripy/docs/pre0_user_guides/02-compiler.md‎
Lines changed: 13 additions & 19 deletions b/‎tripy/docs/pre0_user_guides/02-compiler.md‎
Lines changed: 13 additions & 19 deletions
diff --git a/‎tripy/examples/nanogpt/example.py‎
Lines changed: 2 additions & 4 deletions b/‎tripy/examples/nanogpt/example.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎tripy/tests/backend/api/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎tripy/tests/backend/api/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tripy/tests/backend/api/conftest.py‎
Lines changed: 40 additions & 0 deletions b/‎tripy/tests/backend/api/conftest.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tripy/tests/backend/api/test_compile.py‎
Lines changed: 156 additions & 0 deletions b/‎tripy/tests/backend/api/test_compile.py‎
Lines changed: 156 additions & 0 deletions
@@ -48,8 +48,6 @@ Tripy can also compile functions to generate efficient machine code for faster e
 def add(a, b):
     return a + b
 
-compiler = tp.Compiler(add)
-
 # When compiling, we need to specify shape and data type constraints on the inputs:
 
 # a is a 1D dynamic shape tensor of shape (d,), where `d` can range from 1 to 5.
@@ -59,7 +57,7 @@ a_info = tp.InputInfo(shape=([1, 2, 5],), dtype=tp.float32)
 # `b` is a 1D tensor of shape (1,).
 b_info = tp.InputInfo((1,), dtype=tp.float32)
 
-compiled_add = compiler.compile(a_info, b_info)
+compiled_add = tp.compile(add, args=[a_info, b_info])
 
 print(compiled_add(tp.Tensor([1., 2., 3.]), tp.Tensor([3.])))
 ```
 
@@ -105,21 +105,19 @@ All the code we've seen so far has been using Tripy's eager mode. It is also pos
 functions or modules ahead of time, which can result in significantly better performance.
 
 *Note that the compiler imposes some requirements on the functions/modules it can compile.*
-*See {class}`tripy.Compiler` for details.*
+*See {func}`tripy.compile` for details.*
 
 Let's compile the MLP module we defined above as an example:
 
 ```py
 # doc: no-print-locals
-compiler = tp.Compiler(mlp)
-
 # When we compile, we need to indicate which parameters to the function should be runtime inputs.
 # In this case, MLP takes a single input tensor for which we can specify our desired shape and datatype.
-fast_mlp = compiler.compile(tp.InputInfo(shape=(1, 2), dtype=tp.float32))
+fast_mlp = tp.compile(mlp, args=[tp.InputInfo(shape=(1, 2), dtype=tp.float32)])
 ```
 
 It is also possible to compile for a range of possible input shapes.
-See {func}`tripy.Compiler.compile` for details.
+See {func}`tripy.compile` for details.
 
 Now let's benchmark the compiled version against eager mode:
 ```py
 
@@ -28,17 +28,9 @@ inp = tp.ones((1, 2))
 out = layer(inp)
 ```
 
-Now, let's try to optimize this model for inference using Tripy's {class}`tripy.Compiler`.
+Now, let's try to optimize this model for inference using Tripy's {func}`tripy.compile`.
 
-First, let's initialize the compiler with the module we want to compile, `layer`,
-which lets the compiler know its properties, like the function signature.
-
-```py
-# doc: no-print-locals
-compiler = tp.Compiler(layer)
-```
-
-Next, we need to provide information about each input using {class}`tripy.InputInfo`.
+When we compile our module, we need to provide information about each input using {class}`tripy.InputInfo`.
 The first argument for `InputInfo` is `shape`, where we specify either the static or
 dynamic shape information for each dimension. In the example below, we assume the
 shape of `inp` is static (`(1, 2)`). The second argument specifies the `dtype` for the input:
@@ -51,7 +43,7 @@ Now, we can call the `compile` function to obtain a compiled function and use it
 
 ```py
 # doc: no-print-locals
-fast_geglu = compiler.compile(inp_info)
+fast_geglu = tp.compile(layer, args=[inp_info])
 fast_geglu(inp).eval()
 ```
 
@@ -67,7 +59,7 @@ and it should optimize for a size of 8.
 ```py
 # doc: print-locals out out_change_shape
 inp_info = tp.InputInfo(shape=((1, 8, 16), 2), dtype=tp.float32)
-fast_geglu = compiler.compile(inp_info)
+fast_geglu = tp.compile(layer, args=[inp_info])
 out = fast_geglu(inp)
 
 # Let's change the shape of input to (2, 2)
@@ -94,20 +86,23 @@ Saving an executable to disk:
 
 ```py
 # doc: no-print-locals
-import tempfile, os
-temp_dir = tempfile.mkdtemp()
-executable_file_path = os.path.join(temp_dir, "executable.json")
+import tempfile # doc: omit
+import os
+
+out_dir = tempfile.mkdtemp() # doc: omit
+executable_file_path = os.path.join(out_dir, "executable.json")
 fast_geglu.save(executable_file_path)
 ```
 
 Reading an executable and running inference:
 
 ```py
 # doc: no-print-locals
-inp = tp.Tensor([[1., 2.], [2., 3.]], dtype=tp.float32)
 loaded_fast_geglu = tp.Executable.load(executable_file_path)
+
+inp = tp.Tensor([[1., 2.], [2., 3.]], dtype=tp.float32)
 out = loaded_fast_geglu(inp)
-os.remove(executable_file_path)
+os.remove(executable_file_path) # doc: omit
 ```
 
 ### Querying Executable Properties
@@ -134,9 +129,8 @@ def add_times_two(a, b):
     print(f"c : {c}")
     return c + a + b
 
-compiler = tp.Compiler(add_times_two)
 inp_info = tp.InputInfo(shape=(1, 2), dtype=tp.float32)
-fast_myadd = compiler.compile(inp_info, inp_info)
+fast_myadd = tp.compile(add_times_two, args=[inp_info, inp_info])
 a = tp.Tensor([[1.0, 2.0]], dtype=tp.float32)
 b = tp.Tensor([[2.0, 3.0]], dtype=tp.float32)
 
 
@@ -1,6 +1,5 @@
-
 #
-# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -99,14 +98,13 @@ def main():
 
     # Compile the model before running inference.
     compile_start_time = time.perf_counter()
-    compiler = tp.Compiler(model)
     input_shape = (
         1,
         # We can specify dynamic dimensions by using a sequence indicating the min/opt/max values that
         # a dimension should support:
         [1, len(input_ids), padded_seq_len],
     )
-    model = compiler.compile(tp.InputInfo(input_shape, dtype=tp.int32))
+    model = tp.compile(model, args=[tp.InputInfo(input_shape, dtype=tp.int32)])
     compile_end_time = time.perf_counter()
     print(f"Compilation took {compile_end_time - compile_start_time} seconds.")
 
 
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def add(a, b):
+    return a + b
+
+
+def sub(a, b):
+    return a - b
+
+
+def returns_non_tensor(a):
+    return "not a tensor"
+
+
+def returns_nothing(a):
+    return
+
+
+def returns_multiple_tensors(a, b):
+    return a + b, a - b
+
+
+def variadic_positional(*args):
+    pass
+
+
+def variadic_keyword(**kwargs):
+    pass
@@ -0,0 +1,156 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import cupy as cp
+import pytest
+from tests import helper
+from tests.backend.api.conftest import *
+
+import tripy as tp
+
+
+class TestCompile:
+    # TODO (#246): Verify that it's actually compiling somehow here and below.
+    # Need to return something programatically queriable from compile to do this.
+    def test_function(self):
+        compiled_gelu = tp.compile(tp.relu, args=[tp.InputInfo((2, 2), dtype=tp.float32)])
+
+        inp = tp.ones((2, 2), dtype=tp.float32)
+        out = compiled_gelu(inp)
+
+        # TODO (#225): Replace with tp.all
+        assert cp.array_equal(cp.from_dlpack(out), cp.from_dlpack(tp.relu(inp)))
+
+    def test_module(self):
+        layernorm = tp.LayerNorm(2)
+        compiled_layernorm = tp.compile(layernorm, args=[tp.InputInfo((2, 2), dtype=tp.float32)])
+
+        inp = tp.ones((2, 2), dtype=tp.float32)
+        out = compiled_layernorm(inp)
+
+        assert cp.array_equal(cp.from_dlpack(out), cp.from_dlpack(layernorm(inp)))
+
+    def test_compile_arg_order_irrelevant(self):
+        # The order of arguments we specify to `compile` should not affect the order
+        # of the arguments in the compiled function, which should just follow the order
+        # of the original function.
+        compiled_sub = tp.compile(
+            sub, kwargs=dict(b=tp.InputInfo((2, 2), dtype=tp.float32), a=tp.InputInfo((2, 2), dtype=tp.float32))
+        )
+
+        a = tp.ones((2, 2), dtype=tp.float32) * 2
+        b = tp.ones((2, 2), dtype=tp.float32)
+
+        # Compiled function should still take arguments in (a, b) order.
+        out = compiled_sub(a, b)
+        assert cp.array_equal(cp.from_dlpack(out), cp.ones((2, 2), dtype=cp.float32))
+
+    @pytest.mark.parametrize("b", [2, tp.ones((2, 2), dtype=tp.float32) * 2])
+    def test_constants_baked(self, b):
+        # Any non-InputInfo argument to compile is baked into the compiled function.
+        compiled_add = tp.compile(add, args=[tp.InputInfo((2, 2), dtype=tp.float32), b])
+
+        a = tp.zeros((2, 2), dtype=tp.float32)
+
+        out = compiled_add(a)
+
+        assert cp.array_equal(cp.from_dlpack(out), cp.ones((2, 2), dtype=cp.float32) * 2)
+
+    @pytest.mark.parametrize("func", [variadic_positional, variadic_keyword])
+    def test_variadic_arguments_rejected(self, func):
+        with helper.raises(tp.TripyException, "Variadic positional/keyword arguments are not currently supported."):
+            tp.compile(func)
+
+    @pytest.mark.parametrize("func", [returns_non_tensor, returns_nothing])
+    def test_invalid_return_rejected(self, func):
+        with helper.raises(tp.TripyException, "Function must return 1 or more Tensors"):
+            tp.compile(func, args=[tp.InputInfo((2, 2), dtype=tp.float32)])
+
+    def test_multiple_return_values(self):
+        compiled_func = tp.compile(
+            returns_multiple_tensors,
+            args=[tp.InputInfo((2, 2), dtype=tp.float32), tp.InputInfo((2, 2), dtype=tp.float32)],
+        )
+
+        a = tp.ones((2, 2), dtype=tp.float32) * 2
+        b = tp.ones((2, 2), dtype=tp.float32)
+
+        plus, minus = compiled_func(a, b)
+
+        assert cp.array_equal(cp.from_dlpack(plus), cp.ones((2, 2), dtype=cp.float32) * 3)
+        assert cp.array_equal(cp.from_dlpack(minus), cp.ones((2, 2), dtype=cp.float32))
+
+    def test_incorrect_dtype_rejected(self):
+        a = tp.ones((2, 2), dtype=tp.int32)
+
+        with helper.raises(tp.TripyException, "Unexpected tensor data type.", has_stack_info_for=[a]):
+            compiled_add = tp.compile(
+                add, args=[tp.InputInfo((2, 2), dtype=tp.float32), tp.InputInfo((2, 2), dtype=tp.float32)]
+            )
+            compiled_add(a, a)
+
+    def test_incorrect_shape_rejected(self):
+        a = tp.ones((1, 2), dtype=tp.float32)
+
+        with helper.raises(tp.TripyException, "Unexpected tensor shape.", has_stack_info_for=[a]):
+            compiled_add = tp.compile(
+                add, args=[tp.InputInfo((2, 2), dtype=tp.float32), tp.InputInfo((2, 2), dtype=tp.float32)]
+            )
+            compiled_add(a, a)
+
+    @pytest.mark.skip("TODO (#155): Re-enable once we no longer implicitly copy inputs to device")
+    def test_incorrect_device_rejected(self):
+        compiled_add = tp.compile(
+            add, args=[tp.InputInfo((2, 2), dtype=tp.float32), tp.InputInfo((2, 2), dtype=tp.float32)]
+        )
+        a = tp.copy(tp.ones((2, 2), dtype=tp.float32), device=tp.device("cpu"))
+
+        with helper.raises(tp.TripyException):
+            compiled_add(a, a)
+
+    # TODO (#244): Add multi-profile test
+    def test_dynamic_shapes(self):
+        compiled_add = tp.compile(
+            add, args=[tp.InputInfo(((1, 2, 3), 1), dtype=tp.float32), tp.InputInfo(((1, 2, 3), 1), dtype=tp.float32)]
+        )
+
+        out = compiled_add(tp.ones((2, 1), dtype=tp.float32), tp.ones((2, 1), dtype=tp.float32))
+        assert cp.array_equal(cp.from_dlpack(out), cp.ones((2, 1), dtype=cp.float32) * 2)
+
+        out = compiled_add(tp.ones((3, 1), dtype=tp.float32), tp.ones((3, 1), dtype=tp.float32))
+        assert cp.array_equal(cp.from_dlpack(out), cp.ones((3, 1), dtype=cp.float32) * 2)
+
+
+# TODO (#256): Remove these tests and replace with exhaustive integration testing
+class TestCompiledOps:
+    def test_cast(self):
+        compiled_cast = tp.compile(tp.cast, args=[tp.InputInfo((2, 2), dtype=tp.float32)], kwargs=dict(dtype=tp.int32))
+
+        a = tp.ones((2, 2), dtype=tp.float32)
+        out = compiled_cast(a)
+
+        assert cp.array_equal(cp.from_dlpack(out), cp.ones((2, 2), dtype=cp.int32))
+
+    def test_linear(self):
+        linear = tp.Linear(2, 3)
+
+        compiled_linear = tp.compile(linear, args=[tp.InputInfo((2, 2), dtype=tp.float32)])
+
+        a = tp.ones((2, 2), dtype=tp.float32)
+
+        out = compiled_linear(a)
+
+        assert cp.array_equal(cp.from_dlpack(out), cp.from_dlpack(linear(a)))