pytorch-labs
diff --git a/‎examples/concatenate.py
Lines changed: 5 additions & 4 deletions b/‎examples/concatenate.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/cross_entropy.py
Lines changed: 2 additions & 1 deletion b/‎examples/cross_entropy.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/fp8_attention.py
Lines changed: 1 addition & 1 deletion b/‎examples/fp8_attention.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/jagged_dense_add.py
Lines changed: 5 additions & 3 deletions b/‎examples/jagged_dense_add.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎examples/jagged_mean.py
Lines changed: 8 additions & 5 deletions b/‎examples/jagged_mean.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/matmul_split_k.py
Lines changed: 1 addition & 1 deletion b/‎examples/matmul_split_k.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/moe_matmul_ogs.py
Lines changed: 1 addition & 1 deletion b/‎examples/moe_matmul_ogs.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/segment_reduction.py
Lines changed: 4 additions & 4 deletions b/‎examples/segment_reduction.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎helion/_testing.py
Lines changed: 12 additions & 0 deletions b/‎helion/_testing.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎helion/ref/__init__.py
Lines changed: 6 additions & 0 deletions b/‎helion/ref/__init__.py
Lines changed: 6 additions & 0 deletions
@@ -15,16 +15,17 @@ def concat2d_dim1(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     )
     for tile0, tile1 in hl.tile(out.size()):
         # Most masking is automatic in helion, but tile1 spans both x and y we need to do some manual masking
+        tile1_indices = hl.tile_index(tile1)
         x_part = hl.load(
-            x, [tile0, tile1], extra_mask=(tile1.index < x.size(1))[None, :]
+            x, [tile0, tile1], extra_mask=(tile1_indices < x.size(1))[None, :]
         )
         y_part = hl.load(
             y,
-            [tile0, tile1.index - x.size(1)],
-            extra_mask=(tile1.index >= x.size(1))[None, :],
+            [tile0, tile1_indices - x.size(1)],
+            extra_mask=(tile1_indices >= x.size(1))[None, :],
         )
         out[tile0, tile1] = torch.where(
-            (tile1.index < x.size(1))[None, :], x_part, y_part
+            (tile1_indices < x.size(1))[None, :], x_part, y_part
         )
     return out
 
 
@@ -28,7 +28,8 @@ def cross_entropy(
     for tile_n in hl.tile(n):
         # Get data for this tile
         labels_tile = labels[tile_n]  # [tile_size]
-        base_indices_tile = tile_n.index * v  # [tile_size]
+        tile_n_indices = hl.tile_index(tile_n)
+        base_indices_tile = tile_n_indices * v  # [tile_size]
 
         # Compute the actual flat indices by adding the label offset
         flat_indices = base_indices_tile + labels_tile
 
@@ -22,7 +22,7 @@ def fp8_attention_kernel(
     head_dim = q.size(2)
 
     # Output tensor with 4D shape in FP8 format
-    out = torch.empty(
+    out = torch.zeros(
         [batch, heads, seq_len, head_dim], dtype=torch.float8_e5m2, device=q.device
     )
 
 
@@ -44,17 +44,19 @@ def jagged_dense_add_2d(
     out = torch.zeros_like(y)
     for tile0 in hl.tile(num_rows):
         starts = x_offsets[tile0]
-        ends = x_offsets[tile0.index + 1]
+        tile0_indices = hl.tile_index(tile0)
+        ends = x_offsets[tile0_indices + 1]
         nnz = ends - starts
         max_nnz = nnz.amax()
         # Note, the dynamic loop bounds aren't strictly necessary for this example, since
         # the output is dense, and we iterate over the rest in the next loop. However,
         # it is useful to illustrate how more complex jagged+jagged ops can be handled.
         for tile1 in hl.tile(0, max_nnz):
+            tile1_indices = hl.tile_index(tile1)
             x_slice = hl.load(
                 x_data,
-                [starts[:, None] + tile1.index[None, :]],
-                extra_mask=tile1.index[None, :] < nnz[:, None],
+                [starts[:, None] + tile1_indices[None, :]],
+                extra_mask=tile1_indices[None, :] < nnz[:, None],
             )
             out[tile0, tile1] = y[tile0, tile1] + x_slice
         for tile1 in hl.tile(max_nnz, out.size(1)):
 
@@ -48,7 +48,8 @@ def jagged_mean_kernel(
     # Process rows in tiles
     for tile_b in hl.tile(num_rows):
         starts = x_offsets[tile_b]
-        ends = x_offsets[tile_b.index + 1]
+        tile_b_indices = hl.tile_index(tile_b)
+        ends = x_offsets[tile_b_indices + 1]
         nnz = ends - starts
         max_nnz = nnz.amax()
 
@@ -58,21 +59,23 @@ def jagged_mean_kernel(
         # Process features in tiles
         for tile_m in hl.tile(max_M):
             # Create mask for valid features
-            feature_valid = tile_m.index < feature_counts[:, None]
+            tile_m_indices = hl.tile_index(tile_m)
+            feature_valid = tile_m_indices < feature_counts[:, None]
 
             # Initialize accumulator
             row_sums = hl.zeros([tile_b, tile_m], dtype=x_data.dtype)
 
             # Process elements within each row
             for tile_k in hl.tile(0, max_nnz):
                 # Compute flattened indices
-                base_indices = starts[:, None] + tile_k.index[None, :]
+                tile_k_indices = hl.tile_index(tile_k)
+                base_indices = starts[:, None] + tile_k_indices[None, :]
                 flat_indices = (
-                    base_indices[:, :, None] * max_M + tile_m.index[None, None, :]
+                    base_indices[:, :, None] * max_M + tile_m_indices[None, None, :]
                 )
 
                 # Combined mask: valid row element AND valid feature
-                row_mask = tile_k.index[None, :] < nnz[:, None]
+                row_mask = tile_k_indices[None, :] < nnz[:, None]
                 combined_mask = row_mask[:, :, None] & feature_valid[:, None, :]
 
                 x_slice = hl.load(
 
@@ -21,7 +21,7 @@ def matmul_split_k(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     k_block = helion.next_power_of_2(helion.cdiv(k, split_k))
     for tile_m, tile_n, outer_k in hl.tile([m, n, k], block_size=[None, None, k_block]):
         acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
-        for inner_k in hl.tile(outer_k.begin, outer_k.end):
+        for inner_k in hl.tile(hl.tile_begin(outer_k), hl.tile_end(outer_k)):
             acc = torch.addmm(acc, x[tile_m, inner_k], y[inner_k, tile_n])
         hl.atomic_add(out, [tile_m, tile_n], acc)
     return out
 
@@ -46,7 +46,7 @@ def moe_matmul_ogs(
             for tile_t, tile_n in hl.tile([max_T_per_expert, N]):
                 # Get local token offsets for this tile
                 # (i.e. the tile's corresponding chunk in [0 .. max_T_per_expert-1] token range)
-                local_token_offsets = tile_t.index  # [BLOCK_T]
+                local_token_offsets = hl.tile_index(tile_t)  # [BLOCK_T]
 
                 # Create mask for valid tokens (some tiles may be partially filled)
                 token_valid = local_token_offsets < num_tokens  # bool[BLOCK_T]
 
@@ -34,14 +34,14 @@ def segmented_reduction_helion(
     for tile_e, tile_f in hl.tile([num_elements, num_features]):
         vals = input_data[tile_e, tile_f]
         idxs = indices[tile_e]
+        tile_e_indices = hl.tile_index(tile_e)
         idxs_next = hl.load(
-            indices, [tile_e.index + 1], extra_mask=tile_e.index < num_elements - 1
+            indices, [tile_e_indices + 1], extra_mask=tile_e_indices < num_elements - 1
         )
         tuple_in = (vals, idxs.float().unsqueeze(1).expand_as(vals))
         out_vals, _ = hl.associative_scan(combine_fn_helion, tuple_in, dim=0)
-        mask = (idxs != idxs_next) | (
-            tile_e.index % tile_e.block_size == tile_e.block_size - 1
-        )
+        block_size = hl.tile_block_size(tile_e)
+        mask = (idxs != idxs_next) | (tile_e_indices % block_size == block_size - 1)
         segment_vals = torch.where(mask.unsqueeze(1), out_vals, 0.0)
         hl.atomic_add(output, [idxs, tile_f], segment_vals)
     return output
 
@@ -45,6 +45,11 @@ def code_and_output(
     args: tuple[object, ...],
     **kwargs: object,
 ) -> tuple[str, object]:
+    bound = fn.bind(args)
+    if bound.ref_eager or bound.ref_compile:
+        result = fn(*args)
+        return "", result
+
     if kwargs:
         config = Config(
             **kwargs  # pyright: ignore[reportArgumentType]
@@ -306,6 +311,13 @@ def assertExpectedJournal(self, value: str) -> None:
         Note:
             Use EXPECTTEST_ACCEPT=1 environment variable to update expected outputs.
         """
+        # Skip expected code checks in ref modes since they use the exact same code as original Helion kernel.
+        if (
+            os.environ.get("HELION_REF_EAGER") == "1"
+            or os.environ.get("HELION_REF_COMPILE") == "1"
+        ):
+            return
+
         value, expected = self._expected_journal.lookup(self.id(), value)
         self.assertMultiLineEqual(
             value,
 
@@ -0,0 +1,6 @@
+from __future__ import annotations
+
+from . import hl_patch
+from . import torch_patch
+
+__all__ = ["hl_patch", "torch_patch"]
Original file line number	Diff line number	Diff line change
`@@ -15,16 +15,17 @@ def concat2d_dim1(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:`
`15`	`15`	`)`
`16`	`16`	`for tile0, tile1 in hl.tile(out.size()):`
`17`	`17`	`# Most masking is automatic in helion, but tile1 spans both x and y we need to do some manual masking`
	`18`	`+ tile1_indices = hl.tile_index(tile1)`
`18`	`19`	`x_part = hl.load(`
`19`		`- x, [tile0, tile1], extra_mask=(tile1.index < x.size(1))[None, :]`
	`20`	`+ x, [tile0, tile1], extra_mask=(tile1_indices < x.size(1))[None, :]`
`20`	`21`	`)`
`21`	`22`	`y_part = hl.load(`
`22`	`23`	`y,`
`23`		`- [tile0, tile1.index - x.size(1)],`
`24`		`- extra_mask=(tile1.index >= x.size(1))[None, :],`
	`24`	`+ [tile0, tile1_indices - x.size(1)],`
	`25`	`+ extra_mask=(tile1_indices >= x.size(1))[None, :],`
`25`	`26`	`)`
`26`	`27`	`out[tile0, tile1] = torch.where(`
`27`		`- (tile1.index < x.size(1))[None, :], x_part, y_part`
	`28`	`+ (tile1_indices < x.size(1))[None, :], x_part, y_part`
`28`	`29`	`)`
`29`	`30`	`return out`
`30`	`31`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ def fp8_attention_kernel(`
`22`	`22`	`head_dim = q.size(2)`
`23`	`23`
`24`	`24`	`# Output tensor with 4D shape in FP8 format`
`25`		`- out = torch.empty(`
	`25`	`+ out = torch.zeros(`
`26`	`26`	`[batch, heads, seq_len, head_dim], dtype=torch.float8_e5m2, device=q.device`
`27`	`27`	`)`
`28`	`28`