pytorch
diff --git a/‎helion/_compiler/compile_environment.py‎
Lines changed: 60 additions & 0 deletions b/‎helion/_compiler/compile_environment.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 140 additions & 36 deletions b/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 140 additions & 36 deletions
diff --git a/‎helion/_compiler/type_propagation.py‎
Lines changed: 46 additions & 8 deletions b/‎helion/_compiler/type_propagation.py‎
Lines changed: 46 additions & 8 deletions
@@ -112,6 +112,7 @@ def __init__(
         self.specialized_vars: set[sympy.Symbol] = set()
         self.loop_dependency_checker = LoopDependencyChecker()
         self._symint_cache: dict[object, torch.SymInt] = {}
+        self._tile_index_block_ids: dict[int, int] = {}  # id(tensor) -> block_id
         self.device_load_count = (
             0  # Track number of loads in all device code for eviction policy tuning
         )
@@ -272,6 +273,65 @@ def cached_create_unbacked_symint(
             self._symint_cache[key] = result
         return result
 
+    def get_tile_index_tensor_block_id(self, tensor: torch.Tensor) -> int | None:
+        """Return the originating ``tile.index`` block id if present."""
+        return self._tile_index_block_ids.get(tensor._helion_id)  # type: ignore[attr-defined]
+
+    def tensor_indexer_broadcast_shape(
+        self, tensors: typing.Sequence[torch.Tensor] | None
+    ) -> list[int | torch.SymInt] | None:
+        """Compute broadcast shape for tensor indexers, or None if not applicable."""
+        tlist = [t for t in tensors or [] if isinstance(t, torch.Tensor)]
+        if not tlist or all(self.get_tile_index_tensor_block_id(t) for t in tlist):
+            return None
+        shapes = [list(t.size()) for t in tlist]
+        if all(len(s) == 1 for s in shapes) and len(shapes) > 1:  # Cartesian
+            return [s[0] for s in shapes]
+        max_ndim = max(len(s) for s in shapes)
+        padded = [([1] * (max_ndim - len(s)) + s) for s in shapes]
+        return [
+            next((d for d in dims if self.size_hint(d) != 1), 1)
+            for dims in zip(*padded, strict=True)
+        ]
+
+    def tensor_indexer_dims(
+        self, indexer_tensor: torch.Tensor, base_dim_size: int | torch.SymInt
+    ) -> list[int | torch.SymInt]:
+        """Return dims contributed by a tensor indexer (non-broadcast case)."""
+        dims = list(indexer_tensor.size())
+        non_bc = [d for d in dims if self.size_hint(d) != 1]
+        if len(non_bc) > 1:
+            return typing.cast("list[int | torch.SymInt]", dims)
+        bid = (
+            self.get_tile_index_tensor_block_id(indexer_tensor)
+            or (self.get_block_id(base_dim_size) if base_dim_size else None)
+            or (self.get_block_id(non_bc[0]) if non_bc else None)
+        )
+        return (
+            [self.block_sizes[bid].var]
+            if bid
+            else (typing.cast("list[int | torch.SymInt]", non_bc) or [1])
+        )
+
+    def new_index_result(
+        self, tensor: torch.Tensor, output_shape: typing.Sequence[int | torch.SymInt]
+    ) -> torch.Tensor:
+        """Create tensor for indexing ops, preserving tile index provenance."""
+        shape = list(output_shape)
+        non_bc = [i for i, s in enumerate(shape) if self.size_hint(s) != 1]
+        bid = self.get_tile_index_tensor_block_id(tensor)
+        if bid is None:
+            bids = {self.get_block_id(shape[i]) for i in non_bc} - {None}
+            bid = bids.pop() if len(bids) == 1 else None
+        if bid and len(non_bc) == 1:
+            shape[non_bc[0]] = self.block_sizes[bid].var
+        elif len(non_bc) > 1:
+            bid = None
+        result = tensor.new_empty(shape)
+        if bid is not None:
+            self._tile_index_block_ids[result._helion_id] = bid  # type: ignore[attr-defined]
+        return result
+
     def to_fake(self, obj: object, origin: Origin) -> object:
         if obj is None:
             return None
 
@@ -575,8 +575,13 @@ def compute_shape(
         input_size = collections.deque(tensor.size())
         output_size = []
         env = CompileEnvironment.current()
+
+        tensors = [k for k in index if isinstance(k, torch.Tensor)]
+        broadcast_shape = env.tensor_indexer_broadcast_shape(tensors)
+        first_broadcast_tensor_idx: int | None = None
+
         k_index = 0
-        for k in index:
+        for position, k in enumerate(index):
             if k is None:
                 output_size.append(1)
             elif isinstance(k, int):
@@ -617,11 +622,13 @@ def compute_shape(
                 else:
                     output_size.append(1)
                 k_index += 1
-            elif isinstance(k, torch.Tensor) and (
-                k.ndim == 1 or (len(index) == 1 and tensor.ndim == 1)
-            ):
-                input_size.popleft()
-                output_size.extend(k.size())
+            elif isinstance(k, torch.Tensor):
+                base_dim = input_size.popleft()
+                if broadcast_shape is None:
+                    output_size.extend(env.tensor_indexer_dims(k, base_dim))
+                elif first_broadcast_tensor_idx is None:
+                    output_size.extend(broadcast_shape)
+                    first_broadcast_tensor_idx = position
                 k_index += 1
             else:
                 raise exc.InvalidIndexingType(k)
@@ -667,13 +674,115 @@ def create(
         output_size = SubscriptIndexing.compute_shape(fake_value, index, state)
         env = CompileEnvironment.current()
         dtype = env.triton_index_type()
+        all_tensors = [k for k in index if isinstance(k, torch.Tensor)]
+        broadcast_shape = env.tensor_indexer_broadcast_shape(all_tensors)
+        tensor_shapes = [list(t.size()) for t in all_tensors]
+        first_tensor_idx = 0
+        tensor_count = 0
         if dtype == "tl.int32" and SubscriptIndexing._needs_int64(fake_value):
             raise exc.IndexOffsetOutOfRangeForInt32(env.index_dtype)
 
         def _is_size_one(size: int | torch.SymInt) -> bool:
             return env.known_equal(size, 1)
 
         k_index = 0
+
+        def tensor_index_source_and_mask(
+            index_elem: torch.Tensor, index_var: str, pos: int
+        ) -> tuple[str, int | None]:
+            tile_id = env.get_tile_index_tensor_block_id(index_elem)
+            src = state.codegen.index_var(tile_id) if tile_id else index_var
+            mask_id = tile_id or (
+                env.get_block_id(output_size[pos]) if pos < len(output_size) else None
+            )
+            return src, mask_id
+
+        def handle_broadcast_tensor(
+            position: int, index_elem: torch.Tensor, index_var: str
+        ) -> None:
+            """Handle tensor index with broadcast shape (cartesian or general)."""
+            nonlocal first_tensor_idx, output_idx, tensor_count, k_index
+            assert broadcast_shape is not None
+            dims = len(broadcast_shape)
+            if tensor_count == 0:
+                first_tensor_idx = output_idx
+                output_idx += dims
+
+            shape = (
+                tensor_shapes[tensor_count]
+                if tensor_count < len(tensor_shapes)
+                else [1]
+            )
+            # Cartesian: multiple 1D tensors each contributing one dim
+            is_cartesian = (
+                dims >= 2
+                and len(tensor_shapes) == dims
+                and all(
+                    len(s) == 1 or sum(1 for d in s if env.size_hint(d) != 1) <= 1
+                    for s in tensor_shapes
+                )
+            )
+            # Find position(s) where this tensor contributes
+            offset = max(0, dims - len(shape))
+            contrib = [
+                first_tensor_idx + offset + i
+                for i, d in enumerate(shape)
+                if env.size_hint(d) != 1
+            ]
+            pos = (
+                first_tensor_idx + tensor_count
+                if is_cartesian
+                else (
+                    contrib[0]
+                    if contrib
+                    else max(
+                        0,
+                        min(
+                            first_tensor_idx + offset + len(shape) - 1,
+                            len(output_size) - 1,
+                        ),
+                    )
+                )
+            )
+            # Generate index expression
+            if is_cartesian or len(contrib) <= 1:
+                src, mask_id = tensor_index_source_and_mask(index_elem, index_var, pos)
+                expand = (
+                    tile_strategy.expand_str(output_size, pos)
+                    if index_elem.ndim == 1
+                    else ""
+                )
+                index_values.append(f"({src}){expand}")
+                if (
+                    tensor_count == 0
+                    and mask_id
+                    and (mv := state.codegen.mask_var(mask_id))
+                ):
+                    if not _is_size_one(fake_value.size(len(index_values) - 1)):
+                        mask_values.setdefault(f"({mv}){expand}")
+            else:
+                index_values.append(f"({index_var})")
+                if tensor_count == 0:
+                    for p in contrib:
+                        if p < len(output_size) and (
+                            bid := env.get_block_id(output_size[p])
+                        ):
+                            if (mv := state.codegen.mask_var(bid)) and not _is_size_one(
+                                fake_value.size(len(index_values) - 1)
+                            ):
+                                mask_values.setdefault(
+                                    f"({mv}){tile_strategy.expand_str(output_size, p)}"
+                                )
+            # Padded iota mask
+            if (
+                orig_len := _get_padded_iota_original_length(state, position)
+            ) is not None:
+                mask_values.setdefault(
+                    f"(({index_var} < {orig_len}){tile_strategy.expand_str(output_size, first_tensor_idx + tensor_count)})"
+                )
+            tensor_count += 1
+            k_index += 1
+
         for n, k in enumerate(index):
             if k is None:
                 output_idx += 1
@@ -752,40 +861,35 @@ def _is_size_one(size: int | torch.SymInt) -> bool:
                         index_values.append(f"tl.zeros([1], {dtype}){expand}")
                 output_idx += 1
                 k_index += 1
-            elif isinstance(k, torch.Tensor) and k.ndim == 1:
-                expand = tile_strategy.expand_str(output_size, output_idx)
+            elif isinstance(k, torch.Tensor):
                 ast_index = state.ast_args[1]
                 assert isinstance(ast_index, (list, tuple))
-                assert len(ast_index) == len(index)
                 index_var = state.codegen.lift(ast_index[n], prefix="index").id
-                index_values.append(f"({index_var}){expand}")
-                if (block_idx := env.get_block_id(output_size[output_idx])) is not None:
-                    if mask := state.codegen.mask_var(block_idx):
-                        mask_values.setdefault(f"({mask}){expand}")
-                # Check if this index comes from a padded hl.arange and generate mask
-                if (
-                    original_length := _get_padded_iota_original_length(state, n)
-                ) is not None:
-                    mask_values.setdefault(f"({index_var} < {original_length}){expand}")
-                output_idx += 1
-                k_index += 1
-            elif (
-                isinstance(k, torch.Tensor) and len(index) == 1 and fake_value.ndim == 1
-            ):
-                # TODO(jansel): combine this case with the above
-                ast_index = state.ast_args[1]
-                assert isinstance(ast_index, (list, tuple))
-                assert len(ast_index) == 1
-                index_var = state.codegen.lift(ast_index[0], prefix="index").id
-                index_values.append(index_var)
-                output_idx += k.ndim
-                for n, s in enumerate(output_size):
-                    if (block_idx := env.get_block_id(s)) is not None and (
-                        mask := state.codegen.mask_var(block_idx)
+
+                # Use broadcast handling for: multiple tensors, or single tensor with ndim > 1
+                if broadcast_shape is not None and (len(all_tensors) > 1 or k.ndim > 1):
+                    handle_broadcast_tensor(n, k, index_var)
+                    continue
+
+                index_source, mask_block_id = tensor_index_source_and_mask(
+                    k, index_var, output_idx
+                )
+
+                expand = (
+                    tile_strategy.expand_str(output_size, output_idx)
+                    if k.ndim < len(output_size)
+                    else ""
+                )
+                index_values.append(f"({index_source}){expand}")
+                if mask_block_id is not None:
+                    mask_var = state.codegen.mask_var(mask_block_id)
+                    if mask_var and not _is_size_one(
+                        fake_value.size(len(index_values) - 1)
                     ):
-                        mask_values.setdefault(
-                            f"({mask}){tile_strategy.expand_str(output_size, n)}"
-                        )
+                        mask_values.setdefault(f"({mask_var}){expand}")
+
+                output_idx += k.ndim
+                tensor_count += 1
                 k_index += 1
             else:
                 raise exc.InvalidIndexingType(type(k))
 
@@ -460,7 +460,10 @@ def _device_indexing_size(self, key: TypeInfo) -> list[int | torch.SymInt]:
         inputs_consumed = 0
         output_sizes = []
         env = CompileEnvironment.current()
-        for k in keys:
+        tensor_indexers = [k.fake_value for k in keys if isinstance(k, TensorType)]
+        broadcast_shape = env.tensor_indexer_broadcast_shape(tensor_indexers)
+        first_broadcast_tensor_idx: int | None = None
+        for position, k in enumerate(keys):
             if isinstance(k, LiteralType):
                 if isinstance(k.value, (int, torch.SymInt)):
                     inputs_consumed += 1
@@ -505,9 +508,19 @@ def _device_indexing_size(self, key: TypeInfo) -> list[int | torch.SymInt]:
                 raise exc.DataDependentOutputShapeNotSupported(
                     op_desc="Boolean mask indexing (tensor[boolean_mask])"
                 )
-            elif isinstance(k, TensorType) and k.fake_value.ndim == 1:
+            elif isinstance(k, TensorType):
+                base_dim_size = self.fake_value.size(inputs_consumed)
                 inputs_consumed += 1
-                output_sizes.append(k.fake_value.size(0))
+                if broadcast_shape is None:
+                    output_sizes.extend(
+                        env.tensor_indexer_dims(
+                            k.fake_value,
+                            base_dim_size,
+                        )
+                    )
+                elif first_broadcast_tensor_idx is None:
+                    output_sizes.extend(broadcast_shape)
+                    first_broadcast_tensor_idx = position
             elif k.contains_type(TileIndexType):
                 raise exc.OverpackedTile(k)
             else:
@@ -553,9 +566,11 @@ def propagate_getitem(self, key: TypeInfo, origin: Origin) -> TypeInfo:
                 raise exc.TypeInferenceError(
                     f"Subscript not supported on {self!s} with key={key!s}"
                 ) from None
-        return TensorType(
-            origin, self.fake_value.new_empty(self._device_indexing_size(key))
-        )
+        new_sizes = self._device_indexing_size(key)
+        env = CompileEnvironment.current()
+        new_fake = env.new_index_result(self.fake_value, new_sizes)
+
+        return TensorType(origin, new_fake)
 
     def merge(self, other: TypeInfo, var_name: str | None = None) -> TypeInfo:
         if isinstance(other, TensorType):
@@ -2143,8 +2158,31 @@ def visit_NamedExpr(self, node: ast.NamedExpr) -> TypeInfo:
         return type_info
 
     def visit_Subscript(self, node: ast.Subscript) -> TypeInfo:
-        value_type = self.visit(node.value)
-        slice_type = self.visit(node.slice)
+        value_type, slice_type = self.visit(node.value), self.visit(node.slice)
+        # In device loops, check for overpacked tiles and rank mismatch
+        if self.device_loop_depth > 0 and isinstance(value_type, TensorType):
+            keys = (
+                slice_type.unpack()
+                if isinstance(slice_type, SequenceType)
+                else [slice_type]
+            )
+            consumed, has_tensor = 0, False
+            for k in keys:
+                if k.contains_type(TileIndexType) and not isinstance(k, TileIndexType):
+                    raise exc.OverpackedTile(k)
+                if isinstance(k, TensorType):
+                    has_tensor, consumed = True, consumed + 1
+                elif isinstance(k, SliceType) or (
+                    isinstance(k, (LiteralType, SymIntType, TileIndexType))
+                    and not (isinstance(k, LiteralType) and k.value is None)
+                ):
+                    consumed += 1
+            if not has_tensor and consumed < value_type.fake_value.ndim:
+                raise exc.RankMismatch(
+                    value_type.fake_value.ndim,
+                    consumed,
+                    f"tensor shape: {tuple(value_type.fake_value.shape)}, indexed {consumed} dimensions",
+                )
         return value_type.propagate_getitem(slice_type, self.origin())
 
     def visit_Slice(self, node: ast.Slice) -> TypeInfo: