pytorch
diff --git a/‎helion/_compiler/compile_environment.py‎
Lines changed: 131 additions & 0 deletions b/‎helion/_compiler/compile_environment.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 129 additions & 15 deletions b/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 129 additions & 15 deletions
@@ -72,6 +72,12 @@ def __init__(self, device: torch.device, settings: Settings) -> None:
         self.specialized_vars: set[sympy.Symbol] = set()
         self.loop_dependency_checker = LoopDependencyChecker()
         self._symint_cache: dict[object, torch.SymInt] = {}
+        
+        # Track tile.index tensors to preserve their block_id through broadcast indexing operations.
+        # When tile.index creates indices [0,1,2...] for a tiled dimension, we map the tensor to its
+        # block_id. This origin is preserved through ops like tensor[:, None] so the symbolic size is maintained.
+        self._tile_index_tensor_to_block_id_map: dict[int, int] = {}  # unique_tensor_id -> block_id
+        self._next_tensor_id = 0  # Counter for generating unique tensor IDs
 
     def add_kernel_tensor_size(self, sizes: Sequence[int | torch.SymInt]) -> None:
         from .device_function import contains_only_block_size_symbols
@@ -142,6 +148,30 @@ def allocate_reduction_dimension(self, size: torch.SymInt | int) -> BlockSizeInf
             if rdim.reduction and rdim.size == size:
                 return rdim
 
+        # Check if size matches any tile dimension for symbolic equality.
+        # When building expressions that mix sizes derived from tiles
+        # (e.g., via slicing) with sizes coming directly from tile block vars, we
+        # want them to share the same SymInt variable whenever they are equal by
+        # construction. This preserves equality in the shape environment and avoids
+        # spurious "size mismatch" issues during fake-tensor broadcasting and
+        # arithmetic in type propagation.
+        if isinstance(size, torch.SymInt):
+            size_str = str(size)
+            for block_info in self.block_sizes:
+                if not block_info.reduction and str(block_info.var) == size_str:
+                    # Create reduction dimension with the same var to preserve
+                    # symbolic equality and ensure all later users see identical
+                    # symbols (rather than equal-but-distinct SymInts).
+                    rdim_idx = self.allocate_block_size(
+                        size,
+                        reduction=True,
+                        source=ReductionLoopBlockSizeSource(
+                            reduction_loop=len([b for b in self.block_sizes if b.reduction])
+                        ),
+                    )
+                    self.block_sizes[rdim_idx].var = block_info.var
+                    return self.block_sizes[rdim_idx]
+
         # Allocate a new reduction dimension
         rdim_idx = self.allocate_block_size(
             size,
@@ -203,6 +233,107 @@ def cached_create_unbacked_symint(
             self._symint_cache[key] = result
         return result
 
+
+    def register_tile_index_tensor_block_id(self, tensor: torch.Tensor, block_id: int) -> None:
+        """Register a tensor as originating from a specific tile block.
+        
+        This is called when tile.index creates a 1D tensor of indices for a 
+        specific tiled dimension. The tensor represents indices [0, 1, 2, ...]
+        for ONE dimension that is being tiled.
+        
+        Args:
+            tensor: A 1D tensor created by tile.index containing indices for
+                    a single tiled dimension
+            block_id: The block ID representing the tiled dimension this tensor
+                      corresponds to. This is NOT a multi-dimensional concept -
+                      each tile.index tensor tracks exactly one dimension.
+        
+        Example:
+            When tiling x.size(0) with block_id=3:
+            - tile.index creates tensor([0, 1, 2, ..., block_size-1])
+            - This tensor is registered with block_id=3
+            - Later, when this tensor is used as an indexer, we know the
+              output should have the symbolic size from block_id=3
+        """
+        # Assign a unique ID to this tensor
+        tensor_id = self._next_tensor_id
+        self._next_tensor_id += 1
+        
+        # Store the mapping from this tile.index tensor to its dimension's block_id
+        self._tile_index_tensor_to_block_id_map[tensor_id] = block_id
+        
+        # Tag the tensor with its unique ID
+        tensor._tile_index_tensor_id = tensor_id
+    
+    def get_tile_index_tensor_block_id(self, tensor: torch.Tensor) -> int | None:
+        """Get the block_id for a tensor if it originated from tile.index.
+        
+        Returns the block_id of the single dimension this index tensor represents,
+        or None if this tensor didn't originate from tile.index.
+        """
+        # Check if tensor has our unique ID tag
+        tensor_id = getattr(tensor, '_tile_index_tensor_id', None)
+        if tensor_id is None:
+            return None
+        return self._tile_index_tensor_to_block_id_map.get(tensor_id)
+    
+    def is_tile_index_tensor(self, tensor: torch.Tensor) -> bool:
+        """Check if a tensor originated from a tile.index operation."""
+        tensor_id = getattr(tensor, '_tile_index_tensor_id', None)
+        if tensor_id is None:
+            return False
+        # If tensor has an ID, it must be in the map
+        assert tensor_id in self._tile_index_tensor_to_block_id_map
+        return True
+    
+    def preserve_tile_index_tensor_block_id(
+        self, 
+        input_tensor: torch.Tensor, 
+        output_tensor: torch.Tensor,
+        output_shape: list[int | torch.SymInt]
+    ) -> None:
+        """Preserve tile.index tensor's origin block id through broadcast-only view operations.
+        
+        Note: Caller must check is_tile_index_tensor() before calling this method.
+        """
+        # Get the block_id from input tensor
+        input_tensor_id = getattr(input_tensor, '_tile_index_tensor_id')
+        src_block_id = self._tile_index_tensor_to_block_id_map[input_tensor_id]
+            
+        # Only preserve for broadcast-only views (at most one non-1 dimension)
+        non_broadcast_dims = [i for i, s in enumerate(output_shape) if self.size_hint(s) != 1]
+        if len(non_broadcast_dims) <= 1:
+            # Register the output tensor with the same block_id
+            self.register_tile_index_tensor_block_id(output_tensor, src_block_id)
+            # Ensure the non-broadcast dimension uses the correct symbol
+            if non_broadcast_dims and src_block_id < len(self.block_sizes):
+                output_shape[non_broadcast_dims[0]] = self.block_sizes[src_block_id].var
+    
+    def get_indexer_output_size(
+        self,
+        indexer_tensor: torch.Tensor,
+        base_dim_size: int | torch.SymInt | None
+    ) -> int | torch.SymInt | list:
+        """Get the output size for a tensor indexer, preserving tile.index tensor's origin block id."""  
+        dims = list(indexer_tensor.size())
+        non_broadcast_dims = [d for d in dims if self.size_hint(d) != 1]
+        
+        # Multi-dimensional indexer - return full shape
+        if len(non_broadcast_dims) > 1:
+            return dims
+        
+        # Try to find block_id from different sources in order
+        if block_id := self.get_tile_index_tensor_block_id(indexer_tensor):
+            return self.block_sizes[block_id].var
+        
+        if base_dim_size and (block_id := self.get_block_id(base_dim_size)):
+            return self.block_sizes[block_id].var
+        
+        if non_broadcast_dims and (block_id := self.get_block_id(non_broadcast_dims[0])):
+            return self.block_sizes[block_id].var
+        
+        return non_broadcast_dims[0] if non_broadcast_dims else 1
+
     def to_fake(self, obj: object, origin: Origin) -> object:
         if isinstance(obj, torch.Tensor):
             return self._to_fake_tensor(obj, origin.to_source())
 
@@ -490,11 +490,20 @@ def compute_shape(
                     output_size.append(rdim.var)
                 else:
                     output_size.append(1)
-            elif isinstance(k, torch.Tensor) and (
-                k.ndim == 1 or (len(index) == 1 and tensor.ndim == 1)
-            ):
-                input_size.popleft()
-                output_size.extend(k.size())
+            elif isinstance(k, torch.Tensor):
+                # Advanced tensor indexer: consume one base dim and splice indexer shape.
+                base_dim = input_size.popleft()
+                dims = list(k.size())
+                non_broadcast_dims = [d for d in dims if env.size_hint(d) != 1]
+                
+                # Multi-d indexer contributes its own shape
+                if len(non_broadcast_dims) > 1:
+                    output_size.extend(dims)
+                    continue
+                    
+                # Single or broadcast-only indexer - use origin tracking helper
+                size = env.get_indexer_output_size(k, base_dim)
+                output_size.append(size)
             else:
                 raise exc.InvalidIndexingType(k)
         assert len(input_size) == 0, "invalid subscript"
@@ -514,6 +523,7 @@ def create(
         output_size = SubscriptIndexing.compute_shape(fake_value, index)
         env = CompileEnvironment.current()
         dtype = env.triton_index_type()
+
         for n, k in enumerate(index):
             if k is None:
                 output_idx += 1
@@ -573,16 +583,16 @@ def create(
                     else:
                         index_values.append(f"tl.zeros([1], {dtype}){expand}")
                 output_idx += 1
-            elif isinstance(k, torch.Tensor) and k.ndim == 1:
-                expand = tile_strategy.expand_str(output_size, output_idx)
-                ast_index = state.ast_args[1]
-                assert isinstance(ast_index, (list, tuple))
-                assert len(ast_index) == len(index)
-                index_var = state.codegen.lift(ast_index[n], prefix="index").id
-                index_values.append(f"({index_var}){expand}")
-                if (block_idx := env.get_block_id(output_size[output_idx])) is not None:
-                    if mask := state.codegen.mask_var(block_idx):
-                        mask_values.setdefault(f"({mask}){expand}")
+            elif isinstance(k, torch.Tensor) and (
+                k.ndim == 1
+                or sum(CompileEnvironment.current().size_hint(d) != 1 for d in k.size())
+                <= 1
+            ):
+                # Broadcast-only 1D indexer
+                SubscriptIndexing._handle_broadcast_indexer(
+                    k, n, output_size, output_idx, index, 
+                    state, tile_strategy, index_values, mask_values, env
+                )
                 output_idx += 1
             elif (
                 isinstance(k, torch.Tensor) and len(index) == 1 and fake_value.ndim == 1
@@ -601,6 +611,24 @@ def create(
                         mask_values.setdefault(
                             f"({mask}){tile_strategy.expand_str(output_size, n)}"
                         )
+            elif isinstance(k, torch.Tensor) and k.ndim > 1 and len(index) > 1:
+                # Multi-dimensional tensor indexer combined with other indices
+                non_broadcast_dims = [dim for dim in k.size() if env.size_hint(dim) != 1]
+                
+                if len(non_broadcast_dims) <= 1:
+                    # Broadcast-only multi-dim indexer: treat as single dimension
+                    SubscriptIndexing._handle_broadcast_indexer(
+                        k, n, output_size, output_idx, index, 
+                        state, tile_strategy, index_values, mask_values, env
+                    )
+                    output_idx += 1
+                else:
+                    # True multi-dim indexer: handle all dims at once
+                    SubscriptIndexing._handle_multidim_indexer(
+                        k, n, output_size, output_idx, index,
+                        state, tile_strategy, index_values, mask_values, env
+                    )
+                    output_idx += k.ndim
             else:
                 raise exc.InvalidIndexingType(type(k))
         assert len(output_size) == output_idx
@@ -618,10 +646,96 @@ def create(
         if extra_mask is not None:
             mask_values.setdefault("{_extra_mask}")
             kwargs["_extra_mask"] = extra_mask
+
         return SubscriptIndexing(
             expr_from_string("+".join(index_expr)),
             expr_from_string("&".join(mask_values) or "None", **kwargs),
         )
+    
+    @staticmethod
+    def _handle_broadcast_indexer(
+        k: torch.Tensor, n: int, output_size: list, output_idx: int, index: list,
+        state: CodegenState, tile_strategy: Any, index_values: list, 
+        mask_values: dict, env: CompileEnvironment
+    ) -> None:
+        """Handle broadcast-only tensor indexer (all dims but one are size 1)."""
+        expand = tile_strategy.expand_str(output_size, output_idx)
+        
+        # Try to get tile.index tensor's origin block_id
+        tile_origin_block_id = env.get_tile_index_tensor_block_id(k)
+        
+        if tile_origin_block_id is not None:
+            # Use the tile_index tensor's block id directly
+            index_var = state.codegen.index_var(tile_origin_block_id)
+            index_values.append(f"({index_var}){expand}")
+            if (mask := state.codegen.mask_var(tile_origin_block_id)) is not None:
+                mask_values.setdefault(f"({mask}){expand}")
+        else:
+            # Lift AST to preserve expressions like tile.index + 1
+            ast_index = state.ast_args[1]
+            assert isinstance(ast_index, (list, tuple))
+            assert len(ast_index) == len(index)
+            lifted = state.codegen.lift(ast_index[n], prefix="index").id
+            index_values.append(f"({lifted}){expand}")
+            # Even if we lift, we still know the block-id for this axis from output_size
+            output_block_id = env.get_block_id(output_size[output_idx])
+            if output_block_id is not None and (mask := state.codegen.mask_var(output_block_id)) is not None:
+                mask_values.setdefault(f"({mask}){expand}")
+    
+    @staticmethod
+    def _handle_multidim_indexer(
+        k: torch.Tensor, n: int, output_size: list, output_idx: int, index: list,
+        state: CodegenState, tile_strategy: Any, index_values: list,
+        mask_values: dict, env: CompileEnvironment
+    ) -> None:
+        """Handle multi-dimensional tensor indexer."""
+        # Lift the indexer once
+        ast_index = state.ast_args[1]
+        assert isinstance(ast_index, (list, tuple))
+        assert len(ast_index) == len(index)
+        index_var = state.codegen.lift(ast_index[n], prefix="index").id
+        
+        # Build merged broadcast bracket for all dims
+        # Start with first dimension
+        base = tile_strategy.expand_str(output_size, output_idx)
+        if base == "":
+            tokens = []
+        else:
+            assert base.startswith("[") and base.endswith("]"), base
+            tokens = base[1:-1].split(", ") if len(base) > 2 else []
+        
+        # Merge with other dimensions
+        for d in range(1, k.ndim):
+            s = tile_strategy.expand_str(output_size, output_idx + d)
+            if s == "":
+                s_tokens = [":"]
+            else:
+                assert s.startswith("[") and s.endswith("]"), s
+                s_tokens = s[1:-1].split(", ") if len(s) > 2 else []
+            
+            # Merge tokens: use ':' if either has ':', else 'None'
+            if not tokens:
+                tokens = s_tokens
+            elif s_tokens:
+                tokens = [
+                    ":" if (a == ":" or b == ":") else "None"
+                    for a, b in zip(tokens, s_tokens, strict=True)
+                ]
+        
+        if tokens == [":"] or not tokens:
+            bracket = ""
+        else:
+            bracket = f"[{', '.join(tokens)}]"
+        
+        index_values.append(f"({index_var}){bracket}")
+        
+        # Add mask contributions for each output dim
+        for d in range(k.ndim):
+            if (block_idx := env.get_block_id(output_size[output_idx + d])) is not None:
+                if mask := state.codegen.mask_var(block_idx):
+                    mask_values.setdefault(
+                        f"({mask}){tile_strategy.expand_str(output_size, output_idx + d)}"
+                    )
 
 
 @dataclasses.dataclass