Covered the bool mask cases

cehongwang · cehongwang · commit 22edfe63889c · 2025-09-17T20:49:44.000Z
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/select.py b/py/torch_tensorrt/dynamo/conversion/impl/select.py
@@ -759,7 +759,9 @@ def index_put_converter(
             ) + list(values.shape)
             broadcast_shape = []
             for exp_dim, val_dim in zip(expected_shape, values_shape_padded):
-                if val_dim == 1 or exp_dim == val_dim:
+                if val_dim == DYNAMIC_DIM or exp_dim == DYNAMIC_DIM:
+                    broadcast_shape.append(-1)
+                elif val_dim == 1 or exp_dim == val_dim:
                     broadcast_shape.append(exp_dim)
                 else:
                     raise ValueError(
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -8,6 +8,7 @@
 from .complex_graph_rewrite import complex_graph_detection
 from .constant_folding import constant_fold
 from .fuse_prims_broadcast import fuse_prims_broadcast
+from .index_put_replace_bool_with_indices import index_put_replace_bool_with_indices
 from .pass_manager import DynamoPassManager
 from .remove_assert_nodes import remove_assert_nodes
 from .remove_detach import remove_detach
@@ -22,6 +23,7 @@
     repair_input_as_output,
     fuse_prims_broadcast,
     replace_max_pool_with_indices,
+    index_put_replace_bool_with_indices,
     remove_assert_nodes,
     remove_num_users_is_0_nodes,
     complex_graph_detection,
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/index_put_replace_bool_with_indices.py b/py/torch_tensorrt/dynamo/lowering/passes/index_put_replace_bool_with_indices.py
@@ -0,0 +1,61 @@
+import logging
+import operator
+
+import torch
+import torch.fx as fx
+from torch_tensorrt.dynamo._settings import CompilationSettings
+from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
+    clean_up_graph_after_modifications,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _bool_tensor_to_long_indices(
+    graph: fx.Graph, mask_node: fx.Node, before: fx.Node
+) -> fx.Node:
+
+    with graph.inserting_before(before):
+        nz_tuple = graph.call_function(
+            torch.nonzero, args=(mask_node,), kwargs={"as_tuple": True}
+        )
+        idx = graph.call_function(operator.getitem, args=(nz_tuple, 0))
+
+    return idx
+
+
+def index_put_replace_bool_with_indices(
+    gm: fx.GraphModule, settings: CompilationSettings
+) -> fx.GraphModule:
+
+    graph = gm.graph
+    modified_graph = False
+    for node in list(graph.nodes):
+        if node.target != torch.ops.aten.index_put.default:
+            continue
+
+        indices = node.args[1]
+        if isinstance(indices, (list, tuple)):
+            new_elems = []
+            for it in indices:
+                if isinstance(it, fx.Node) and it.meta["val"].dtype == torch.bool:
+                    # bool Tensor → long indices Tensor
+                    idx = _bool_tensor_to_long_indices(graph, it, before=node)
+                    new_elems.append(idx)
+                elif isinstance(it, (list, tuple)) and all(
+                    isinstance(b, bool) for b in it
+                ):
+                    new_elems.append([i for i, b in enumerate(it) if b])
+                else:
+                    new_elems.append(it)
+            new_indices = type(indices)(new_elems)
+            node.args = (node.args[0], new_indices, *node.args[2:])
+        elif isinstance(indices, fx.Node) and it.meta["val"].dtype == torch.bool:
+            idx = _bool_tensor_to_long_indices(graph, indices, before=node)
+            node.args = (node.args[0], idx, *node.args[2:])
+        modified_graph = True
+
+    if modified_graph:
+        gm = clean_up_graph_after_modifications(gm)
+
+    return gm
diff --git a/tests/py/dynamo/conversion/test_index_put_aten.py b/tests/py/dynamo/conversion/test_index_put_aten.py
@@ -328,6 +328,50 @@ def forward(self, x, y, z, a, b):
         result = trt_mod(*inputs)
         assert torch.allclose(result, torch_output, atol=1e-4, rtol=1e-4)
 
+    def test_bool_mask_test(self):
+
+        source_tensor = torch.ones([5, 10], dtype=torch.float32).cuda()
+        indices_tensor = torch.tensor([False, False, True, False, True])
+        # indices_tensor = torch.tensor([3,4])
+        value_tensor = torch.zeros([2, 10], dtype=torch.float32).cuda()
+
+        dim1 = torch.export.Dim("dim1", min=1, max=5)
+        dim2 = torch.export.Dim("dim2", min=1, max=5)
+
+        # source_tensor=torch.zeros([5, 5], dtype=torch.int32).cuda()
+        # indices_tensor=(torch.tensor([0, 0], dtype=torch.int32).cuda(), torch.tensor([1, 1], dtype=torch.int32).cuda())
+        # value_tensor=torch.tensor([1, 2], dtype=torch.int32).cuda()
+        # accumulate=False
+
+        class TestIndexPut(torch.nn.Module):
+            def forward(self, source_tensor, indices_tensor, value_tensor):
+                # indices_tensor = torch.where(indices_tensor)[0]
+                source_tensor[indices_tensor] = value_tensor
+                return source_tensor
+
+        model = TestIndexPut()
+        torch_output = model.forward(source_tensor, indices_tensor, value_tensor)
+
+        ep = torch.export.export(
+            model,
+            (source_tensor, indices_tensor, value_tensor),
+            dynamic_shapes=({0: dim1}, {0: dim1}, {0: dim2}),
+        )
+        with torchtrt.dynamo.Debugger(log_level="debug"):
+            trt_engine = torchtrt.dynamo.compile(
+                ep,
+                inputs=(source_tensor, indices_tensor, value_tensor),
+                enabled_precisions={torch.float32},
+                min_block_size=1,
+                use_explicit_typing=False,
+                use_fp32_acc=False,
+                disable_tf32=True,
+                use_python_runtime=True,
+            )
+        result = trt_engine(source_tensor, indices_tensor, value_tensor)
+
+        torch.allclose(result, torch_output, atol=1e-4, rtol=1e-4)
+
 
 if __name__ == "__main__":
     run_tests()