Added decorator and tests

cehongwang · cehongwang · commit e7cad5bf7403 · 2025-11-14T00:46:50.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -618,6 +618,10 @@ def compile(
             "'arg_inputs' and 'inputs' should not be used at the same time."
         )
 
+    assert (
+        cpu_memory_budget >= 2 * 1024 * 1024 * 1024
+    ), "CPU memory budget must be greater than 10GB"
+
     arg_inputs = inputs or arg_inputs
 
     if kwarg_inputs is None:
diff --git a/py/torch_tensorrt/dynamo/partitioning/_resource_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_resource_partitioner.py
@@ -211,7 +211,7 @@ def calculate_size_budget(
             int: Budget in bytes for a single accelerated subgraph.
         """
 
-        used_rss: int = psutil.virtual_memory().used
+        used_rss: int = psutil.Process().memory_info().rss
         available_rss = self.cpu_memory_budget - used_rss
         return available_rss // engine_compilation_memory_usage_multiplier
 
diff --git a/py/torch_tensorrt/dynamo/partitioning/fusion_patterns.py b/py/torch_tensorrt/dynamo/partitioning/fusion_patterns.py
@@ -1,11 +1,25 @@
 from functools import lru_cache
-from typing import Dict, List, Set
+from typing import Callable, Dict, List, Set
 
 import torch
 from torch.fx.passes.utils.matcher_utils import SubgraphMatcher
 from torch.ops import aten
 
+ATOMIC_SUBGRAPHS = []
 
+
+def register_atomic_subgraph(
+    is_aten: bool = False,
+) -> Callable[[torch.nn.Module], torch.nn.Module]:
+
+    def decorator(subgraph: torch.nn.Module) -> torch.nn.Module:
+        ATOMIC_SUBGRAPHS.append((subgraph, is_aten))
+        return subgraph
+
+    return decorator
+
+
+@register_atomic_subgraph(is_aten=True)
 class ConvBNReLU(torch.nn.Module):  # type: ignore[misc]
     def __init__(self) -> None:
         super().__init__()
@@ -46,6 +60,7 @@ def forward(
         return x
 
 
+@register_atomic_subgraph(is_aten=True)
 class ConvReLU(torch.nn.Module):  # type: ignore[misc]
     def __init__(self) -> None:
         super().__init__()
@@ -77,6 +92,7 @@ def forward(
         return x
 
 
+@register_atomic_subgraph(is_aten=True)
 class ConvGelu(torch.nn.Module):  # type: ignore[misc]
     def __init__(self) -> None:
         super().__init__()
@@ -108,6 +124,7 @@ def forward(
         return x
 
 
+@register_atomic_subgraph(is_aten=True)
 class ConvSilu(torch.nn.Module):  # type: ignore[misc]
     def __init__(self) -> None:
         super().__init__()
@@ -122,6 +139,7 @@ def forward(
         return x
 
 
+@register_atomic_subgraph(is_aten=True)
 class MulAdd(torch.nn.Module):  # type: ignore[misc]
     def __init__(self) -> None:
         super().__init__()
@@ -134,6 +152,7 @@ def forward(
         return x
 
 
+@register_atomic_subgraph(is_aten=True)
 class MulMul(torch.nn.Module):  # type: ignore[misc]
     def __init__(self) -> None:
         super().__init__()
@@ -146,16 +165,6 @@ def forward(
         return x
 
 
-All_FUSION_PATTERNS = [
-    ConvBNReLU,
-    ConvReLU,
-    ConvGelu,
-    ConvSilu,
-    MulAdd,
-    MulMul,
-]
-
-
 @lru_cache(maxsize=None)
 def get_node_in_fusion_pattern(
     graph: torch.fx.Graph,
@@ -166,8 +175,9 @@ def get_node_in_fusion_pattern(
     Value: the list of nodes that should be fused together
     """
     fusion_nodes = {}
-    for pattern in All_FUSION_PATTERNS:
+    for pattern, is_aten in ATOMIC_SUBGRAPHS:
         pattern_graph = torch.fx.symbolic_trace(pattern())
+        # TODO: Add decomposition and lowering if is_aten is False
         subgraph_matcher = SubgraphMatcher(pattern_graph.graph)
         match_result = subgraph_matcher.match(graph)
         for match in match_result:
diff --git a/tests/py/dynamo/partitioning/test_resource_partitioning.py b/tests/py/dynamo/partitioning/test_resource_partitioning.py
@@ -0,0 +1,93 @@
+from typing import Any
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_tensorrt as torchtrt
+from torch.testing._internal.common_utils import TestCase, run_tests
+from torch_tensorrt.dynamo import partitioning
+from torch_tensorrt.dynamo.conversion import CompilationSettings
+from torch_tensorrt.dynamo.lowering import (
+    get_decompositions,
+    post_lowering,
+    pre_export_lowering,
+)
+from torch_tensorrt.dynamo.lowering.passes import post_lowering, pre_export_lowering
+from torch_tensorrt.dynamo.partitioning._resource_partitioner import resource_partition
+
+
+class TestResourcePartitioning(TestCase):
+    def test_resource_partitioning(self):
+        class net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = nn.Conv2d(1024, 4096, 3, padding=1)
+                self.bn1 = nn.BatchNorm2d(4096)
+                self.conv2 = nn.Conv2d(4096, 1024, 3, padding=1)
+                self.bn2 = nn.BatchNorm2d(1024)
+                self.fc1 = nn.Linear(1024 * 56 * 56, 10)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.bn1(x)
+                x = F.relu(x)
+                x = F.max_pool2d(x, (2, 2))
+                x = self.conv2(x)
+                x = self.bn2(x)
+                x = F.relu(x)
+                x = F.max_pool2d(x, (2, 2))
+                x = torch.flatten(x, 1)
+                return self.fc1(x)
+
+        model = net().eval()
+        model.to("cuda")
+        inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")]
+
+        enabled_precisions = {torch.float}
+        use_python_runtime = False
+
+        exp_program = torch.export.export(model, tuple(inputs))
+
+        compilation_options = {
+            "use_python_runtime": use_python_runtime,
+            "enabled_precisions": enabled_precisions,
+            "min_block_size": 1,
+            "immutable_weights": True,
+            "reuse_cached_engines": False,
+        }
+        settings = CompilationSettings(**compilation_options)
+        with torchtrt.dynamo.Debugger(
+            log_level="debug",
+            logging_dir="/home/profile/logging/moe",
+            engine_builder_monitor=False,
+        ):
+
+            exported_program = pre_export_lowering(exp_program, settings)
+            exported_program = exported_program.run_decompositions(
+                get_decompositions(False)
+            )
+
+            gm = exported_program.module()
+            gm = post_lowering(gm, settings)
+
+            partitioned_module, supported_ops = partitioning.fast_partition(
+                gm,
+                min_block_size=settings.min_block_size,
+                torch_executed_ops=settings.torch_executed_ops,
+                require_full_compilation=settings.require_full_compilation,
+                skip_fusion=True,
+            )
+
+            partitioned_module = resource_partition(
+                gm, partitioned_module, cpu_memory_budget=2 * 1024 * 1024 * 1024  # 2GB,
+            )
+
+            self.assertEqual(
+                len(list[Any](partitioned_module.named_children())),
+                2,
+                "The graph should have 2 subgraphs",
+            )
+
+
+if __name__ == "__main__":
+    run_tests()

Original file line number	Diff line number	Diff line change
`@@ -618,6 +618,10 @@ def compile(`
`618`	`618`	`"'arg_inputs' and 'inputs' should not be used at the same time."`
`619`	`619`	`)`
`620`	`620`
	`621`	`+ assert (`
	`622`	`+ cpu_memory_budget >= 2 * 1024 * 1024 * 1024`
	`623`	`+ ), "CPU memory budget must be greater than 10GB"`
	`624`	`+`
`621`	`625`	`arg_inputs = inputs or arg_inputs`
`622`	`626`
`623`	`627`	`if kwarg_inputs is None:`