Added example and fixed lru problem

cehongwang · cehongwang · commit 3b02ef9bc589 · 2025-11-14T20:17:10.000Z
diff --git a/examples/dynamo/low_cpu_memory_compilation.py b/examples/dynamo/low_cpu_memory_compilation.py
@@ -0,0 +1,84 @@
+"""
+
+.. _low_cpu_memory_compilation:
+
+Low CPU Memory Compilation Example
+==================================
+
+This example demonstrates compiling a model with a bounded CPU (host) memory
+budget using Torch-TensorRT Dynamo. Limiting host RAM use is helpful on
+memory-constrained machines or when compiling very large models.
+
+Key notes:
+- The toy model below has roughly 430 MB of parameters. We set the CPU
+  memory budget to 2 GiB. At compile time, only about 900 MB of host RAM
+  may remain available. We expect at most 403 * 4 = 1612 MB of memory to be used by the model.
+  So the model is partitioned into two subgraphs to fit the memory budget.
+
+- Performance impact varies by model. When the number of TensorRT engines
+  created is small, the impact is typically minimal.
+
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_tensorrt as torchtrt
+from torch_tensorrt.dynamo.conversion import CompilationSettings
+
+
+class net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # Intentionally large layers to stress host memory during compilation.
+        self.conv1 = nn.Conv2d(1024, 4096, 3, padding=1)
+        self.bn1 = nn.BatchNorm2d(4096)
+        self.conv2 = nn.Conv2d(4096, 1024, 3, padding=1)
+        self.bn2 = nn.BatchNorm2d(1024)
+        self.fc1 = nn.Linear(1024 * 56 * 56, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, (2, 2))
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, (2, 2))
+        x = torch.flatten(x, 1)
+        return self.fc1(x)
+
+
+model = net().eval()
+model.to("cuda")
+inputs = [torch.randn((1, 1024, 224, 224)).to("cuda")]
+
+enabled_precisions = {torch.float}
+use_python_runtime = False
+
+compilation_options = {
+    "use_python_runtime": use_python_runtime,
+    "enabled_precisions": enabled_precisions,
+    "min_block_size": 1,
+    "immutable_weights": True,
+    "reuse_cached_engines": False,
+    "cpu_memory_budget": 2 * 1024 * 1024 * 1024,  # 2 GiB in bytes
+}
+
+settings = CompilationSettings(**compilation_options)
+with torchtrt.dynamo.Debugger(
+    log_level="debug",
+    logging_dir="/home/profile/logging/moe",
+    engine_builder_monitor=False,
+):
+
+    exp_program = torch.export.export(model, tuple(inputs))
+    trt_gm = torchtrt.dynamo.compile(
+        exp_program,
+        inputs=inputs,
+        **compilation_options,
+    )
+
+    # Expect two back-to-back TensorRT engines due to partitioning under the memory budget.
+    print(trt_gm)
diff --git a/py/torch_tensorrt/dynamo/partitioning/_atomic_subgraphs.py b/py/torch_tensorrt/dynamo/partitioning/_atomic_subgraphs.py
@@ -165,7 +165,6 @@ def forward(
         return x
 
 
-@lru_cache(maxsize=None)
 def get_node_in_fusion_pattern(
     graph: torch.fx.Graph,
 ) -> Dict[torch.fx.Node, Set[torch.fx.Node]]:
@@ -175,10 +174,8 @@ def get_node_in_fusion_pattern(
     Value: the list of nodes that should be fused together
     """
     fusion_nodes = {}
-    for pattern, is_aten in ATOMIC_SUBGRAPHS:
-        pattern_graph = torch.fx.symbolic_trace(pattern())
-        # TODO: Add decomposition and lowering if is_aten is False
-        subgraph_matcher = SubgraphMatcher(pattern_graph.graph)
+    for compiled_pattern_graph in get_compiled_atomic_subgraphs():
+        subgraph_matcher = SubgraphMatcher(compiled_pattern_graph.graph)
         match_result = subgraph_matcher.match(graph)
         for match in match_result:
             fusion_group = {
@@ -193,3 +190,21 @@ def get_node_in_fusion_pattern(
                 fusion_nodes[node] = fusion_group
 
     return fusion_nodes
+
+
+@lru_cache(maxsize=None)
+def get_compiled_atomic_subgraphs() -> List[torch.fx.GraphModule]:
+    """
+    This function gets the compiled atomic subgraphs from the graph.
+    LRU cache the result to avoid recompiling the same pattern multiple times.
+    """
+    compiled_atomic_subgraphs = []
+    for pattern, is_aten in ATOMIC_SUBGRAPHS:
+        pattern_graph = torch.fx.symbolic_trace(pattern())
+        if not is_aten:
+            # TODO: Add decomposition and lowering if is_aten is False
+            raise NotImplementedError(
+                "Atomic subgraphs are not supported for non-aten subgraphs yet."
+            )
+        compiled_atomic_subgraphs.append(pattern_graph)
+    return compiled_atomic_subgraphs