Support trace_me and xp.Trace in assume_pure (#9311)

tengyifei · web-flow · commit d663c7ed5777 · 2025-06-09T10:10:21.000-07:00
diff --git a/test/test_assume_pure.py b/test/test_assume_pure.py
@@ -1,4 +1,6 @@
 from copy import deepcopy
+import glob
+import os
 from absl.testing import absltest
 from absl import flags
 import time
@@ -369,7 +371,7 @@ def original_func(a, b):
     self.assertIsNone(a_pure.grad)
     self.assertIsNone(b_pure.grad)
 
-  def test_composibility_with_call_jax(self):
+  def test_composability_with_call_jax(self):
 
     def jax_func(a, b):
       return jnp.dot(a, b)
@@ -407,6 +409,42 @@ def f(a, b):
         msg="Forward outputs do not match",
         check_device=False)
 
+  def test_assume_pure_profile(self):
+    """Test that xp.Trace works inside assume_pure."""
+    import torch_xla.debug.profiler as xp
+
+    # Arrange
+    MAGIC_STRING = 'foobar123'
+
+    @assume_pure
+    def torch_func(a, b):
+      with xp.Trace(MAGIC_STRING):
+        return torch.matmul(a, b)
+
+    # Precompile it such that it won't be traced again on CPU.
+    # This way we exclusively test the device-side profiles.
+    a = torch.randn(3, 3, device='xla')
+    b = torch.randn(3, 3, device='xla')
+    _ = torch_func(a, b)
+
+    # Act
+    tempdir = self.create_tempdir().full_path
+    xp.start_trace(tempdir)
+    _ = torch_func(a, b)
+    torch_xla.sync(wait=True)
+    xp.stop_trace()
+
+    # Assert
+    files = glob.glob(
+        os.path.join(tempdir, '**', '*.xplane.pb'), recursive=True)
+    self.assertEqual(len(files), 1)
+
+    path = files[0]
+    with open(path, 'rb') as f:
+      proto_str = str(f.read())
+    self.assertTrue(MAGIC_STRING in proto_str,
+                    f'Expected "{MAGIC_STRING}" trace in: {path}')
+
 
 FLAGS = flags.FLAGS
 flags.DEFINE_integer(
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -9,7 +9,6 @@
 from torch.ao.quantization.utils import determine_qparams
 
 import torch_xla
-import torch_xla.core.xla_model as xm
 from torch_xla import runtime as xr
 from torch_xla._internal import tpu
 
@@ -26,6 +25,7 @@
 def with_jax_high_precision(func):
 
   def wrapper(*args, **kwargs):
+    import jax
     jax.config.update('jax_default_matmul_precision', "highest")
     try:
       result = func(*args, **kwargs)
diff --git a/test/test_profiler.py b/test/test_profiler.py
@@ -5,7 +5,7 @@
 import os
 import sys
 import tempfile
-import time
+import signal
 import unittest
 
 import args_parse
@@ -23,7 +23,7 @@ def train_worker(port, training_started):
       batch_size=16,
       momentum=0.5,
       lr=0.01,
-      num_epochs=10)
+      num_epochs=100)
   flags.fake_data = True
   flags.profiler_port = port
 
@@ -86,17 +86,26 @@ def test_trace_and_metrics(self):
     training_started = context.Event()
     p = context.Process(
         target=train_worker, args=(port, training_started), daemon=True)
+
+    # Wait for training to start.
     p.start()
-    training_started.wait(60)
+    training_started.wait(600)
 
+    # Take a profile.
     logdir = tempfile.mkdtemp()
     xp.trace(
         f'localhost:{port}',
         logdir,
         duration_ms=5000,
         num_tracing_attempts=5,
         delay_ms=1000)
-    p.terminate()
+    pid = p.pid
+    assert pid is not None, 'Process ID should not be None'
+    # Gracefully interrupt the process.
+    os.kill(pid, signal.SIGINT)
+    p.join()
+
+    # Validate the profiling output.
     path = self._check_xspace_pb_exist(logdir)
     self._check_trace_namespace_exists(path)
     self._check_metrics_warnings_exist(self.fname)
diff --git a/torch_xla/debug/profiler.py b/torch_xla/debug/profiler.py
@@ -107,13 +107,9 @@ class Trace(torch_xla._XLAC.profiler.TraceMe):
 
   The traces generated can then be collected using the above profiling APIs.
   The profiling server first needs to be started up and then can be sampled
-  either using Tensorboard profiler plugin
-  (https://github.com/tensorflow/profiler) or the
+  either using xprof (https://github.com/openxla/xprof) or the
   :func:`~torch_xla.debug.profiler.trace` method.
 
-  Note: currently only supports PyTorch/XLA client side trace events. i.e.,
-  the namespace won't group TPU worker side trace.
-
   Example usage:
   ```python
   server = xp.start_server(9012)
@@ -132,7 +128,13 @@ def __enter__(self):
     self.scope = torch_xla._XLAC.profiler.scope_pusher(self.name)
     super().__enter__()
 
+    # Also enter the JAX named scope, to support torchax lowering.
+    import jax
+    self._jax_scope = jax.named_scope(self.name)
+    self._jax_scope.__enter__()
+
   def __exit__(self, type, value, traceback):
+    self._jax_scope.__exit__(type, value, traceback)
     if getattr(self, 'scope', None):
       del self.scope
     super().__exit__(type, value, traceback)
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
@@ -868,7 +868,7 @@ def flash_attention(
                               sm_scale, ab, partition_spec, mesh)
 
 
-# This function should only be called and excuted on runtime.
+# This function should only be called and executed on runtime.
 def _ragged_paged_attention_runtime_check(
     q,  # [max_num_batched_tokens, num_q_heads, head_dim]
     kv_pages,  # [total_num_pages, page_size, num_combined_kv_heads, head_dim]