Compare microbatch forward outputs and gradients

xmfan · xmfan · commit b8546e1e852d · 2025-11-19T16:43:14.000-08:00
stack-info: PR: #246, branch: xmfan/stack/20
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
+import functools
 import itertools
 import warnings
 from contextlib import ExitStack, contextmanager
@@ -42,7 +43,11 @@
 )
 from .init_weights import hook_params_setters
 from .optimize_sharding import ShardingOptimizer
-from .utils import _get_device_from_mesh
+from .utils import (
+    NumericsLogger,
+    _get_device_from_mesh,
+    debug_boxed_nop_preserve_node_meta,
+)
 
 _APPLY_VIEW_MM_VIEW_PATTERN = False
 
@@ -212,6 +217,7 @@ def __init__(
         reshard_after_forward: bool = True,
         dynamic: bool = False,
         loss_fn: Optional[Callable] = None,
+        numerics_logger: NumericsLogger | None = None,
         **kwargs,
     ):
         self.stack = ExitStack()
@@ -239,7 +245,14 @@ def __init__(
         self.model = move_to_fake(model, self.fake_mode, device)
         self.input_fn = input_fn
         self.mesh = mesh
-        self.compiler_fn = compile_fx_inner if compile else boxed_nop_preserve_node_meta
+        if compile:
+            self.compiler_fn = compile_fx_inner
+        elif numerics_logger:
+            self.compiler_fn = functools.partial(
+                debug_boxed_nop_preserve_node_meta, numerics_logger=numerics_logger
+            )
+        else:
+            self.compiler_fn = boxed_nop_preserve_node_meta  # type: ignore[assignment]
         self.enable_ac = enable_ac
         self.ac_stage_size_in_GiB = ac_stage_size_in_GiB
         self.reshard_after_forward = reshard_after_forward
diff --git a/autoparallel/graph_pp_runner.py b/autoparallel/graph_pp_runner.py
@@ -214,6 +214,7 @@ def stage_forward(
     action: _Action,
     ctx: _PipelineContext,
     numerics_logs: Optional[list[str]] = None,
+    forward_hook: Callable | None = None,
 ) -> None:
     schedule, stage_index_to_stage, stage = _get_stage_from_action(action, ctx)
     stage_index = stage.stage_index
@@ -279,6 +280,8 @@ def stage_forward(
         stage.output_chunks.append(output)
         if ctx.target_mbs is not None:
             ctx.schedule_ref._internal_losses.append(output)
+    if forward_hook:
+        forward_hook(stage, action, output)
 
     stage.fwd_cache[mb_index] = (output_tuple, saved_intermediates)  # type: ignore[assignment]
 
diff --git a/autoparallel/utils.py b/autoparallel/utils.py
@@ -341,7 +341,7 @@ def log(self, node: str, args: Iterable[Any], inputs_or_outputs: str):
                 continue
 
             self._logs.append(
-                f"{node=}, {inputs_or_outputs}[{i}]={torch.hash_tensor(arg)}"
+                f"{node=}, {inputs_or_outputs}[{i}]={torch.hash_tensor(arg)} nan={torch.any(torch.isnan(arg))}"
             )
 
     def run_node(self, n: torch.fx.Node) -> Any:
@@ -429,13 +429,27 @@ def log_model_weights(self, parallel_mod):
 
             print(f"Weight hashes written to {path}")
 
-    def log_pp_model_weights(self, orig_mod, stage_mods, num_world_stages, ranks):
+    def log_fw_intermediates(self, logs):
+        rank = torch.distributed.get_rank()
+        path = self.dir / f"rank_{rank}_fw_intermediates.log"
+        with open(path, "a") as f:
+            f.write("\n".join(logs) + "\n")
+
+    def log_diff(self, t, rank=0, prefix="?"):
+        if self.rank == rank:
+            path = self.dir / "diff.log"
+            if isinstance(t, torch.distributed.tensor.DTensor):
+                t = t.to_local()
+            with open(path, "a") as f:
+                f.write(f"[{prefix}] hash={hash_tensor(t)}, norm={torch.norm(t)}\n")
+
+    def log_pp_model_weights(self, orig_mod, stage_mods, num_world_stages, should_log):
         path = self.dir / "pp_weights.log"
 
         torch.distributed.barrier()
         # First print the params of every stage
         for i in range(num_world_stages):
-            if self.rank in ranks and i in stage_mods:
+            if should_log and i in stage_mods:
                 param_logs = []
                 real_params = dict(stage_mods[i].named_parameters())
                 for name, _ in orig_mod.named_parameters():
@@ -449,7 +463,7 @@ def log_pp_model_weights(self, orig_mod, stage_mods, num_world_stages, ranks):
 
         # Then print the buffers of every stage
         for i in range(num_world_stages):
-            if self.rank in ranks and i in stage_mods:
+            if should_log and i in stage_mods:
                 buffer_logs = []
                 real_buffers = dict(stage_mods[i].named_buffers())
                 for name, _ in orig_mod.named_buffers():
@@ -463,3 +477,39 @@ def log_pp_model_weights(self, orig_mod, stage_mods, num_world_stages, ranks):
 
         if self.rank == 0:
             print(f"Weight hashes written to {path}")
+
+    def log_pp_grads(self, orig_mod, stage_mods, num_world_stages, should_log):
+        path = self.dir / "diff.log"
+
+        for i in range(num_world_stages):
+            if should_log and i in stage_mods:
+                grad_logs = []
+                real_params = dict(stage_mods[i].named_parameters())
+                for name, _ in orig_mod.named_parameters():
+                    if name not in real_params:
+                        continue
+                    grad = real_params[name].grad
+                    if grad is None:
+                        grad_logs.append(f"[grad {name}] None")
+                    else:
+                        grad = grad.to_local()
+                        grad_logs.append(
+                            f"[grad {name}] hash={hash_tensor(grad)}, norm={torch.norm(grad)}"
+                        )
+                with open(path, "a") as f:
+                    f.write("\n".join(grad_logs) + "\n")
+            torch.distributed.barrier()
+
+
+def debug_boxed_nop_preserve_node_meta(fx_g, example_inputs, numerics_logger):
+    def run(args):
+        with torch.fx.traceback.preserve_node_meta():
+            interp = DebugInterpreter(fx_g)
+            out = interp.boxed_run(args)
+            mylogs = interp.get_logs()
+            if numerics_logger:
+                numerics_logger.log_fw_intermediates(mylogs)
+            return out
+
+    run._boxed_call = True
+    return run
diff --git a/examples/example_ds3_local_map.py b/examples/example_ds3_local_map.py
@@ -118,7 +118,8 @@ def run_test(fake_evaluate: bool, rng_seed: Optional[int], logs_dir: str):
             mscale=0.70,
         )
 
-    bs = 4 * mesh.shape[0] * mesh.shape[1]
+    local_batch_size = 2
+    global_batch_size = local_batch_size * mesh.shape[0] * mesh.shape[1]
     device = torch.device(f"cuda:{local_rank}")
 
     # parallelize the model
@@ -129,11 +130,16 @@ def input_fn():
         return torch.randint(
             0,
             config.vocab_size,
-            (bs, seq_len),
+            (global_batch_size, seq_len),
             device=device,
         )
 
-    with AutoParallel(model, input_fn, mesh, dynamic=True) as autop:
+    numerics_logger = None
+    if rng_seed is not None:
+        numerics_logger = NumericsLogger(logs_dir)
+    with AutoParallel(
+        model, input_fn, mesh, dynamic=True, numerics_logger=None
+    ) as autop:
         autop.add_parameter_memory_constraint(low=None, high=None)
 
         # x_sharding = (Shard(0), Replicate())
@@ -153,17 +159,22 @@ def input_fn():
     # )  # maybe not correct value
     parallel_mod.init_weights(buffer_device=device, seed=rng_seed)
     if rng_seed is not None:
-        numerics_logger = NumericsLogger(logs_dir)
         numerics_logger.log_model_weights(parallel_mod)
-
-    x = (
-        torch.randint(
-            0,
-            config.vocab_size,
-            (bs // mesh.shape[0] // mesh.shape[1], seq_len),
-            device=device,
-        ),
+        torch.manual_seed(rng_seed)
+
+    n_microbatches = 16
+    full_batch = torch.randint(
+        0,
+        config.vocab_size,
+        (local_batch_size * n_microbatches, seq_len),
+        device=device,
     )
+    microbatches = torch.split(full_batch, local_batch_size, dim=0)
+    assert len(microbatches) == n_microbatches
+    if rng_seed:
+        numerics_logger.log_diff(
+            full_batch.to(torch.float32), prefix="full batch input"
+        )
 
     # Symbolically evaluate in case you want to test running a graph bigger than your gpu
     if fake_evaluate:
@@ -173,15 +184,22 @@ def input_fn():
             allow_non_fake_inputs=True,
             shape_env=shape_env,
         ):
-            # # now let's run it
-            out = parallel_mod(*x)
-            out.backward(torch.randn_like(out))
+            # now let's run it
+            for x in microbatches:
+                out = parallel_mod(x)
+                out.backward(torch.ones_like(out))
     else:
-        out = parallel_mod(*x)
-        assert not torch.any(torch.isnan(out)), "Found NaNs in forward output"
+        for i, x in enumerate(microbatches):
+            assert x.shape[0] == 2
+            out = parallel_mod(x)
+            assert not torch.any(torch.isnan(out)), "Found NaNs in forward output"
+            out.backward(torch.ones_like(out))
+            if rng_seed is not None:
+                numerics_logger.log_diff(out, prefix=f"mb{i} fwd out")
+
         if rng_seed is not None:
-            numerics_logger.log_forward_output(out)
-        out.backward(torch.randn_like(out))
+            for k, v in parallel_mod.named_parameters():
+                numerics_logger.log_diff(v.grad, prefix=f"grad {k}")
 
     print("All good!")
 
diff --git a/examples/example_ds3_pp.py b/examples/example_ds3_pp.py
@@ -90,6 +90,7 @@ def build_pipeline_schedule(
     local_batch_size: int,
     pipeline_parallel_degree: int,
     backward_requires_autograd: bool = False,
+    scale_grads: bool = True,
 ) -> _PipelineSchedule:
     """Builds a pipeline schedule for the given configuration and stages."""
     schedule_class = get_schedule_class(pipeline_parallel_schedule)
@@ -115,6 +116,7 @@ def build_pipeline_schedule(
         n_microbatches=n_microbatches,
         loss_fn=loss_fn,
         backward_requires_autograd=backward_requires_autograd,
+        scale_grads=scale_grads,
     )
     logger.info(
         f"Using pipeline schedule {pipeline_parallel_schedule} "
@@ -192,9 +194,9 @@ def run_test(
     # This is the spmd mesh to be used for tracing
     mesh = world_mesh[("dp_mod_ep", "ep")]
 
-    global_batch_size = 32 * dp_degree
     # Batch size that will be supplied to the schedule and will be broken down into microbatches
-    local_batch_size = global_batch_size // dp_degree
+    local_batch_size = 32
+    # global_batch_size = local_batch_size * dp_degree
     n_microbatches = 16
     # Batch size with which the spmd graphs will actually be executed
     microbatch_size = local_batch_size // n_microbatches
@@ -379,6 +381,31 @@ def last_stage_inp_with_loss_fn():
     for stages in pp_rank_to_stage_indices.values():
         assert len(stages) * pp_degree == len(virtual_pp_stages)
     stage_indices_current_pp_rank = pp_rank_to_stage_indices[pp_rank]
+    if rng_seed:
+        # Compute the ranks to log from
+        # 1. for fw_outs, log from coord [pp_rank_containing_last_stage, 0, 0]
+        last_stage_idx = total_pp_stages - 1
+        pp_rank_containing_last_stage = None
+        for pp_rank_, stage_indices in pp_rank_to_stage_indices.items():
+            if last_stage_idx in stage_indices:
+                assert pp_rank_containing_last_stage is None
+                pp_rank_containing_last_stage = pp_rank_
+
+        log_fw_out_rank_coordinate = []
+        for mesh_dim_name in world_mesh.mesh_dim_names:
+            if mesh_dim_name == "pp":
+                log_fw_out_rank_coordinate.append(pp_rank_containing_last_stage)
+            else:
+                log_fw_out_rank_coordinate.append(0)
+        should_log_fw_outs = world_mesh.get_coordinate() == log_fw_out_rank_coordinate
+
+        # 2. for weights, log from coords [:, 0, 0]
+        pp_world_size = world_mesh.shape[world_mesh._get_mesh_dim_by_name("pp")]
+        log_weights_rank_coordinates = [(i, 0, 0) for i in range(pp_world_size)]
+        should_log_weights = (
+            tuple(world_mesh.get_coordinate()) in log_weights_rank_coordinates
+        )
+
     stage_mods: dict[int, torch.nn.Module] = {}
     stage_graphs: dict[int, GraphCallables] = {}
     stage_graph_metas: dict[int, GraphMeta] = {}
@@ -504,10 +531,6 @@ def last_stage_inp_with_loss_fn():
 
     world_size = torch.distributed.get_world_size()
     num_world_stages = world_size * len(stage_mods)
-    if rng_seed is not None:
-        NumericsLogger(logs_dir).log_pp_model_weights(
-            model, stage_mods, num_world_stages, ranks=[0, 4]
-        )
 
     stages = []
     # Step 4. Construct pipeline stages for this pp_rank using the stage modules, graphs and metadata
@@ -532,6 +555,7 @@ def last_stage_inp_with_loss_fn():
             group=world_mesh.get_group("pp"),
         )
         stages.append(stage)
+
     # Step 5. Construct the pipeline runner using the pipeline stages for this pp_rank
     schedule = build_pipeline_schedule(
         stages=stages,
@@ -541,11 +565,37 @@ def last_stage_inp_with_loss_fn():
         local_batch_size=local_batch_size,
         pipeline_parallel_degree=pp_degree,
         backward_requires_autograd=False,
+        scale_grads=rng_seed is None,  # In determinism mode, don't scale grads
     )
     assert isinstance(schedule, _PipelineScheduleRuntime)
+
+    if rng_seed is not None:
+        numerics_logger = NumericsLogger(logs_dir)
+        numerics_logger.log_pp_model_weights(
+            model, stage_mods, num_world_stages, should_log=should_log_weights
+        )
+        torch.manual_seed(rng_seed)
+
+    def last_stage_forward_hook(
+        stage: GraphPipelineStage, action: str, output: torch.Tensor
+    ):
+        if not stage.is_last or rng_seed is None:
+            # hook is only for numerics mode
+            return
+
+        if should_log_fw_outs:
+            numerics_logger.log_diff(
+                output,
+                rank=torch.distributed.get_rank(),
+                prefix=f"mb{action.microbatch_index} fwd out",
+            )
+
     # Step 6. Override the pipeline runner's action implementations
     schedule.register_custom_function(
-        FORWARD, functools.partial(stage_forward, numerics_logs=None)
+        FORWARD,
+        functools.partial(
+            stage_forward, numerics_logs=None, forward_hook=last_stage_forward_hook
+        ),
     )
     schedule.register_custom_function(FULL_BACKWARD, stage_full_backward)
     schedule.register_custom_function(REDUCE_GRAD, stage_reduce_grad)
@@ -576,6 +626,10 @@ def last_stage_inp_with_loss_fn():
             )
             if pp_rank == 0:
                 x = runtime_input_fn_first_stage()
+                if rng_seed:
+                    numerics_logger.log_diff(
+                        x.to(torch.float32), prefix="full batch input"
+                    )
                 graph_pp_runner.step(
                     x, target=target, losses=losses, return_outputs=False
                 )
@@ -590,6 +644,10 @@ def last_stage_inp_with_loss_fn():
                 payload_fn=lambda: f"losses: {losses}",
             )
 
+        numerics_logger.log_pp_grads(
+            model, stage_mods, num_world_stages, should_log=should_log_weights
+        )
+
     print("All good!")
 
     if torch.distributed.is_initialized():