Grad scaling parity between both pp and non-pp

xmfan · xmfan · commit 36903f2d97ff · 2025-11-14T09:06:39.000-08:00
diff --git a/autoparallel/graph_pp_runner.py b/autoparallel/graph_pp_runner.py
@@ -190,12 +190,12 @@ def _accumulate_stage_grads(
 ) -> None:
     assert len(unsharded_grads) == len(grads_to_accumulate)
     assert not all(grad is None for grad in grads_to_accumulate), "All grads are None"
-    for unsharded_grad, grad_to_accumulate in zip(unsharded_grads, grads_to_accumulate):
-        if grad_to_accumulate is not None:
-            if unsharded_grad is None:
-                unsharded_grad = grad_to_accumulate
+    for i in range(len(unsharded_grads)):
+        if grads_to_accumulate[i] is not None:
+            if unsharded_grads[i] is None:
+                unsharded_grads[i] = grads_to_accumulate[i]
             else:
-                unsharded_grad += grad_to_accumulate
+                unsharded_grads[i] += grads_to_accumulate[i]
 
 
 def _run_forward_microbatch(
@@ -374,6 +374,7 @@ def stage_full_backward(
         # next stage
         # TODO(sanketpurandare)
         # HACK till we have loss function, we populate the tangents here manually
+        assert len(stage_output) == 1
         bwd_kwargs = {
             "stage_output": loss,
             "tangents": [torch.ones_like(stage_output[0])],
diff --git a/examples/example_ds3_pp.py b/examples/example_ds3_pp.py
@@ -93,6 +93,7 @@ def build_pipeline_schedule(
         n_microbatches=n_microbatches,
         loss_fn=loss_fn,
         backward_requires_autograd=backward_requires_autograd,
+        scale_grads=False,
     )
     logger.info(
         f"Using pipeline schedule {pipeline_parallel_schedule} "

Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,7 @@ def build_pipeline_schedule(`
`93`	`93`	`n_microbatches=n_microbatches,`
`94`	`94`	`loss_fn=loss_fn,`
`95`	`95`	`backward_requires_autograd=backward_requires_autograd,`
	`96`	`+ scale_grads=False,`
`96`	`97`	`)`
`97`	`98`	`logger.info(`
`98`	`99`	`f"Using pipeline schedule {pipeline_parallel_schedule} "`