From 90f7c0369d625a634cdfb8e7d2ca5dff50b28d94 Mon Sep 17 00:00:00 2001 From: Wil Kong Date: Wed, 10 Sep 2025 16:03:31 +0800 Subject: [PATCH 1/7] Enhance logger metrics retrieval to conditionally include progress bar metrics based on the presence of a progress bar callback. Signed-off-by: Wil Kong --- .../trainer/connectors/logger_connector/logger_connector.py | 4 +++- .../pytorch/trainer/connectors/logger_connector/result.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py b/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py index f6e8885ee050a..9db1a6a595943 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py @@ -234,7 +234,9 @@ def metrics(self) -> _METRICS: """This function returns either batch or epoch metrics.""" on_step = self._first_loop_iter is not None assert self.trainer._results is not None - return self.trainer._results.metrics(on_step) + # Only include progress bar metrics if a progress bar callback is present + include_pbar_metrics = self.trainer.progress_bar_callback is not None + return self.trainer._results.metrics(on_step, include_pbar_metrics=include_pbar_metrics) @property def callback_metrics(self) -> _OUT_DICT: diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py index 1a4aa7c401960..a0efc9ff22a20 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py @@ -468,7 +468,7 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> tuple[str forked_name += dataloader_suffix return name, forked_name - def metrics(self, on_step: bool) -> _METRICS: + def metrics(self, on_step: bool, *, include_pbar_metrics: bool = True) -> _METRICS: metrics = _METRICS(callback={}, log={}, pbar={}) for _, result_metric in self.valid_items(): @@ -489,7 +489,7 @@ def metrics(self, on_step: bool) -> _METRICS: metrics["callback"][forked_name] = value # populate progress_bar metrics. convert tensors to numbers - if result_metric.meta.prog_bar: + if result_metric.meta.prog_bar and include_pbar_metrics: metrics["pbar"][forked_name] = convert_tensors_to_scalars(value) return metrics From 4223f697ce998b52d06f55fac674763d47d3e5f9 Mon Sep 17 00:00:00 2001 From: Wil Kong Date: Wed, 10 Sep 2025 18:37:45 +0800 Subject: [PATCH 2/7] Refactor tensor handling in __to_tensor method to optimize device management for scalar metrics. Signed-off-by: Wil Kong --- src/lightning/pytorch/core/module.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index 85f631ee40f75..b87773c43f916 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -656,11 +656,20 @@ def __check_allowed(v: Any, name: str, value: Any) -> None: raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged") def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor: - value = ( - value.clone().detach() - if isinstance(value, Tensor) - else torch.tensor(value, device=self.device, dtype=_get_default_dtype()) - ) + if isinstance(value, Tensor): + # Keep tensor on its original device to avoid unnecessary transfers + value = value.clone().detach() + else: + if self.device.type == "cuda": + # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization. + # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric + # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient. + device = "cpu" + else: + # For non-CUDA devices, maintain original behavior + device = self.device + value = torch.tensor(value, device=device, dtype=_get_default_dtype()) + if not torch.numel(value) == 1: raise ValueError( f"`self.log({name}, {value})` was called, but the tensor must have a single element." From daac9561a51dda1e2b241fc3bc84a2c29e0a5ca6 Mon Sep 17 00:00:00 2001 From: Wil Kong Date: Sun, 21 Sep 2025 14:33:14 +0800 Subject: [PATCH 3/7] Fix issue reported by pre-commit. Signed-off-by: Wil Kong --- src/lightning/pytorch/core/module.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index b87773c43f916..bd7b272f9f947 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -660,14 +660,11 @@ def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor # Keep tensor on its original device to avoid unnecessary transfers value = value.clone().detach() else: - if self.device.type == "cuda": - # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization. - # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric - # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient. - device = "cpu" - else: - # For non-CUDA devices, maintain original behavior - device = self.device + # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization. + # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric + # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient. + # For non-CUDA devices, maintain original behavior + device = "cpu" if self.device.type == "cuda" else self.device value = torch.tensor(value, device=device, dtype=_get_default_dtype()) if not torch.numel(value) == 1: From 7c1a4b979fb401b1a4b2dce222bac820caef1af1 Mon Sep 17 00:00:00 2001 From: Wil Kong Date: Mon, 22 Sep 2025 17:43:41 +0800 Subject: [PATCH 4/7] Update src/lightning/pytorch/trainer/connectors/logger_connector/result.py Co-authored-by: Nicki Skafte Detlefsen --- .../pytorch/trainer/connectors/logger_connector/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py index a0efc9ff22a20..8298d280c98fc 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py @@ -468,7 +468,7 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> tuple[str forked_name += dataloader_suffix return name, forked_name - def metrics(self, on_step: bool, *, include_pbar_metrics: bool = True) -> _METRICS: + def metrics(self, on_step: bool, include_pbar_metrics: bool = True) -> _METRICS: metrics = _METRICS(callback={}, log={}, pbar={}) for _, result_metric in self.valid_items(): From 293233cacdfa9ad02a536c79a3fc5561d96daa05 Mon Sep 17 00:00:00 2001 From: Wil Kong Date: Mon, 22 Sep 2025 21:14:05 +0800 Subject: [PATCH 5/7] Add unittest for metric sync-free changes. Signed-off-by: Wil Kong --- .../core/test_lightning_module.py | 50 +++++++++ .../trainer/logging_/test_logger_connector.py | 104 ++++++++++++++++++ 2 files changed, 154 insertions(+) diff --git a/tests/tests_pytorch/core/test_lightning_module.py b/tests/tests_pytorch/core/test_lightning_module.py index c33488a4f2626..c724567ab7ba8 100644 --- a/tests/tests_pytorch/core/test_lightning_module.py +++ b/tests/tests_pytorch/core/test_lightning_module.py @@ -594,3 +594,53 @@ def __init__(self): fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=1e-3, max_norm=None) else: fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=None, max_norm=1e-3) + + +@RunIf(min_cuda_gpus=1) +def test_log_no_cuda_sync(): + """Test logging scalars and tensors doesn't introduce CUDA sync.""" + + class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.to("cuda") + + def training_step(self, batch, batch_idx): + # Create tensors before enabling sync debug mode to avoid sync + cuda_tensor = torch.tensor(0.7, device=self.device) + cpu_tensor = torch.tensor(1.0, device="cpu") + + # Enable sync debug mode to catch any synchronization + torch.cuda.set_sync_debug_mode("error") + try: + # Test scalar value (should be placed on CPU to avoid sync) + self.log("scalar_loss", 0.5) + + # Test CUDA tensor (should stay on original device) + self.log("cuda_tensor", cuda_tensor) + + # Test CPU tensor (should stay on CPU) + self.log("cpu_tensor", cpu_tensor) + + except RuntimeError as e: + if "called a synchronizing CUDA operation" in str(e): + msg = f"Unexpected CUDA synchronization: {e}" + pytest.fail(msg) + else: + raise + finally: + torch.cuda.set_sync_debug_mode("default") + + return super().training_step(batch, batch_idx) + + model = TestModel() + trainer = Trainer( + max_epochs=1, + limit_train_batches=1, + limit_val_batches=0, + accelerator="gpu", + devices=1, + enable_progress_bar=False, + enable_checkpointing=False, + ) + trainer.fit(model) diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index d3d355edb003b..315d23ae657ca 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -660,3 +660,107 @@ def test_result_collection_changes_device(): # same device as the new tensor results.log(fx, name, log_val, on_step=True, on_epoch=False, reduce_fx="mean") assert results[f"{fx}.{name}"].cumulated_batch_size.device == log_val.device + + +@RunIf(min_cuda_gpus=1) +def test_logger_connector_no_sync_without_progress_bar(): + """Test logger connector doesn't sync when no progress bar.""" + from lightning.pytorch import Trainer + from lightning.pytorch.demos.boring_classes import BoringModel + + class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.to("cuda") + + def training_step(self, batch, batch_idx): + # Log some metrics with progress bar enabled + loss = super().training_step(batch, batch_idx)["loss"] + + # Enable sync debug mode to catch any synchronization + torch.cuda.set_sync_debug_mode("error") + try: + # These logs have prog_bar=True but should not sync + # when progress bar callback is not present + self.log("train_loss", loss, prog_bar=True) + self.log("train_acc", 0.95, prog_bar=True) + + except RuntimeError as e: + if "called a synchronizing CUDA operation" in str(e): + msg = f"Unexpected CUDA synchronization: {e}" + pytest.fail(msg) + else: + raise + finally: + torch.cuda.set_sync_debug_mode("default") + + return loss + + model = TestModel() + trainer = Trainer( + max_epochs=1, + limit_train_batches=1, + limit_val_batches=0, + accelerator="gpu", + devices=1, + enable_progress_bar=False, # Key - no progress bar callback + enable_checkpointing=False, + ) + trainer.fit(model) + + +def test_result_collection_metrics_include_pbar_parameter(): + """Test metrics method handles include_pbar_metrics parameter.""" + from lightning.pytorch.trainer.connectors.logger_connector.result import ( + _ResultCollection, + ) + + results = _ResultCollection(training=True) + + # Log some metrics with different prog_bar settings + results.log( + "training_step", + "regular_metric", + torch.tensor(1.0), + on_step=True, + on_epoch=False, + prog_bar=False, + ) + results.log( + "training_step", + "pbar_metric", + torch.tensor(2.0), + on_step=True, + on_epoch=False, + prog_bar=True, + ) + results.log( + "training_step", + "both_metric", + torch.tensor(3.0), + on_step=True, + on_epoch=False, + prog_bar=True, + logger=True, + ) + + # Test with include_pbar_metrics=True (default behavior) + metrics_with_pbar = results.metrics( + on_step=True, include_pbar_metrics=True + ) + assert "pbar_metric" in metrics_with_pbar["pbar"] + assert "both_metric" in metrics_with_pbar["pbar"] + assert "both_metric" in metrics_with_pbar["log"] + + # Test with include_pbar_metrics=False (optimization) + metrics_without_pbar = results.metrics( + on_step=True, include_pbar_metrics=False + ) + # No progress bar metrics should be included + assert len(metrics_without_pbar["pbar"]) == 0 + # Logger metrics should still be included + assert "both_metric" in metrics_without_pbar["log"] + + # Verify callback metrics are not affected + assert "regular_metric" in metrics_with_pbar["callback"] + assert "regular_metric" in metrics_without_pbar["callback"] From bfdfad360f08c8f390da1e5e4f136c3a911c4401 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:14:43 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/logging_/test_logger_connector.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index 315d23ae657ca..4af5812e441f5 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -745,17 +745,13 @@ def test_result_collection_metrics_include_pbar_parameter(): ) # Test with include_pbar_metrics=True (default behavior) - metrics_with_pbar = results.metrics( - on_step=True, include_pbar_metrics=True - ) + metrics_with_pbar = results.metrics(on_step=True, include_pbar_metrics=True) assert "pbar_metric" in metrics_with_pbar["pbar"] assert "both_metric" in metrics_with_pbar["pbar"] assert "both_metric" in metrics_with_pbar["log"] # Test with include_pbar_metrics=False (optimization) - metrics_without_pbar = results.metrics( - on_step=True, include_pbar_metrics=False - ) + metrics_without_pbar = results.metrics(on_step=True, include_pbar_metrics=False) # No progress bar metrics should be included assert len(metrics_without_pbar["pbar"]) == 0 # Logger metrics should still be included From 1d2eced57affb674d08ec7ca8c99e2cee4388b89 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Wed, 1 Oct 2025 07:08:39 +0200 Subject: [PATCH 7/7] changelog --- src/lightning/pytorch/CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 863a3a4a7e939..b332fefa38ddd 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -52,6 +52,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed preventing recursive symlink creation iwhen `save_last='link'` and `save_top_k=-1` ([#21186](https://github.com/Lightning-AI/pytorch-lightning/pull/21186)) +- Fixed redundant host-device sync in progressbar printing ([#21233](https://github.com/Lightning-AI/pytorch-lightning/pull/21233)) + --- ## [2.5.5] - 2025-09-05