From 90f7c0369d625a634cdfb8e7d2ca5dff50b28d94 Mon Sep 17 00:00:00 2001
From: Wil Kong <alpha0422@gmail.com>
Date: Wed, 10 Sep 2025 16:03:31 +0800
Subject: [PATCH 1/7] Enhance logger metrics retrieval to conditionally include
 progress bar metrics based on the presence of a progress bar callback.

Signed-off-by: Wil Kong <alpha0422@gmail.com>
---
 .../trainer/connectors/logger_connector/logger_connector.py   | 4 +++-
 .../pytorch/trainer/connectors/logger_connector/result.py     | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py b/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py
index f6e8885ee050a..9db1a6a595943 100644
--- a/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py
+++ b/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py
@@ -234,7 +234,9 @@ def metrics(self) -> _METRICS:
         """This function returns either batch or epoch metrics."""
         on_step = self._first_loop_iter is not None
         assert self.trainer._results is not None
-        return self.trainer._results.metrics(on_step)
+        # Only include progress bar metrics if a progress bar callback is present
+        include_pbar_metrics = self.trainer.progress_bar_callback is not None
+        return self.trainer._results.metrics(on_step, include_pbar_metrics=include_pbar_metrics)
 
     @property
     def callback_metrics(self) -> _OUT_DICT:
diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
index 1a4aa7c401960..a0efc9ff22a20 100644
--- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
+++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
@@ -468,7 +468,7 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> tuple[str
             forked_name += dataloader_suffix
         return name, forked_name
 
-    def metrics(self, on_step: bool) -> _METRICS:
+    def metrics(self, on_step: bool, *, include_pbar_metrics: bool = True) -> _METRICS:
         metrics = _METRICS(callback={}, log={}, pbar={})
 
         for _, result_metric in self.valid_items():
@@ -489,7 +489,7 @@ def metrics(self, on_step: bool) -> _METRICS:
                 metrics["callback"][forked_name] = value
 
             # populate progress_bar metrics. convert tensors to numbers
-            if result_metric.meta.prog_bar:
+            if result_metric.meta.prog_bar and include_pbar_metrics:
                 metrics["pbar"][forked_name] = convert_tensors_to_scalars(value)
 
         return metrics

From 4223f697ce998b52d06f55fac674763d47d3e5f9 Mon Sep 17 00:00:00 2001
From: Wil Kong <alpha0422@gmail.com>
Date: Wed, 10 Sep 2025 18:37:45 +0800
Subject: [PATCH 2/7] Refactor tensor handling in __to_tensor method to
 optimize device management for scalar metrics.

Signed-off-by: Wil Kong <alpha0422@gmail.com>
---
 src/lightning/pytorch/core/module.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
index 85f631ee40f75..b87773c43f916 100644
--- a/src/lightning/pytorch/core/module.py
+++ b/src/lightning/pytorch/core/module.py
@@ -656,11 +656,20 @@ def __check_allowed(v: Any, name: str, value: Any) -> None:
         raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged")
 
     def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor:
-        value = (
-            value.clone().detach()
-            if isinstance(value, Tensor)
-            else torch.tensor(value, device=self.device, dtype=_get_default_dtype())
-        )
+        if isinstance(value, Tensor):
+            # Keep tensor on its original device to avoid unnecessary transfers
+            value = value.clone().detach()
+        else:
+            if self.device.type == "cuda":
+                # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization.
+                # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric
+                # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient.
+                device = "cpu"
+            else:
+                # For non-CUDA devices, maintain original behavior
+                device = self.device
+            value = torch.tensor(value, device=device, dtype=_get_default_dtype())
+
         if not torch.numel(value) == 1:
             raise ValueError(
                 f"`self.log({name}, {value})` was called, but the tensor must have a single element."

From daac9561a51dda1e2b241fc3bc84a2c29e0a5ca6 Mon Sep 17 00:00:00 2001
From: Wil Kong <alpha0422@gmail.com>
Date: Sun, 21 Sep 2025 14:33:14 +0800
Subject: [PATCH 3/7] Fix issue reported by pre-commit.

Signed-off-by: Wil Kong <alpha0422@gmail.com>
---
 src/lightning/pytorch/core/module.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
index b87773c43f916..bd7b272f9f947 100644
--- a/src/lightning/pytorch/core/module.py
+++ b/src/lightning/pytorch/core/module.py
@@ -660,14 +660,11 @@ def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor
             # Keep tensor on its original device to avoid unnecessary transfers
             value = value.clone().detach()
         else:
-            if self.device.type == "cuda":
-                # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization.
-                # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric
-                # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient.
-                device = "cpu"
-            else:
-                # For non-CUDA devices, maintain original behavior
-                device = self.device
+            # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization.
+            # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric
+            # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient.
+            # For non-CUDA devices, maintain original behavior
+            device = "cpu" if self.device.type == "cuda" else self.device
             value = torch.tensor(value, device=device, dtype=_get_default_dtype())
 
         if not torch.numel(value) == 1:

From 7c1a4b979fb401b1a4b2dce222bac820caef1af1 Mon Sep 17 00:00:00 2001
From: Wil Kong <alpha0422@gmail.com>
Date: Mon, 22 Sep 2025 17:43:41 +0800
Subject: [PATCH 4/7] Update
 src/lightning/pytorch/trainer/connectors/logger_connector/result.py

Co-authored-by: Nicki Skafte Detlefsen <skaftenicki@gmail.com>
---
 .../pytorch/trainer/connectors/logger_connector/result.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
index a0efc9ff22a20..8298d280c98fc 100644
--- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
+++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
@@ -468,7 +468,7 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> tuple[str
             forked_name += dataloader_suffix
         return name, forked_name
 
-    def metrics(self, on_step: bool, *, include_pbar_metrics: bool = True) -> _METRICS:
+    def metrics(self, on_step: bool, include_pbar_metrics: bool = True) -> _METRICS:
         metrics = _METRICS(callback={}, log={}, pbar={})
 
         for _, result_metric in self.valid_items():

From 293233cacdfa9ad02a536c79a3fc5561d96daa05 Mon Sep 17 00:00:00 2001
From: Wil Kong <alpha0422@gmail.com>
Date: Mon, 22 Sep 2025 21:14:05 +0800
Subject: [PATCH 5/7] Add unittest for metric sync-free changes.

Signed-off-by: Wil Kong <alpha0422@gmail.com>
---
 .../core/test_lightning_module.py             |  50 +++++++++
 .../trainer/logging_/test_logger_connector.py | 104 ++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/tests/tests_pytorch/core/test_lightning_module.py b/tests/tests_pytorch/core/test_lightning_module.py
index c33488a4f2626..c724567ab7ba8 100644
--- a/tests/tests_pytorch/core/test_lightning_module.py
+++ b/tests/tests_pytorch/core/test_lightning_module.py
@@ -594,3 +594,53 @@ def __init__(self):
         fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=1e-3, max_norm=None)
     else:
         fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=None, max_norm=1e-3)
+
+
+@RunIf(min_cuda_gpus=1)
+def test_log_no_cuda_sync():
+    """Test logging scalars and tensors doesn't introduce CUDA sync."""
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.to("cuda")
+
+        def training_step(self, batch, batch_idx):
+            # Create tensors before enabling sync debug mode to avoid sync
+            cuda_tensor = torch.tensor(0.7, device=self.device)
+            cpu_tensor = torch.tensor(1.0, device="cpu")
+
+            # Enable sync debug mode to catch any synchronization
+            torch.cuda.set_sync_debug_mode("error")
+            try:
+                # Test scalar value (should be placed on CPU to avoid sync)
+                self.log("scalar_loss", 0.5)
+
+                # Test CUDA tensor (should stay on original device)
+                self.log("cuda_tensor", cuda_tensor)
+
+                # Test CPU tensor (should stay on CPU)
+                self.log("cpu_tensor", cpu_tensor)
+
+            except RuntimeError as e:
+                if "called a synchronizing CUDA operation" in str(e):
+                    msg = f"Unexpected CUDA synchronization: {e}"
+                    pytest.fail(msg)
+                else:
+                    raise
+            finally:
+                torch.cuda.set_sync_debug_mode("default")
+
+            return super().training_step(batch, batch_idx)
+
+    model = TestModel()
+    trainer = Trainer(
+        max_epochs=1,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        accelerator="gpu",
+        devices=1,
+        enable_progress_bar=False,
+        enable_checkpointing=False,
+    )
+    trainer.fit(model)
diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
index d3d355edb003b..315d23ae657ca 100644
--- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
+++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
@@ -660,3 +660,107 @@ def test_result_collection_changes_device():
     # same device as the new tensor
     results.log(fx, name, log_val, on_step=True, on_epoch=False, reduce_fx="mean")
     assert results[f"{fx}.{name}"].cumulated_batch_size.device == log_val.device
+
+
+@RunIf(min_cuda_gpus=1)
+def test_logger_connector_no_sync_without_progress_bar():
+    """Test logger connector doesn't sync when no progress bar."""
+    from lightning.pytorch import Trainer
+    from lightning.pytorch.demos.boring_classes import BoringModel
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.to("cuda")
+
+        def training_step(self, batch, batch_idx):
+            # Log some metrics with progress bar enabled
+            loss = super().training_step(batch, batch_idx)["loss"]
+
+            # Enable sync debug mode to catch any synchronization
+            torch.cuda.set_sync_debug_mode("error")
+            try:
+                # These logs have prog_bar=True but should not sync
+                # when progress bar callback is not present
+                self.log("train_loss", loss, prog_bar=True)
+                self.log("train_acc", 0.95, prog_bar=True)
+
+            except RuntimeError as e:
+                if "called a synchronizing CUDA operation" in str(e):
+                    msg = f"Unexpected CUDA synchronization: {e}"
+                    pytest.fail(msg)
+                else:
+                    raise
+            finally:
+                torch.cuda.set_sync_debug_mode("default")
+
+            return loss
+
+    model = TestModel()
+    trainer = Trainer(
+        max_epochs=1,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        accelerator="gpu",
+        devices=1,
+        enable_progress_bar=False,  # Key - no progress bar callback
+        enable_checkpointing=False,
+    )
+    trainer.fit(model)
+
+
+def test_result_collection_metrics_include_pbar_parameter():
+    """Test metrics method handles include_pbar_metrics parameter."""
+    from lightning.pytorch.trainer.connectors.logger_connector.result import (
+        _ResultCollection,
+    )
+
+    results = _ResultCollection(training=True)
+
+    # Log some metrics with different prog_bar settings
+    results.log(
+        "training_step",
+        "regular_metric",
+        torch.tensor(1.0),
+        on_step=True,
+        on_epoch=False,
+        prog_bar=False,
+    )
+    results.log(
+        "training_step",
+        "pbar_metric",
+        torch.tensor(2.0),
+        on_step=True,
+        on_epoch=False,
+        prog_bar=True,
+    )
+    results.log(
+        "training_step",
+        "both_metric",
+        torch.tensor(3.0),
+        on_step=True,
+        on_epoch=False,
+        prog_bar=True,
+        logger=True,
+    )
+
+    # Test with include_pbar_metrics=True (default behavior)
+    metrics_with_pbar = results.metrics(
+        on_step=True, include_pbar_metrics=True
+    )
+    assert "pbar_metric" in metrics_with_pbar["pbar"]
+    assert "both_metric" in metrics_with_pbar["pbar"]
+    assert "both_metric" in metrics_with_pbar["log"]
+
+    # Test with include_pbar_metrics=False (optimization)
+    metrics_without_pbar = results.metrics(
+        on_step=True, include_pbar_metrics=False
+    )
+    # No progress bar metrics should be included
+    assert len(metrics_without_pbar["pbar"]) == 0
+    # Logger metrics should still be included
+    assert "both_metric" in metrics_without_pbar["log"]
+
+    # Verify callback metrics are not affected
+    assert "regular_metric" in metrics_with_pbar["callback"]
+    assert "regular_metric" in metrics_without_pbar["callback"]

From bfdfad360f08c8f390da1e5e4f136c3a911c4401 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Sep 2025 13:14:43 +0000
Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../trainer/logging_/test_logger_connector.py             | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
index 315d23ae657ca..4af5812e441f5 100644
--- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
+++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py
@@ -745,17 +745,13 @@ def test_result_collection_metrics_include_pbar_parameter():
     )
 
     # Test with include_pbar_metrics=True (default behavior)
-    metrics_with_pbar = results.metrics(
-        on_step=True, include_pbar_metrics=True
-    )
+    metrics_with_pbar = results.metrics(on_step=True, include_pbar_metrics=True)
     assert "pbar_metric" in metrics_with_pbar["pbar"]
     assert "both_metric" in metrics_with_pbar["pbar"]
     assert "both_metric" in metrics_with_pbar["log"]
 
     # Test with include_pbar_metrics=False (optimization)
-    metrics_without_pbar = results.metrics(
-        on_step=True, include_pbar_metrics=False
-    )
+    metrics_without_pbar = results.metrics(on_step=True, include_pbar_metrics=False)
     # No progress bar metrics should be included
     assert len(metrics_without_pbar["pbar"]) == 0
     # Logger metrics should still be included

From 1d2eced57affb674d08ec7ca8c99e2cee4388b89 Mon Sep 17 00:00:00 2001
From: Nicki Skafte Detlefsen <skaftenicki@gmail.com>
Date: Wed, 1 Oct 2025 07:08:39 +0200
Subject: [PATCH 7/7] changelog

---
 src/lightning/pytorch/CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
index 863a3a4a7e939..b332fefa38ddd 100644
--- a/src/lightning/pytorch/CHANGELOG.md
+++ b/src/lightning/pytorch/CHANGELOG.md
@@ -52,6 +52,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed preventing recursive symlink creation iwhen `save_last='link'` and `save_top_k=-1` ([#21186](https://github.com/Lightning-AI/pytorch-lightning/pull/21186))
 
 
+- Fixed redundant host-device sync in progressbar printing ([#21233](https://github.com/Lightning-AI/pytorch-lightning/pull/21233))
+
 ---
 
 ## [2.5.5] - 2025-09-05