diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fa6d145d6..d259014cb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -76,7 +76,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ([#4654](https://github.com/open-telemetry/opentelemetry-python/pull/4654)). - Fix type checking for built-in metric exporters ([#4820](https://github.com/open-telemetry/opentelemetry-python/pull/4820)) - +- `opentelemetry-exporter-otlp-proto-http`: fix retry logic and error handling for connection failures in trace, metric, and log exporters + ([#4709](https://github.com/open-telemetry/opentelemetry-python/pull/4709)) + ([#4712](https://github.com/open-telemetry/opentelemetry-python/issues/4712)) + ## Version 1.38.0/0.59b0 (2025-10-16) - Add `rstcheck` to pre-commit to stop introducing invalid RST diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index b120a2cca4..de5747f408 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -186,26 +186,42 @@ def export( serialized_data = encode_logs(batch).SerializeToString() deadline_sec = time() + self._timeout for retry_num in range(_MAX_RETRYS): - resp = self._export(serialized_data, deadline_sec - time()) - if resp.ok: - return LogRecordExportResult.SUCCESS # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) + try: + resp = self._export(serialized_data, deadline_sec - time()) + if resp.ok: + return LogRecordExportResult.SUCCESS + except requests.exceptions.RequestException as error: + reason = str(error) + retryable = isinstance(error, ConnectionError) + status_code = None + else: + reason = resp.reason + retryable = _is_retryable(resp) + status_code = resp.status_code + + if not retryable: + _logger.error( + "Failed to export logs batch code: %s, reason: %s", + status_code, + reason, + ) + return LogRecordExportResult.FAILURE + if ( - not _is_retryable(resp) - or retry_num + 1 == _MAX_RETRYS + retry_num + 1 == _MAX_RETRYS or backoff_seconds > (deadline_sec - time()) or self._shutdown ): _logger.error( - "Failed to export logs batch code: %s, reason: %s", - resp.status_code, - resp.text, + "Failed to export logs batch due to timeout, " + "max retries or shutdown." ) return LogRecordExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting logs batch, retrying in %.2fs.", - resp.reason, + reason, backoff_seconds, ) shutdown = self._shutdown_is_occuring.wait(backoff_seconds) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index c6d657e7ae..12d36b7827 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -231,26 +231,41 @@ def export( serialized_data = encode_metrics(metrics_data).SerializeToString() deadline_sec = time() + self._timeout for retry_num in range(_MAX_RETRYS): - resp = self._export(serialized_data, deadline_sec - time()) - if resp.ok: - return MetricExportResult.SUCCESS # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) + try: + resp = self._export(serialized_data, deadline_sec - time()) + if resp.ok: + return MetricExportResult.SUCCESS + except requests.exceptions.RequestException as error: + reason = str(error) + retryable = isinstance(error, ConnectionError) + status_code = None + else: + reason = resp.reason + retryable = _is_retryable(resp) + status_code = resp.status_code + + if not retryable: + _logger.error( + "Failed to export metrics batch code: %s, reason: %s", + status_code, + reason, + ) + return MetricExportResult.FAILURE if ( - not _is_retryable(resp) - or retry_num + 1 == _MAX_RETRYS + retry_num + 1 == _MAX_RETRYS or backoff_seconds > (deadline_sec - time()) or self._shutdown ): _logger.error( - "Failed to export metrics batch code: %s, reason: %s", - resp.status_code, - resp.text, + "Failed to export metrics batch due to timeout, " + "max retries or shutdown." ) return MetricExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting metrics batch, retrying in %.2fs.", - resp.reason, + reason, backoff_seconds, ) shutdown = self._shutdown_in_progress.wait(backoff_seconds) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 055e829dab..b247b44660 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -179,26 +179,42 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: serialized_data = encode_spans(spans).SerializePartialToString() deadline_sec = time() + self._timeout for retry_num in range(_MAX_RETRYS): - resp = self._export(serialized_data, deadline_sec - time()) - if resp.ok: - return SpanExportResult.SUCCESS # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) + try: + resp = self._export(serialized_data, deadline_sec - time()) + if resp.ok: + return SpanExportResult.SUCCESS + except requests.exceptions.RequestException as error: + reason = str(error) + retryable = isinstance(error, ConnectionError) + status_code = None + else: + reason = resp.reason + retryable = _is_retryable(resp) + status_code = resp.status_code + + if not retryable: + _logger.error( + "Failed to export span batch code: %s, reason: %s", + status_code, + reason, + ) + return SpanExportResult.FAILURE + if ( - not _is_retryable(resp) - or retry_num + 1 == _MAX_RETRYS + retry_num + 1 == _MAX_RETRYS or backoff_seconds > (deadline_sec - time()) or self._shutdown ): _logger.error( - "Failed to export span batch code: %s, reason: %s", - resp.status_code, - resp.text, + "Failed to export span batch due to timeout, " + "max retries or shutdown." ) return SpanExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting span batch, retrying in %.2fs.", - resp.reason, + reason, backoff_seconds, ) shutdown = self._shutdown_in_progress.wait(backoff_seconds) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index eca1aed5d9..2dbbadccb9 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -19,7 +19,9 @@ from unittest import TestCase from unittest.mock import ANY, MagicMock, Mock, patch +import requests from requests import Session +from requests.exceptions import ConnectionError from requests.models import Response from opentelemetry.exporter.otlp.proto.common.metrics_encoder import ( @@ -555,6 +557,40 @@ def test_retry_timeout(self, mock_post): warning.records[0].message, ) + @patch.object(Session, "post") + def test_export_no_collector_available_retryable(self, mock_post): + exporter = OTLPMetricExporter(timeout=1.5) + msg = "Server not available." + mock_post.side_effect = ConnectionError(msg) + with self.assertLogs(level=WARNING) as warning: + self.assertEqual( + exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + # Check for greater 2 because the request is on each retry + # done twice at the moment. + self.assertGreater(mock_post.call_count, 2) + self.assertIn( + f"Transient error {msg} encountered while exporting metrics batch, retrying in", + warning.records[0].message, + ) + + @patch.object(Session, "post") + def test_export_no_collector_available(self, mock_post): + exporter = OTLPMetricExporter(timeout=1.5) + + mock_post.side_effect = requests.exceptions.RequestException() + with self.assertLogs(level=WARNING) as warning: + self.assertEqual( + exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + self.assertEqual(mock_post.call_count, 1) + self.assertIn( + "Failed to export metrics batch code", + warning.records[0].message, + ) + @patch.object(Session, "post") def test_timeout_set_correctly(self, mock_post): resp = Response() diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 31e824a980..c86ac1f6ba 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -24,6 +24,7 @@ import requests from google.protobuf.json_format import MessageToDict from requests import Session +from requests.exceptions import ConnectionError from requests.models import Response from opentelemetry._logs import LogRecord, SeverityNumber @@ -483,6 +484,40 @@ def test_retry_timeout(self, mock_post): warning.records[0].message, ) + @patch.object(Session, "post") + def test_export_no_collector_available_retryable(self, mock_post): + exporter = OTLPLogExporter(timeout=1.5) + msg = "Server not available." + mock_post.side_effect = ConnectionError(msg) + with self.assertLogs(level=WARNING) as warning: + self.assertEqual( + exporter.export(self._get_sdk_log_data()), + LogRecordExportResult.FAILURE, + ) + # Check for greater 2 because the request is on each retry + # done twice at the moment. + self.assertGreater(mock_post.call_count, 2) + self.assertIn( + f"Transient error {msg} encountered while exporting logs batch, retrying in", + warning.records[0].message, + ) + + @patch.object(Session, "post") + def test_export_no_collector_available(self, mock_post): + exporter = OTLPLogExporter(timeout=1.5) + + mock_post.side_effect = requests.exceptions.RequestException() + with self.assertLogs(level=WARNING) as warning: + self.assertEqual( + exporter.export(self._get_sdk_log_data()), + LogRecordExportResult.FAILURE, + ) + self.assertEqual(mock_post.call_count, 1) + self.assertIn( + "Failed to export logs batch code", + warning.records[0].message, + ) + @patch.object(Session, "post") def test_timeout_set_correctly(self, mock_post): resp = Response() diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 10dcb1a9e0..5f61344bbf 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -20,6 +20,7 @@ import requests from requests import Session +from requests.exceptions import ConnectionError from requests.models import Response from opentelemetry.exporter.otlp.proto.http import Compression @@ -303,6 +304,40 @@ def test_retry_timeout(self, mock_post): warning.records[0].message, ) + @patch.object(Session, "post") + def test_export_no_collector_available_retryable(self, mock_post): + exporter = OTLPSpanExporter(timeout=1.5) + msg = "Server not available." + mock_post.side_effect = ConnectionError(msg) + with self.assertLogs(level=WARNING) as warning: + self.assertEqual( + exporter.export([BASIC_SPAN]), + SpanExportResult.FAILURE, + ) + # Check for greater 2 because the request is on each retry + # done twice at the moment. + self.assertGreater(mock_post.call_count, 2) + self.assertIn( + f"Transient error {msg} encountered while exporting span batch, retrying in", + warning.records[0].message, + ) + + @patch.object(Session, "post") + def test_export_no_collector_available(self, mock_post): + exporter = OTLPSpanExporter(timeout=1.5) + + mock_post.side_effect = requests.exceptions.RequestException() + with self.assertLogs(level=WARNING) as warning: + self.assertEqual( + exporter.export([BASIC_SPAN]), + SpanExportResult.FAILURE, + ) + self.assertEqual(mock_post.call_count, 1) + self.assertIn( + "Failed to export span batch code", + warning.records[0].message, + ) + @patch.object(Session, "post") def test_timeout_set_correctly(self, mock_post): resp = Response()