diff --git a/src/deadline/client/api/_stack_trace_sanitizer.py b/src/deadline/client/api/_stack_trace_sanitizer.py new file mode 100644 index 000000000..c35f38c92 --- /dev/null +++ b/src/deadline/client/api/_stack_trace_sanitizer.py @@ -0,0 +1,76 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +""" +Stack trace sanitizer for Deadline Cloud client telemetry. + +Uses an allowlist approach: only explicitly chosen fields (sanitized filename, +line number, function name, exception type) are emitted. Source code context +and exception messages are intentionally omitted as they could contain +customer data. + +Conforms to ADR 2024-02-19: "No customer content or other information provided +by the customer can be submitted, such as bucket names, file names, or similar." +""" + +import traceback +from typing import FrozenSet, List + +# Packages we control — safe to include relative paths for +_KNOWN_PACKAGES: FrozenSet[str] = frozenset( + { + "deadline", + "openjd", + "boto3", + "botocore", + } +) + + +def _sanitize_path(filepath: str) -> str: + """Replace a full file path with the package-relative portion or bare filename.""" + if filepath.startswith("<"): + return filepath + + parts = filepath.replace("\\", "/").split("/") + + for i, part in enumerate(parts): + stem = part.split(".")[0] + if stem in _KNOWN_PACKAGES: + return "/".join(parts[i:]) + + for i, part in enumerate(parts): + if part == "site-packages" and i + 1 < len(parts): + return "/".join(parts[i + 1 :]) + + return parts[-1] + + +def _sanitize_traceback(te: traceback.TracebackException) -> List[str]: + """Recursively format a TracebackException chain using only allowlisted fields.""" + lines: List[str] = [] + + # Handle chained exceptions (cause or context) + if te.__cause__ is not None: + lines.extend(_sanitize_traceback(te.__cause__)) + lines.append("\nThe above exception was the direct cause of the following exception:\n") + elif te.__context__ is not None and not te.__suppress_context__: + lines.extend(_sanitize_traceback(te.__context__)) + lines.append("\nDuring handling of the above exception, another exception occurred:\n") + + lines.append("Traceback (most recent call last):") + for frame in te.stack: + safe_path = _sanitize_path(frame.filename) + lines.append(f' File "{safe_path}", line {frame.lineno}, in {frame.name}') + # Intentionally omit frame.line — source code context could + # contain credentials, customer data, or other sensitive values + + # Only emit the exception type, not the message + exc_name = te.exc_type.__qualname__ if te.exc_type else "UnknownException" + lines.append(exc_name) + + return lines + + +def sanitize_exception(exc: BaseException) -> str: + """Format and sanitize a live exception using only allowlisted fields.""" + te = traceback.TracebackException.from_exception(exc) + return "\n".join(_sanitize_traceback(te)) diff --git a/src/deadline/client/api/_telemetry.py b/src/deadline/client/api/_telemetry.py index a5f7d91af..05638d893 100644 --- a/src/deadline/client/api/_telemetry.py +++ b/src/deadline/client/api/_telemetry.py @@ -28,6 +28,7 @@ get_boto3_client, get_boto3_session, ) +from ._stack_trace_sanitizer import sanitize_exception from ..config import config_file from .. import version @@ -164,6 +165,27 @@ def initialize(self, config: Optional[ConfigParser] = None) -> None: # Silently swallow any exceptions return + def record_error_with_trace( + self, + exc: BaseException, + exception_scope: str, + extra_details: Optional[dict] = None, + from_gui: bool = False, + ) -> None: + event_details: dict = { + "exception_type": type(exc).__qualname__, + "exception_scope": exception_scope, + "stack_trace": sanitize_exception(exc), + } + if extra_details: + event_details.update(extra_details) + + self.record_event( + event_type="com.amazon.rum.deadline.error", + event_details=event_details, + from_gui=from_gui, + ) + @property def is_initialized(self) -> bool: return self._initialized diff --git a/src/deadline/client/cli/_groups/bundle_group.py b/src/deadline/client/cli/_groups/bundle_group.py index 21080a0b4..fc3fd308f 100644 --- a/src/deadline/client/cli/_groups/bundle_group.py +++ b/src/deadline/client/cli/_groups/bundle_group.py @@ -353,10 +353,7 @@ def _check_create_job_wait_canceled() -> bool: click.echo("Job submission canceled.") sys.exit(1) except Exception as exc: - api.get_deadline_cloud_library_telemetry_client().record_error( - event_details={"exception_scope": "on_submit"}, - exception_type=str(type(exc)), - ) + api.get_deadline_cloud_library_telemetry_client().record_error_with_trace(exc, "on_submit") raise finally: if snapshot_tmpdir: diff --git a/src/deadline/client/ui/dialogs/submit_job_to_deadline_dialog.py b/src/deadline/client/ui/dialogs/submit_job_to_deadline_dialog.py index 2bd2bba9a..4653384a0 100644 --- a/src/deadline/client/ui/dialogs/submit_job_to_deadline_dialog.py +++ b/src/deadline/client/ui/dialogs/submit_job_to_deadline_dialog.py @@ -621,10 +621,8 @@ def on_submit(self): job_progress_dialog.close() except Exception as exc: logger.exception("error submitting job") - api.get_deadline_cloud_library_telemetry_client().record_error( - event_details={"exception_scope": "on_submit"}, - exception_type=str(type(exc)), - from_gui=True, + api.get_deadline_cloud_library_telemetry_client().record_error_with_trace( + exc, "on_submit", from_gui=True ) QMessageBox.critical( self, diff --git a/test/unit/deadline_client/api/test_api_telemetry.py b/test/unit/deadline_client/api/test_api_telemetry.py index ee8e66bbe..75812f88e 100644 --- a/test/unit/deadline_client/api/test_api_telemetry.py +++ b/test/unit/deadline_client/api/test_api_telemetry.py @@ -18,6 +18,10 @@ record_success_fail_telemetry_event, record_function_latency_telemetry_event, ) +from deadline.client.api._stack_trace_sanitizer import ( + _sanitize_path, + sanitize_exception, +) from deadline.job_attachments.progress_tracker import SummaryStatistics @@ -60,6 +64,7 @@ def test_opt_out_config(fresh_deadline_config): client.record_hashing_summary(SummaryStatistics(), from_gui=True) client.record_upload_summary(SummaryStatistics(), from_gui=False) client.record_error({}, str(type(Exception))) + client.record_error_with_trace(RuntimeError("opt-out test"), "test") @pytest.mark.parametrize( @@ -92,6 +97,7 @@ def test_opt_out_env_var(fresh_deadline_config, monkeypatch, env_var_value): client.record_hashing_summary(SummaryStatistics(), from_gui=True) client.record_upload_summary(SummaryStatistics(), from_gui=False) client.record_error({}, str(type(Exception))) + client.record_error_with_trace(RuntimeError("opt-out test"), "test") def test_initialize_failure_then_success(fresh_deadline_config): @@ -282,6 +288,81 @@ def test_record_error(fresh_deadline_config, mock_telemetry_client): queue_mock.put_nowait.assert_called_once_with(expected_event) +def test_record_error_with_trace(fresh_deadline_config, mock_telemetry_client): + """Test that record_error_with_trace sends a TelemetryEvent with sanitized stack trace fields""" + # GIVEN + queue_mock = MagicMock() + mock_telemetry_client.event_queue = queue_mock + + try: + raise ValueError("something broke") + except ValueError as exc: + with patch.object( + mock_telemetry_client, "get_account_id", return_value="111122223333" + ), patch.object(api._telemetry, "get_boto3_session"): + # WHEN + mock_telemetry_client.record_error_with_trace(exc, "test_scope") + + # THEN + queue_mock.put_nowait.assert_called_once() + event: TelemetryEvent = queue_mock.put_nowait.call_args[0][0] + assert event.event_type == "com.amazon.rum.deadline.error" + assert event.event_details["exception_type"] == "ValueError" + assert event.event_details["exception_scope"] == "test_scope" + assert "ValueError" in event.event_details["stack_trace"] + assert "message" not in event.event_details + assert event.event_details["usage_mode"] == "CLI" + assert event.event_details["accountId"] == "111122223333" + + +def test_record_error_with_trace_extra_details(fresh_deadline_config, mock_telemetry_client): + """Test that extra_details are merged into the event""" + # GIVEN + queue_mock = MagicMock() + mock_telemetry_client.event_queue = queue_mock + + try: + raise RuntimeError("fail") + except RuntimeError as exc: + with patch.object( + mock_telemetry_client, "get_account_id", return_value="111122223333" + ), patch.object(api._telemetry, "get_boto3_session"): + # WHEN + mock_telemetry_client.record_error_with_trace( + exc, "cli", extra_details={"command": "bundle submit"} + ) + + # THEN + event: TelemetryEvent = queue_mock.put_nowait.call_args[0][0] + assert event.event_details["command"] == "bundle submit" + assert event.event_details["exception_type"] == "RuntimeError" + + +def test_record_error_with_trace_sanitizes_paths(fresh_deadline_config, mock_telemetry_client): + """Test that customer paths are stripped from the stack trace""" + # GIVEN + queue_mock = MagicMock() + mock_telemetry_client.event_queue = queue_mock + + try: + raise TypeError("bad type") + except TypeError as exc: + with patch.object( + mock_telemetry_client, "get_account_id", return_value="111122223333" + ), patch.object(api._telemetry, "get_boto3_session"): + # WHEN + mock_telemetry_client.record_error_with_trace(exc, "test") + + # THEN + event: TelemetryEvent = queue_mock.put_nowait.call_args[0][0] + stack_trace = event.event_details["stack_trace"] + # The stack trace should not contain the full absolute path to this test file + for line in stack_trace.splitlines(): + if line.strip().startswith('File "'): + path = line.split('"')[1] + assert not path.startswith("/"), f"Absolute path leaked: {path}" + + @pytest.mark.parametrize( "endpoint,prefix,expected_result", [ @@ -441,3 +522,112 @@ def fake_init(self, **kwargs): assert client_b.package_name == "package-b" telemetry_mod.__cached_telemetry_clients = {} + + +# --- Stack trace sanitizer tests --- + + +class TestSanitizePath: + def test_known_package_deadline(self): + assert ( + _sanitize_path( + "/home/customer/secret/venv/lib/python3.11/site-packages/deadline/client/api/_telemetry.py" + ) + == "deadline/client/api/_telemetry.py" + ) + + def test_known_package_openjd(self): + assert _sanitize_path("/opt/libs/openjd/sessions/runner.py") == "openjd/sessions/runner.py" + + def test_known_package_botocore(self): + assert ( + _sanitize_path("/usr/lib/python3/dist-packages/botocore/client.py") + == "botocore/client.py" + ) + + def test_site_packages_unknown_lib(self): + assert ( + _sanitize_path("/home/user/venv/lib/python3.11/site-packages/somelib/core.py") + == "somelib/core.py" + ) + + def test_customer_script_returns_filename_only(self): + assert _sanitize_path("/home/customer/my-bucket-name/scripts/render.py") == "render.py" + + def test_windows_path(self): + assert ( + _sanitize_path( + "C:\\Users\\customer\\AppData\\Local\\deadline\\client\\api\\_telemetry.py" + ) + == "deadline/client/api/_telemetry.py" + ) + + def test_frozen_module(self): + assert _sanitize_path("") == "" + + def test_string_input(self): + assert _sanitize_path("") == "" + + +class TestSanitizeException: + def test_live_exception(self): + try: + raise RuntimeError("test error") + except RuntimeError as e: + result = sanitize_exception(e) + assert "RuntimeError" in result + assert "Traceback (most recent call last):" in result + + def test_no_absolute_paths(self): + try: + raise RuntimeError("path test") + except RuntimeError as e: + result = sanitize_exception(e) + for line in result.splitlines(): + if line.strip().startswith('File "'): + path = line.split('"')[1] + assert not path.startswith("/"), f"Absolute path leaked: {path}" + + def test_no_source_code_context(self): + """Source code lines are omitted to avoid leaking customer data.""" + try: + customer_secret = "sensitive" # noqa: F841 + raise ValueError("fail") + except ValueError as e: + result = sanitize_exception(e) + assert "customer_secret" not in result + assert "sensitive" not in result + + def test_message_omitted(self): + """Exception messages are not included — only the type.""" + try: + raise FileNotFoundError("/home/customer/secret/file.txt") + except FileNotFoundError as e: + result = sanitize_exception(e) + assert "customer" not in result + assert "secret" not in result + assert "FileNotFoundError" in result + + def test_chained_exception_cause(self): + try: + try: + raise KeyError("original") + except KeyError as e: + raise ValueError("wrapper") from e + except ValueError as e: + result = sanitize_exception(e) + assert "KeyError" in result + assert "ValueError" in result + assert "direct cause" in result + + def test_chained_exception_context(self): + try: + try: + raise KeyError("original") + except KeyError: + raise ValueError("during handling") + except ValueError as e: + result = sanitize_exception(e) + assert "KeyError" in result + assert "ValueError" in result + assert "During handling" in result