Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions src/deadline/client/api/_stack_trace_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
"""
Stack trace sanitizer for Deadline Cloud client telemetry.

Uses an allowlist approach: only explicitly chosen fields (sanitized filename,
line number, function name, exception type) are emitted. Source code context
and exception messages are intentionally omitted as they could contain
customer data.

Conforms to ADR 2024-02-19: "No customer content or other information provided
by the customer can be submitted, such as bucket names, file names, or similar."
"""

import traceback
from typing import FrozenSet, List

# Packages we control — safe to include relative paths for
_KNOWN_PACKAGES: FrozenSet[str] = frozenset(
{
"deadline",
"openjd",
"boto3",
"botocore",
}
)


def _sanitize_path(filepath: str) -> str:
"""Replace a full file path with the package-relative portion or bare filename."""
if filepath.startswith("<"):
return filepath

parts = filepath.replace("\\", "/").split("/")

for i, part in enumerate(parts):
stem = part.split(".")[0]
if stem in _KNOWN_PACKAGES:
return "/".join(parts[i:])

for i, part in enumerate(parts):
if part == "site-packages" and i + 1 < len(parts):
return "/".join(parts[i + 1 :])

return parts[-1]


def _sanitize_traceback(te: traceback.TracebackException) -> List[str]:
"""Recursively format a TracebackException chain using only allowlisted fields."""
lines: List[str] = []

# Handle chained exceptions (cause or context)
if te.__cause__ is not None:
lines.extend(_sanitize_traceback(te.__cause__))
lines.append("\nThe above exception was the direct cause of the following exception:\n")
elif te.__context__ is not None and not te.__suppress_context__:
lines.extend(_sanitize_traceback(te.__context__))
lines.append("\nDuring handling of the above exception, another exception occurred:\n")

lines.append("Traceback (most recent call last):")
for frame in te.stack:
safe_path = _sanitize_path(frame.filename)
lines.append(f' File "{safe_path}", line {frame.lineno}, in {frame.name}')
# Intentionally omit frame.line — source code context could
# contain credentials, customer data, or other sensitive values

# Only emit the exception type, not the message
exc_name = te.exc_type.__qualname__ if te.exc_type else "UnknownException"
lines.append(exc_name)

return lines


def sanitize_exception(exc: BaseException) -> str:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of this denylist approach (taking the whole exception, putting into a string and then looking for things to take out), consider an allowlist approach. You could use something like traceback.extract_tb() to work with structure data instead, and only emit the fields you explicitly choose.

This would look something like:

def sanitize_exception(exc: BaseException) -> str:
    lines = ["Traceback (most recent call last):"]
    for frame in traceback.extract_tb(exc.__traceback__):
        safe_path = _sanitize_path(frame.filename)
        lines.append(f'  File "{safe_path}", line {frame.lineno}, in {frame.name}')
        # Intentionally omit frame.line — source code context could
        # contain credentials, customer data, or other sensitive values
    lines.append(f"{type(exc).__qualname__}: {sanitize_message(str(exc))}")
    return "\n".join(lines)

This gives you filename, lineno, and name as discrete values you can sanitize individually, and it avoids passing through anything unexpected.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You would need to consider chained exceptions as well, but traceback.TracebackException should have the full chain.

"""Format and sanitize a live exception using only allowlisted fields."""
te = traceback.TracebackException.from_exception(exc)
return "\n".join(_sanitize_traceback(te))
22 changes: 22 additions & 0 deletions src/deadline/client/api/_telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
get_boto3_client,
get_boto3_session,
)
from ._stack_trace_sanitizer import sanitize_exception
from ..config import config_file
from .. import version

Expand Down Expand Up @@ -164,6 +165,27 @@ def initialize(self, config: Optional[ConfigParser] = None) -> None:
# Silently swallow any exceptions
return

def record_error_with_trace(
self,
exc: BaseException,
exception_scope: str,
extra_details: Optional[dict] = None,
from_gui: bool = False,
) -> None:
event_details: dict = {
"exception_type": type(exc).__qualname__,
"exception_scope": exception_scope,
"stack_trace": sanitize_exception(exc),
}
if extra_details:
event_details.update(extra_details)

self.record_event(
event_type="com.amazon.rum.deadline.error",
event_details=event_details,
from_gui=from_gui,
)

@property
def is_initialized(self) -> bool:
return self._initialized
Expand Down
5 changes: 1 addition & 4 deletions src/deadline/client/cli/_groups/bundle_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,10 +353,7 @@ def _check_create_job_wait_canceled() -> bool:
click.echo("Job submission canceled.")
sys.exit(1)
except Exception as exc:
api.get_deadline_cloud_library_telemetry_client().record_error(
event_details={"exception_scope": "on_submit"},
exception_type=str(type(exc)),
)
api.get_deadline_cloud_library_telemetry_client().record_error_with_trace(exc, "on_submit")
raise
finally:
if snapshot_tmpdir:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -621,10 +621,8 @@ def on_submit(self):
job_progress_dialog.close()
except Exception as exc:
logger.exception("error submitting job")
api.get_deadline_cloud_library_telemetry_client().record_error(
event_details={"exception_scope": "on_submit"},
exception_type=str(type(exc)),
from_gui=True,
api.get_deadline_cloud_library_telemetry_client().record_error_with_trace(
exc, "on_submit", from_gui=True
)
QMessageBox.critical(
self,
Expand Down
190 changes: 190 additions & 0 deletions test/unit/deadline_client/api/test_api_telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
record_success_fail_telemetry_event,
record_function_latency_telemetry_event,
)
from deadline.client.api._stack_trace_sanitizer import (
_sanitize_path,
sanitize_exception,
)
from deadline.job_attachments.progress_tracker import SummaryStatistics


Expand Down Expand Up @@ -60,6 +64,7 @@
client.record_hashing_summary(SummaryStatistics(), from_gui=True)
client.record_upload_summary(SummaryStatistics(), from_gui=False)
client.record_error({}, str(type(Exception)))
client.record_error_with_trace(RuntimeError("opt-out test"), "test")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -92,6 +97,7 @@
client.record_hashing_summary(SummaryStatistics(), from_gui=True)
client.record_upload_summary(SummaryStatistics(), from_gui=False)
client.record_error({}, str(type(Exception)))
client.record_error_with_trace(RuntimeError("opt-out test"), "test")


def test_initialize_failure_then_success(fresh_deadline_config):
Expand Down Expand Up @@ -282,6 +288,81 @@
queue_mock.put_nowait.assert_called_once_with(expected_event)


def test_record_error_with_trace(fresh_deadline_config, mock_telemetry_client):
"""Test that record_error_with_trace sends a TelemetryEvent with sanitized stack trace fields"""
# GIVEN
queue_mock = MagicMock()
mock_telemetry_client.event_queue = queue_mock

try:
raise ValueError("something broke")
except ValueError as exc:
with patch.object(
mock_telemetry_client, "get_account_id", return_value="111122223333"
), patch.object(api._telemetry, "get_boto3_session"):
# WHEN
mock_telemetry_client.record_error_with_trace(exc, "test_scope")

# THEN
queue_mock.put_nowait.assert_called_once()
event: TelemetryEvent = queue_mock.put_nowait.call_args[0][0]
assert event.event_type == "com.amazon.rum.deadline.error"
assert event.event_details["exception_type"] == "ValueError"
assert event.event_details["exception_scope"] == "test_scope"
assert "ValueError" in event.event_details["stack_trace"]
assert "message" not in event.event_details
assert event.event_details["usage_mode"] == "CLI"
assert event.event_details["accountId"] == "111122223333"


def test_record_error_with_trace_extra_details(fresh_deadline_config, mock_telemetry_client):
"""Test that extra_details are merged into the event"""
# GIVEN
queue_mock = MagicMock()
mock_telemetry_client.event_queue = queue_mock

try:
raise RuntimeError("fail")
except RuntimeError as exc:
with patch.object(
mock_telemetry_client, "get_account_id", return_value="111122223333"
), patch.object(api._telemetry, "get_boto3_session"):
# WHEN
mock_telemetry_client.record_error_with_trace(
exc, "cli", extra_details={"command": "bundle submit"}
)

# THEN
event: TelemetryEvent = queue_mock.put_nowait.call_args[0][0]
assert event.event_details["command"] == "bundle submit"
assert event.event_details["exception_type"] == "RuntimeError"


def test_record_error_with_trace_sanitizes_paths(fresh_deadline_config, mock_telemetry_client):
"""Test that customer paths are stripped from the stack trace"""
# GIVEN
queue_mock = MagicMock()
mock_telemetry_client.event_queue = queue_mock

try:
raise TypeError("bad type")
except TypeError as exc:
with patch.object(
mock_telemetry_client, "get_account_id", return_value="111122223333"
), patch.object(api._telemetry, "get_boto3_session"):
# WHEN
mock_telemetry_client.record_error_with_trace(exc, "test")

# THEN
event: TelemetryEvent = queue_mock.put_nowait.call_args[0][0]
stack_trace = event.event_details["stack_trace"]
# The stack trace should not contain the full absolute path to this test file
for line in stack_trace.splitlines():
if line.strip().startswith('File "'):
path = line.split('"')[1]
assert not path.startswith("/"), f"Absolute path leaked: {path}"


@pytest.mark.parametrize(
"endpoint,prefix,expected_result",
[
Expand Down Expand Up @@ -441,3 +522,112 @@
assert client_b.package_name == "package-b"

telemetry_mod.__cached_telemetry_clients = {}


# --- Stack trace sanitizer tests ---


class TestSanitizePath:
def test_known_package_deadline(self):
assert (
_sanitize_path(
"/home/customer/secret/venv/lib/python3.11/site-packages/deadline/client/api/_telemetry.py"
)
== "deadline/client/api/_telemetry.py"
)

def test_known_package_openjd(self):
assert _sanitize_path("/opt/libs/openjd/sessions/runner.py") == "openjd/sessions/runner.py"

def test_known_package_botocore(self):
assert (
_sanitize_path("/usr/lib/python3/dist-packages/botocore/client.py")
== "botocore/client.py"
)

def test_site_packages_unknown_lib(self):
assert (
_sanitize_path("/home/user/venv/lib/python3.11/site-packages/somelib/core.py")
== "somelib/core.py"
)

def test_customer_script_returns_filename_only(self):
assert _sanitize_path("/home/customer/my-bucket-name/scripts/render.py") == "render.py"

def test_windows_path(self):
assert (
_sanitize_path(
"C:\\Users\\customer\\AppData\\Local\\deadline\\client\\api\\_telemetry.py"
)
== "deadline/client/api/_telemetry.py"
)

def test_frozen_module(self):
assert _sanitize_path("<frozen importlib._bootstrap>") == "<frozen importlib._bootstrap>"

def test_string_input(self):
assert _sanitize_path("<string>") == "<string>"


class TestSanitizeException:
def test_live_exception(self):
try:
raise RuntimeError("test error")
except RuntimeError as e:
result = sanitize_exception(e)
assert "RuntimeError" in result
assert "Traceback (most recent call last):" in result

def test_no_absolute_paths(self):
try:
raise RuntimeError("path test")
except RuntimeError as e:
result = sanitize_exception(e)
for line in result.splitlines():
if line.strip().startswith('File "'):
path = line.split('"')[1]
assert not path.startswith("/"), f"Absolute path leaked: {path}"

def test_no_source_code_context(self):
"""Source code lines are omitted to avoid leaking customer data."""
try:
customer_secret = "sensitive" # noqa: F841

Check notice

Code scanning / CodeQL

Unused local variable Note test

Variable customer_secret is not used.
raise ValueError("fail")
except ValueError as e:
result = sanitize_exception(e)
assert "customer_secret" not in result
assert "sensitive" not in result

def test_message_omitted(self):
"""Exception messages are not included — only the type."""
try:
raise FileNotFoundError("/home/customer/secret/file.txt")
except FileNotFoundError as e:
result = sanitize_exception(e)
assert "customer" not in result
assert "secret" not in result
assert "FileNotFoundError" in result

def test_chained_exception_cause(self):
try:
try:
raise KeyError("original")
except KeyError as e:
raise ValueError("wrapper") from e
except ValueError as e:
result = sanitize_exception(e)
assert "KeyError" in result
assert "ValueError" in result
assert "direct cause" in result

def test_chained_exception_context(self):
try:
try:
raise KeyError("original")
except KeyError:
raise ValueError("during handling")
except ValueError as e:
result = sanitize_exception(e)
assert "KeyError" in result
assert "ValueError" in result
assert "During handling" in result
Loading