fix: make retry examples deterministic using seeded random

vip-amzn · vip-amzn · commit 8e47e50a3660 · 2025-11-06T11:49:33.000Z
diff --git a/examples/src/step/step_with_retry.py b/examples/src/step/step_with_retry.py
@@ -1,4 +1,4 @@
-from random import random
+import random
 from typing import Any
 
 from aws_durable_execution_sdk_python.config import StepConfig
@@ -14,12 +14,18 @@
 )
 
 
+# Seed random at module level for deterministic behavior across retries
+random.seed(42)
+
+
 @durable_step
 def unreliable_operation(
     _step_context: StepContext,
 ) -> str:
+    # Use seeded random for deterministic behavior
+    # With seed 42, this will fail once then succeed
     failure_threshold = 0.5
-    if random() > failure_threshold:  # noqa: S311
+    if random.random() > failure_threshold:  # noqa: S311
         msg = "Random error occurred"
         raise RuntimeError(msg)
     return "Operation succeeded"
diff --git a/examples/src/step/steps_with_retry.py b/examples/src/step/steps_with_retry.py
@@ -1,26 +1,31 @@
 """Example demonstrating multiple steps with retry logic."""
 
-from random import random
+import random
 from typing import Any
 
 from aws_durable_execution_sdk_python.config import StepConfig
-from aws_durable_execution_sdk_python.context import DurableContext
+from aws_durable_execution_sdk_python.context import DurableContext, StepContext
 from aws_durable_execution_sdk_python.execution import durable_execution
 from aws_durable_execution_sdk_python.retries import (
     RetryStrategyConfig,
     create_retry_strategy,
 )
 
 
-def simulated_get_item(name: str) -> dict[str, Any] | None:
-    """Simulate getting an item that may fail randomly."""
+# Seed random at module level for deterministic behavior across retries
+random.seed(42)
+
+
+def simulated_get_item(_step_context: StepContext, name: str) -> dict[str, Any] | None:
+    """Simulate getting an item with deterministic seeded random behavior."""
+    # Use seeded random for deterministic behavior
     # Fail 50% of the time
-    if random() < 0.5:  # noqa: S311
+    if random.random() < 0.5:  # noqa: S311
         msg = "Random failure"
         raise RuntimeError(msg)
 
     # Simulate finding item after some attempts
-    if random() > 0.3:  # noqa: S311
+    if random.random() > 0.3:  # noqa: S311
         return {"id": name, "data": "item data"}
 
     return None
@@ -49,7 +54,7 @@ def handler(event: Any, context: DurableContext) -> dict[str, Any]:
 
             # Try to get the item with retry
             get_response = context.step(
-                lambda _, n=name: simulated_get_item(n),
+                lambda _, n=name: simulated_get_item(_, n),
                 name=f"get_item_poll_{poll_count}",
                 config=step_config,
             )
diff --git a/examples/test/conftest.py b/examples/test/conftest.py
@@ -4,6 +4,7 @@
 import json
 import logging
 import os
+import random
 import sys
 from enum import StrEnum
 from pathlib import Path
@@ -29,6 +30,18 @@
 logger = logging.getLogger(__name__)
 
 
+@pytest.fixture(autouse=True)
+def reset_random_seed():
+    """Reset random seed before each test for deterministic behavior.
+
+    This ensures that tests using module-level random.seed(42) get consistent
+    behavior even when run in different orders or combinations.
+    """
+    random.seed(42)
+    yield
+    # No cleanup needed - next test will reset
+
+
 def deserialize_operation_payload(
     payload: OperationPayload | None, serdes: ExtendedTypeSerDes | None = None
 ) -> Any:
diff --git a/examples/test/step/test_step_with_retry.py b/examples/test/step/test_step_with_retry.py
@@ -14,20 +14,28 @@
     lambda_function_name="step with retry",
 )
 def test_step_with_retry(durable_runner):
-    """Test step with retry configuration."""
+    """Test step with retry configuration.
+
+    With seed 42 at module level, the random state persists across retries:
+    - Attempt 1: random() = 0.639... > 0.5 → raises RuntimeError ❌
+    - Attempt 2: random() = 0.025... > 0.5 → False, succeeds ✓
+
+    The function deterministically fails once then succeeds on the second attempt.
+    """
     with durable_runner:
         result = durable_runner.run(input="test", timeout=30)
 
-    # The function uses random() so it may succeed or fail
-    # We just verify it completes and has retry configuration
-    assert result.status in [InvocationStatus.SUCCEEDED, InvocationStatus.FAILED]
+    # With seeded random (seed=42) at module level, succeeds on attempt 2
+    assert result.status is InvocationStatus.SUCCEEDED
+    assert deserialize_operation_payload(result.result) == "Operation succeeded"
 
-    # Verify step operation exists
+    # Verify step operation exists with retry details
     step_ops = [
         op for op in result.operations if op.operation_type == OperationType.STEP
     ]
-    assert len(step_ops) >= 1
+    assert len(step_ops) == 1
 
-    # If it succeeded, verify the result
-    if result.status is InvocationStatus.SUCCEEDED:
-        assert deserialize_operation_payload(result.result) == "Operation succeeded"
+    # The step should have succeeded on attempt 2 (after 1 failure)
+    # Attempt numbering: 1 (initial attempt), 2 (first retry)
+    step_op = step_ops[0]
+    assert step_op.attempt == 2  # Succeeded on first retry (1-indexed: 2=first retry)
diff --git a/examples/test/step/test_steps_with_retry.py b/examples/test/step/test_steps_with_retry.py
@@ -14,20 +14,42 @@
     lambda_function_name="steps with retry",
 )
 def test_steps_with_retry(durable_runner):
-    """Test steps_with_retry pattern."""
+    """Test steps_with_retry pattern.
+
+    With seed 42 set by conftest fixture, the random state persists:
+    - Poll 1, Attempt 0: random() = 0.639 ≥ 0.5 (passes), random() = 0.025 ≤ 0.3 → returns None
+    - Poll 2, Attempt 0: random() = 0.275 < 0.5 → raises RuntimeError ❌
+    - Poll 2, Attempt 1: random() = 0.736 ≥ 0.5 (passes), random() = 0.676 > 0.3 → returns item ✓
+
+    The function finds the item on poll 2 after 1 retry.
+    """
     with durable_runner:
         result = durable_runner.run(input={"name": "test-item"}, timeout=30)
 
     assert result.status is InvocationStatus.SUCCEEDED
 
-    # Result should be either success with item or error
-    assert isinstance(deserialize_operation_payload(result.result), dict)
-    assert "success" in deserialize_operation_payload(
-        result.result
-    ) or "error" in deserialize_operation_payload(result.result)
+    # With seeded random (seed=42) at module level, finds item on poll 2
+    result_data = deserialize_operation_payload(result.result)
+    assert isinstance(result_data, dict)
+    assert result_data.get("success") is True
+    assert result_data.get("pollsRequired") == 2
+    assert "item" in result_data
+    assert result_data["item"]["id"] == "test-item"
 
-    # Verify step operations exist (polling steps)
+    # Verify step operations exist
     step_ops = [
         op for op in result.operations if op.operation_type == OperationType.STEP
     ]
-    assert len(step_ops) >= 1
+    # Should have exactly 2 step operations (poll 1 and poll 2)
+    assert len(step_ops) == 2
+
+    # Poll 1: succeeded immediately (returned None)
+    assert step_ops[0].name == "get_item_poll_1"
+    assert step_ops[0].result == "null"
+    assert step_ops[0].attempt == 1  # No retries needed (1-indexed: 1=initial)
+
+    # Poll 2: succeeded after 1 retry (returned item)
+    assert step_ops[1].name == "get_item_poll_2"
+    assert (
+        step_ops[1].attempt == 2
+    )  # Exactly 1 retry occurred (1-indexed: 2=first retry)
diff --git a/src/aws_durable_execution_sdk_python_testing/checkpoint/processors/base.py b/src/aws_durable_execution_sdk_python_testing/checkpoint/processors/base.py
@@ -12,6 +12,7 @@
     ContextDetails,
     ExecutionDetails,
     Operation,
+    OperationAction,
     OperationStatus,
     OperationType,
     OperationUpdate,
@@ -72,9 +73,15 @@ def _create_context_details(self, update: OperationUpdate) -> ContextDetails | N
         )
 
     def _create_step_details(
-        self, update: OperationUpdate, current_operation: Operation | None = None
+        self,
+        update: OperationUpdate,
+        current_operation: Operation | None = None,
     ) -> StepDetails | None:
-        """Create StepDetails from OperationUpdate."""
+        """Create StepDetails from OperationUpdate.
+
+        Automatically increments attempt count for RETRY, SUCCEED, and FAIL actions.
+        """
+
         attempt: int = 0
         next_attempt_timestamp: datetime.datetime | None = None
 
@@ -84,6 +91,13 @@ def _create_step_details(
                 next_attempt_timestamp = (
                     current_operation.step_details.next_attempt_timestamp
                 )
+            # Increment attempt for RETRY, SUCCEED, and FAIL actions
+            if update.action in {
+                OperationAction.RETRY,
+                OperationAction.SUCCEED,
+                OperationAction.FAIL,
+            }:
+                attempt += 1
             return StepDetails(
                 attempt=attempt,
                 next_attempt_timestamp=next_attempt_timestamp,
diff --git a/tests/checkpoint/processors/step_test.py b/tests/checkpoint/processors/step_test.py
@@ -230,6 +230,7 @@ def test_process_succeed_action_with_current_operation():
 
     current_op = Mock()
     current_op.start_timestamp = datetime.now(UTC)
+    current_op.step_details = StepDetails()
 
     update = OperationUpdate(
         operation_id="step-123",
@@ -243,6 +244,7 @@ def test_process_succeed_action_with_current_operation():
 
     assert result.start_timestamp == current_op.start_timestamp
     assert result.status == OperationStatus.SUCCEEDED
+    assert result.step_details.attempt == 1
 
 
 def test_process_fail_action():
@@ -274,6 +276,7 @@ def test_process_fail_action_with_current_operation():
 
     current_op = Mock()
     current_op.start_timestamp = datetime.now(UTC)
+    current_op.step_details = StepDetails()
 
     error = ErrorObject.from_message("step failed")
     update = OperationUpdate(
@@ -288,6 +291,7 @@ def test_process_fail_action_with_current_operation():
 
     assert result.start_timestamp == current_op.start_timestamp
     assert result.status == OperationStatus.FAILED
+    assert result.step_details.attempt == 1
 
 
 def test_process_invalid_action():