Merge pull request #10 from workflowai/guillaume/fix-retry-logic

guillaq · web-flow · commit c2c7ce612fe8 · 2024-09-24T15:30:25.000-04:00
fix: retry logic returns
diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,8 @@
+import pytest
+from freezegun import freeze_time
+
+
+@pytest.fixture()
+def frozen_time():
+    with freeze_time("2024-01-01T00:00:00Z") as frozen_time:
+        yield frozen_time
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "workflowai"
-version = "0.2.2"
+version = "0.2.3"
 description = ""
 authors = ["Guillaume Aquilina <guillaume@workflowai.com>"]
 readme = "README.md"
diff --git a/workflowai/core/client/__init__.py b/workflowai/core/client/__init__.py
@@ -33,9 +33,8 @@ async def run(
         use_cache: "cache_usage.CacheUsage" = "when_available",
         labels: Optional[set[str]] = None,
         metadata: Optional[dict[str, Any]] = None,
-        retry_delay: int = 5000,
-        max_retry_delay: int = 60000,
-        max_retry_count: int = 1,
+        max_retry_delay: float = 60,
+        max_retry_count: float = 1,
     ) -> "task_run.TaskRun[task.TaskInput, task.TaskOutput]": ...
 
     @overload
@@ -50,9 +49,8 @@ async def run(
         use_cache: "cache_usage.CacheUsage" = "when_available",
         labels: Optional[set[str]] = None,
         metadata: Optional[dict[str, Any]] = None,
-        retry_delay: int = 5000,
-        max_retry_delay: int = 60000,
-        max_retry_count: int = 1,
+        max_retry_delay: float = 60,
+        max_retry_count: float = 1,
     ) -> AsyncIterator["task.TaskOutput"]: ...
 
     async def run(
@@ -66,9 +64,8 @@ async def run(
         use_cache: "cache_usage.CacheUsage" = "when_available",
         labels: Optional[set[str]] = None,
         metadata: Optional[dict[str, Any]] = None,
-        retry_delay: int = 5000,
-        max_retry_delay: int = 60000,
-        max_retry_count: int = 1,
+        max_retry_delay: float = 60,
+        max_retry_count: float = 1,
     ) -> Union[
         "task_run.TaskRun[task.TaskInput, task.TaskOutput]",
         AsyncIterator["task.TaskOutput"],
diff --git a/workflowai/core/client/client.py b/workflowai/core/client/client.py
@@ -1,7 +1,6 @@
-import asyncio
 import importlib.metadata
 import os
-from email.utils import parsedate_to_datetime
+from collections.abc import Awaitable, Callable
 from typing import (
     Any,
     AsyncIterator,
@@ -25,6 +24,7 @@
     RunTaskStreamChunk,
     TaskRunResponse,
 )
+from workflowai.core.client.utils import build_retryable_wait
 from workflowai.core.domain.cache_usage import CacheUsage
 from workflowai.core.domain.errors import BaseError, WorkflowAIError
 from workflowai.core.domain.task import Task, TaskInput, TaskOutput
@@ -77,9 +77,8 @@ async def run(
         use_cache: CacheUsage = "when_available",
         labels: Optional[set[str]] = None,
         metadata: Optional[dict[str, Any]] = None,
-        retry_delay: int = 5000,
-        max_retry_delay: int = 60000,
-        max_retry_count: int = 1,
+        max_retry_delay: float = 60,
+        max_retry_count: float = 1,
     ) -> TaskRun[TaskInput, TaskOutput]: ...
 
     @overload
@@ -94,12 +93,11 @@ async def run(
         use_cache: CacheUsage = "when_available",
         labels: Optional[set[str]] = None,
         metadata: Optional[dict[str, Any]] = None,
-        retry_delay: int = 5000,
-        max_retry_delay: int = 60000,
-        max_retry_count: int = 1,
+        max_retry_delay: float = 60,
+        max_retry_count: float = 1,
     ) -> AsyncIterator[TaskOutput]: ...
 
-    async def run(  # noqa: C901
+    async def run(
         self,
         task: Task[TaskInput, TaskOutput],
         task_input: TaskInput,
@@ -110,9 +108,8 @@ async def run(  # noqa: C901
         use_cache: CacheUsage = "when_available",
         labels: Optional[set[str]] = None,
         metadata: Optional[dict[str, Any]] = None,
-        retry_delay: int = 5000,
-        max_retry_delay: int = 60000,
-        max_retry_count: int = 1,
+        max_retry_delay: float = 60,
+        max_retry_count: float = 1,
     ) -> Union[TaskRun[TaskInput, TaskOutput], AsyncIterator[TaskOutput]]:
         await self._auto_register(task)
 
@@ -135,76 +132,62 @@ async def run(  # noqa: C901
         )
 
         route = f"/tasks/{task.id}/schemas/{task.schema_id}/run"
+        should_retry, wait_for_exception = build_retryable_wait(max_retry_delay, max_retry_count)
 
         if not stream:
-            res = None
-            delay = retry_delay / 1000
-            retry_count = 0
-            while retry_count < max_retry_count:
-                try:
-                    res = await self.api.post(route, request, returns=TaskRunResponse)
-                    return res.to_domain(task)
-                except HTTPStatusError as e:
-                    if e.response.status_code == 404:
-                        raise WorkflowAIError(
-                            error=BaseError(
-                                status_code=404,
-                                code="not_found",
-                                message="Task not found",
-                            ),
-                        ) from e
-                    retry_after = e.response.headers.get("Retry-After")
-                    if retry_after:
-                        try:
-                            # for 429 errors this is non-negative decimal
-                            delay = float(retry_after)
-                        except ValueError:
-                            try:
-                                retry_after_date = parsedate_to_datetime(retry_after)
-                                current_time = asyncio.get_event_loop().time()
-                                delay = retry_after_date.timestamp() - current_time
-                            except (TypeError, ValueError, OverflowError):
-                                delay = min(delay * 2, max_retry_delay / 1000)
-                        await asyncio.sleep(delay)
-                    elif e.response.status_code == 429:
-                        if delay < max_retry_delay / 1000:
-                            delay = min(delay * 2, max_retry_delay / 1000)
-                        await asyncio.sleep(delay)
-                retry_count += 1
-
-        async def _stream():
-            delay = retry_delay / 1000
-            retry_count = 0
-            while retry_count < max_retry_count:
-                try:
-                    async for chunk in self.api.stream(
-                        method="POST",
-                        path=route,
-                        data=request,
-                        returns=RunTaskStreamChunk,
-                    ):
-                        yield task.output_class.model_construct(None, **chunk.task_output)
-                except HTTPStatusError as e:
-                    if e.response.status_code == 404:
-                        raise WorkflowAIError(error=BaseError(message="Task not found")) from e
-                    retry_after = e.response.headers.get("Retry-After")
-
-                    if retry_after:
-                        try:
-                            delay = float(retry_after)
-                        except ValueError:
-                            try:
-                                retry_after_date = parsedate_to_datetime(retry_after)
-                                current_time = asyncio.get_event_loop().time()
-                                delay = retry_after_date.timestamp() - current_time
-                            except (TypeError, ValueError, OverflowError):
-                                delay = min(delay * 2, max_retry_delay / 1000)
-                    elif e.response.status_code == 429 and delay < max_retry_delay / 1000:
-                        delay = min(delay * 2, max_retry_delay / 1000)
-                    await asyncio.sleep(delay)
-                retry_count += 1
-
-        return _stream()
+            return await self._retriable_run(
+                route,
+                request,
+                task,
+                should_retry=should_retry,
+                wait_for_exception=wait_for_exception,
+            )
+
+        return self._retriable_stream(
+            route,
+            request,
+            task,
+            should_retry=should_retry,
+            wait_for_exception=wait_for_exception,
+        )
+
+    async def _retriable_run(
+        self,
+        route: str,
+        request: RunRequest,
+        task: Task[TaskInput, TaskOutput],
+        should_retry: Callable[[], bool],
+        wait_for_exception: Callable[[HTTPStatusError], Awaitable[None]],
+    ):
+        while should_retry():
+            try:
+                res = await self.api.post(route, request, returns=TaskRunResponse)
+                return res.to_domain(task)
+            except HTTPStatusError as e:  # noqa: PERF203
+                await wait_for_exception(e)
+
+        raise WorkflowAIError(error=BaseError(message="max retries reached"))
+
+    async def _retriable_stream(
+        self,
+        route: str,
+        request: RunRequest,
+        task: Task[TaskInput, TaskOutput],
+        should_retry: Callable[[], bool],
+        wait_for_exception: Callable[[HTTPStatusError], Awaitable[None]],
+    ):
+        while should_retry():
+            try:
+                async for chunk in self.api.stream(
+                    method="POST",
+                    path=route,
+                    data=request,
+                    returns=RunTaskStreamChunk,
+                ):
+                    yield task.output_class.model_construct(None, **chunk.task_output)
+                return
+            except HTTPStatusError as e:  # noqa: PERF203
+                await wait_for_exception(e)
 
     async def import_run(
         self,
diff --git a/workflowai/core/client/client_test.py b/workflowai/core/client/client_test.py
@@ -140,7 +140,7 @@ async def test_success_with_headers(self, httpx_mock: HTTPXMock, client: Client)
     async def test_run_retries_on_too_many_requests(self, httpx_mock: HTTPXMock, client: Client):
         task = HelloTask(id="123", schema_id=1)
 
-        httpx_mock.add_response(status_code=429)
+        httpx_mock.add_response(headers={"Retry-After": "0.01"}, status_code=429)
         httpx_mock.add_response(json=fixtures_json("task_run.json"))
 
         task_run = await client.run(task, task_input=HelloTaskInput(name="Alice"), max_retry_count=5)
diff --git a/workflowai/core/client/utils.py b/workflowai/core/client/utils.py
@@ -1,7 +1,13 @@
 # Sometimes, 2 payloads are sent in a single message.
 # By adding the " at the end we more or less guarantee that
 # the delimiter is not withing a quoted string
+import asyncio
 import re
+from email.utils import parsedate_to_datetime
+from time import time
+from typing import Any, Optional
+
+from httpx import HTTPStatusError
 
 delimiter = re.compile(r'\}\n\ndata: \{"')
 
@@ -13,3 +19,51 @@ def split_chunks(chunk: bytes):
         yield chunk_str[start : match.start() + 1]
         start = match.end() - 2
     yield chunk_str[start:]
+
+
+def retry_after_to_delay_seconds(retry_after: Any) -> Optional[float]:
+    if retry_after is None:
+        return None
+
+    try:
+        return float(retry_after)
+    except ValueError:
+        pass
+    try:
+        retry_after_date = parsedate_to_datetime(retry_after)
+        current_time = time()
+        return retry_after_date.timestamp() - current_time
+    except (TypeError, ValueError, OverflowError):
+        return None
+
+
+# Returns two functions:
+# - _should_retry: returns True if we should retry
+# - _wait_for_exception: waits after an exception only if we should retry, otherwise raises
+# This is a bit convoluted and would be better in a function wrapper, but since we are dealing
+# with both Awaitable and AsyncGenerator, a wrapper would just be too complex
+def build_retryable_wait(
+    max_retry_delay: float = 60,
+    max_retry_count: float = 1,
+):
+    now = time()
+    retry_count = 0
+
+    def _leftover_delay():
+        # Time remaining before we hit the max retry delay
+        return max_retry_delay - (time() - now)
+
+    def _should_retry():
+        return retry_count < max_retry_count and _leftover_delay() >= 0
+
+    async def _wait_for_exception(e: HTTPStatusError):
+        nonlocal retry_count
+        retry_after = retry_after_to_delay_seconds(e.response.headers.get("Retry-After"))
+        leftover_delay = _leftover_delay()
+        if not retry_after or leftover_delay < 0 or retry_count >= max_retry_count:
+            # TODO: convert error to WorkflowAIError
+            raise e
+        await asyncio.sleep(retry_after)
+        retry_count += 1
+
+    return _should_retry, _wait_for_exception
diff --git a/workflowai/core/client/utils_test.py b/workflowai/core/client/utils_test.py
@@ -1,6 +1,11 @@
+from typing import Optional
+from unittest.mock import Mock
+
 import pytest
+from freezegun import freeze_time
+from httpx import HTTPStatusError
 
-from workflowai.core.client.utils import split_chunks
+from workflowai.core.client.utils import build_retryable_wait, retry_after_to_delay_seconds, split_chunks
 
 
 @pytest.mark.parametrize(
@@ -15,3 +20,30 @@
 )
 def test_split_chunks(chunk: bytes, expected: list[bytes]):
     assert list(split_chunks(chunk)) == expected
+
+
+@freeze_time("2024-01-01T00:00:00Z")
+@pytest.mark.parametrize(
+    ("retry_after", "expected"),
+    [
+        (None, None),
+        ("10", 10),
+        ("Wed, 01 Jan 2024 00:00:10 UTC", 10),
+    ],
+)
+def test_retry_after_to_delay_seconds(retry_after: Optional[str], expected: Optional[float]):
+    assert retry_after_to_delay_seconds(retry_after) == expected
+
+
+class TestBuildRetryableWait:
+    @pytest.fixture()
+    def request_error(self):
+        response = Mock()
+        response.headers = {"Retry-After": "0.01"}
+        return HTTPStatusError(message="", request=Mock(), response=response)
+
+    async def test_should_retry_count(self, request_error: HTTPStatusError):
+        should_retry, wait_for_exception = build_retryable_wait(60, 1)
+        assert should_retry()
+        await wait_for_exception(request_error)
+        assert not should_retry()