From b159d3ed61d2489c8d8716b64013f39267c07f6f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Sat, 16 Aug 2025 13:32:09 +0200 Subject: [PATCH 01/16] Draft, unique key enables better metadata handling --- .../_apify/_request_queue_client.py | 154 +++++++++--------- tests/integration/conftest.py | 5 + tests/integration/test_actor_request_queue.py | 121 ++++++++++++-- 3 files changed, 193 insertions(+), 87 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 519cd95a..3e552890 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -10,6 +10,7 @@ from typing_extensions import override from apify_client import ApifyClientAsync +from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.requests import unique_key_to_request_id from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata @@ -40,10 +41,7 @@ def __init__( self, *, api_client: RequestQueueClientAsync, - id: str, - name: str | None, - total_request_count: int, - handled_request_count: int, + metadata: RequestQueueMetadata, ) -> None: """Initialize a new instance. @@ -52,11 +50,8 @@ def __init__( self._api_client = api_client """The Apify request queue client for API operations.""" - self._id = id - """The ID of the request queue.""" - - self._name = name - """The name of the request queue.""" + self._metadata = metadata + """Additional data related to the RequestQueue.""" self._queue_head = deque[str]() """A deque to store request IDs in the queue head.""" @@ -70,41 +65,55 @@ def __init__( self._should_check_for_forefront_requests = False """Whether to check for forefront requests in the next list_head call.""" - self._had_multiple_clients = False - """Whether the request queue has been accessed by multiple clients.""" - - self._initial_total_count = total_request_count - """The initial total request count (from the API) when the queue was opened.""" - - self._initial_handled_count = handled_request_count - """The initial handled request count (from the API) when the queue was opened.""" - - self._assumed_total_count = 0 - """The number of requests we assume are in the queue (tracked manually for this instance).""" - - self._assumed_handled_count = 0 - """The number of requests we assume have been handled (tracked manually for this instance).""" - self._fetch_lock = asyncio.Lock() """Fetch lock to minimize race conditions when communicating with API.""" @override async def get_metadata(self) -> RequestQueueMetadata: - total_count = self._initial_total_count + self._assumed_total_count - handled_count = self._initial_handled_count + self._assumed_handled_count - pending_count = total_count - handled_count - - return RequestQueueMetadata( - id=self._id, - name=self._name, - total_request_count=total_count, - handled_request_count=handled_count, - pending_request_count=pending_count, - created_at=datetime.now(timezone.utc), - modified_at=datetime.now(timezone.utc), - accessed_at=datetime.now(timezone.utc), - had_multiple_clients=self._had_multiple_clients, - ) + """Get metadata about the request queue.""" + if self._metadata.had_multiple_clients: + # Enhanced from API (can be delayed few seconds) + response = await self._api_client.get() + if response is None: + raise ValueError('Failed to fetch request queue metadata from the API.') + return RequestQueueMetadata( + id=response['id'], + name=response['name'], + total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count), + handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count), + pending_request_count=response['pendingRequestCount'], + created_at=response['createdAt'], + modified_at=max(response['modifiedAt'], self._metadata.modified_at), + accessed_at=max(response['accessedAt'], self._metadata.accessed_at), + had_multiple_clients=response['hadMultipleClients'], + ) + # Update local estimation? + # Get local estimation (will not include changes done bo another client) + return self._metadata + + + @override + async def get_metadata(self) -> RequestQueueMetadata: + """Get metadata about the request queue.""" + if self._metadata.had_multiple_clients: + # Enhanced from API (can be delayed few seconds) + response = await self._api_client.get() + if response is None: + raise ValueError('Failed to fetch request queue metadata from the API.') + return RequestQueueMetadata( + id=response['id'], + name=response['name'], + total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count), + handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count), + pending_request_count=response['pendingRequestCount'], + created_at=response['createdAt'], + modified_at=max(response['modifiedAt'], self._metadata.modified_at), + accessed_at=max(response['accessedAt'], self._metadata.accessed_at), + had_multiple_clients=response['hadMultipleClients'], + ) + # Update local estimation? + # Get local estimation (will not include changes done bo another client) + return self._metadata @classmethod async def open( @@ -162,27 +171,33 @@ async def open( ) apify_rqs_client = apify_client_async.request_queues() - # If both id and name are provided, raise an error. - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - - # If id is provided, get the storage by ID. - if id and name is None: - apify_rq_client = apify_client_async.request_queue(request_queue_id=id) + match (id, name): + case (None, None): + # If both id and name are None, try to get the default storage ID from environment variables. + # The default storage ID environment variable is set by the Apify platform. It also contains + # a new storage ID after Actor's reboot or migration. + id = configuration.default_request_queue_id + case (None, name): + # If name is provided, get or create the storage by name. + id = RequestQueueMetadata.model_validate( + await apify_rqs_client.get_or_create(name=name), + ).id + case (_, None): + pass + case (_, _): + # If both id and name are provided, raise an error. + raise ValueError('Only one of "id" or "name" can be specified, not both.') + if id is None: + raise RuntimeError('Unreachable code') - # If name is provided, get or create the storage by name. - if name and id is None: - id = RequestQueueMetadata.model_validate( - await apify_rqs_client.get_or_create(name=name), - ).id - apify_rq_client = apify_client_async.request_queue(request_queue_id=id) + # Use suitable client_key to make `hadMultipleClients` response of Apify API useful. + # It should persist across migrated Actor runs on the Apify platform. + _api_max_client_key_length = 32 + client_key = (configuration.actor_run_id or crypto_random_object_id(length=_api_max_client_key_length))[ + :_api_max_client_key_length + ] - # If both id and name are None, try to get the default storage ID from environment variables. - # The default storage ID environment variable is set by the Apify platform. It also contains - # a new storage ID after Actor's reboot or migration. - if id is None and name is None: - id = configuration.default_request_queue_id - apify_rq_client = apify_client_async.request_queue(request_queue_id=id) + apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key) # Fetch its metadata. metadata = await apify_rq_client.get() @@ -192,27 +207,18 @@ async def open( id = RequestQueueMetadata.model_validate( await apify_rqs_client.get_or_create(), ).id - apify_rq_client = apify_client_async.request_queue(request_queue_id=id) + apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key) # Verify that the storage exists by fetching its metadata again. metadata = await apify_rq_client.get() if metadata is None: raise ValueError(f'Opening request queue with id={id} and name={name} failed.') - metadata_model = RequestQueueMetadata.model_validate( - await apify_rqs_client.get_or_create(), - ) - - # Ensure we have a valid ID. - if id is None: - raise ValueError('Request queue ID cannot be None.') + metadata_model = RequestQueueMetadata.model_validate(metadata) return cls( api_client=apify_rq_client, - id=id, - name=name, - total_request_count=metadata_model.total_request_count, - handled_request_count=metadata_model.handled_request_count, + metadata=metadata_model, ) @override @@ -261,7 +267,7 @@ async def add_batch_of_requests( if not processed_request.was_already_present and not processed_request.was_already_handled: new_request_count += 1 - self._assumed_total_count += new_request_count + self._metadata.total_request_count += new_request_count return api_response @@ -359,7 +365,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | # Update assumed handled count if this wasn't already handled if not processed_request.was_already_handled: - self._assumed_handled_count += 1 + self._metadata.handled_request_count += 1 # Update the cache with the handled request cache_key = unique_key_to_request_id(request.unique_key) @@ -407,7 +413,7 @@ async def reclaim_request( # If the request was previously handled, decrement our handled count since # we're putting it back for processing. if request.was_already_handled and not processed_request.was_already_handled: - self._assumed_handled_count -= 1 + self._metadata.handled_request_count -= 1 # Update the cache cache_key = unique_key_to_request_id(request.unique_key) @@ -591,6 +597,8 @@ async def _list_head( # Update the queue head cache self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) + # Check if there is another client working with the RequestQueue + self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) for request_data in response.get('items', []): request = Request.model_validate(request_data) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 07b6c758..c2898107 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -96,6 +96,11 @@ def apify_token() -> str: return api_token +@pytest.fixture(autouse=True) +def set_token(apify_token: str, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) + + @pytest.fixture def apify_client_async(apify_token: str) -> ApifyClientAsync: """Create an instance of the ApifyClientAsync. diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 64a846b5..368c8a0b 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -2,14 +2,13 @@ from typing import TYPE_CHECKING -from apify_shared.consts import ApifyEnvVars +from crawlee.storages import RequestQueue from ._utils import generate_unique_resource_name from apify import Actor, Request +from apify.storage_clients import ApifyStorageClient if TYPE_CHECKING: - import pytest - from apify_client import ApifyClientAsync from .conftest import MakeActorFunction, RunActorFunction @@ -61,11 +60,7 @@ async def main() -> None: async def test_force_cloud( apify_client_async: ApifyClientAsync, - monkeypatch: pytest.MonkeyPatch, ) -> None: - assert apify_client_async.token is not None - monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_client_async.token) - request_queue_name = generate_unique_resource_name('request_queue') async with Actor: @@ -88,13 +83,7 @@ async def test_force_cloud( await request_queue_client.delete() -async def test_request_queue_is_finished( - apify_client_async: ApifyClientAsync, - monkeypatch: pytest.MonkeyPatch, -) -> None: - assert apify_client_async.token is not None - monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_client_async.token) - +async def test_request_queue_is_finished() -> None: request_queue_name = generate_unique_resource_name('request_queue') async with Actor: @@ -113,3 +102,107 @@ async def test_request_queue_is_finished( assert await request_queue.is_finished() finally: await request_queue.drop() + +# TODO, add more metadata tests + +async def test_request_queue_had_multiple_clients_localaaaa( + apify_client_async: ApifyClientAsync, +) -> None: + """`RequestQueue` clients created with different `client_key` should appear as distinct clients.""" + #request_queue_name = generate_unique_resource_name('request_queue') + rq_client = await ApifyStorageClient().create_rq_client(name=None, id=None) + client_metadata = await rq_client.get_metadata() + rq = RequestQueue(name=client_metadata.name, id=client_metadata.id, client=rq_client) + await rq.fetch_next_request() + await rq.fetch_next_request() + + # Check that it is correctly in the RequestQueueClient metadata + assert (await rq.get_metadata()).had_multiple_clients is False + +async def test_request_queue_had_multiple_clients_local( + apify_client_async: ApifyClientAsync, +) -> None: + """`RequestQueue` clients created with different `client_key` should appear as distinct clients.""" + request_queue_name = generate_unique_resource_name('request_queue') + + async with Actor: + rq_1 = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + await rq_1.fetch_next_request() + + # Accessed with client created explicitly with `client_key=None` should appear as distinct client + api_client = apify_client_async.request_queue(request_queue_id=rq_1.id, client_key=None) + await api_client.list_head() + + # Check that it is correctly in the RequestQueueClient metadata + assert (await rq_1.get_metadata()).had_multiple_clients is True # Currently broken + # Check that it is correctly in the API + assert ((await api_client.get())['hadMultipleClients']) is True + + +async def test_request_queue_not_had_multiple_clients_local(apify_client_async: ApifyClientAsync,) -> None: + """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" + request_queue_name = generate_unique_resource_name('request_queue') + + async with Actor: + rq_1 = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True + await rq_1.fetch_next_request() + await rq_1.fetch_next_request() + + # Check that it is correctly in the RequestQueueClient metadata + assert (await rq_1.get_metadata()).had_multiple_clients is False + # Check that it is correctly in the API + api_client = apify_client_async.request_queue(request_queue_id=rq_1.id) + assert ((await api_client.get())['hadMultipleClients']) is False + + +async def test_request_queue_had_multiple_clients_platform( + make_actor: MakeActorFunction, + run_actor: RunActorFunction, +) -> None: + async def main() -> None: + """`RequestQueue` clients created with different `client_key` should appear as distinct clients.""" + from apify_client import ApifyClientAsync + + async with Actor: + rq_1 = await Actor.open_request_queue() + await rq_1.fetch_next_request() + + # Accessed with client created explicitly with `client_key=None` should appear as distinct client + api_client = ApifyClientAsync(token=Actor.configuration.token).request_queue( + request_queue_id=rq_1.id, client_key=None + ) + await api_client.list_head() + + # Check that it is correctly in the RequestQueueClient metadata + assert (await rq_1.get_metadata()).had_multiple_clients is True # Currently broken + # Check that it is correctly in the API + assert ((await rq_1._client._api_client.get())['hadMultipleClients']) is True + + actor = await make_actor(label='rq-same-ref-default', main_func=main) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' + + +async def test_request_queue_not_had_multiple_clients_platform( + make_actor: MakeActorFunction, + run_actor: RunActorFunction, +) -> None: + async def main() -> None: + """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" + async with Actor: + rq_1 = await Actor.open_request_queue() + # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True + await rq_1.fetch_next_request() + await rq_1.fetch_next_request() + + # Check that it is correctly in the RequestQueueClient metadata + assert (await rq_1.get_metadata()).had_multiple_clients is False + # Check that it is correctly in the API + assert ((await rq_1._client._api_client.get())['hadMultipleClients']) is False + + actor = await make_actor(label='rq-same-ref-default', main_func=main) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' From 301e2946c5298fad916a1a5ee95380195957f39c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Aug 2025 09:47:51 +0200 Subject: [PATCH 02/16] Add more tests --- .../_apify/_request_queue_client.py | 59 +++------- tests/integration/conftest.py | 15 ++- tests/integration/test_actor_request_queue.py | 111 +++++++++++------- 3 files changed, 99 insertions(+), 86 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 3e552890..1cc7e809 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -68,52 +68,31 @@ def __init__( self._fetch_lock = asyncio.Lock() """Fetch lock to minimize race conditions when communicating with API.""" - @override - async def get_metadata(self) -> RequestQueueMetadata: - """Get metadata about the request queue.""" + async def _get_metadata(self) -> RequestQueueMetadata: + """Try to get cached metadata first. If multiple clients, fuse with global metadata.""" if self._metadata.had_multiple_clients: - # Enhanced from API (can be delayed few seconds) - response = await self._api_client.get() - if response is None: - raise ValueError('Failed to fetch request queue metadata from the API.') - return RequestQueueMetadata( - id=response['id'], - name=response['name'], - total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count), - handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count), - pending_request_count=response['pendingRequestCount'], - created_at=response['createdAt'], - modified_at=max(response['modifiedAt'], self._metadata.modified_at), - accessed_at=max(response['accessedAt'], self._metadata.accessed_at), - had_multiple_clients=response['hadMultipleClients'], - ) - # Update local estimation? + return await self.get_metadata() # Get local estimation (will not include changes done bo another client) return self._metadata - @override async def get_metadata(self) -> RequestQueueMetadata: """Get metadata about the request queue.""" - if self._metadata.had_multiple_clients: - # Enhanced from API (can be delayed few seconds) - response = await self._api_client.get() - if response is None: - raise ValueError('Failed to fetch request queue metadata from the API.') - return RequestQueueMetadata( - id=response['id'], - name=response['name'], - total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count), - handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count), - pending_request_count=response['pendingRequestCount'], - created_at=response['createdAt'], - modified_at=max(response['modifiedAt'], self._metadata.modified_at), - accessed_at=max(response['accessedAt'], self._metadata.accessed_at), - had_multiple_clients=response['hadMultipleClients'], - ) - # Update local estimation? - # Get local estimation (will not include changes done bo another client) - return self._metadata + response = await self._api_client.get() + if response is None: + raise ValueError('Failed to fetch request queue metadata from the API.') + # Enhance API response by local estimations (API can be delayed few seconds, while local estimation not.) + return RequestQueueMetadata( + id=response['id'], + name=response['name'], + total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count), + handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count), + pending_request_count=response['pendingRequestCount'], + created_at=min(response['createdAt'], self._metadata.created_at), + modified_at=max(response['modifiedAt'], self._metadata.modified_at), + accessed_at=max(response['accessedAt'], self._metadata.accessed_at), + had_multiple_clients=response['hadMultipleClients'] or self._metadata.had_multiple_clients, + ) @classmethod async def open( @@ -570,7 +549,7 @@ async def _list_head( if cached_request and cached_request.hydrated: items.append(cached_request.hydrated) - metadata = await self.get_metadata() + metadata = await self._get_metadata() return RequestQueueHead( limit=limit, diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c2898107..6ec53449 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -18,13 +18,15 @@ import apify._actor from ._utils import generate_unique_resource_name +from apify import Actor from apify._models import ActorRun if TYPE_CHECKING: - from collections.abc import Awaitable, Callable, Coroutine, Iterator, Mapping + from collections.abc import AsyncGenerator, Awaitable, Callable, Coroutine, Iterator, Mapping from decimal import Decimal from apify_client.clients.resource_clients import ActorClientAsync + from crawlee.storages import RequestQueue _TOKEN_ENV_VAR = 'APIFY_TEST_USER_API_TOKEN' _API_URL_ENV_VAR = 'APIFY_INTEGRATION_TESTS_API_URL' @@ -114,6 +116,17 @@ def apify_client_async(apify_token: str) -> ApifyClientAsync: return ApifyClientAsync(apify_token, api_url=api_url) +@pytest.fixture +async def request_queue_force_cloud() -> AsyncGenerator[RequestQueue]: + """Create an instance of the Apify request queue on the platform and drop it when the test is finished.""" + request_queue_name = generate_unique_resource_name('request_queue') + + async with Actor: + rq = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + yield rq + await rq.drop() + + @pytest.fixture(scope='session') def sdk_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) -> Path: """Build the package wheel if it hasn't been built yet, and return the path to the wheel.""" diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 368c8a0b..69dfb1fb 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -1,15 +1,14 @@ from __future__ import annotations +import asyncio from typing import TYPE_CHECKING -from crawlee.storages import RequestQueue - from ._utils import generate_unique_resource_name from apify import Actor, Request -from apify.storage_clients import ApifyStorageClient if TYPE_CHECKING: from apify_client import ApifyClientAsync + from crawlee.storages import RequestQueue from .conftest import MakeActorFunction, RunActorFunction @@ -103,57 +102,83 @@ async def test_request_queue_is_finished() -> None: finally: await request_queue.drop() -# TODO, add more metadata tests -async def test_request_queue_had_multiple_clients_localaaaa( +async def test_request_queue_enhanced_metadata( + request_queue_force_cloud: RequestQueue, apify_client_async: ApifyClientAsync, ) -> None: - """`RequestQueue` clients created with different `client_key` should appear as distinct clients.""" - #request_queue_name = generate_unique_resource_name('request_queue') - rq_client = await ApifyStorageClient().create_rq_client(name=None, id=None) - client_metadata = await rq_client.get_metadata() - rq = RequestQueue(name=client_metadata.name, id=client_metadata.id, client=rq_client) - await rq.fetch_next_request() - await rq.fetch_next_request() + """Test metadata tracking. + + Multiple clients scenarios are not guaranteed to give correct results without delay. But at least multiple clients, + single producer, should be reliable on the producer side.""" + + for i in range(1, 10): + await request_queue_force_cloud.add_request(Request.from_url(f'http://example.com/{i}')) + # Reliable information as the API response is enhanced with local metadata estimation. + assert (await request_queue_force_cloud.get_metadata()).total_request_count == i + + # Accessed with client created explicitly with `client_key=None` should appear as distinct client + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + await api_client.list_head() + + # The presence of another non-producing client should not affect the metadata + for i in range(10, 20): + await request_queue_force_cloud.add_request(Request.from_url(f'http://example.com/{i}')) + # Reliable information as the API response is enhanced with local metadata estimation. + assert (await request_queue_force_cloud.get_metadata()).total_request_count == i + + +async def test_request_queue_metadata_another_client( + request_queue_force_cloud: RequestQueue, + apify_client_async: ApifyClientAsync, +) -> None: + """Test metadata tracking. The delayed metadata should be reliable even when changed by another client.""" + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + await api_client.add_request(Request.from_url('http://example.com/1').model_dump(by_alias=True, exclude={'id'})) + + # Wait to be sure that the API has updated the global metadata + await asyncio.sleep(10) + + assert (await request_queue_force_cloud.get_metadata()).total_request_count == 1 - # Check that it is correctly in the RequestQueueClient metadata - assert (await rq.get_metadata()).had_multiple_clients is False async def test_request_queue_had_multiple_clients_local( + request_queue_force_cloud: RequestQueue, apify_client_async: ApifyClientAsync, ) -> None: - """`RequestQueue` clients created with different `client_key` should appear as distinct clients.""" - request_queue_name = generate_unique_resource_name('request_queue') + """Test that `RequestQueue` correctly detects multiple clients. - async with Actor: - rq_1 = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) - await rq_1.fetch_next_request() + Clients created with different `client_key` should appear as distinct clients.""" + await request_queue_force_cloud.fetch_next_request() - # Accessed with client created explicitly with `client_key=None` should appear as distinct client - api_client = apify_client_async.request_queue(request_queue_id=rq_1.id, client_key=None) - await api_client.list_head() + # Accessed with client created explicitly with `client_key=None` should appear as distinct client + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + await api_client.list_head() - # Check that it is correctly in the RequestQueueClient metadata - assert (await rq_1.get_metadata()).had_multiple_clients is True # Currently broken - # Check that it is correctly in the API - assert ((await api_client.get())['hadMultipleClients']) is True + # Check that it is correctly in the RequestQueueClient metadata + assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is True + # Check that it is correctly in the API + api_response = await api_client.get() + assert api_response + assert api_response['hadMultipleClients'] is True -async def test_request_queue_not_had_multiple_clients_local(apify_client_async: ApifyClientAsync,) -> None: +async def test_request_queue_not_had_multiple_clients_local( + request_queue_force_cloud: RequestQueue, apify_client_async: ApifyClientAsync +) -> None: """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" - request_queue_name = generate_unique_resource_name('request_queue') - async with Actor: - rq_1 = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) - # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True - await rq_1.fetch_next_request() - await rq_1.fetch_next_request() + # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True + await request_queue_force_cloud.fetch_next_request() + await request_queue_force_cloud.fetch_next_request() - # Check that it is correctly in the RequestQueueClient metadata - assert (await rq_1.get_metadata()).had_multiple_clients is False - # Check that it is correctly in the API - api_client = apify_client_async.request_queue(request_queue_id=rq_1.id) - assert ((await api_client.get())['hadMultipleClients']) is False + # Check that it is correctly in the RequestQueueClient metadata + assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is False + # Check that it is correctly in the API + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id) + api_response = await api_client.get() + assert api_response + assert api_response['hadMultipleClients'] is False async def test_request_queue_had_multiple_clients_platform( @@ -175,11 +200,9 @@ async def main() -> None: await api_client.list_head() # Check that it is correctly in the RequestQueueClient metadata - assert (await rq_1.get_metadata()).had_multiple_clients is True # Currently broken - # Check that it is correctly in the API - assert ((await rq_1._client._api_client.get())['hadMultipleClients']) is True + assert (await rq_1.get_metadata()).had_multiple_clients is True - actor = await make_actor(label='rq-same-ref-default', main_func=main) + actor = await make_actor(label='rq-had-multiple-clients', main_func=main) run_result = await run_actor(actor) assert run_result.status == 'SUCCEEDED' @@ -199,10 +222,8 @@ async def main() -> None: # Check that it is correctly in the RequestQueueClient metadata assert (await rq_1.get_metadata()).had_multiple_clients is False - # Check that it is correctly in the API - assert ((await rq_1._client._api_client.get())['hadMultipleClients']) is False - actor = await make_actor(label='rq-same-ref-default', main_func=main) + actor = await make_actor(label='rq-not-had-multiple-clients', main_func=main) run_result = await run_actor(actor) assert run_result.status == 'SUCCEEDED' From 281c67f8a9b86633af79c589e76d0a8d7fd27edd Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Aug 2025 11:43:25 +0200 Subject: [PATCH 03/16] Add resurrection test Seems to be some problem on the platform? --- .../_apify/_request_queue_client.py | 5 +- tests/integration/_utils.py | 11 +++-- tests/integration/test_actor_request_queue.py | 49 +++++++++++++++++-- 3 files changed, 55 insertions(+), 10 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 1cc7e809..54fc48ed 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -157,11 +157,12 @@ async def open( # a new storage ID after Actor's reboot or migration. id = configuration.default_request_queue_id case (None, name): - # If name is provided, get or create the storage by name. + # If only name is provided, get or create the storage by name. id = RequestQueueMetadata.model_validate( await apify_rqs_client.get_or_create(name=name), ).id case (_, None): + # If only id is provided, use it. pass case (_, _): # If both id and name are provided, raise an error. @@ -170,7 +171,7 @@ async def open( raise RuntimeError('Unreachable code') # Use suitable client_key to make `hadMultipleClients` response of Apify API useful. - # It should persist across migrated Actor runs on the Apify platform. + # It should persist across migrated or resurrected Actor runs on the Apify platform. _api_max_client_key_length = 32 client_key = (configuration.actor_run_id or crypto_random_object_id(length=_api_max_client_key_length))[ :_api_max_client_key_length diff --git a/tests/integration/_utils.py b/tests/integration/_utils.py index cbea845d..6b1ec069 100644 --- a/tests/integration/_utils.py +++ b/tests/integration/_utils.py @@ -1,9 +1,14 @@ from __future__ import annotations -from crawlee._utils.crypto import crypto_random_object_id - def generate_unique_resource_name(label: str) -> str: """Generates a unique resource name, which will contain the given label.""" + name_template = 'python-sdk-tests-{}-generated-{}' + template_length = len(name_template.format('', '')) + api_name_limit = 63 + label_length_limit = api_name_limit - template_length + label = label.replace('_', '-') - return f'python-sdk-tests-{label}-generated-{crypto_random_object_id(8)}' + assert len(label) <= label_length_limit, f'Max label length is {label_length_limit}, but got {len(label)}' + + return name_template.format(label, template_length) diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 69dfb1fb..618b92bb 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -5,6 +5,7 @@ from ._utils import generate_unique_resource_name from apify import Actor, Request +from apify._models import ActorRun if TYPE_CHECKING: from apify_client import ApifyClientAsync @@ -157,6 +158,7 @@ async def test_request_queue_had_multiple_clients_local( # Check that it is correctly in the RequestQueueClient metadata assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is True + # Check that it is correctly in the API api_response = await api_client.get() assert api_response @@ -174,6 +176,7 @@ async def test_request_queue_not_had_multiple_clients_local( # Check that it is correctly in the RequestQueueClient metadata assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is False + # Check that it is correctly in the API api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id) api_response = await api_client.get() @@ -185,8 +188,9 @@ async def test_request_queue_had_multiple_clients_platform( make_actor: MakeActorFunction, run_actor: RunActorFunction, ) -> None: + """Test that `RequestQueue` clients created with different `client_key` appear as distinct clients.""" + async def main() -> None: - """`RequestQueue` clients created with different `client_key` should appear as distinct clients.""" from apify_client import ApifyClientAsync async with Actor: @@ -199,7 +203,6 @@ async def main() -> None: ) await api_client.list_head() - # Check that it is correctly in the RequestQueueClient metadata assert (await rq_1.get_metadata()).had_multiple_clients is True actor = await make_actor(label='rq-had-multiple-clients', main_func=main) @@ -212,18 +215,54 @@ async def test_request_queue_not_had_multiple_clients_platform( make_actor: MakeActorFunction, run_actor: RunActorFunction, ) -> None: + """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" + async def main() -> None: - """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" async with Actor: rq_1 = await Actor.open_request_queue() - # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True + # Two calls to API to create situation where unset `client_key` can cause `had_multiple_clients` to True await rq_1.fetch_next_request() await rq_1.fetch_next_request() - # Check that it is correctly in the RequestQueueClient metadata assert (await rq_1.get_metadata()).had_multiple_clients is False actor = await make_actor(label='rq-not-had-multiple-clients', main_func=main) run_result = await run_actor(actor) assert run_result.status == 'SUCCEEDED' + + +async def test_request_queue_not_had_multiple_clients_platform_resurrection( + make_actor: MakeActorFunction, + run_actor: RunActorFunction, + apify_client_async: ApifyClientAsync, +) -> None: + """Test `RequestQueue` created from Actor does not act as multiple clients even after resurrection.""" + + async def main() -> None: + async with Actor: + rq_1 = await Actor.open_request_queue() + Actor.log.info(f'Used client key = {rq_1._client._api_client.client_key}, request queue ID = {rq_1.id}') + metadata = await rq_1.get_metadata() + Actor.log.info(metadata) + + assert metadata.had_multiple_clients is False, 'Not accessed yet, should be False' + + await rq_1.fetch_next_request() + + assert (await rq_1.get_metadata()).had_multiple_clients is False, ( + 'Accessed with same client, should be False' + ) + + actor = await make_actor(label='rq-multiple-clients-resurrection', main_func=main) + run_result = await run_actor(actor) + assert run_result.status == 'SUCCEEDED' + + # Resurrect the run, the RequestQueue should still use same client key and thus not have multiple clients. + run_client = apify_client_async.run(run_id=run_result.id) + # Redirect logs even from the resurrected run + streamed_log = await run_client.get_streamed_log(from_start=False) + await run_client.resurrect() + async with streamed_log: + run_result = ActorRun.model_validate(await run_client.wait_for_finish(wait_secs=600)) + assert run_result.status == 'SUCCEEDED' From efd9e5318c3b19244d15d5cc2ff9376253820ded Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Aug 2025 13:05:31 +0200 Subject: [PATCH 04/16] Remove internal debug logs from the test Platform acknowledged it is a bug --- tests/integration/test_actor_request_queue.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 618b92bb..296ca825 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -242,16 +242,12 @@ async def test_request_queue_not_had_multiple_clients_platform_resurrection( async def main() -> None: async with Actor: rq_1 = await Actor.open_request_queue() - Actor.log.info(f'Used client key = {rq_1._client._api_client.client_key}, request queue ID = {rq_1.id}') - metadata = await rq_1.get_metadata() - Actor.log.info(metadata) - - assert metadata.had_multiple_clients is False, 'Not accessed yet, should be False' + assert (await rq_1.get_metadata()).had_multiple_clients is False, 'Not accessed yet, should be False' await rq_1.fetch_next_request() assert (await rq_1.get_metadata()).had_multiple_clients is False, ( - 'Accessed with same client, should be False' + 'Accessed with the same client, should be False' ) actor = await make_actor(label='rq-multiple-clients-resurrection', main_func=main) From 2b2cc3042003d518a1c68a044c9a503c44814fdb Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Aug 2025 13:42:11 +0200 Subject: [PATCH 05/16] Properly update `generate_unique_resource_name` --- tests/integration/_utils.py | 7 +++++-- tests/integration/test_actor_request_queue.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/integration/_utils.py b/tests/integration/_utils.py index 6b1ec069..b5323272 100644 --- a/tests/integration/_utils.py +++ b/tests/integration/_utils.py @@ -1,14 +1,17 @@ from __future__ import annotations +from crawlee._utils.crypto import crypto_random_object_id + def generate_unique_resource_name(label: str) -> str: """Generates a unique resource name, which will contain the given label.""" name_template = 'python-sdk-tests-{}-generated-{}' template_length = len(name_template.format('', '')) api_name_limit = 63 - label_length_limit = api_name_limit - template_length + generated_random_id_length = 8 + label_length_limit = api_name_limit - template_length - generated_random_id_length label = label.replace('_', '-') assert len(label) <= label_length_limit, f'Max label length is {label_length_limit}, but got {len(label)}' - return name_template.format(label, template_length) + return name_template.format(label, crypto_random_object_id(generated_random_id_length)) diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 296ca825..75b88c08 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -250,7 +250,7 @@ async def main() -> None: 'Accessed with the same client, should be False' ) - actor = await make_actor(label='rq-multiple-clients-resurrection', main_func=main) + actor = await make_actor(label='rq-clients-resurrection', main_func=main) run_result = await run_actor(actor) assert run_result.status == 'SUCCEEDED' From 9d37e14087afb37ee15868140344607f4b1310ce Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Aug 2025 17:20:47 +0200 Subject: [PATCH 06/16] Remove request.id --- pyproject.toml | 4 +- src/apify/scrapy/requests.py | 3 +- src/apify/storage_clients/_apify/_models.py | 4 +- .../_apify/_request_queue_client.py | 71 +++++++++---------- .../actor_source_base/requirements.txt | 2 +- tests/integration/test_request_queue.py | 29 ++++---- .../scrapy/requests/test_to_apify_request.py | 2 - .../scrapy/requests/test_to_scrapy_request.py | 4 -- uv.lock | 43 +++-------- 9 files changed, 60 insertions(+), 102 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 940f1400..1e0c8092 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,9 +34,9 @@ keywords = [ "scraping", ] dependencies = [ - "apify-client>=2.0.0,<3.0.0", + "apify-client@git+https://github.com/apify/apify-client-python.git@remove-request-id", "apify-shared>=2.0.0,<3.0.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@master", + "crawlee@git+https://github.com/apify/crawlee-python.git@remove-request-id", "cachetools>=5.5.0", "cryptography>=42.0.0", "httpx>=0.27.0", diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 63bba3c7..27328c5c 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -122,7 +122,7 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ # Update the meta field with the meta field from the apify_request meta = scrapy_request.meta or {} - meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key}) + meta.update({'apify_request_unique_key': apify_request.unique_key}) # scrapy_request.meta is a property, so we have to set it like this scrapy_request._meta = meta # noqa: SLF001 @@ -134,7 +134,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ url=apify_request.url, method=apify_request.method, meta={ - 'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key, }, ) diff --git a/src/apify/storage_clients/_apify/_models.py b/src/apify/storage_clients/_apify/_models.py index df981121..993ea8db 100644 --- a/src/apify/storage_clients/_apify/_models.py +++ b/src/apify/storage_clients/_apify/_models.py @@ -94,8 +94,8 @@ class CachedRequest(BaseModel): Only internal structure. """ - id: str - """The ID of the request.""" + unique_key: str + """Unique key of the request.""" was_already_handled: bool """Whether the request was already handled.""" diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index c9e7031a..375ce67a 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -10,7 +10,6 @@ from typing_extensions import override from apify_client import ApifyClientAsync -from crawlee._utils.requests import unique_key_to_request_id from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata @@ -59,10 +58,10 @@ def __init__( """The name of the request queue.""" self._queue_head = deque[str]() - """A deque to store request IDs in the queue head.""" + """A deque to store request unique keys in the queue head.""" self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) - """A cache to store request objects. Request ID is used as the cache key.""" + """A cache to store request objects. Request unique key is used as the cache key.""" self._queue_has_locked_requests: bool | None = None """Whether the queue has requests locked by another client.""" @@ -248,14 +247,13 @@ async def add_batch_of_requests( already_present_requests: list[ProcessedRequest] = [] for request in requests: - if self._requests_cache.get(request.id): + if self._requests_cache.get(request.unique_key): # We are not sure if it was already handled at this point, and it is not worth calling API for it. # It could have been handled by another client in the meantime, so cached information about # `request.was_already_handled` is not reliable. already_present_requests.append( ProcessedRequest.model_validate( { - 'id': request.id, 'uniqueKey': request.unique_key, 'wasAlreadyPresent': True, 'wasAlreadyHandled': request.was_already_handled, @@ -267,14 +265,13 @@ async def add_batch_of_requests( # Add new request to the cache. processed_request = ProcessedRequest.model_validate( { - 'id': request.id, 'uniqueKey': request.unique_key, 'wasAlreadyPresent': True, 'wasAlreadyHandled': request.was_already_handled, } ) self._cache_request( - unique_key_to_request_id(request.unique_key), + request.unique_key, processed_request, ) new_requests.append(request) @@ -299,7 +296,7 @@ async def add_batch_of_requests( # Remove unprocessed requests from the cache for unprocessed_request in api_response.unprocessed_requests: - self._requests_cache.pop(unique_key_to_request_id(unprocessed_request.unique_key), None) + self._requests_cache.pop(unprocessed_request.unique_key, None) else: api_response = AddRequestsResponse.model_validate( @@ -323,16 +320,16 @@ async def add_batch_of_requests( return api_response @override - async def get_request(self, request_id: str) -> Request | None: + async def get_request(self, request_unique_key: str) -> Request | None: """Get a request by ID. Args: - request_id: The ID of the request to get. + request_unique_key: Unique key of the request to get. Returns: The request or None if not found. """ - response = await self._api_client.get_request(request_id) + response = await self._api_client.get_request_by_unique_key(request_unique_key) if response is None: return None @@ -381,7 +378,7 @@ async def fetch_next_request(self) -> Request | None: return None # Use get request to ensure we have the full request object. - request = await self.get_request(request.id) + request = await self.get_request(request.unique_key) if request is None: logger.debug( 'Request fetched from the beginning of queue was not found in the RQ', @@ -407,7 +404,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | if request.handled_at is None: request.handled_at = datetime.now(tz=timezone.utc) - if cached_request := self._requests_cache[request.id]: + if cached_request := self._requests_cache[request.unique_key]: cached_request.was_already_handled = request.was_already_handled try: # Update the request in the API @@ -419,14 +416,14 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | self._assumed_handled_count += 1 # Update the cache with the handled request - cache_key = unique_key_to_request_id(request.unique_key) + cache_key = request.unique_key self._cache_request( cache_key, processed_request, hydrated_request=request, ) except Exception as exc: - logger.debug(f'Error marking request {request.id} as handled: {exc!s}') + logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') return None else: return processed_request @@ -467,7 +464,7 @@ async def reclaim_request( self._assumed_handled_count -= 1 # Update the cache - cache_key = unique_key_to_request_id(request.unique_key) + cache_key = request.unique_key self._cache_request( cache_key, processed_request, @@ -481,11 +478,11 @@ async def reclaim_request( # Try to release the lock on the request try: - await self._delete_request_lock(request.id, forefront=forefront) + await self._delete_request_lock(request.unique_key, forefront=forefront) except Exception as err: - logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err) + logger.debug(f'Failed to delete request lock for request {request.unique_key}', exc_info=err) except Exception as exc: - logger.debug(f'Error reclaiming request {request.id}: {exc!s}') + logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') return None else: return processed_request @@ -554,7 +551,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: return None # Update cache with hydrated request - cache_key = unique_key_to_request_id(request.unique_key) + cache_key = request.unique_key self._cache_request( cache_key, ProcessedRequest( @@ -592,7 +589,7 @@ async def _update_request( ) return ProcessedRequest.model_validate( - {'id': request.id, 'uniqueKey': request.unique_key} | response, + {'uniqueKey': request.unique_key} | response, ) async def _list_head( @@ -653,11 +650,10 @@ async def _list_head( request = Request.model_validate(request_data) # Skip requests without ID or unique key - if not request.id or not request.unique_key: + if not request.unique_key: logger.debug( 'Skipping request from queue head, missing ID or unique key', extra={ - 'id': request.id, 'unique_key': request.unique_key, }, ) @@ -665,16 +661,15 @@ async def _list_head( # Cache the request self._cache_request( - unique_key_to_request_id(request.unique_key), + request.unique_key, ProcessedRequest( - id=request.id, unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ), hydrated_request=request, ) - self._queue_head.append(request.id) + self._queue_head.append(request.unique_key) for leftover_request_id in leftover_buffer: # After adding new requests to the forefront, any existing leftover locked request is kept in the end. @@ -683,21 +678,21 @@ async def _list_head( async def _prolong_request_lock( self, - request_id: str, + request_unique_key: str, *, lock_secs: int, ) -> ProlongRequestLockResponse: """Prolong the lock on a specific request in the queue. Args: - request_id: The identifier of the request whose lock is to be prolonged. + request_unique_key: Unique key of the request whose lock is to be prolonged. lock_secs: The additional amount of time, in seconds, that the request will remain locked. Returns: A response containing the time at which the lock will expire. """ - response = await self._api_client.prolong_request_lock( - request_id=request_id, + response = await self._api_client.prolong_request_lock_by_unique_key( + request_unique_key=request_unique_key, # All requests reaching this code were the tip of the queue at the moment when they were fetched, # so if their lock expires, they should be put back to the forefront as their handling is long overdue. forefront=True, @@ -710,7 +705,7 @@ async def _prolong_request_lock( # Update the cache with the new lock expiration for cached_request in self._requests_cache.values(): - if cached_request.id == request_id: + if cached_request.unique_key == request_unique_key: cached_request.lock_expires_at = result.lock_expires_at break @@ -718,29 +713,29 @@ async def _prolong_request_lock( async def _delete_request_lock( self, - request_id: str, + request_unique_key: str, *, forefront: bool = False, ) -> None: """Delete the lock on a specific request in the queue. Args: - request_id: ID of the request to delete the lock. + request_unique_key: Unique key of the request to delete the lock. forefront: Whether to put the request in the beginning or the end of the queue after the lock is deleted. """ try: - await self._api_client.delete_request_lock( - request_id=request_id, + await self._api_client.delete_request_lock_by_unique_key( + request_unique_key=request_unique_key, forefront=forefront, ) # Update the cache to remove the lock for cached_request in self._requests_cache.values(): - if cached_request.id == request_id: + if cached_request.unique_key == request_unique_key: cached_request.lock_expires_at = None break except Exception as err: - logger.debug(f'Failed to delete request lock for request {request_id}', exc_info=err) + logger.debug(f'Failed to delete request lock for request {request_unique_key}', exc_info=err) def _cache_request( self, @@ -758,7 +753,7 @@ def _cache_request( hydrated_request: The hydrated request object, if available. """ self._requests_cache[cache_key] = CachedRequest( - id=processed_request.id, + unique_key=processed_request.unique_key, was_already_handled=processed_request.was_already_handled, hydrated=hydrated_request, lock_expires_at=None, diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt index 66a782ba..e13b626e 100644 --- a/tests/integration/actor_source_base/requirements.txt +++ b/tests/integration/actor_source_base/requirements.txt @@ -1,4 +1,4 @@ # The test fixture will put the Apify SDK wheel path on the next line APIFY_SDK_WHEEL_PLACEHOLDER uvicorn[standard] -crawlee[parsel] @ git+https://github.com/apify/crawlee-python.git@master +crawlee[parsel] @ git+https://github.com/apify/crawlee-python.git@remove-request-id diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index fe9c50e5..1db730a7 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -399,38 +399,35 @@ async def main() -> None: assert run_result.status == 'SUCCEEDED' -async def test_get_request_by_id( +async def test_get_request_by_unique_key( make_actor: MakeActorFunction, run_actor: RunActorFunction, ) -> None: - """Test retrieving specific requests by their ID.""" + """Test retrieving specific requests by their unique_key.""" async def main() -> None: async with Actor: rq = await Actor.open_request_queue() Actor.log.info('Request queue opened') - # Add a request and get its ID + # Add a request and get its unique_key add_result = await rq.add_request('https://example.com/test') - request_id = add_result.id - Actor.log.info(f'Request added with ID: {request_id}') + request_unique_key = add_result.unique_key + Actor.log.info(f'Request added with unique_key: {request_unique_key}') - # Retrieve the request by ID - retrieved_request = await rq.get_request(request_id) + # Retrieve the request by unique_key + retrieved_request = await rq.get_request(request_unique_key) assert retrieved_request is not None, f'retrieved_request={retrieved_request}' assert retrieved_request.url == 'https://example.com/test', f'retrieved_request.url={retrieved_request.url}' - assert retrieved_request.id == request_id, ( - f'retrieved_request.id={retrieved_request.id}', - f'request_id={request_id}', - ) - Actor.log.info('Request retrieved successfully by ID') + assert retrieved_request.unique_key == request_unique_key, (f'{request_unique_key=}',) + Actor.log.info('Request retrieved successfully by unique_key') - # Test with non-existent ID - non_existent_request = await rq.get_request('non-existent-id') + # Test with non-existent unique_key + non_existent_request = await rq.get_request('non-existent-unique_key') assert non_existent_request is None, f'non_existent_request={non_existent_request}' - Actor.log.info('Non-existent ID correctly returned None') + Actor.log.info('Non-existent unique_key correctly returned None') - actor = await make_actor(label='rq-get-by-id-test', main_func=main) + actor = await make_actor(label='rq-get-by-unique-key-test', main_func=main) run_result = await run_actor(actor) assert run_result.status == 'SUCCEEDED' diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py index 3c79fe1b..e69a7916 100644 --- a/tests/unit/scrapy/requests/test_to_apify_request.py +++ b/tests/unit/scrapy/requests/test_to_apify_request.py @@ -66,7 +66,6 @@ def test_with_id_and_unique_key(spider: Spider) -> None: url='https://example.com', method='GET', meta={ - 'apify_request_id': 'abc123', 'apify_request_unique_key': 'https://example.com', 'userData': {'some_user_data': 'hello'}, }, @@ -77,7 +76,6 @@ def test_with_id_and_unique_key(spider: Spider) -> None: assert apify_request.url == 'https://example.com' assert apify_request.method == 'GET' - assert apify_request.id == 'abc123' assert apify_request.unique_key == 'https://example.com' user_data = apify_request.user_data diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index 2b8f0ab7..13659527 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -36,7 +36,6 @@ def test_without_reconstruction(spider: Spider) -> None: assert isinstance(scrapy_request, Request) assert apify_request.url == scrapy_request.url assert apify_request.method == scrapy_request.method - assert apify_request.id == scrapy_request.meta.get('apify_request_id') assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') @@ -56,7 +55,6 @@ def test_without_reconstruction_with_optional_fields(spider: Spider) -> None: assert isinstance(scrapy_request, Request) assert apify_request.url == scrapy_request.url assert apify_request.method == scrapy_request.method - assert apify_request.id == scrapy_request.meta.get('apify_request_id') assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') scrapy_request_headers = scrapy_request.headers.get('authorization') @@ -82,7 +80,6 @@ def test_with_reconstruction(spider: Spider) -> None: assert isinstance(scrapy_request, Request) assert apify_request.url == scrapy_request.url assert apify_request.method == scrapy_request.method - assert apify_request.id == scrapy_request.meta.get('apify_request_id') assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') assert apify_request.user_data == scrapy_request.meta.get('userData') @@ -106,7 +103,6 @@ def test_with_reconstruction_with_optional_fields(spider: Spider) -> None: assert isinstance(scrapy_request, Request) assert apify_request.url == scrapy_request.url assert apify_request.method == scrapy_request.method - assert apify_request.id == scrapy_request.meta.get('apify_request_id') assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') scrapy_request_headers = scrapy_request.headers.get('authorization') diff --git a/uv.lock b/uv.lock index 5a238629..7461924e 100644 --- a/uv.lock +++ b/uv.lock @@ -73,10 +73,10 @@ dev = [ [package.metadata] requires-dist = [ - { name = "apify-client", specifier = ">=2.0.0,<3.0.0" }, + { name = "apify-client", git = "https://github.com/apify/apify-client-python.git?rev=remove-request-id" }, { name = "apify-shared", specifier = ">=2.0.0,<3.0.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=master" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=remove-request-id" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -112,18 +112,14 @@ dev = [ [[package]] name = "apify-client" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } +version = "2.0.1" +source = { git = "https://github.com/apify/apify-client-python.git?rev=remove-request-id#2ebd75c406e8836a27558b3cb3050868bb0471c4" } dependencies = [ { name = "apify-shared" }, { name = "colorama" }, { name = "impit" }, { name = "more-itertools" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ae/fe/1ec02e47d6aa955de6a1ef62100ac6b2ea33810b0eabc020d0a5dcca080c/apify_client-2.0.0.tar.gz", hash = "sha256:1fd46ddebca5fd3f30cf6321350533bc753d701913fa9429324b37fd54a26663", size = 359411, upload-time = "2025-08-15T09:58:50.024Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/68/4b/2623b6c731c45320ac73cc9c93966b2b0c857daae9911bc18bd84e602a1b/apify_client-2.0.0-py3-none-any.whl", hash = "sha256:86451edb9d69a9423b65dcfa8bed12ef53c4c54be0d61ba923583098b214242b", size = 84868, upload-time = "2025-08-15T09:58:47.94Z" }, -] [[package]] name = "apify-shared" @@ -478,7 +474,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.13" -source = { git = "https://github.com/apify/crawlee-python.git?rev=master#454de75b1516bed68dab69a3663e563704d55ce2" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=remove-request-id#2d1f9e642167f4949f17cb7e0d1a7f5656d8aff7" } dependencies = [ { name = "cachetools" }, { name = "colorama" }, @@ -489,8 +485,6 @@ dependencies = [ { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pyee" }, - { name = "sortedcollections" }, - { name = "sortedcontainers" }, { name = "tldextract" }, { name = "typing-extensions" }, { name = "yarl" }, @@ -1860,7 +1854,7 @@ wheels = [ [[package]] name = "requests" -version = "2.32.4" +version = "2.32.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi" }, @@ -1868,9 +1862,9 @@ dependencies = [ { name = "idna" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload-time = "2025-06-09T16:43:07.34Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" }, + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, ] [[package]] @@ -1973,27 +1967,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] -[[package]] -name = "sortedcollections" -version = "2.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sortedcontainers" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/01/00/6d749cc1f88e7f95f5442a8abb195fa607094deba9e0475affbfb7fa8c04/sortedcollections-2.1.0.tar.gz", hash = "sha256:d8e9609d6c580a16a1224a3dc8965789e03ebc4c3e5ffd05ada54a2fed5dcacd", size = 9287, upload-time = "2021-01-18T22:15:16.623Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/39/c993a7d0c9dbf3aeca5008bdd00e4436ad9b7170527cef0a14634b47001f/sortedcollections-2.1.0-py3-none-any.whl", hash = "sha256:b07abbc73472cc459da9dd6e2607d73d1f3b9309a32dd9a57fa2c6fa882f4c6c", size = 9531, upload-time = "2021-01-18T22:15:15.36Z" }, -] - -[[package]] -name = "sortedcontainers" -version = "2.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, -] - [[package]] name = "tldextract" version = "5.3.0" From 6d87488d6401f972b997904b30bdda3b05c43354 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 20 Aug 2025 09:05:34 +0200 Subject: [PATCH 07/16] Update lock after updating the branches in downstream repos --- uv.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uv.lock b/uv.lock index 7461924e..0b739225 100644 --- a/uv.lock +++ b/uv.lock @@ -113,7 +113,7 @@ dev = [ [[package]] name = "apify-client" version = "2.0.1" -source = { git = "https://github.com/apify/apify-client-python.git?rev=remove-request-id#2ebd75c406e8836a27558b3cb3050868bb0471c4" } +source = { git = "https://github.com/apify/apify-client-python.git?rev=remove-request-id#3729faca3e55db9858b89edb1d9730aa11039979" } dependencies = [ { name = "apify-shared" }, { name = "colorama" }, @@ -474,7 +474,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.13" -source = { git = "https://github.com/apify/crawlee-python.git?rev=remove-request-id#2d1f9e642167f4949f17cb7e0d1a7f5656d8aff7" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=remove-request-id#10c0d7d06c780c13fdb9307db666ca110220b568" } dependencies = [ { name = "cachetools" }, { name = "colorama" }, From caf9e23ab5719bb383213962974b04153f83c71d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 20 Aug 2025 16:26:56 +0200 Subject: [PATCH 08/16] Move transformation function to from client to sdk Update leftover id-based variable names --- pyproject.toml | 2 +- .../_apify/_request_queue_client.py | 95 ++++++++++++------- .../test_apify_request_queue_client.py | 38 ++++++++ uv.lock | 10 +- 4 files changed, 106 insertions(+), 39 deletions(-) create mode 100644 tests/unit/storage_clients/test_apify_request_queue_client.py diff --git a/pyproject.toml b/pyproject.toml index 1e0c8092..5516141c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ keywords = [ "scraping", ] dependencies = [ - "apify-client@git+https://github.com/apify/apify-client-python.git@remove-request-id", + "apify-client>=2.0.0,<3.0.0", "apify-shared>=2.0.0,<3.0.0", "crawlee@git+https://github.com/apify/crawlee-python.git@remove-request-id", "cachetools>=5.5.0", diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 375ce67a..cae8a38b 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -1,8 +1,11 @@ from __future__ import annotations import asyncio +import re +from base64 import b64encode from collections import deque from datetime import datetime, timedelta, timezone +from hashlib import sha256 from logging import getLogger from typing import TYPE_CHECKING, Final @@ -320,16 +323,16 @@ async def add_batch_of_requests( return api_response @override - async def get_request(self, request_unique_key: str) -> Request | None: - """Get a request by ID. + async def get_request(self, unique_key: str) -> Request | None: + """Get a request by unique key. Args: - request_unique_key: Unique key of the request to get. + unique_key: Unique key of the request to get. Returns: The request or None if not found. """ - response = await self._api_client.get_request_by_unique_key(request_unique_key) + response = await self._api_client.get_request(unique_key_to_request_id(unique_key)) if response is None: return None @@ -357,15 +360,15 @@ async def fetch_next_request(self) -> Request | None: return None # Get the next request ID from the queue head - next_request_id = self._queue_head.popleft() + next_unique_key = self._queue_head.popleft() - request = await self._get_or_hydrate_request(next_request_id) + request = await self._get_or_hydrate_request(next_unique_key) # Handle potential inconsistency where request might not be in the main table yet if request is None: logger.debug( 'Cannot find a request from the beginning of queue, will be retried later', - extra={'nextRequestId': next_request_id}, + extra={'nextRequestUniqueKey': next_unique_key}, ) return None @@ -373,7 +376,7 @@ async def fetch_next_request(self) -> Request | None: if request.handled_at is not None: logger.debug( 'Request fetched from the beginning of queue was already handled', - extra={'nextRequestId': next_request_id}, + extra={'nextRequestUniqueKey': next_unique_key}, ) return None @@ -382,7 +385,7 @@ async def fetch_next_request(self) -> Request | None: if request is None: logger.debug( 'Request fetched from the beginning of queue was not found in the RQ', - extra={'nextRequestId': next_request_id}, + extra={'nextRequestUniqueKey': next_unique_key}, ) return None @@ -509,17 +512,17 @@ async def _ensure_head_is_non_empty(self) -> None: # Fetch requests from the API and populate the queue head await self._list_head(lock_time=self._DEFAULT_LOCK_TIME) - async def _get_or_hydrate_request(self, request_id: str) -> Request | None: - """Get a request by ID, either from cache or by fetching from API. + async def _get_or_hydrate_request(self, unique_key: str) -> Request | None: + """Get a request by unique key, either from cache or by fetching from API. Args: - request_id: The ID of the request to get. + unique_key: Unique keu of the request to get. Returns: The request if found and valid, otherwise None. """ # First check if the request is in our cache - cached_entry = self._requests_cache.get(request_id) + cached_entry = self._requests_cache.get(unique_key) if cached_entry and cached_entry.hydrated: # If we have the request hydrated in cache, check if lock is expired @@ -527,11 +530,11 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: # Try to prolong the lock if it's expired try: lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) - response = await self._prolong_request_lock(request_id, lock_secs=lock_secs) + response = await self._prolong_request_lock(unique_key, lock_secs=lock_secs) cached_entry.lock_expires_at = response.lock_expires_at except Exception: # If prolonging the lock fails, we lost the request - logger.debug(f'Failed to prolong lock for request {request_id}, returning None') + logger.debug(f'Failed to prolong lock for request {unique_key}, returning None') return None return cached_entry.hydrated @@ -540,14 +543,14 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: try: # Try to acquire or prolong the lock lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) - await self._prolong_request_lock(request_id, lock_secs=lock_secs) + await self._prolong_request_lock(unique_key, lock_secs=lock_secs) # Fetch the request data - request = await self.get_request(request_id) + request = await self.get_request(unique_key) # If request is not found, release lock and return None if not request: - await self._delete_request_lock(request_id) + await self._delete_request_lock(unique_key) return None # Update cache with hydrated request @@ -555,7 +558,6 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: self._cache_request( cache_key, ProcessedRequest( - id=request_id, unique_key=request.unique_key, was_already_present=True, was_already_handled=request.handled_at is not None, @@ -563,7 +565,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: hydrated_request=request, ) except Exception as exc: - logger.debug(f'Error fetching or locking request {request_id}: {exc!s}') + logger.debug(f'Error fetching or locking request {unique_key}: {exc!s}') return None else: return request @@ -613,8 +615,8 @@ async def _list_head( logger.debug(f'Using cached queue head with {len(self._queue_head)} requests') # Create a list of requests from the cached queue head items = [] - for request_id in list(self._queue_head)[:limit]: - cached_request = self._requests_cache.get(request_id) + for unique_key in list(self._queue_head)[:limit]: + cached_request = self._requests_cache.get(unique_key) if cached_request and cached_request.hydrated: items.append(cached_request.hydrated) @@ -671,28 +673,28 @@ async def _list_head( ) self._queue_head.append(request.unique_key) - for leftover_request_id in leftover_buffer: + for leftover_unique_key in leftover_buffer: # After adding new requests to the forefront, any existing leftover locked request is kept in the end. - self._queue_head.append(leftover_request_id) + self._queue_head.append(leftover_unique_key) return RequestQueueHead.model_validate(response) async def _prolong_request_lock( self, - request_unique_key: str, + unique_key: str, *, lock_secs: int, ) -> ProlongRequestLockResponse: """Prolong the lock on a specific request in the queue. Args: - request_unique_key: Unique key of the request whose lock is to be prolonged. + unique_key: Unique key of the request whose lock is to be prolonged. lock_secs: The additional amount of time, in seconds, that the request will remain locked. Returns: A response containing the time at which the lock will expire. """ - response = await self._api_client.prolong_request_lock_by_unique_key( - request_unique_key=request_unique_key, + response = await self._api_client.prolong_request_lock( + request_id=unique_key_to_request_id(unique_key), # All requests reaching this code were the tip of the queue at the moment when they were fetched, # so if their lock expires, they should be put back to the forefront as their handling is long overdue. forefront=True, @@ -705,7 +707,7 @@ async def _prolong_request_lock( # Update the cache with the new lock expiration for cached_request in self._requests_cache.values(): - if cached_request.unique_key == request_unique_key: + if cached_request.unique_key == unique_key: cached_request.lock_expires_at = result.lock_expires_at break @@ -713,29 +715,29 @@ async def _prolong_request_lock( async def _delete_request_lock( self, - request_unique_key: str, + unique_key: str, *, forefront: bool = False, ) -> None: """Delete the lock on a specific request in the queue. Args: - request_unique_key: Unique key of the request to delete the lock. + unique_key: Unique key of the request to delete the lock. forefront: Whether to put the request in the beginning or the end of the queue after the lock is deleted. """ try: - await self._api_client.delete_request_lock_by_unique_key( - request_unique_key=request_unique_key, + await self._api_client.delete_request_lock( + request_id=unique_key_to_request_id(unique_key), forefront=forefront, ) # Update the cache to remove the lock for cached_request in self._requests_cache.values(): - if cached_request.unique_key == request_unique_key: + if cached_request.unique_key == unique_key: cached_request.lock_expires_at = None break except Exception as err: - logger.debug(f'Failed to delete request lock for request {request_unique_key}', exc_info=err) + logger.debug(f'Failed to delete request lock for request {unique_key}', exc_info=err) def _cache_request( self, @@ -758,3 +760,26 @@ def _cache_request( hydrated=hydrated_request, lock_expires_at=None, ) + + +def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> str: + """Generate a deterministic request ID based on a unique key. + + Args: + unique_key: The unique key to convert into a request ID. + request_id_length: The length of the request ID. + + Returns: + A URL-safe, truncated request ID based on the unique key. + """ + # Encode the unique key and compute its SHA-256 hash + hashed_key = sha256(unique_key.encode('utf-8')).digest() + + # Encode the hash in base64 and decode it to get a string + base64_encoded = b64encode(hashed_key).decode('utf-8') + + # Remove characters that are not URL-safe ('+', '/', or '=') + url_safe_key = re.sub(r'(\+|\/|=)', '', base64_encoded) + + # Truncate the key to the desired length + return url_safe_key[:request_id_length] diff --git a/tests/unit/storage_clients/test_apify_request_queue_client.py b/tests/unit/storage_clients/test_apify_request_queue_client.py new file mode 100644 index 00000000..019b2e0b --- /dev/null +++ b/tests/unit/storage_clients/test_apify_request_queue_client.py @@ -0,0 +1,38 @@ +import pytest + +from apify.storage_clients._apify._request_queue_client import unique_key_to_request_id + + +def test_unique_key_to_request_id_length() -> None: + unique_key = 'exampleKey123' + request_id = unique_key_to_request_id(unique_key, request_id_length=15) + assert len(request_id) == 15, 'Request ID should have the correct length.' + + +def test_unique_key_to_request_id_consistency() -> None: + unique_key = 'consistentKey' + request_id_1 = unique_key_to_request_id(unique_key) + request_id_2 = unique_key_to_request_id(unique_key) + assert request_id_1 == request_id_2, 'The same unique key should generate consistent request IDs.' + + +@pytest.mark.parametrize( + ('unique_key', 'expected_request_id'), + [ + ('abc', 'ungWv48BzpBQUDe'), + ('uniqueKey', 'xiWPs083cree7mH'), + ('', '47DEQpj8HBSaTIm'), + ('测试中文', 'lKPdJkdvw8MXEUp'), + ('test+/=', 'XZRQjhoG0yjfnYD'), + ], + ids=[ + 'basic_abc', + 'keyword_uniqueKey', + 'empty_string', + 'non_ascii_characters', + 'url_unsafe_characters', + ], +) +def test_unique_key_to_request_id_matches_known_values(unique_key: str, expected_request_id: str) -> None: + request_id = unique_key_to_request_id(unique_key) + assert request_id == expected_request_id, f'Unique key "{unique_key}" should produce the expected request ID.' diff --git a/uv.lock b/uv.lock index 0b739225..84bc2450 100644 --- a/uv.lock +++ b/uv.lock @@ -73,7 +73,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "apify-client", git = "https://github.com/apify/apify-client-python.git?rev=remove-request-id" }, + { name = "apify-client", specifier = ">=2.0.0,<3.0.0" }, { name = "apify-shared", specifier = ">=2.0.0,<3.0.0" }, { name = "cachetools", specifier = ">=5.5.0" }, { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=remove-request-id" }, @@ -112,14 +112,18 @@ dev = [ [[package]] name = "apify-client" -version = "2.0.1" -source = { git = "https://github.com/apify/apify-client-python.git?rev=remove-request-id#3729faca3e55db9858b89edb1d9730aa11039979" } +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "apify-shared" }, { name = "colorama" }, { name = "impit" }, { name = "more-itertools" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/ae/fe/1ec02e47d6aa955de6a1ef62100ac6b2ea33810b0eabc020d0a5dcca080c/apify_client-2.0.0.tar.gz", hash = "sha256:1fd46ddebca5fd3f30cf6321350533bc753d701913fa9429324b37fd54a26663", size = 359411, upload-time = "2025-08-15T09:58:50.024Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/4b/2623b6c731c45320ac73cc9c93966b2b0c857daae9911bc18bd84e602a1b/apify_client-2.0.0-py3-none-any.whl", hash = "sha256:86451edb9d69a9423b65dcfa8bed12ef53c4c54be0d61ba923583098b214242b", size = 84868, upload-time = "2025-08-15T09:58:47.94Z" }, +] [[package]] name = "apify-shared" From 5763b488bbf91b44953ff2a68687abaa31b6283c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 21 Aug 2025 09:39:50 +0200 Subject: [PATCH 09/16] Review comments --- .../_apify/_request_queue_client.py | 7 +- tests/integration/conftest.py | 8 +- tests/integration/test_actor_request_queue.py | 80 ----------------- tests/integration/test_request_queue.py | 86 +++++++++++++++++++ 4 files changed, 94 insertions(+), 87 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 2360d56b..6321eb10 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -69,7 +69,12 @@ def __init__( """Fetch lock to minimize race conditions when communicating with API.""" async def _get_metadata(self) -> RequestQueueMetadata: - """Try to get cached metadata first. If multiple clients, fuse with global metadata.""" + """Try to get cached metadata first. If multiple clients, fuse with global metadata. + + This method is used internally to avoid unnecessary API call unless needed (multiple clients). + Local estimation of metadata is without delay, unlike metadata from API. In situation where there is only one + client, it is the better choice. + """ if self._metadata.had_multiple_clients: return await self.get_metadata() # Get local estimation (will not include changes done bo another client) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 6ec53449..52ab7f5a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -98,11 +98,6 @@ def apify_token() -> str: return api_token -@pytest.fixture(autouse=True) -def set_token(apify_token: str, monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) - - @pytest.fixture def apify_client_async(apify_token: str) -> ApifyClientAsync: """Create an instance of the ApifyClientAsync. @@ -117,9 +112,10 @@ def apify_client_async(apify_token: str) -> ApifyClientAsync: @pytest.fixture -async def request_queue_force_cloud() -> AsyncGenerator[RequestQueue]: +async def request_queue_force_cloud(apify_token: str, monkeypatch: pytest.MonkeyPatch) -> AsyncGenerator[RequestQueue]: """Create an instance of the Apify request queue on the platform and drop it when the test is finished.""" request_queue_name = generate_unique_resource_name('request_queue') + monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) async with Actor: rq = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 37fe92b0..9f03c637 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -318,86 +318,6 @@ def return_unprocessed_requests(requests: list[dict], *_: Any, **__: Any) -> dic assert (stats_after['writeCount'] - stats_before['writeCount']) == 1 -async def test_request_queue_enhanced_metadata( - request_queue_force_cloud: RequestQueue, - apify_client_async: ApifyClientAsync, -) -> None: - """Test metadata tracking. - - Multiple clients scenarios are not guaranteed to give correct results without delay. But at least multiple clients, - single producer, should be reliable on the producer side.""" - - for i in range(1, 10): - await request_queue_force_cloud.add_request(Request.from_url(f'http://example.com/{i}')) - # Reliable information as the API response is enhanced with local metadata estimation. - assert (await request_queue_force_cloud.get_metadata()).total_request_count == i - - # Accessed with client created explicitly with `client_key=None` should appear as distinct client - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) - await api_client.list_head() - - # The presence of another non-producing client should not affect the metadata - for i in range(10, 20): - await request_queue_force_cloud.add_request(Request.from_url(f'http://example.com/{i}')) - # Reliable information as the API response is enhanced with local metadata estimation. - assert (await request_queue_force_cloud.get_metadata()).total_request_count == i - - -async def test_request_queue_metadata_another_client( - request_queue_force_cloud: RequestQueue, - apify_client_async: ApifyClientAsync, -) -> None: - """Test metadata tracking. The delayed metadata should be reliable even when changed by another client.""" - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) - await api_client.add_request(Request.from_url('http://example.com/1').model_dump(by_alias=True, exclude={'id'})) - - # Wait to be sure that the API has updated the global metadata - await asyncio.sleep(10) - - assert (await request_queue_force_cloud.get_metadata()).total_request_count == 1 - - -async def test_request_queue_had_multiple_clients_local( - request_queue_force_cloud: RequestQueue, - apify_client_async: ApifyClientAsync, -) -> None: - """Test that `RequestQueue` correctly detects multiple clients. - - Clients created with different `client_key` should appear as distinct clients.""" - await request_queue_force_cloud.fetch_next_request() - - # Accessed with client created explicitly with `client_key=None` should appear as distinct client - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) - await api_client.list_head() - - # Check that it is correctly in the RequestQueueClient metadata - assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is True - - # Check that it is correctly in the API - api_response = await api_client.get() - assert api_response - assert api_response['hadMultipleClients'] is True - - -async def test_request_queue_not_had_multiple_clients_local( - request_queue_force_cloud: RequestQueue, apify_client_async: ApifyClientAsync -) -> None: - """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" - - # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True - await request_queue_force_cloud.fetch_next_request() - await request_queue_force_cloud.fetch_next_request() - - # Check that it is correctly in the RequestQueueClient metadata - assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is False - - # Check that it is correctly in the API - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id) - api_response = await api_client.get() - assert api_response - assert api_response['hadMultipleClients'] is False - - async def test_request_queue_had_multiple_clients_platform( make_actor: MakeActorFunction, run_actor: RunActorFunction, diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index fe9c50e5..e7f7fe82 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -1,12 +1,18 @@ from __future__ import annotations +import asyncio from typing import TYPE_CHECKING import pytest +from crawlee import Request + from apify import Actor if TYPE_CHECKING: + from apify_client import ApifyClientAsync + from crawlee.storages import RequestQueue + from .conftest import MakeActorFunction, RunActorFunction @@ -1195,3 +1201,83 @@ async def consumer() -> int: actor = await make_actor(label='rq-performance-pattern-test', main_func=main) run_result = await run_actor(actor) assert run_result.status == 'SUCCEEDED' + + +async def test_request_queue_enhanced_metadata( + request_queue_force_cloud: RequestQueue, + apify_client_async: ApifyClientAsync, +) -> None: + """Test metadata tracking. + + Multiple clients scenarios are not guaranteed to give correct results without delay. But at least multiple clients, + single producer, should be reliable on the producer side.""" + + for i in range(1, 10): + await request_queue_force_cloud.add_request(Request.from_url(f'http://example.com/{i}')) + # Reliable information as the API response is enhanced with local metadata estimation. + assert (await request_queue_force_cloud.get_metadata()).total_request_count == i + + # Accessed with client created explicitly with `client_key=None` should appear as distinct client + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + await api_client.list_head() + + # The presence of another non-producing client should not affect the metadata + for i in range(10, 20): + await request_queue_force_cloud.add_request(Request.from_url(f'http://example.com/{i}')) + # Reliable information as the API response is enhanced with local metadata estimation. + assert (await request_queue_force_cloud.get_metadata()).total_request_count == i + + +async def test_request_queue_metadata_another_client( + request_queue_force_cloud: RequestQueue, + apify_client_async: ApifyClientAsync, +) -> None: + """Test metadata tracking. The delayed metadata should be reliable even when changed by another client.""" + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + await api_client.add_request(Request.from_url('http://example.com/1').model_dump(by_alias=True, exclude={'id'})) + + # Wait to be sure that the API has updated the global metadata + await asyncio.sleep(10) + + assert (await request_queue_force_cloud.get_metadata()).total_request_count == 1 + + +async def test_request_queue_had_multiple_clients( + request_queue_force_cloud: RequestQueue, + apify_client_async: ApifyClientAsync, +) -> None: + """Test that `RequestQueue` correctly detects multiple clients. + + Clients created with different `client_key` should appear as distinct clients.""" + await request_queue_force_cloud.fetch_next_request() + + # Accessed with client created explicitly with `client_key=None` should appear as distinct client + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + await api_client.list_head() + + # Check that it is correctly in the RequestQueueClient metadata + assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is True + + # Check that it is correctly in the API + api_response = await api_client.get() + assert api_response + assert api_response['hadMultipleClients'] is True + + +async def test_request_queue_not_had_multiple_clients( + request_queue_force_cloud: RequestQueue, apify_client_async: ApifyClientAsync +) -> None: + """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" + + # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True + await request_queue_force_cloud.fetch_next_request() + await request_queue_force_cloud.fetch_next_request() + + # Check that it is correctly in the RequestQueueClient metadata + assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is False + + # Check that it is correctly in the API + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id) + api_response = await api_client.get() + assert api_response + assert api_response['hadMultipleClients'] is False From 4fa7359b5e7c2f4bf870450cd42915c598e1227b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 21 Aug 2025 16:48:58 +0200 Subject: [PATCH 10/16] Update after crawlee update --- docs/03_concepts/code/03_rq.py | 5 +---- pyproject.toml | 2 +- tests/integration/actor_source_base/requirements.txt | 2 +- tests/integration/test_actor_request_queue.py | 1 + uv.lock | 4 ++-- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/03_concepts/code/03_rq.py b/docs/03_concepts/code/03_rq.py index fe1ea605..e9ad6a51 100644 --- a/docs/03_concepts/code/03_rq.py +++ b/docs/03_concepts/code/03_rq.py @@ -20,13 +20,10 @@ async def main() -> None: # If you try to add an existing request again, it will not do anything add_request_info = await queue.add_request( - Request.from_url('http://different-example.com/5') + Request.from_url('http://example.com/5') ) Actor.log.info(f'Add request info: {add_request_info}') - processed_request = await queue.get_request(add_request_info.id) - Actor.log.info(f'Processed request: {processed_request}') - # Finally, process the queue until all requests are handled while not await queue.is_finished(): # Fetch the next unhandled request in the queue diff --git a/pyproject.toml b/pyproject.toml index 5516141c..940f1400 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ keywords = [ dependencies = [ "apify-client>=2.0.0,<3.0.0", "apify-shared>=2.0.0,<3.0.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@remove-request-id", + "crawlee@git+https://github.com/apify/crawlee-python.git@master", "cachetools>=5.5.0", "cryptography>=42.0.0", "httpx>=0.27.0", diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt index e13b626e..66a782ba 100644 --- a/tests/integration/actor_source_base/requirements.txt +++ b/tests/integration/actor_source_base/requirements.txt @@ -1,4 +1,4 @@ # The test fixture will put the Apify SDK wheel path on the next line APIFY_SDK_WHEEL_PLACEHOLDER uvicorn[standard] -crawlee[parsel] @ git+https://github.com/apify/crawlee-python.git@remove-request-id +crawlee[parsel] @ git+https://github.com/apify/crawlee-python.git@master diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index a785f1ad..b9abc2a1 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -85,6 +85,7 @@ async def test_force_cloud( ) -> None: request_queue_id = (await apify_named_rq.get_metadata()).id request_info = await apify_named_rq.add_request(Request.from_url('http://example.com')) + assert request_info.id is not None request_queue_client = apify_client_async.request_queue(request_queue_id) request_queue_details = await request_queue_client.get() diff --git a/uv.lock b/uv.lock index 84bc2450..d868a680 100644 --- a/uv.lock +++ b/uv.lock @@ -76,7 +76,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=2.0.0,<3.0.0" }, { name = "apify-shared", specifier = ">=2.0.0,<3.0.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=remove-request-id" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=master" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -478,7 +478,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.13" -source = { git = "https://github.com/apify/crawlee-python.git?rev=remove-request-id#10c0d7d06c780c13fdb9307db666ca110220b568" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=master#0650b7e097751b0cf6b190ef4c25b05e44169389" } dependencies = [ { name = "cachetools" }, { name = "colorama" }, From 01b581eb57fb289e1d2d8cafafb4bf626efa3e54 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 21 Aug 2025 17:00:01 +0200 Subject: [PATCH 11/16] Fix upgrading guide versions --- docs/04_upgrading/upgrading_to_v3.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index eba1f2d4..d9f179e5 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -1,6 +1,6 @@ --- -id: upgrading-to-v2 -title: Upgrading to v2 +id: upgrading-to-v3 +title: Upgrading to v3 --- This page summarizes the breaking changes between Apify Python SDK v2.x and v3.0. From bedb6ddd177dd27ac8c7421c7cb4b66e36236231 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 21 Aug 2025 17:22:37 +0200 Subject: [PATCH 12/16] Add one missing unique_key to id transformation --- src/apify/storage_clients/_apify/_request_queue_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index cae8a38b..801945ec 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -585,8 +585,10 @@ async def _update_request( Returns: The updated request """ + request_dict = request.model_dump(by_alias=True) + request_dict['id'] = unique_key_to_request_id(request.unique_key) response = await self._api_client.update_request( - request=request.model_dump(by_alias=True), + request=request_dict, forefront=forefront, ) From 1b68d1a7e13a9cd236bf876c00db5cb6f5e35bc2 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 22 Aug 2025 10:09:27 +0200 Subject: [PATCH 13/16] Review comments --- .../storage_clients/_apify/_request_queue_client.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 8b2c776c..bab29477 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -70,7 +70,7 @@ def __init__( self._fetch_lock = asyncio.Lock() """Fetch lock to minimize race conditions when communicating with API.""" - async def _get_metadata(self) -> RequestQueueMetadata: + async def _get_metadata_estimate(self) -> RequestQueueMetadata: """Try to get cached metadata first. If multiple clients, fuse with global metadata. This method is used internally to avoid unnecessary API call unless needed (multiple clients). @@ -84,7 +84,12 @@ async def _get_metadata(self) -> RequestQueueMetadata: @override async def get_metadata(self) -> RequestQueueMetadata: - """Get metadata about the request queue.""" + """Get metadata about the request queue. + + Returns: + Metadata from the API, merged with local estimation, because in some cases, the data from the API can + be delayed. + """ response = await self._api_client.get() if response is None: raise ValueError('Failed to fetch request queue metadata from the API.') @@ -613,7 +618,7 @@ async def _list_head( if cached_request and cached_request.hydrated: items.append(cached_request.hydrated) - metadata = await self._get_metadata() + metadata = await self._get_metadata_estimate() return RequestQueueHead( limit=limit, From 9cc4b263d7e811b0e1664a353e673d2dadc4d001 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 22 Aug 2025 14:21:34 +0200 Subject: [PATCH 14/16] Bump @apify/docs-theme to 1.0.201 --- website/package-lock.json | 2 +- website/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/package-lock.json b/website/package-lock.json index 2420fdbc..ccf2b5a2 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -6,7 +6,7 @@ "": { "name": "apify-sdk-python", "dependencies": { - "@apify/docs-theme": "^1.0.185", + "@apify/docs-theme": "^1.0.201", "@apify/docusaurus-plugin-typedoc-api": "^4.4.6", "@docusaurus/core": "^3.8.1", "@docusaurus/faster": "^3.8.1", diff --git a/website/package.json b/website/package.json index 76e37047..7d3c67e7 100644 --- a/website/package.json +++ b/website/package.json @@ -21,7 +21,7 @@ "lint:code:fix": "eslint . --fix" }, "dependencies": { - "@apify/docs-theme": "^1.0.185", + "@apify/docs-theme": "^1.0.201", "@apify/docusaurus-plugin-typedoc-api": "^4.4.6", "@docusaurus/core": "^3.8.1", "@docusaurus/faster": "^3.8.1", From c1be561f3f3fd0f71a6d3d9e7257fc6e015c58b8 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 22 Aug 2025 14:40:06 +0200 Subject: [PATCH 15/16] Try to temporarily add markdown.svg --- website/static/img/markdown.svg | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 website/static/img/markdown.svg diff --git a/website/static/img/markdown.svg b/website/static/img/markdown.svg new file mode 100644 index 00000000..b5599a0f --- /dev/null +++ b/website/static/img/markdown.svg @@ -0,0 +1,5 @@ + + + + + From b2d08121061f34294ab51c236843a66c460b9912 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 25 Aug 2025 09:33:21 +0200 Subject: [PATCH 16/16] Try to bump docs again --- website/package-lock.json | 2 +- website/package.json | 2 +- website/static/img/markdown.svg | 5 ----- 3 files changed, 2 insertions(+), 7 deletions(-) delete mode 100644 website/static/img/markdown.svg diff --git a/website/package-lock.json b/website/package-lock.json index ccf2b5a2..230848fa 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -6,7 +6,7 @@ "": { "name": "apify-sdk-python", "dependencies": { - "@apify/docs-theme": "^1.0.201", + "@apify/docs-theme": "^1.0.203", "@apify/docusaurus-plugin-typedoc-api": "^4.4.6", "@docusaurus/core": "^3.8.1", "@docusaurus/faster": "^3.8.1", diff --git a/website/package.json b/website/package.json index 7d3c67e7..3df0bf09 100644 --- a/website/package.json +++ b/website/package.json @@ -21,7 +21,7 @@ "lint:code:fix": "eslint . --fix" }, "dependencies": { - "@apify/docs-theme": "^1.0.201", + "@apify/docs-theme": "^1.0.203", "@apify/docusaurus-plugin-typedoc-api": "^4.4.6", "@docusaurus/core": "^3.8.1", "@docusaurus/faster": "^3.8.1", diff --git a/website/static/img/markdown.svg b/website/static/img/markdown.svg deleted file mode 100644 index b5599a0f..00000000 --- a/website/static/img/markdown.svg +++ /dev/null @@ -1,5 +0,0 @@ - - - - -