From c79a3f3c5b96551883d11a8a80a6179b4d0d8fef Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 22 Sep 2025 15:14:28 +0200 Subject: [PATCH 1/3] Make default Apify storages use alias mechanism --- docs/04_upgrading/upgrading_to_v3.md | 7 +++++++ src/apify/_configuration.py | 12 ++++++------ .../storage_clients/_apify/_dataset_client.py | 11 +++++++++-- .../_apify/_key_value_store_client.py | 11 +++++++++-- .../_apify/_request_queue_client.py | 11 +++++++++-- src/apify/storage_clients/_apify/_utils.py | 5 +++-- tests/integration/test_apify_storages.py | 17 +++++++++++++++++ 7 files changed, 60 insertions(+), 14 deletions(-) diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index df571d34..9e92e4ea 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -45,6 +45,13 @@ async def main(): ## Removed Actor.config property - `Actor.config` property has been removed. Use `Actor.configuration` instead. +## Default storage ids in configuration changed to None +- `Configuration.default_key_value_store_id` changed from `'default'` to `None`. +- `Configuration.default_dataset_id` changed from `'default'` to `None`. +- `Configuration.default_request_queue_id` changed from `'default'` to `None`. + +As a consequence of this change, using default storage without specifying its `id` in `Configuration` will use unnamed storage. + ## Storages diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index aba566b9..28158b55 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -142,7 +142,7 @@ class Configuration(CrawleeConfiguration): ] = None default_dataset_id: Annotated[ - str, + str | None, Field( validation_alias=AliasChoices( 'actor_default_dataset_id', @@ -150,10 +150,10 @@ class Configuration(CrawleeConfiguration): ), description='Default dataset ID used by the Apify storage client when no ID or name is provided.', ), - ] = 'default' + ] = None default_key_value_store_id: Annotated[ - str, + str | None, Field( validation_alias=AliasChoices( 'actor_default_key_value_store_id', @@ -161,10 +161,10 @@ class Configuration(CrawleeConfiguration): ), description='Default key-value store ID for the Apify storage client when no ID or name is provided.', ), - ] = 'default' + ] = None default_request_queue_id: Annotated[ - str, + str | None, Field( validation_alias=AliasChoices( 'actor_default_request_queue_id', @@ -172,7 +172,7 @@ class Configuration(CrawleeConfiguration): ), description='Default request queue ID for the Apify storage client when no ID or name is provided.', ), - ] = 'default' + ] = None disable_outdated_warning: Annotated[ bool, diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index e5ec91d0..8b6f3e11 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -124,8 +124,10 @@ async def open( ) apify_datasets_client = apify_client_async.datasets() - # Normalize 'default' alias to None - alias = None if alias == 'default' else alias + # Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed + # storage aliased as `__default__` + if not any([alias, name, id, configuration.default_dataset_id]): + alias = '__default__' if alias: # Check if there is pre-existing alias mapping in the default KVS. @@ -150,6 +152,11 @@ async def open( # If none are provided, try to get the default storage ID from environment variables. elif id is None: id = configuration.default_dataset_id + if not id: + raise ValueError( + 'Dataset "id", "name", or "alias" must be specified, ' + 'or a default dataset ID must be set in the configuration.' + ) # Now create the client for the determined ID apify_dataset_client = apify_client_async.dataset(dataset_id=id) diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 9011d834..79215ba2 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -115,8 +115,10 @@ async def open( ) apify_kvss_client = apify_client_async.key_value_stores() - # Normalize 'default' alias to None - alias = None if alias == 'default' else alias + # Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to + # unnamed storage aliased as `__default__` + if not any([alias, name, id, configuration.default_key_value_store_id]): + alias = '__default__' if alias: # Check if there is pre-existing alias mapping in the default KVS. @@ -142,6 +144,11 @@ async def open( # If none are provided, try to get the default storage ID from environment variables. elif id is None: id = configuration.default_key_value_store_id + if not id: + raise ValueError( + 'KeyValueStore "id", "name", or "alias" must be specified, ' + 'or a default KeyValueStore ID must be set in the configuration.' + ) # Now create the client for the determined ID apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 39556d2d..893f26b9 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -200,8 +200,10 @@ async def open( ) apify_rqs_client = apify_client_async.request_queues() - # Normalize 'default' alias to None - alias = None if alias == 'default' else alias + # Normalize unnamed default storage in cases where not defined in `configuration.default_request_queue_id` to + # unnamed storage aliased as `__default__` + if not any([alias, name, id, configuration.default_request_queue_id]): + alias = '__default__' if alias: # Check if there is pre-existing alias mapping in the default KVS. @@ -226,6 +228,11 @@ async def open( # If none are provided, try to get the default storage ID from environment variables. elif id is None: id = configuration.default_request_queue_id + if not id: + raise ValueError( + 'RequestQueue "id", "name", or "alias" must be specified, ' + 'or a default default_request_queue_id ID must be set in the configuration.' + ) # Use suitable client_key to make `hadMultipleClients` response of Apify API useful. # It should persist across migrated or resurrected Actor runs on the Apify platform. diff --git a/src/apify/storage_clients/_apify/_utils.py b/src/apify/storage_clients/_apify/_utils.py index 6d05bff3..ebae80f7 100644 --- a/src/apify/storage_clients/_apify/_utils.py +++ b/src/apify/storage_clients/_apify/_utils.py @@ -76,7 +76,7 @@ async def _get_alias_map(cls) -> dict[str, str]: Returns: Map of aliases and storage ids. """ - if not cls._alias_map: + if not cls._alias_map and Configuration.get_global_configuration().is_at_home: default_kvs_client = await _get_default_kvs_client() record = await default_kvs_client.get_record(cls._ALIAS_MAPPING_KEY) @@ -156,7 +156,8 @@ async def _get_default_kvs_client() -> KeyValueStoreClientAsync: min_delay_between_retries_millis=500, timeout_secs=360, ) - + if not configuration.default_key_value_store_id: + raise ValueError("'Configuration.default_key_value_store_id' must be set.") return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) diff --git a/tests/integration/test_apify_storages.py b/tests/integration/test_apify_storages.py index 0cf0c9af..437ea689 100644 --- a/tests/integration/test_apify_storages.py +++ b/tests/integration/test_apify_storages.py @@ -32,3 +32,20 @@ async def test_alias_concurrent_creation_local( except AssertionError: for storage in storages: await storage.drop() + + +@pytest.mark.parametrize( + 'storage_type', + [Dataset, KeyValueStore, RequestQueue], +) +async def test_unnamed_default_without_config( + storage_type: Dataset | KeyValueStore | RequestQueue, apify_token: str +) -> None: + """Test that default Apify storage used locally is unnamed storage.""" + service_locator.set_configuration(Configuration(token=apify_token)) + service_locator.set_storage_client(ApifyStorageClient()) + + storage = await storage_type.open() + assert storage.name is None + assert storage.id + await storage.drop() From 15374ff76d04ae4c5a0776b6231094dcaedd9cc3 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 23 Sep 2025 08:52:00 +0200 Subject: [PATCH 2/3] Update upgrading guide --- docs/04_upgrading/upgrading_to_v3.md | 2 +- tests/integration/test_apify_storages.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index 9e92e4ea..608c8ee9 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -50,7 +50,7 @@ async def main(): - `Configuration.default_dataset_id` changed from `'default'` to `None`. - `Configuration.default_request_queue_id` changed from `'default'` to `None`. -As a consequence of this change, using default storage without specifying its `id` in `Configuration` will use unnamed storage. +Previously using the default storage without specifying its `id` in `Configuration` would lead to using specific storage with id `'default'`. Now it will use newly created unnamed storage with `'id'` assigned by the Apify platform, consecutive calls to get the default storage will return the same storage. ## Storages diff --git a/tests/integration/test_apify_storages.py b/tests/integration/test_apify_storages.py index 437ea689..e65c1f94 100644 --- a/tests/integration/test_apify_storages.py +++ b/tests/integration/test_apify_storages.py @@ -45,7 +45,13 @@ async def test_unnamed_default_without_config( service_locator.set_configuration(Configuration(token=apify_token)) service_locator.set_storage_client(ApifyStorageClient()) + # Open storage and make sure it has no name and it has id storage = await storage_type.open() assert storage.name is None assert storage.id + + # Make sure the same instance is returned when opened again without name or alias + storage_again = await storage_type.open() + assert storage is storage_again + await storage.drop() From 627830264ea50c07f9b0f2558bbb8ed6c1add0a9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 24 Sep 2025 13:38:01 +0200 Subject: [PATCH 3/3] Add test requested in review --- tests/integration/test_apify_storages.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_apify_storages.py b/tests/integration/test_apify_storages.py index e65c1f94..83ad7ebd 100644 --- a/tests/integration/test_apify_storages.py +++ b/tests/integration/test_apify_storages.py @@ -5,7 +5,7 @@ from crawlee import service_locator from crawlee.storages import Dataset, KeyValueStore, RequestQueue -from apify import Configuration +from apify import Actor, Configuration from apify.storage_clients import ApifyStorageClient @@ -55,3 +55,21 @@ async def test_unnamed_default_without_config( assert storage is storage_again await storage.drop() + + +@pytest.mark.parametrize( + 'storage_type', + [Dataset, KeyValueStore, RequestQueue], +) +async def test_aliases_not_stored_on_platform_when_local( + storage_type: Dataset | KeyValueStore | RequestQueue, apify_token: str +) -> None: + """Test that default Apify storage used locally is not persisting aliases to Apify based default KVS.""" + service_locator.set_configuration(Configuration(token=apify_token)) + service_locator.set_storage_client(ApifyStorageClient()) + async with Actor(configure_logging=False): + await storage_type.open(alias='test') + default_kvs = await Actor.open_key_value_store(force_cloud=True) + + # The default KVS should be empty + assert len(await default_kvs.list_keys()) == 0