From f458ec374e03e1e09085dc97cb529338d2f85d7f Mon Sep 17 00:00:00 2001 From: Hemant Mishra Date: Fri, 28 Nov 2025 17:12:51 +0530 Subject: [PATCH 1/2] Remove extractor specific config from the base config --- .../unstable/configuration/models.py | 52 ++++++++-------- cognite/extractorutils/unstable/core/base.py | 1 + .../extractorutils/unstable/core/runtime.py | 3 +- tests/test_unstable/test_base.py | 14 ----- tests/test_unstable/test_configuration.py | 59 ++++++++++++++++++- 5 files changed, 87 insertions(+), 42 deletions(-) diff --git a/cognite/extractorutils/unstable/configuration/models.py b/cognite/extractorutils/unstable/configuration/models.py index ad593752..4691f399 100644 --- a/cognite/extractorutils/unstable/configuration/models.py +++ b/cognite/extractorutils/unstable/configuration/models.py @@ -51,6 +51,7 @@ "MetricsConfig", "ScheduleConfig", "TimeIntervalConfig", + "WithDataSetId", ] @@ -619,6 +620,32 @@ def _log_handler_default() -> list[LogHandlerConfig]: return [LogConsoleHandlerConfig(type="console", level=LogLevel.INFO)] +class WithDataSetId(ConfigModel): + """ + Base class for configuration models that include a data set reference. + """ + + data_set: EitherIdConfig | None = None + + def get_data_set(self, cdf_client: CogniteClient) -> DataSet | None: + """ + Retrieves the DataSet object based on the configuration. + + Args: + cdf_client: An instance of CogniteClient to use for retrieving the DataSet. + + Returns: + DataSet object if data_set is provided; otherwise None. + """ + if not self.data_set: + return None + + return cdf_client.data_sets.retrieve( + id=self.data_set.either_id.internal_id, + external_id=self.data_set.either_id.external_id, + ) + + class FileSizeConfig: """ Configuration parameter for setting a file size. @@ -868,31 +895,6 @@ class ExtractorConfig(ConfigModel): state_store: StateStoreConfig | None = None metrics: MetricsConfig | None = None log_handlers: list[LogHandlerConfig] = Field(default_factory=_log_handler_default) - retry_startup: bool = True - upload_queue_size: int = 50_000 - data_set: EitherIdConfig | None = None - data_set_external_id: str | None = None - - def get_data_set(self, cdf_client: CogniteClient) -> DataSet | None: - """ - Retrieves the DataSet object based on the configuration. - - Args: - cdf_client: An instance of CogniteClient to use for retrieving the DataSet. - - Returns: - DataSet object if data_set, data_set_id, or data_set_external_id is provided; otherwise None. - """ - if self.data_set_external_id: - return cdf_client.data_sets.retrieve(external_id=self.data_set_external_id) - - if not self.data_set: - return None - - return cdf_client.data_sets.retrieve( - id=self.data_set.either_id.internal_id, - external_id=self.data_set.either_id.external_id, - ) ConfigType = TypeVar("ConfigType", bound=ExtractorConfig) diff --git a/cognite/extractorutils/unstable/core/base.py b/cognite/extractorutils/unstable/core/base.py index 78a7b4a6..6b4af39e 100644 --- a/cognite/extractorutils/unstable/core/base.py +++ b/cognite/extractorutils/unstable/core/base.py @@ -142,6 +142,7 @@ class Extractor(Generic[ConfigType], CogniteLogger): CONFIG_TYPE: type[ConfigType] + RETRY_STARTUP: bool = True RESTART_POLICY: RestartPolicy = WHEN_CONTINUOUS_TASKS_CRASHES USE_DEFAULT_STATE_STORE: bool = True _statestore_singleton: AbstractStateStore | None = None diff --git a/cognite/extractorutils/unstable/core/runtime.py b/cognite/extractorutils/unstable/core/runtime.py index d545b3de..78ec701b 100644 --- a/cognite/extractorutils/unstable/core/runtime.py +++ b/cognite/extractorutils/unstable/core/runtime.py @@ -85,8 +85,7 @@ def _extractor_process_entrypoint( checkin_worker.active_revision = config.current_config_revision checkin_worker.set_on_fatal_error_handler(lambda _: on_fatal_error(controls)) checkin_worker.set_on_revision_change_handler(lambda _: on_revision_changed(controls)) - if config.application_config.retry_startup: - checkin_worker.set_retry_startup(config.application_config.retry_startup) + checkin_worker.set_retry_startup(extractor_class.RETRY_STARTUP) if not metrics: metrics = BaseMetrics(extractor_name=extractor_class.NAME, extractor_version=extractor_class.VERSION) extractor = extractor_class._init_from_runtime(config, checkin_worker, metrics) diff --git a/tests/test_unstable/test_base.py b/tests/test_unstable/test_base.py index 9d512272..1f8527d8 100644 --- a/tests/test_unstable/test_base.py +++ b/tests/test_unstable/test_base.py @@ -15,7 +15,6 @@ from cognite.extractorutils.unstable.configuration.loaders import ConfigFormat, load_io from cognite.extractorutils.unstable.configuration.models import ( ConnectionConfig, - ExtractorConfig, LocalStateStoreConfig, LogConsoleHandlerConfig, LogFileHandlerConfig, @@ -345,16 +344,3 @@ def test_pushgatewayconfig_none_credentials_from_yaml() -> None: assert pusher.password is None assert pusher.url == "http://localhost:9091" assert pusher.job_name == "test-job" - - -def test_extractor_config_upload_queue_size_with_yaml() -> None: - """Test upload_queue_size parsing from YAML configuration.""" - config_yaml = """ -upload-queue-size: 200000 -retry-startup: false -""" - stream = StringIO(config_yaml) - config = load_io(stream, ConfigFormat.YAML, ExtractorConfig) - - assert config.upload_queue_size == 200_000 - assert config.retry_startup is False diff --git a/tests/test_unstable/test_configuration.py b/tests/test_unstable/test_configuration.py index a4cefb00..eaf28e83 100644 --- a/tests/test_unstable/test_configuration.py +++ b/tests/test_unstable/test_configuration.py @@ -17,6 +17,7 @@ FileSizeConfig, LogLevel, TimeIntervalConfig, + WithDataSetId, _ClientCredentialsConfig, ) @@ -363,7 +364,6 @@ def test_get_data_set_various_configurations( ) -> None: """Test get_data_set method with various configuration scenarios.""" extractor_config = ExtractorConfig( - retry_startup=False, data_set_external_id=data_set_external_id, data_set=data_set_config, ) @@ -386,3 +386,60 @@ def test_get_data_set_various_configurations( if attr != "name": assert getattr(result, attr) == value mock_client.data_sets.retrieve.assert_called_once_with(**expected_call) + + +@pytest.mark.parametrize( + "data_set_config,expected_call,expected_result_attrs,should_return_none", + [ + # Test with data_set config using internal ID + ( + EitherIdConfig(id=12345), + {"id": 12345, "external_id": None}, + {"id": 12345, "name": "Test Dataset"}, + False, + ), + # Test with data_set config using external ID + ( + EitherIdConfig(external_id="config-dataset"), + {"id": None, "external_id": "config-dataset"}, + {"external_id": "config-dataset", "name": "Config Dataset"}, + False, + ), + # Test with data_set not provided + ( + None, + {}, + {}, + True, + ), + ], +) +def test_with_data_set_id_various_configurations( + data_set_config: EitherIdConfig | None, + expected_call: dict | None, + expected_result_attrs: dict | None, + should_return_none: bool, +) -> None: + """Test WithDataSetId.get_data_set method with various configuration scenarios.""" + with_data_set_config = WithDataSetId( + data_set=data_set_config, + ) + + # Create a mock client instead of using a real one + mock_client = Mock() + + if not should_return_none: + mock_dataset = DataSet(**expected_result_attrs) + mock_client.data_sets.retrieve.return_value = mock_dataset + + result = with_data_set_config.get_data_set(mock_client) + + if should_return_none: + assert result is None + mock_client.data_sets.retrieve.assert_not_called() + else: + assert result is not None + for attr, value in expected_result_attrs.items(): + if attr != "name": + assert getattr(result, attr) == value + mock_client.data_sets.retrieve.assert_called_once_with(**expected_call) From 10f27155ec9fa0dfc6b59212e8a58f77db849fc3 Mon Sep 17 00:00:00 2001 From: Hemant Mishra Date: Fri, 28 Nov 2025 17:30:44 +0530 Subject: [PATCH 2/2] Removed obsolete test case --- tests/test_unstable/test_configuration.py | 79 ----------------------- 1 file changed, 79 deletions(-) diff --git a/tests/test_unstable/test_configuration.py b/tests/test_unstable/test_configuration.py index eaf28e83..43a7588a 100644 --- a/tests/test_unstable/test_configuration.py +++ b/tests/test_unstable/test_configuration.py @@ -13,7 +13,6 @@ ConfigModel, ConnectionConfig, EitherIdConfig, - ExtractorConfig, FileSizeConfig, LogLevel, TimeIntervalConfig, @@ -310,84 +309,6 @@ def test_setting_log_level_from_any_case() -> None: LogLevel("not-a-log-level") -@pytest.mark.parametrize( - "data_set_external_id,data_set_config,expected_call,expected_result_attrs,should_return_none", - [ - # Test with data_set_external_id provided - ( - "test-dataset", - None, - {"external_id": "test-dataset"}, - {"external_id": "test-dataset", "name": "Test Dataset"}, - False, - ), - # Test with data_set config using internal ID - ( - None, - EitherIdConfig(id=12345), - {"id": 12345, "external_id": None}, - {"id": 12345, "name": "Test Dataset"}, - False, - ), - # Test with data_set config using external ID - ( - None, - EitherIdConfig(external_id="config-dataset"), - {"id": None, "external_id": "config-dataset"}, - {"external_id": "config-dataset", "name": "Config Dataset"}, - False, - ), - # Test that data_set_external_id takes priority over data_set - ( - "priority-dataset", - EitherIdConfig(external_id="should-be-ignored"), - {"external_id": "priority-dataset"}, - {"external_id": "priority-dataset", "name": "Priority Dataset"}, - False, - ), - # Test with neither data_set_external_id nor data_set provided - ( - None, - None, - {}, - {}, - True, - ), - ], -) -def test_get_data_set_various_configurations( - data_set_external_id: str | None, - data_set_config: EitherIdConfig | None, - expected_call: dict | None, - expected_result_attrs: dict | None, - should_return_none: bool, -) -> None: - """Test get_data_set method with various configuration scenarios.""" - extractor_config = ExtractorConfig( - data_set_external_id=data_set_external_id, - data_set=data_set_config, - ) - - # Create a mock client instead of using a real one - mock_client = Mock() - - if not should_return_none: - mock_dataset = DataSet(**expected_result_attrs) - mock_client.data_sets.retrieve.return_value = mock_dataset - - result = extractor_config.get_data_set(mock_client) - - if should_return_none: - assert result is None - mock_client.data_sets.retrieve.assert_not_called() - else: - assert result is not None - for attr, value in expected_result_attrs.items(): - if attr != "name": - assert getattr(result, attr) == value - mock_client.data_sets.retrieve.assert_called_once_with(**expected_call) - - @pytest.mark.parametrize( "data_set_config,expected_call,expected_result_attrs,should_return_none", [