From 92d5eb62c1a6bb767ae694136e781317adc54838 Mon Sep 17 00:00:00 2001 From: Babatunde Aromire Date: Tue, 24 Mar 2026 11:36:05 +0100 Subject: [PATCH 1/8] Add context to validation This makes it easy for us to pass context data to pydantic validation. One of it's use cases is that of the test where we can then update this context thereby allowing nested models to validate based on the value provided by the parent. --- .../unstable/configuration/loaders.py | 18 +++-- tests/test_unstable/test_configuration.py | 74 ++++++++++++++++++- 2 files changed, 83 insertions(+), 9 deletions(-) diff --git a/cognite/extractorutils/unstable/configuration/loaders.py b/cognite/extractorutils/unstable/configuration/loaders.py index b718fd87..7ff321e4 100644 --- a/cognite/extractorutils/unstable/configuration/loaders.py +++ b/cognite/extractorutils/unstable/configuration/loaders.py @@ -6,7 +6,7 @@ from enum import Enum from io import StringIO from pathlib import Path -from typing import TextIO, TypeVar +from typing import Any, TextIO, TypeVar from cognite.client import CogniteClient from cognite.client.exceptions import CogniteAPIError @@ -36,13 +36,14 @@ class ConfigFormat(Enum): YAML = "yaml" -def load_file(path: Path, schema: type[_T]) -> _T: +def load_file(path: Path, schema: type[_T], context: dict[str, Any] | None = None) -> _T: """ Load a configuration file from the given path and parse it into the specified schema. Args: path: Path to the configuration file. schema: The schema class to parse the configuration into. + context: Optional context to pass to the schema during validation. Returns: An instance of the schema populated with the configuration data. @@ -58,7 +59,7 @@ def load_file(path: Path, schema: type[_T]) -> _T: raise InvalidConfigError(f"Unknown file type {path.suffix}") with open(path) as stream: - return load_io(stream, file_format, schema) + return load_io(stream, file_format, schema, context=context if context is not None else {}) def load_from_cdf( @@ -108,7 +109,7 @@ def load_from_cdf( raise new_e from e -def load_io(stream: TextIO, file_format: ConfigFormat, schema: type[_T]) -> _T: +def load_io(stream: TextIO, file_format: ConfigFormat, schema: type[_T], context: dict[str, Any] | None = None) -> _T: """ Load a configuration from a stream (e.g., file or string) and parse it into the specified schema. @@ -116,6 +117,7 @@ def load_io(stream: TextIO, file_format: ConfigFormat, schema: type[_T]) -> _T: stream: A text stream containing the configuration data. file_format: The format of the configuration data. schema: The schema class to parse the configuration into. + context: Optional context to pass to the schema during validation. Returns: An instance of the schema populated with the configuration data. @@ -134,7 +136,7 @@ def load_io(stream: TextIO, file_format: ConfigFormat, schema: type[_T]) -> _T: if "key-vault" in data: data.pop("key-vault") - return load_dict(data, schema) + return load_dict(data, schema, context=context if context is not None else {}) def _make_loc_str(loc: tuple) -> str: @@ -155,13 +157,14 @@ def _make_loc_str(loc: tuple) -> str: return loc_str -def load_dict(data: dict, schema: type[_T]) -> _T: +def load_dict(data: dict, schema: type[_T], context: dict[str, Any] | None = None) -> _T: """ Load a configuration from a dictionary and parse it into the specified schema. Args: data: A dictionary containing the configuration data. schema: The schema class to parse the configuration into. + context: Optional context to pass to the schema during validation. Returns: An instance of the schema populated with the configuration data. @@ -170,10 +173,11 @@ def load_dict(data: dict, schema: type[_T]) -> _T: InvalidConfigError: If the configuration is invalid. """ try: - return schema.model_validate(data) + return schema.model_validate(data, context=context if context is not None else {}) except ValidationError as e: messages = [] + # TODO: Check why there's an extra . for err in e.errors(): loc = err.get("loc") if loc is None: diff --git a/tests/test_unstable/test_configuration.py b/tests/test_unstable/test_configuration.py index be1393b4..28c5533c 100644 --- a/tests/test_unstable/test_configuration.py +++ b/tests/test_unstable/test_configuration.py @@ -1,22 +1,26 @@ import os +import re from io import StringIO +from typing import Any from unittest.mock import Mock import pytest from cognite.client.credentials import OAuthClientCredentials from cognite.client.data_classes import DataSet -from pydantic import Field +from pydantic import Field, ValidationInfo, field_validator, model_validator -from cognite.extractorutils.exceptions import InvalidConfigError +from cognite.extractorutils.unstable.configuration.exceptions import InvalidConfigError from cognite.extractorutils.unstable.configuration.loaders import ConfigFormat, load_io from cognite.extractorutils.unstable.configuration.models import ( ConfigModel, ConnectionConfig, EitherIdConfig, + ExtractorConfig, FileSizeConfig, LogLevel, TimeIntervalConfig, WithDataSetId, + _ClientCertificateConfig, _ClientCredentialsConfig, ) @@ -137,6 +141,25 @@ - thumbprint1 - thumbprint2 """ +TEST_REMOTE_CONFIG = """ +--- +sources: +- name: abc + option: option1 + +- name: def + option: option2 + +tasks: +- name: task1 + source: abc + +- name: task2 + source: def + +- name: task3 + source: ghi +""" @pytest.mark.parametrize("config_str", [CONFIG_EXAMPLE_ONLY_REQUIRED, CONFIG_EXAMPLE_ONLY_REQUIRED2]) @@ -148,6 +171,7 @@ def test_load_from_io(config_str: str) -> None: assert config.base_url == "https://baseurl.com" assert config.integration.external_id == "test-pipeline" assert config.authentication.type == "client-credentials" + assert isinstance(config.authentication, _ClientCredentialsConfig) assert config.authentication.client_secret == "very_secret123" assert list(config.authentication.scopes) == ["scopea", "scopeb"] @@ -165,6 +189,7 @@ def test_full_config_client_credentials(config_str: str) -> None: assert config.authentication.type == "client-credentials" assert config.authentication.client_id == "testid" + assert isinstance(config.authentication, _ClientCredentialsConfig) assert config.authentication.client_secret == "very_secret123" assert config.authentication.token_url == "https://get-a-token.com/token" assert list(config.authentication.scopes) == ["scopea", "scopeb"] @@ -192,6 +217,7 @@ def test_full_config_client_certificates(config_str: str) -> None: assert config.authentication.type == "client-certificate" assert config.authentication.client_id == "testid" + assert isinstance(config.authentication, _ClientCertificateConfig) assert config.authentication.password == "very-strong-password" assert config.authentication.path.as_posix() == "/path/to/cert.pem" assert config.authentication.authority_url == "https://you-are-authorized.com" @@ -272,6 +298,50 @@ def test_file_size_config_equality() -> None: assert file_size_3 != file_size_1 +class Instance(ConfigModel): + name: str + option: str + + +class TaskConfig(ConfigModel): + name: str + source: str + + @field_validator( + "source", + mode="after", + ) + @classmethod + def validate_instance(cls, value: str, info: ValidationInfo) -> str: + source_names = (info.context or {}).get("source_names", []) + if value not in source_names: + raise ValueError(f"'{value}' is not defined in the list of sources.") + return value + + +class TestRemoteConfig(ExtractorConfig): + sources: list[Instance] + tasks: list[TaskConfig] + + @model_validator(mode="before") + @classmethod + def map_instances(cls, data: dict[str, Any], validation_info: ValidationInfo) -> dict[str, Any]: + if validation_info.context is not None: + validation_info.context.update( + {"source_names": [source["name"] for source in data.get("sources", [])]}, + ) + return data + + +def test_config_with_context() -> None: + stream = StringIO(TEST_REMOTE_CONFIG) + with pytest.raises( + InvalidConfigError, + match=re.escape("Invalid config: 'ghi' is not defined in the list of sources.: tasks[2].source"), + ): + load_io(stream, ConfigFormat.YAML, TestRemoteConfig) + + @pytest.mark.parametrize( "expression", ["12.3kbkb", "10XY", "abcMB", "5.5.5GB", "MB", "", " ", "10 M B", "10MB extra", "tenMB"] ) From 9216878bea9d460312bb3d1e66f309d2aba45c0c Mon Sep 17 00:00:00 2001 From: Babatunde Aromire Date: Tue, 24 Mar 2026 11:43:47 +0100 Subject: [PATCH 2/8] refactor: review suggestion among others --- .../unstable/configuration/loaders.py | 14 +++++++++----- tests/test_unstable/test_configuration.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cognite/extractorutils/unstable/configuration/loaders.py b/cognite/extractorutils/unstable/configuration/loaders.py index 7ff321e4..b0ab5e76 100644 --- a/cognite/extractorutils/unstable/configuration/loaders.py +++ b/cognite/extractorutils/unstable/configuration/loaders.py @@ -59,11 +59,15 @@ def load_file(path: Path, schema: type[_T], context: dict[str, Any] | None = Non raise InvalidConfigError(f"Unknown file type {path.suffix}") with open(path) as stream: - return load_io(stream, file_format, schema, context=context if context is not None else {}) + return load_io(stream, file_format, schema, context=context) def load_from_cdf( - cognite_client: CogniteClient, external_id: str, schema: type[_T], revision: int | None = None + cognite_client: CogniteClient, + external_id: str, + schema: type[_T], + revision: int | None = None, + context: dict[str, Any] | None = None, ) -> tuple[_T, int]: """ Load a configuration from a CDF integration using the provided external ID and schema. @@ -73,6 +77,7 @@ def load_from_cdf( external_id: The external ID of the integration to load configuration from. schema: The schema class to parse the configuration into. revision: the specific revision of the configuration to load, otherwise get the latest. + context: Optional context to pass to the schema during validation. Returns: A tuple containing the parsed configuration instance and the revision number. @@ -98,7 +103,7 @@ def load_from_cdf( data = response.json() try: - return load_io(StringIO(data["config"]), ConfigFormat.YAML, schema), data["revision"] + return load_io(StringIO(data["config"]), ConfigFormat.YAML, schema, context), data["revision"] except InvalidConfigError as e: e.attempted_revision = data["revision"] @@ -136,7 +141,7 @@ def load_io(stream: TextIO, file_format: ConfigFormat, schema: type[_T], context if "key-vault" in data: data.pop("key-vault") - return load_dict(data, schema, context=context if context is not None else {}) + return load_dict(data, schema, context=context) def _make_loc_str(loc: tuple) -> str: @@ -177,7 +182,6 @@ def load_dict(data: dict, schema: type[_T], context: dict[str, Any] | None = Non except ValidationError as e: messages = [] - # TODO: Check why there's an extra . for err in e.errors(): loc = err.get("loc") if loc is None: diff --git a/tests/test_unstable/test_configuration.py b/tests/test_unstable/test_configuration.py index 28c5533c..2a79dfec 100644 --- a/tests/test_unstable/test_configuration.py +++ b/tests/test_unstable/test_configuration.py @@ -315,7 +315,7 @@ class TaskConfig(ConfigModel): def validate_instance(cls, value: str, info: ValidationInfo) -> str: source_names = (info.context or {}).get("source_names", []) if value not in source_names: - raise ValueError(f"'{value}' is not defined in the list of sources.") + raise ValueError(f"'{value}' is not defined in the list of sources") return value @@ -337,7 +337,7 @@ def test_config_with_context() -> None: stream = StringIO(TEST_REMOTE_CONFIG) with pytest.raises( InvalidConfigError, - match=re.escape("Invalid config: 'ghi' is not defined in the list of sources.: tasks[2].source"), + match=re.escape("Invalid config: 'ghi' is not defined in the list of sources: tasks[2].source"), ): load_io(stream, ConfigFormat.YAML, TestRemoteConfig) From 3a4a5ac123dc5ece8180d8b3552c87503f389460 Mon Sep 17 00:00:00 2001 From: Babatunde Aromire Date: Tue, 24 Mar 2026 11:55:01 +0100 Subject: [PATCH 3/8] test: fix exception handling --- tests/test_unstable/test_configuration.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_unstable/test_configuration.py b/tests/test_unstable/test_configuration.py index 2a79dfec..08bc8e5e 100644 --- a/tests/test_unstable/test_configuration.py +++ b/tests/test_unstable/test_configuration.py @@ -9,7 +9,8 @@ from cognite.client.data_classes import DataSet from pydantic import Field, ValidationInfo, field_validator, model_validator -from cognite.extractorutils.unstable.configuration.exceptions import InvalidConfigError +from cognite.extractorutils.exceptions import InvalidConfigError +from cognite.extractorutils.unstable.configuration.exceptions import InvalidConfigError as UnstableInvalidConfigError from cognite.extractorutils.unstable.configuration.loaders import ConfigFormat, load_io from cognite.extractorutils.unstable.configuration.models import ( ConfigModel, @@ -298,7 +299,7 @@ def test_file_size_config_equality() -> None: assert file_size_3 != file_size_1 -class Instance(ConfigModel): +class Source(ConfigModel): name: str option: str @@ -320,7 +321,7 @@ def validate_instance(cls, value: str, info: ValidationInfo) -> str: class TestRemoteConfig(ExtractorConfig): - sources: list[Instance] + sources: list[Source] tasks: list[TaskConfig] @model_validator(mode="before") @@ -336,7 +337,7 @@ def map_instances(cls, data: dict[str, Any], validation_info: ValidationInfo) -> def test_config_with_context() -> None: stream = StringIO(TEST_REMOTE_CONFIG) with pytest.raises( - InvalidConfigError, + UnstableInvalidConfigError, match=re.escape("Invalid config: 'ghi' is not defined in the list of sources: tasks[2].source"), ): load_io(stream, ConfigFormat.YAML, TestRemoteConfig) From 70410413428869ed3af60b519e630e662a7b592c Mon Sep 17 00:00:00 2001 From: Babatunde Aromire Date: Tue, 24 Mar 2026 12:00:02 +0100 Subject: [PATCH 4/8] chore: update version and changelog --- CHANGELOG.md | 5 +++++ cognite/extractorutils/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f3925e84..3629d51a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ Changes are grouped as follows - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. +## 7.12.1 + +### Added +* In the `unstable` package: Adds context to the config validation layer. This also allows us to customize validation. + ## 7.12.0 ### Added diff --git a/cognite/extractorutils/__init__.py b/cognite/extractorutils/__init__.py index a8cec2e1..cdfe2b11 100644 --- a/cognite/extractorutils/__init__.py +++ b/cognite/extractorutils/__init__.py @@ -16,7 +16,7 @@ Cognite extractor utils is a Python package that simplifies the development of new extractors. """ -__version__ = "7.12.0" +__version__ = "7.12.1" from .base import Extractor __all__ = ["Extractor"] diff --git a/pyproject.toml b/pyproject.toml index 77ece1f1..bda4ed8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cognite-extractor-utils" -version = "7.12.0" +version = "7.12.1" description = "Utilities for easier development of extractors for CDF" authors = [ {name = "Mathias Lohne", email = "mathias.lohne@cognite.com"} From 6b844a34825bf656afe44ccb79e602c346423cb3 Mon Sep 17 00:00:00 2001 From: Babatunde Aromire Date: Fri, 27 Mar 2026 11:01:53 +0100 Subject: [PATCH 5/8] Update cognite/extractorutils/unstable/configuration/loaders.py Co-authored-by: Trygve Utstumo --- cognite/extractorutils/unstable/configuration/loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognite/extractorutils/unstable/configuration/loaders.py b/cognite/extractorutils/unstable/configuration/loaders.py index b0ab5e76..56daa3d9 100644 --- a/cognite/extractorutils/unstable/configuration/loaders.py +++ b/cognite/extractorutils/unstable/configuration/loaders.py @@ -43,7 +43,7 @@ def load_file(path: Path, schema: type[_T], context: dict[str, Any] | None = Non Args: path: Path to the configuration file. schema: The schema class to parse the configuration into. - context: Optional context to pass to the schema during validation. + context: Optional Pydantic validation context; see ``load_dict`` for semantics. Returns: An instance of the schema populated with the configuration data. From 3a5cf16ce1edbe9bacfc0076a9aa3b9f904f7981 Mon Sep 17 00:00:00 2001 From: Babatunde Aromire Date: Fri, 27 Mar 2026 11:02:14 +0100 Subject: [PATCH 6/8] Update cognite/extractorutils/unstable/configuration/loaders.py Co-authored-by: Trygve Utstumo --- cognite/extractorutils/unstable/configuration/loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognite/extractorutils/unstable/configuration/loaders.py b/cognite/extractorutils/unstable/configuration/loaders.py index 56daa3d9..948f11a0 100644 --- a/cognite/extractorutils/unstable/configuration/loaders.py +++ b/cognite/extractorutils/unstable/configuration/loaders.py @@ -77,7 +77,7 @@ def load_from_cdf( external_id: The external ID of the integration to load configuration from. schema: The schema class to parse the configuration into. revision: the specific revision of the configuration to load, otherwise get the latest. - context: Optional context to pass to the schema during validation. + context: Optional Pydantic validation context; see ``load_dict`` for semantics. Returns: A tuple containing the parsed configuration instance and the revision number. From 99bbf53ad1f8de3f20f32f36011e57a2aef270c9 Mon Sep 17 00:00:00 2001 From: Babatunde Aromire Date: Fri, 27 Mar 2026 11:02:24 +0100 Subject: [PATCH 7/8] Update cognite/extractorutils/unstable/configuration/loaders.py Co-authored-by: Trygve Utstumo --- cognite/extractorutils/unstable/configuration/loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognite/extractorutils/unstable/configuration/loaders.py b/cognite/extractorutils/unstable/configuration/loaders.py index 948f11a0..a3921970 100644 --- a/cognite/extractorutils/unstable/configuration/loaders.py +++ b/cognite/extractorutils/unstable/configuration/loaders.py @@ -122,7 +122,7 @@ def load_io(stream: TextIO, file_format: ConfigFormat, schema: type[_T], context stream: A text stream containing the configuration data. file_format: The format of the configuration data. schema: The schema class to parse the configuration into. - context: Optional context to pass to the schema during validation. + context: Optional Pydantic validation context; see ``load_dict`` for semantics. Returns: An instance of the schema populated with the configuration data. From 0efdd04dc3d71399e53d0056f32766be7ce866ba Mon Sep 17 00:00:00 2001 From: Babatunde Aromire Date: Fri, 27 Mar 2026 11:02:54 +0100 Subject: [PATCH 8/8] Update cognite/extractorutils/unstable/configuration/loaders.py Co-authored-by: Trygve Utstumo --- cognite/extractorutils/unstable/configuration/loaders.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cognite/extractorutils/unstable/configuration/loaders.py b/cognite/extractorutils/unstable/configuration/loaders.py index a3921970..2f2e849d 100644 --- a/cognite/extractorutils/unstable/configuration/loaders.py +++ b/cognite/extractorutils/unstable/configuration/loaders.py @@ -169,7 +169,13 @@ def load_dict(data: dict, schema: type[_T], context: dict[str, Any] | None = Non Args: data: A dictionary containing the configuration data. schema: The schema class to parse the configuration into. - context: Optional context to pass to the schema during validation. + context: Optional Pydantic validation context: forwarded to + ``schema.model_validate(..., context=...)`` and exposed to validators as + ``ValidationInfo.context``. Pydantic reuses one dict for the entire validation + run, so validators can add or change keys to pass data to validators that run + later (for example a model validator stashing derived data for nested field + validators). The dict you pass in is therefore mutated in place; pass a fresh + dict if you need the original object unchanged after load. Returns: An instance of the schema populated with the configuration data.