From d538a6cdf9054d42f998d2b99af2ef8597097659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=AF=D0=BA=D0=B8=D0=BC=D0=B5=D0=BD=D0=BA=D0=BE=D0=B2=20?= =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=90=D0=BB=D0=B5=D0=BA?= =?UTF-8?q?=D1=81=D0=B0=D0=BD=D0=B4=D1=80=D0=BE=D0=B2=D0=B8=D1=87?= Date: Tue, 7 Apr 2026 15:02:32 +0300 Subject: [PATCH 1/3] [DOP-23676] add external_id and external_url for Dataset --- .../2026-04-07_947c82ba59ba_external_url.py | 28 +++++++++++++++++++ data_rentgen/db/models/dataset.py | 10 +++++++ data_rentgen/server/schemas/v1/dataset.py | 7 +++++ data_rentgen/server/services/dataset.py | 4 +++ data_rentgen/server/utils/lineage_response.py | 4 +++ .../test_server/fixtures/factories/dataset.py | 2 ++ .../test_lineage/test_column_lineage.py | 4 +++ .../test_lineage/test_dataset_lineage.py | 22 +++++++++++++++ tests/test_server/utils/convert_to_json.py | 2 ++ 9 files changed, 83 insertions(+) create mode 100644 data_rentgen/db/migrations/versions/2026-04-07_947c82ba59ba_external_url.py diff --git a/data_rentgen/db/migrations/versions/2026-04-07_947c82ba59ba_external_url.py b/data_rentgen/db/migrations/versions/2026-04-07_947c82ba59ba_external_url.py new file mode 100644 index 00000000..303a3532 --- /dev/null +++ b/data_rentgen/db/migrations/versions/2026-04-07_947c82ba59ba_external_url.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: 2024-present MTS PJSC +# SPDX-License-Identifier: Apache-2.0 +"""Add external_url and external_id for datasets + +Revision ID: 947c82ba59ba +Revises: 4e119cb7481e +Create Date: 2026-04-07 14:16:26.411705 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "947c82ba59ba" +down_revision = "4e119cb7481e" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column("dataset", sa.Column("external_id", sa.String(), nullable=True)) + op.add_column("dataset", sa.Column("external_url", sa.String(), nullable=True)) + + +def downgrade() -> None: + op.drop_column("dataset", "external_url") + op.drop_column("dataset", "external_id") diff --git a/data_rentgen/db/models/dataset.py b/data_rentgen/db/models/dataset.py index fb92cdc7..23e6e360 100644 --- a/data_rentgen/db/models/dataset.py +++ b/data_rentgen/db/models/dataset.py @@ -50,6 +50,16 @@ class Dataset(Base): lazy="noload", doc="Dataset tag values", ) + external_id: Mapped[str | None] = mapped_column( + String, + nullable=True, + doc="External ID for integration with other systems", + ) + external_url: Mapped[str | None] = mapped_column( + String, + nullable=True, + doc="External link to other systems", + ) search_vector: Mapped[str] = mapped_column( TSVECTOR, diff --git a/data_rentgen/server/schemas/v1/dataset.py b/data_rentgen/server/schemas/v1/dataset.py index bc40b87e..cb762805 100644 --- a/data_rentgen/server/schemas/v1/dataset.py +++ b/data_rentgen/server/schemas/v1/dataset.py @@ -30,10 +30,17 @@ class DatasetSchemaV1(BaseModel): model_config = ConfigDict(from_attributes=True) +class ExternalUrlResponseV1(BaseModel): + url: str + model_config = ConfigDict(from_attributes=True, extra="ignore") + + class DatasetResponseV1(BaseModel): id: str = Field(description="Dataset id", coerce_numbers_to_str=True) location: LocationResponseV1 = Field(description="Corresponding Location") name: str = Field(description="Dataset name") + external_id: str | None = Field(description="External ID for integration with other systems") + external_url: str | None = Field(description="Link to dataset in a external system") schema: DatasetSchemaV1 | None = Field( # type: ignore[assignment] description="Schema", default=None, diff --git a/data_rentgen/server/services/dataset.py b/data_rentgen/server/services/dataset.py index 2e6f3ad4..c37c466b 100644 --- a/data_rentgen/server/services/dataset.py +++ b/data_rentgen/server/services/dataset.py @@ -18,6 +18,8 @@ class DatasetData: id: int name: str location: Location + external_id: str | None + external_url: str | None @dataclass @@ -66,6 +68,8 @@ async def paginate( id=dataset.id, name=dataset.name, location=dataset.location, + external_id=dataset.external_id, + external_url=dataset.external_url, ), tags=[ TagData( diff --git a/data_rentgen/server/utils/lineage_response.py b/data_rentgen/server/utils/lineage_response.py index 2f07e3eb..44635db8 100644 --- a/data_rentgen/server/utils/lineage_response.py +++ b/data_rentgen/server/utils/lineage_response.py @@ -268,6 +268,8 @@ def _get_datasets( id=str(dataset.id), location=LocationResponseV1.model_validate(dataset.location), name=dataset.name, + external_id=dataset.external_id, + external_url=dataset.external_url, schema=schema, ) return datasets @@ -330,6 +332,8 @@ def _get_datasets_with_dataset_granularity( id=str(dataset.id), location=LocationResponseV1.model_validate(dataset.location), name=dataset.name, + external_id=dataset.external_id, + external_url=dataset.external_url, schema=schema, ) return datasets diff --git a/tests/test_server/fixtures/factories/dataset.py b/tests/test_server/fixtures/factories/dataset.py index 6859fe66..59ff4090 100644 --- a/tests/test_server/fixtures/factories/dataset.py +++ b/tests/test_server/fixtures/factories/dataset.py @@ -24,6 +24,8 @@ def dataset_factory(**kwargs): "id": randint(0, 10000000), "location_id": randint(0, 10000000), "name": random_string(32), + "external_id": random_string(), + "external_url": None, } data.update(kwargs) diff --git a/tests/test_server/test_lineage/test_column_lineage.py b/tests/test_server/test_lineage/test_column_lineage.py index 35909a15..0c741a86 100644 --- a/tests/test_server/test_lineage/test_column_lineage.py +++ b/tests/test_server/test_lineage/test_column_lineage.py @@ -1342,6 +1342,8 @@ async def test_get_dataset_lineage_with_granularity_dataset_and_column_lineage( "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema_to_json(lineage.outputs[0].schema, "EXACT_MATCH"), + "external_id": dataset.external_id, + "external_url": dataset.external_url, } for dataset in datasets }, @@ -1482,6 +1484,8 @@ async def test_get_dataset_lineage_with_granularity_dataset_and_column_lineage_f "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema_to_json(lineage.outputs[0].schema, "EXACT_MATCH"), + "external_id": dataset.external_id, + "external_url": dataset.external_url, } for dataset in datasets }, diff --git a/tests/test_server/test_lineage/test_dataset_lineage.py b/tests/test_server/test_lineage/test_dataset_lineage.py index 8702d2d2..c1f073de 100644 --- a/tests/test_server/test_lineage/test_dataset_lineage.py +++ b/tests/test_server/test_lineage/test_dataset_lineage.py @@ -341,6 +341,8 @@ async def test_get_dataset_lineage_with_granularity_dataset( "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema_to_json(lineage.inputs[0].schema, "EXACT_MATCH"), + "external_id": dataset.external_id, + "external_url": dataset.external_url, } for dataset in datasets }, @@ -417,6 +419,8 @@ async def test_get_dataset_lineage_with_granularity_dataset_and_direction( "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema_to_json(lineage.inputs[0].schema, "EXACT_MATCH"), + "external_id": dataset.external_id, + "external_url": dataset.external_url, } for dataset in datasets }, @@ -485,6 +489,8 @@ async def test_get_dataset_lineage_with_granularity_dataset_and_depth( "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema_to_json(lineage.inputs[0].schema, "EXACT_MATCH"), + "external_id": dataset.external_id, + "external_url": dataset.external_url, } for dataset in datasets }, @@ -563,6 +569,8 @@ async def test_get_dataset_lineage_with_granularity_dataset_and_symlinks( if dataset.id in inputs_by_dataset_id or dataset.id in outputs_by_dataset_id else None ), + "external_id": dataset.external_id, + "external_url": dataset.external_url, } for dataset in datasets }, @@ -633,6 +641,8 @@ async def test_get_dataset_lineage_with_granularity_dataset_and_until( "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema_to_json(lineage.inputs[0].schema, "EXACT_MATCH"), + "external_id": dataset.external_id, + "external_url": dataset.external_url, } for dataset in datasets }, @@ -1495,6 +1505,8 @@ async def test_get_dataset_lineage_unmergeable_schema_and_output_type( "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema_to_json(response_schema, "LATEST_KNOWN"), + "external_id": dataset.external_id, + "external_url": dataset.external_url, }, }, "jobs": jobs_to_json(jobs), @@ -1685,18 +1697,24 @@ async def test_get_dataset_lineage_with_granularity_dataset_without_output_schem "name": lineage_dataset.name, "location": location_to_json(lineage_dataset.location), "schema": schema_to_json(response_schema, "EXACT_MATCH"), + "external_id": lineage_dataset.external_id, + "external_url": lineage_dataset.external_url, }, str(datasets[0].id): { "id": str(datasets[0].id), "name": datasets[0].name, "location": location_to_json(datasets[0].location), "schema": schema_to_json(lineage.inputs[0].schema, "EXACT_MATCH"), + "external_id": datasets[0].external_id, + "external_url": datasets[0].external_url, }, str(datasets[2].id): { "id": str(datasets[2].id), "name": datasets[2].name, "location": location_to_json(datasets[2].location), "schema": schema_to_json(lineage.inputs[0].schema, "EXACT_MATCH"), + "external_id": datasets[2].external_id, + "external_url": datasets[2].external_url, }, }, "jobs": {}, @@ -1749,6 +1767,8 @@ async def test_get_dataset_lineage_with_granularity_dataset_ignore_self_referenc "name": dataset.name, "location": location_to_json(dataset.location), "schema": None, + "external_id": dataset.external_id, + "external_url": dataset.external_url, }, }, "jobs": {}, @@ -1802,6 +1822,8 @@ async def test_get_dataset_lineage_with_granularity_dataset_ignore_not_connected "name": dataset.name, "location": location_to_json(dataset.location), "schema": None, + "external_id": dataset.external_id, + "external_url": dataset.external_url, }, }, "jobs": {}, diff --git a/tests/test_server/utils/convert_to_json.py b/tests/test_server/utils/convert_to_json.py index c1cd6cdb..ee4ac5c6 100644 --- a/tests/test_server/utils/convert_to_json.py +++ b/tests/test_server/utils/convert_to_json.py @@ -232,6 +232,8 @@ def dataset_to_json( "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema, + "external_id": dataset.external_id or None, + "external_url": dataset.external_url or None, } From 21c51bb1310b21b30f340aa28a89037251cb1850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=AF=D0=BA=D0=B8=D0=BC=D0=B5=D0=BD=D0=BA=D0=BE=D0=B2=20?= =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=90=D0=BB=D0=B5=D0=BA?= =?UTF-8?q?=D1=81=D0=B0=D0=BD=D0=B4=D1=80=D0=BE=D0=B2=D0=B8=D1=87?= Date: Tue, 7 Apr 2026 15:07:02 +0300 Subject: [PATCH 2/3] [DOP-23676] add external_id and external_url for Dataset --- docs/changelog/next_release/432.improvement.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/changelog/next_release/432.improvement.rst diff --git a/docs/changelog/next_release/432.improvement.rst b/docs/changelog/next_release/432.improvement.rst new file mode 100644 index 00000000..a640d200 --- /dev/null +++ b/docs/changelog/next_release/432.improvement.rst @@ -0,0 +1 @@ +Added optional ``external_id`` and ``external_url`` fields on datasets (database, API responses) for linking datasets to external systems. From 5437e5b9516f01b3f9ce73217edce30b9a20abe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=AF=D0=BA=D0=B8=D0=BC=D0=B5=D0=BD=D0=BA=D0=BE=D0=B2=20?= =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=90=D0=BB=D0=B5=D0=BA?= =?UTF-8?q?=D1=81=D0=B0=D0=BD=D0=B4=D1=80=D0=BE=D0=B2=D0=B8=D1=87?= Date: Tue, 7 Apr 2026 15:26:31 +0300 Subject: [PATCH 3/3] [DOP-23676] fixes --- data_rentgen/server/schemas/v1/dataset.py | 5 ----- tests/test_server/utils/convert_to_json.py | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/data_rentgen/server/schemas/v1/dataset.py b/data_rentgen/server/schemas/v1/dataset.py index cb762805..6e1b6d15 100644 --- a/data_rentgen/server/schemas/v1/dataset.py +++ b/data_rentgen/server/schemas/v1/dataset.py @@ -30,11 +30,6 @@ class DatasetSchemaV1(BaseModel): model_config = ConfigDict(from_attributes=True) -class ExternalUrlResponseV1(BaseModel): - url: str - model_config = ConfigDict(from_attributes=True, extra="ignore") - - class DatasetResponseV1(BaseModel): id: str = Field(description="Dataset id", coerce_numbers_to_str=True) location: LocationResponseV1 = Field(description="Corresponding Location") diff --git a/tests/test_server/utils/convert_to_json.py b/tests/test_server/utils/convert_to_json.py index ee4ac5c6..dfff6ef5 100644 --- a/tests/test_server/utils/convert_to_json.py +++ b/tests/test_server/utils/convert_to_json.py @@ -232,8 +232,8 @@ def dataset_to_json( "name": dataset.name, "location": location_to_json(dataset.location), "schema": schema, - "external_id": dataset.external_id or None, - "external_url": dataset.external_url or None, + "external_id": dataset.external_id, + "external_url": dataset.external_url, }